RWKV · saharNooby · Jun 13, 2023 · Jun 14, 2023 · Jun 14, 2023 · Jun 14, 2023
diff --git a/README.md b/README.md
@@ -130,10 +130,10 @@ This option would require a little more manual work, but you can use it with any
 
 ```commandline
 # Windows
-python rwkv\convert_pytorch_to_ggml.py C:\RWKV-4-Pile-169M-20220807-8023.pth C:\rwkv.cpp-169M.bin float16
+python rwkv\convert_pytorch_to_ggml.py C:\RWKV-4-Pile-169M-20220807-8023.pth C:\rwkv.cpp-169M.bin FP16
 
 # Linux / MacOS
-python rwkv/convert_pytorch_to_ggml.py ~/Downloads/RWKV-4-Pile-169M-20220807-8023.pth ~/Downloads/rwkv.cpp-169M.bin float16
+python rwkv/convert_pytorch_to_ggml.py ~/Downloads/RWKV-4-Pile-169M-20220807-8023.pth ~/Downloads/rwkv.cpp-169M.bin FP16
 ```
 
 **Optionally**, quantize the model into one of quantized formats from the table above:

diff --git a/rwkv.cpp b/rwkv.cpp
diff --git a/rwkv.h b/rwkv.h
@@ -83,10 +83,53 @@ extern "C" {
     // - ctx: the context the retrieve the error for, or NULL for the global error.
     RWKV_API enum rwkv_error_flags rwkv_get_last_error(struct rwkv_context * ctx);
 
+    enum rwkv_init_from_file_option_key {
+        // Sets target format of model parameters.
+        //
+        // If an FP16 or FP32 model is being loaded, and this option is set,
+        // parameters will be quantized just-in-time into the specified format.
+        // If an already quantized model is being loaded, value of this option is ignored.
+        // The function will not read the whole model file at once, but will do quantization tensor-by-tensor;
+        // it is safe to load big models which will fit into RAM when quantized.
+        // Use of this option will introduce significant one-time delay when loading the model.
+        //
+        // Intended use-case is to have only FP16 model on disk, while not wasting
+        // the disk space on models of all available quantized formats.
+        //
+        // Allowed values:
+        // - Q4_0
+        // - Q4_1
+        // - Q5_0
+        // - Q5_1
+        // - Q8_0
+        RWKV_INIT_FROM_FILE_OPTION_TARGET_FORMAT_NAME,
+        // Do not use this as an actual option key.
+        RWKV_INIT_FROM_FILE_OPTION_COUNT
+    };
+
+    struct rwkv_init_from_file_option {
+        // Key of the option.
+        enum rwkv_init_from_file_option_key key;
+        // Value of the option as a NULL-terminated, UTF-8 encoded string.
+        char * value;
+    };
+
     // Loads the model from a file and prepares it for inference.
+    // Loading behavior can be customized with options, but none of them are required.
+    // Function behavior when multiple options with the same key are specified is undefined.
     // Returns NULL on any error.
     // - model_file_path: path to model file in ggml format.
     // - n_threads: count of threads to use, must be positive.
+    // - options: array of options. Passing NULL is the same as setting option_count to 0.
+    // - option_count: size of the options array.
+    RWKV_API struct rwkv_context * rwkv_init_from_file_ex(
+        const char * model_file_path,
+        const uint32_t n_threads,
+        const struct rwkv_init_from_file_option * options,
+        const size_t option_count
+    );
+
+    // Same as rwkv_init_from_file_ex, but passing an empty array of options.
     RWKV_API struct rwkv_context * rwkv_init_from_file(const char * model_file_path, const uint32_t n_threads);
 
     // Creates a new context from an existing one.

diff --git a/rwkv/convert_pytorch_to_ggml.py b/rwkv/convert_pytorch_to_ggml.py
@@ -1,5 +1,5 @@
 # Converts an RWKV model checkpoint in PyTorch format to an rwkv.cpp compatible file.
-# Usage: python convert_pytorch_to_ggml.py C:\RWKV-4-Pile-169M-20220807-8023.pth C:\rwkv.cpp-169M.bin float32
+# Usage: python convert_pytorch_to_ggml.py C:\RWKV-4-Pile-169M-20220807-8023.pth C:\rwkv.cpp-169M.bin FP16
 # Get model checkpoints from https://huggingface.co/BlinkDL
 # See FILE_FORMAT.md for the documentation on the file format.
 
@@ -12,7 +12,7 @@ def parse_args():
     parser = argparse.ArgumentParser(description='Convert an RWKV model checkpoint in PyTorch format to an rwkv.cpp compatible file')
     parser.add_argument('src_path', help='Path to PyTorch checkpoint file')
     parser.add_argument('dest_path', help='Path to rwkv.cpp checkpoint file, will be overwritten')
-    parser.add_argument('data_type', help='Data type, float16 or float32', type=str, choices=['float16', 'float32'], default='float32')
+    parser.add_argument('data_type', help='Data type, FP16 or FP32', type=str, choices=['FP16', 'FP32'], default='FP16')
     return parser.parse_args()
 
 def get_layer_count(state_dict: Dict[str, torch.Tensor]) -> int:
@@ -26,6 +26,8 @@ def get_layer_count(state_dict: Dict[str, torch.Tensor]) -> int:
     return n_layer
 
 def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_type: str) -> None:
+    is_FP16 = data_type == 'FP16' or data_type == 'float16'
+
     emb_weight: torch.Tensor = state_dict['emb.weight']
 
     n_layer = get_layer_count(state_dict)
@@ -42,7 +44,7 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
             n_vocab,
             n_embed,
             n_layer,
-            1 if data_type == 'float16' else 0
+            1 if is_FP16 else 0
         ))
 
         for k in state_dict.keys():
@@ -56,8 +58,8 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
             if '.time_decay' in k:
                 tensor = -torch.exp(tensor)
 
-            # Keep 1-dim vectors in fp32
-            if data_type == 'float16' and len(tensor.shape) > 1:
+            # Keep 1-dim vectors in FP32
+            if is_FP16 and len(tensor.shape) > 1:
                 tensor = tensor.half()
 
             shape = tensor.shape

diff --git a/rwkv/convert_pytorch_to_ggml.test.py b/rwkv/convert_pytorch_to_ggml.test.py
@@ -13,7 +13,7 @@ def test() -> None:
             'blocks.0.ln1.weight': torch.tensor([1], dtype=torch.float32)
         }
 
-        convert_pytorch_to_ggml.write_state_dict(state_dict, dest_path=test_file_path, data_type='float32')
+        convert_pytorch_to_ggml.write_state_dict(state_dict, dest_path=test_file_path, data_type='FP32')
 
         with open(test_file_path, 'rb') as input:
             actual_bytes: bytes = input.read()

diff --git a/rwkv/rwkv_cpp_model.py b/rwkv/rwkv_cpp_model.py
@@ -2,7 +2,7 @@
 import torch
 import multiprocessing
 import rwkv_cpp_shared_library
-from typing import Tuple, Optional
+from typing import Dict, Tuple, Optional
 
 class RWKVModel:
     """
@@ -15,6 +15,7 @@ def __init__(
             model_path: str,
             thread_count: int = max(1, multiprocessing.cpu_count() // 2),
             gpu_layers_count: int = 0,
+            options: Optional[Dict[rwkv_cpp_shared_library.RWKVInitFromFileOptionKey, str]] = None
     ):
         """
         Loads the model and prepares it for inference.
@@ -28,6 +29,8 @@ def __init__(
             Path to RWKV model file in ggml format.
         thread_count : int
             Thread count to use. If not set, defaults to CPU count / 2.
+        options : Optional[Dict[RWKVInitFromFileOptionKey, str]]
+            Options passed to rwkv_init_from_file_ex.
         """
 
         assert os.path.isfile(model_path), f'{model_path} is not a file'
@@ -36,7 +39,7 @@ def __init__(
 
         self._library = shared_library
 
-        self._ctx = self._library.rwkv_init_from_file(model_path, thread_count)
+        self._ctx = self._library.rwkv_init_from_file(model_path, thread_count, options)
 
         if gpu_layers_count > 0:
             self._library.rwkv_gpu_offload_layers(self._ctx, gpu_layers_count)

diff --git a/rwkv/rwkv_cpp_shared_library.py b/rwkv/rwkv_cpp_shared_library.py
@@ -2,7 +2,8 @@
 import sys
 import ctypes
 import pathlib
-from typing import Optional
+import enum
+from typing import Dict, Optional
 
 QUANTIZED_FORMAT_NAMES = (
     'Q4_0',
@@ -14,6 +15,29 @@
 
 P_FLOAT = ctypes.POINTER(ctypes.c_float)
 
+class RWKVInitFromFileOptionKey(enum.Enum):
+    # Sets target format of model parameters.
+    #
+    # If an FP16 or FP32 model is being loaded, and this option is set,
+    # parameters will be quantized just-in-time into the specified format.
+    # If an already quantized model is being loaded, value of this option is ignored.
+    # The function will not read the whole model file at once, but will do quantization tensor-by-tensor;
+    # it is safe to load big models which will fit into RAM when quantized.
+    # Use of this option will introduce significant one-time delay when loading the model.
+    #
+    # Intended use-case is to have only FP16 model on disk, while not wasting
+    # the disk space on models of all available quantized formats.
+    #
+    # For allowed values, see QUANTIZED_FORMAT_NAMES.
+    TARGET_FORMAT_NAME = 0
+
+class RWKVInitFromFileOption(ctypes.Structure):
+
+    _fields_ = [
+        ('key', ctypes.c_int),
+        ('value', ctypes.c_char_p)
+    ]
+
 class RWKVContext:
 
     def __init__(self, ptr: ctypes.pointer):
@@ -37,8 +61,8 @@ def __init__(self, shared_library_path: str):
 
         self.library = ctypes.cdll.LoadLibrary(shared_library_path)
 
-        self.library.rwkv_init_from_file.argtypes = [ctypes.c_char_p, ctypes.c_uint32]
-        self.library.rwkv_init_from_file.restype = ctypes.c_void_p
+        self.library.rwkv_init_from_file_ex.argtypes = [ctypes.c_char_p, ctypes.c_uint32, ctypes.POINTER(RWKVInitFromFileOption), ctypes.c_size_t]
+        self.library.rwkv_init_from_file_ex.restype = ctypes.c_void_p
 
         self.library.rwkv_gpu_offload_layers.argtypes = [ctypes.c_void_p, ctypes.c_uint32]
         self.library.rwkv_gpu_offload_layers.restype = ctypes.c_bool
@@ -70,9 +94,10 @@ def __init__(self, shared_library_path: str):
         self.library.rwkv_get_system_info_string.argtypes = []
         self.library.rwkv_get_system_info_string.restype = ctypes.c_char_p
 
-    def rwkv_init_from_file(self, model_file_path: str, thread_count: int) -> RWKVContext:
+    def rwkv_init_from_file(self, model_file_path: str, thread_count: int, options: Optional[Dict[RWKVInitFromFileOptionKey, str]] = None) -> RWKVContext:
         """
         Loads the model from a file and prepares it for inference.
+        Loading behavior can be customized with options, but none of them are required.
         Throws an exception in case of any error. Error messages would be printed to stderr.
 
         Parameters
@@ -81,9 +106,25 @@ def rwkv_init_from_file(self, model_file_path: str, thread_count: int) -> RWKVCo
             Path to model file in ggml format.
         thread_count : int
             Count of threads to use, must be positive.
+        options : Optional[Dict[RWKVInitFromFileOptionKey, str]]
+            Options passed to rwkv_init_from_file_ex.
         """
 
-        ptr = self.library.rwkv_init_from_file(model_file_path.encode('utf-8'), ctypes.c_uint32(thread_count))
+        options_count = 0
+        options_ptr = None
+
+        if options is not None and len(options) > 0:
+            options_count = len(options)
+            options_ptr = (RWKVInitFromFileOption * options_count)()
+
+            i = 0
+            for k, v in options.items():
+                options_ptr[i].key = k.value
+                options_ptr[i].value = v.encode('utf-8')
+
+                i += 1
+
+        ptr = self.library.rwkv_init_from_file_ex(model_file_path.encode('utf-8'), ctypes.c_uint32(thread_count), options_ptr, options_count)
 
         assert ptr is not None, 'rwkv_init_from_file failed, check stderr'
 

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -12,6 +12,7 @@ file(COPY tiny-rwkv-660K-FP32.bin DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 file(COPY tiny-rwkv-660K-FP16.bin DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 file(COPY expected_logits.bin DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 
-rwkv_add_test(test_ggml_basics.c)
-rwkv_add_test(test_tiny_rwkv.c)
-rwkv_add_test(test_context_cloning.c)
+file(GLOB tests *.c)
+foreach (test ${tests})
+	rwkv_add_test(${test})
+endforeach()
diff --git a/tests/test_context_cloning.c b/tests/test_context_cloning.c
@@ -1,4 +1,6 @@
-#include <rwkv.h>
+// Tests that after context cloning evaluation gives identical results.
+
+#include "rwkv.h"
 
 #include <stdlib.h>
 #include <stdio.h>

diff --git a/tests/test_quantization_on_the_fly.c b/tests/test_quantization_on_the_fly.c
@@ -0,0 +1,91 @@
+// Tests that results from on-the-fly quantized model are identical with results of pre-quantized model.
+
+#include "ggml.h"
+#include "rwkv.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define N_THREADS 2
+
+int main(void) {
+    rwkv_quantize_model_file("tiny-rwkv-660K-FP32.bin", "tiny-rwkv-660K-FP32-Q5_1.bin", "Q5_1");
+
+	struct rwkv_context * prequantized_ctx = rwkv_init_from_file("tiny-rwkv-660K-FP32-Q5_1.bin", N_THREADS);
+
+	if (!prequantized_ctx) {
+		enum rwkv_error_flags error = rwkv_get_last_error(NULL);
+		fprintf(stderr, "Unexpected error 0x%.8X\n", error);
+		return EXIT_FAILURE;
+	}
+
+    // ---
+
+    struct rwkv_init_from_file_option option = {RWKV_INIT_FROM_FILE_OPTION_TARGET_FORMAT_NAME, "Q5_1"};
+
+	struct rwkv_context * on_the_fly_quantized_ctx = rwkv_init_from_file_ex("tiny-rwkv-660K-FP32.bin", N_THREADS, &option, 1);
+
+	if (!on_the_fly_quantized_ctx) {
+		enum rwkv_error_flags error = rwkv_get_last_error(NULL);
+		fprintf(stderr, "Unexpected error 0x%.8X\n", error);
+		return EXIT_FAILURE;
+	}
+
+	// ---
+
+	float * state = calloc(rwkv_get_state_len(prequantized_ctx), sizeof(float));
+
+	if (!state) {
+		fprintf(stderr, "Failed to allocate state\n");
+		return EXIT_FAILURE;
+	}
+
+	float * expected_logits = calloc(rwkv_get_logits_len(prequantized_ctx), sizeof(float));
+
+	if (!expected_logits) {
+		fprintf(stderr, "Failed to allocate logits\n");
+		return EXIT_FAILURE;
+	}
+
+	const unsigned char prompt[12] = "hello world";
+
+	rwkv_eval(prequantized_ctx, prompt[0], NULL, state, expected_logits);
+
+	for (int i = 1; prompt[i] != 0; i++) {
+		rwkv_eval(prequantized_ctx, prompt[i], state, state, expected_logits);
+	}
+
+	// ---
+
+	float * actual_logits = calloc(rwkv_get_logits_len(on_the_fly_quantized_ctx), sizeof(float));
+
+	if (!actual_logits) {
+		fprintf(stderr, "Failed to allocate logits\n");
+		return EXIT_FAILURE;
+	}
+
+	rwkv_eval(on_the_fly_quantized_ctx, prompt[0], NULL, state, actual_logits);
+
+	for (int i = 1; prompt[i] != 0; i++) {
+		rwkv_eval(on_the_fly_quantized_ctx, prompt[i], state, state, actual_logits);
+	}
+
+	// ---
+
+	if (memcmp(expected_logits, actual_logits, rwkv_get_logits_len(on_the_fly_quantized_ctx) * sizeof(float))) {
+		fprintf(stderr, "Results not identical :(\n");
+		return EXIT_FAILURE;
+	} else {
+		fprintf(stdout, "Results identical, success!\n");
+	}
+
+	rwkv_free(on_the_fly_quantized_ctx);
+	rwkv_free(prequantized_ctx);
+
+	free(expected_logits);
+	free(actual_logits);
+	free(state);
+
+    return 0;
+}