From 9cbb9d92315dd0687c0e981d3ec6e819cf3e7143 Mon Sep 17 00:00:00 2001
From: Alex <saharNooby@users.noreply.github.com>
Date: Wed, 21 Jun 2023 21:06:21 +0500
Subject: [PATCH] Various improvements (#104)

* Make rwkv_gpu_offload_layers return true only if layers were actually offloaded

* Validate device of tensors

* Offload all layers during test

* Consistently use FP16 and FP32 instead of float16/fp16/F16/etc.

* Use spaces for indentation

* Remove spaces between type name and []

* Add cuBLAS on Windows guide, refactor docs structure

* Insert replacement characters when decoding invalid UTF-8 sequences

* Fix compatibility

* Fix formatting

* Fix copy-pasted tensor validation
---
 README.md                             | 44 +++++++--------
 CODE_STYLE.md => docs/CODE_STYLE.md   |  0
 FILE_FORMAT.md => docs/FILE_FORMAT.md |  0
 docs/cuBLAS_on_Windows.md             | 68 +++++++++++++++++++++++
 extras/CMakeLists.txt                 | 18 +++----
 rwkv.cpp                              | 77 +++++++++++++--------------
 rwkv.h                                |  7 +--
 rwkv/convert_pytorch_to_ggml.py       | 12 +++--
 rwkv/convert_pytorch_to_ggml.test.py  |  2 +-
 rwkv/rwkv_cpp_model.py                | 44 +++++++--------
 rwkv/rwkv_cpp_shared_library.py       | 15 +++---
 rwkv/rwkv_tokenizer.py                |  4 +-
 tests/test_tiny_rwkv.c                |  4 +-
 13 files changed, 182 insertions(+), 113 deletions(-)
 rename CODE_STYLE.md => docs/CODE_STYLE.md (100%)
 rename FILE_FORMAT.md => docs/FILE_FORMAT.md (100%)
 create mode 100644 docs/cuBLAS_on_Windows.md

diff --git a/README.md b/README.md
index ef84c19..72cc7af 100644
--- a/README.md
+++ b/README.md
@@ -28,16 +28,19 @@ Below table is for reference only. Measurements were made on 4C/8T x86 CPU with
 
 #### With cuBLAS
 
-Measurements were made on Intel i7 13700K & NVIDIA 3060 Ti 8G. Latency per token shown.
+Measurements were made on Intel i7 13700K & NVIDIA 3060 Ti 8 GB. Latency per token in ms shown.
 
-| Model                 | Layers on GPU | Format | 24 Threads  | 8 Threads  | 4 Threads  | 2 Threads  | 1 Threads  |
-|-----------------------|---------------|--------|-------------|------------|------------|------------|------------|
-| `RWKV-4-Pile-169M`    | 12            | `Q4_0` | 20.6 ms     | 8.6 ms     | 6.9 ms     | 6.2 ms     | 7.9 ms     |
-| `RWKV-4-Pile-169M`    | 12            | `Q4_1` | 21.4 ms     | 8.6 ms     | 6.9 ms     | 6.7 ms     | 7.8 ms     |
-| `RWKV-4-Pile-169M`    | 12            | `Q5_1` | 22.2 ms     | 9.0 ms     | 6.9 ms     | 6.7 ms     | 8.1 ms     |
-| `RWKV-4-Raven-7B-v11` | 32            | `Q4_0` | 94.9 ms     | 54.3 ms    | 50.2 ms    | 51.6 ms    | 59.2 ms    |
-| `RWKV-4-Raven-7B-v11` | 32            | `Q4_1` | 94.5 ms     | 54.3 ms    | 49.7 ms    | 51.8 ms    | 59.2 ms    |
-| `RWKV-4-Raven-7B-v11` | 32            | `Q5_1` | 101.6 ms    | 72.3 ms    | 67.2 ms    | 69.3 ms    | 77.0 ms    |
+| Model                 | Layers on GPU | Format | 1 thread | 2 threads | 4 threads | 8 threads | 24 threads |
+|-----------------------|---------------|--------|----------|-----------|-----------|-----------|------------|
+| `RWKV-4-Pile-169M`    | 12            | `Q4_0` | 7.9      | 6.2       | 6.9       | 8.6       | 20         |
+| `RWKV-4-Pile-169M`    | 12            | `Q4_1` | 7.8      | 6.7       | 6.9       | 8.6       | 21         |
+| `RWKV-4-Pile-169M`    | 12            | `Q5_1` | 8.1      | 6.7       | 6.9       | 9.0       | 22         |
+
+| Model                 | Layers on GPU | Format | 1 thread | 2 threads | 4 threads | 8 threads | 24 threads |
+|-----------------------|---------------|--------|----------|-----------|-----------|-----------|------------|
+| `RWKV-4-Raven-7B-v11` | 32            | `Q4_0` | 59       | 51        | 50        | 54        | 94         |
+| `RWKV-4-Raven-7B-v11` | 32            | `Q4_1` | 59       | 51        | 49        | 54        | 94         |
+| `RWKV-4-Raven-7B-v11` | 32            | `Q5_1` | 77       | 69        | 67        | 72        | 101        |
 
 Note: since cuBLAS is supported only for `ggml_mul_mat()`, we still need to use few CPU resources to execute remaining operations.
 
@@ -68,7 +71,7 @@ This option is recommended for maximum performance, because the library would be
 
 ##### Windows
 
-**Requirements**: [CMake](https://cmake.org/download/) or [CMake from anaconda](https://anaconda.org/conda-forge/cmake), MSVC compiler.
+**Requirements**: [CMake](https://cmake.org/download/) or [CMake from anaconda](https://anaconda.org/conda-forge/cmake), [Build Tools for Visual Studio 2019](https://visualstudio.microsoft.com/vs/older-downloads/).
 
 ```commandline
 cmake .
@@ -79,14 +82,7 @@ If everything went OK, `bin\Release\rwkv.dll` file should appear.
 
 ##### Windows + cuBLAS
 
-**Important**: Since there are no cuBLAS static libraries for Windows, after compiling with dynamic libraries following DLLs should be copied from `{CUDA}/bin` into `build/bin/Release`: `cudart64_12.dll`, `cublas64_12.dll`, `cublasLt64_12.dll`.
-
-```commandline
-mkdir build
-cd build
-cmake .. -DRWKV_CUBLAS=ON
-cmake --build . --config Release
-```
+Refer to [docs/cuBLAS_on_Windows.md](docs%2FcuBLAS_on_Windows.md) for a comprehensive guide.
 
 ##### Linux / MacOS
 
@@ -104,9 +100,7 @@ If everything went OK, `librwkv.so` (Linux) or `librwkv.dylib` (MacOS) file shou
 ##### Linux / MacOS + cuBLAS
 
 ```commandline
-mkdir build
-cd build
-cmake .. -DRWKV_CUBLAS=ON
+cmake . -DRWKV_CUBLAS=ON
 cmake --build . --config Release
 ```
 
@@ -130,10 +124,10 @@ This option would require a little more manual work, but you can use it with any
 
 ```commandline
 # Windows
-python rwkv\convert_pytorch_to_ggml.py C:\RWKV-4-Pile-169M-20220807-8023.pth C:\rwkv.cpp-169M.bin float16
+python rwkv\convert_pytorch_to_ggml.py C:\RWKV-4-Pile-169M-20220807-8023.pth C:\rwkv.cpp-169M.bin FP16
 
 # Linux / MacOS
-python rwkv/convert_pytorch_to_ggml.py ~/Downloads/RWKV-4-Pile-169M-20220807-8023.pth ~/Downloads/rwkv.cpp-169M.bin float16
+python rwkv/convert_pytorch_to_ggml.py ~/Downloads/RWKV-4-Pile-169M-20220807-8023.pth ~/Downloads/rwkv.cpp-169M.bin FP16
 ```
 
 **Optionally**, quantize the model into one of quantized formats from the table above:
@@ -218,8 +212,8 @@ For reference only, here is a list of latest versions of `rwkv.cpp` that have su
 - `Q4_3`, `Q4_1_O`
   - [commit c736ef5](https://github.com/saharNooby/rwkv.cpp/commit/c736ef5411606b529d3a74c139ee111ef1a28bb9), [release with prebuilt binaries](https://github.com/saharNooby/rwkv.cpp/releases/tag/master-1c363e6)
 
-See also [FILE_FORMAT.md](FILE_FORMAT.md) for version numbers of `rwkv.cpp` model files and their changelog.
+See also [docs/FILE_FORMAT.md](docs/FILE_FORMAT.md) for version numbers of `rwkv.cpp` model files and their changelog.
 
 ## Contributing
 
-Please follow the code style described in [CODE_STYLE.md](CODE_STYLE.md).
+Please follow the code style described in [docs/CODE_STYLE.md](docs/CODE_STYLE.md).
diff --git a/CODE_STYLE.md b/docs/CODE_STYLE.md
similarity index 100%
rename from CODE_STYLE.md
rename to docs/CODE_STYLE.md
diff --git a/FILE_FORMAT.md b/docs/FILE_FORMAT.md
similarity index 100%
rename from FILE_FORMAT.md
rename to docs/FILE_FORMAT.md
diff --git a/docs/cuBLAS_on_Windows.md b/docs/cuBLAS_on_Windows.md
new file mode 100644
index 0000000..051c5f3
--- /dev/null
+++ b/docs/cuBLAS_on_Windows.md
@@ -0,0 +1,68 @@
+# Using cuBLAS on Windows
+
+To get cuBLAS in `rwkv.cpp` working on Windows, go through this guide section by section.
+
+## Build Tools for Visual Studio 2019
+
+Skip this step if you already have Build Tools installed.
+
+To install Build Tools, go to [Visual Studio Older Downloads](https://visualstudio.microsoft.com/vs/older-downloads/), download `Visual Studio 2019 and other Products` and run the installer.
+
+## CMake
+
+Skip this step if you already have CMake installed: running `cmake --version` should output `cmake version x.y.z`.
+
+Download latest `Windows x64 Installer` from [Download | CMake](https://cmake.org/download/) and run it.
+
+## CUDA Toolkit
+
+Skip this step if you already have CUDA Toolkit installed: running `nvcc --version` should output `nvcc: NVIDIA (R) Cuda compiler driver`.
+
+CUDA Toolkit must be installed **after** CMake, or else CMake would not be able to see it and you will get error [No CUDA toolset found](https://stackoverflow.com/questions/56636714/cuda-compile-problems-on-windows-cmake-error-no-cuda-toolset-found).
+
+Download an installer from [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive) and run it.
+
+When installing:
+
+- check `Visual Studio Integration`, or else CMake would not be able to see the toolkit
+- optionally, uncheck driver installation — depending on the downloaded version of the toolkit, you may get an unwanted driver downgrade
+
+## Building rwkv.cpp
+
+The only thing different from the regular CPU build is `-DRWKV_CUBLAS=ON` option:
+
+```commandline
+cmake . -DRWKV_CUBLAS=ON
+cmake --build . --config Release
+```
+
+If everything went OK, `bin\Release\rwkv.dll` file should appear.
+
+## Using the GPU
+
+You need to choose layer count that will be offloaded onto the GPU. In general, the more layers offloaded, the better will be the performance; but you may be constrained by VRAM size of your GPU. Increase offloaded layer count until you get "CUDA out of memory" errors.
+
+If most of the computation is performed on GPU, you will not need high thread count. Optimal value may be as low as 1, since any additional threads would just eat CPU cycles while waiting for GPU operation to complete.
+
+To offload layers to GPU:
+
+- if using Python model: pass non-zero number in `gpu_layer_count` to constructor of `rwkv.rwkv_cpp_model.RWKVModel`
+- if using Python wrapper for C library: call `rwkv.rwkv_cpp_shared_library.RWKVSharedLibrary.rwkv_gpu_offload_layers`
+- if using C library directly: call `bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers)`
+
+## Fixing issues
+
+You may get `FileNotFoundError: Could not find module '...\rwkv.dll' (or one of its dependencies). Try using the full path with constructor syntax.` error.
+
+This means that the application couldn't find CUDA libraries that `rwkv.dll` depends on.
+
+To fix this:
+
+- navigate to the folder where CUDA Toolkit is installed
+    - usually, it looks like `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin`
+- find three DLLs in the `bin` folder:
+  - `cudart64_110.dll`
+  - `cublas64_11.dll`
+  - `cublasLt64_11.dll`
+- copy these DDLs to the folder containing `rwkv.dll`
+  - usually, the folder is `rwkv.cpp/bin/Release`
diff --git a/extras/CMakeLists.txt b/extras/CMakeLists.txt
index 505f1d8..d4e7cd6 100644
--- a/extras/CMakeLists.txt
+++ b/extras/CMakeLists.txt
@@ -1,15 +1,15 @@
 function(rwkv_add_extra source)
-	get_filename_component(EXTRA_TARGET ${source} NAME_WE)
-	add_executable(rwkv_${EXTRA_TARGET} ${source})
-	target_link_libraries(rwkv_${EXTRA_TARGET} PRIVATE ggml rwkv)
-	if (RWKV_STATIC)
-		get_target_property(target_LINK_OPTIONS rwkv_${EXTRA_TARGET} LINK_OPTIONS)
-		list(REMOVE_ITEM target_LINK_OPTIONS "-static")
-		set_target_properties(rwkv_${EXTRA_TARGET} PROPERTIES LINK_OPTIONS "${target_LINK_OPTIONS}")
-	endif()
+    get_filename_component(EXTRA_TARGET ${source} NAME_WE)
+    add_executable(rwkv_${EXTRA_TARGET} ${source})
+    target_link_libraries(rwkv_${EXTRA_TARGET} PRIVATE ggml rwkv)
+    if (RWKV_STATIC)
+        get_target_property(target_LINK_OPTIONS rwkv_${EXTRA_TARGET} LINK_OPTIONS)
+        list(REMOVE_ITEM target_LINK_OPTIONS "-static")
+        set_target_properties(rwkv_${EXTRA_TARGET} PROPERTIES LINK_OPTIONS "${target_LINK_OPTIONS}")
+    endif()
 endfunction()
 
 file(GLOB extras *.c)
 foreach (extra ${extras})
-	rwkv_add_extra(${extra})
+    rwkv_add_extra(${extra})
 endforeach()
diff --git a/rwkv.cpp b/rwkv.cpp
index b99ac6a..0ee6518 100644
--- a/rwkv.cpp
+++ b/rwkv.cpp
@@ -174,8 +174,8 @@ bool rwkv_fwrite_data(FILE * file, const void * data, const size_t length) {
 #define TYPE_UNKNOWN TYPE_COUNT
 
 enum rwkv_type {
-    TYPE_F32,
-    TYPE_F16,
+    TYPE_FP32,
+    TYPE_FP16,
     TYPE_Q4_0,
     TYPE_Q4_1,
     TYPE_Q4_1_O, // Unsupported
@@ -190,8 +190,8 @@ enum rwkv_type {
 #define GGML_TYPE_UNKNOWN GGML_TYPE_COUNT
 
 extern const enum ggml_type rwkv_type_to_ggml[TYPE_COUNT + 1] = {
-    GGML_TYPE_F32,     /* F32    */
-    GGML_TYPE_F16,     /* F16    */
+    GGML_TYPE_F32,     /* FP32   */
+    GGML_TYPE_F16,     /* FP16   */
     GGML_TYPE_Q4_0,    /* Q4_0   */
     GGML_TYPE_Q4_1,    /* Q4_1   */
     GGML_TYPE_UNKNOWN, /* Q4_1_O */
@@ -204,8 +204,8 @@ extern const enum ggml_type rwkv_type_to_ggml[TYPE_COUNT + 1] = {
 };
 
 extern const enum rwkv_type rwkv_type_from_ggml[GGML_TYPE_COUNT + 1] = {
-    TYPE_F32,    /* F32   */
-    TYPE_F16,    /* F16   */
+    TYPE_FP32,   /* FP32  */
+    TYPE_FP16,   /* FP16  */
     TYPE_Q4_0,   /* Q4_0  */
     TYPE_Q4_1,   /* Q4_1  */
     TYPE_Q4_2,   /* Q4_2  */
@@ -220,7 +220,7 @@ extern const enum rwkv_type rwkv_type_from_ggml[GGML_TYPE_COUNT + 1] = {
     TYPE_COUNT,  /* COUNT */
 };
 
-extern const char * rwkv_type_to_string[TYPE_COUNT + 1] = {"float32", "float16", "Q4_0", "Q4_1", "Q4_1_O", "Q4_2", "Q4_3", "Q5_0", "Q5_1", "Q8_0", "unknown"};
+extern const char * rwkv_type_to_string[TYPE_COUNT + 1] = {"FP32", "FP16", "Q4_0", "Q4_1", "Q4_1_O", "Q4_2", "Q4_3", "Q5_0", "Q5_1", "Q8_0", "unknown"};
 
 enum rwkv_type rwkv_type_from_string(const char * str) {
     for (int ord = 0; ord < TYPE_COUNT; ord++) {
@@ -429,7 +429,7 @@ struct rwkv_model {
     struct ggml_tensor * ln0_weight;
     struct ggml_tensor * ln0_bias;
 
-    std::unique_ptr<struct rwkv_layer []> layers;
+    std::unique_ptr<struct rwkv_layer[]> layers;
 
     struct ggml_tensor * ln_out_weight;
     struct ggml_tensor * ln_out_bias;
@@ -580,9 +580,9 @@ struct rwkv_context {
     // Reused by all graphs.
     struct rwkv_ggml_context ctx;
     struct ggml_tensor * input_state;
-    std::unique_ptr<struct rwkv_layer_state []> input_layers;
+    std::unique_ptr<struct rwkv_layer_state[]> input_layers;
     struct ggml_tensor * output_state;
-    std::unique_ptr<struct rwkv_layer_state []> output_layers;
+    std::unique_ptr<struct rwkv_layer_state[]> output_layers;
     struct ggml_tensor * logits;
 
     uint32_t n_threads;
@@ -598,8 +598,7 @@ struct rwkv_context {
     enum rwkv_error_flags last_error;
     bool print_errors;
 
-    size_t gpu_layers;
-    size_t vram_total;
+    uint32_t gpu_layers;
 };
 
 // https://stackoverflow.com/a/6458689
@@ -610,7 +609,7 @@ bool rwkv_set_params(struct rwkv_model & model, F callback) {
     RWKV_ENSURE_OR_FALSE(callback("blocks.0.ln0.bias", model.ln0_bias));
 
     uint32_t n_layer = model.header.n_layer;
-    std::unique_ptr<struct rwkv_layer []> layers(new(std::nothrow) struct rwkv_layer [n_layer]);
+    std::unique_ptr<struct rwkv_layer[]> layers(new(std::nothrow) struct rwkv_layer[n_layer]);
     RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, layers.get(), "Failed to allocate model layers");
     model.layers = std::move(layers);
 
@@ -1203,11 +1202,11 @@ struct rwkv_context * rwkv_new_context_impl(std::shared_ptr<struct rwkv_instance
     struct ggml_tensor * output = ggml_new_tensor_1d(ctx.ctx, GGML_TYPE_F32, n_embed * 5 * n_layer);
 
     // We collect parts of input state here. Each part is (n_embed) vector.
-    std::unique_ptr<struct rwkv_layer_state []> inputs(new(std::nothrow) struct rwkv_layer_state [n_layer]);
+    std::unique_ptr<struct rwkv_layer_state[]> inputs(new(std::nothrow) struct rwkv_layer_state[n_layer]);
     RWKV_ASSERT_NULL_MSG(RWKV_ERROR_ALLOC, inputs.get(), "Failed to allocate input state parts");
 
     // We collect parts of output state here. Each part is (n_embed) vector.
-    std::unique_ptr<struct rwkv_layer_state []> outputs(new(std::nothrow) struct rwkv_layer_state [n_layer]);
+    std::unique_ptr<struct rwkv_layer_state[]> outputs(new(std::nothrow) struct rwkv_layer_state[n_layer]);
     RWKV_ASSERT_NULL_MSG(RWKV_ERROR_ALLOC, outputs.get(), "Failed to allocate output state parts");
 
     for (size_t i = 0; i < n_layer; i++) {
@@ -1277,31 +1276,29 @@ struct rwkv_context * rwkv_clone_context(struct rwkv_context * ctx, const uint32
     return clone;
 }
 
-bool rwkv_gpu_offload_layers(const struct rwkv_context * ctx, const uint32_t n_gpu_layers) {
+bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers) {
 #ifdef GGML_USE_CUBLAS
-    size_t n_gpu = std::min(n_gpu_layers, ctx->instance->model.header.n_layer);
+    uint32_t layers_to_offload = std::min(n_layers, ctx->instance->model.header.n_layer - ctx->gpu_layers);
 
-    size_t gpu_layers = ctx->gpu_layers;
-    size_t vram_total = ctx->vram_total;
+    for (uint32_t i = 0; i < layers_to_offload; i++) {
+        const struct rwkv_layer & layer = ctx->instance->model.layers[ctx->gpu_layers + i];
 
-    for (size_t i = 0; i < n_gpu; i++) {
-        const struct rwkv_layer & layer = ctx->instance->model.layers[i];
+        // Use cuBLAS only for heavy matrices; other operations are not supported for the GPU at the moment
+        ggml_cuda_transform_tensor(layer.att_key);
+        ggml_cuda_transform_tensor(layer.att_value);
+        ggml_cuda_transform_tensor(layer.att_receptance);
+        ggml_cuda_transform_tensor(layer.att_output);
 
-        // Use cuBLAS only for heavy matrices; other operations are not supported for GPU at the moment
-        ggml_cuda_transform_tensor(layer.att_key);          vram_total += ggml_nbytes(layer.att_key);
-        ggml_cuda_transform_tensor(layer.att_value);        vram_total += ggml_nbytes(layer.att_value);
-        ggml_cuda_transform_tensor(layer.att_receptance);   vram_total += ggml_nbytes(layer.att_receptance);
-        ggml_cuda_transform_tensor(layer.att_output);       vram_total += ggml_nbytes(layer.att_output);
+        ggml_cuda_transform_tensor(layer.ffn_key);
+        ggml_cuda_transform_tensor(layer.ffn_value);
+        ggml_cuda_transform_tensor(layer.ffn_receptance);
+    }
 
-        ggml_cuda_transform_tensor(layer.ffn_key);          vram_total += ggml_nbytes(layer.ffn_key);
-        ggml_cuda_transform_tensor(layer.ffn_value);        vram_total += ggml_nbytes(layer.ffn_value);
-        ggml_cuda_transform_tensor(layer.ffn_receptance);   vram_total += ggml_nbytes(layer.ffn_receptance);
+    ctx->gpu_layers += layers_to_offload;
 
-        gpu_layers++;
-    }
+    return layers_to_offload > 0;
 #endif
-
-    return true;
+    return false;
 }
 
 void rwkv_set_inputs(const struct rwkv_context * ctx, const float * state_in) {
@@ -1464,7 +1461,7 @@ bool rwkv_quantize_model_file(const char * in_path, const char * out_path, const
     RWKV_ASSERT_FALSE_MSG(
         RWKV_ERROR_FILE,
         in_type == GGML_TYPE_F32 || in_type == GGML_TYPE_F16,
-        "Unsupported input data type (%s); needs to be F32 or F16",
+        "Unsupported input data type (%s); needs to be FP32 or FP16",
         rwkv_type_to_string[rwkv_type_from_ggml[in_type]]
     );
 
@@ -1477,7 +1474,7 @@ bool rwkv_quantize_model_file(const char * in_path, const char * out_path, const
     size_t orig_total_size = 0;
     size_t new_total_size = 0;
 
-    // Required to init the fp16 tables
+    // Required to init the F16 tables
     // Doesn't crash if ggml_init fails
     ggml_free(ggml_init({ 0, NULL, true }));
 
@@ -1496,7 +1493,7 @@ bool rwkv_quantize_model_file(const char * in_path, const char * out_path, const
         }
 
         // f16 type tensors get relocated to out and then converted into f32 at in
-        if (header.data_type == TYPE_F16) {
+        if (header.data_type == TYPE_FP16) {
             if (in_size > max_out_size) {
                 max_out_size = in_size;
             }
@@ -1524,7 +1521,7 @@ bool rwkv_quantize_model_file(const char * in_path, const char * out_path, const
     // This is a histogram of quantized values. If it shows single 1.0, then all 0.0, something went very wrong!
     int64_t hist_all[16] {};
 
-    std::unique_ptr<uint8_t []> scratch(new(std::nothrow) uint8_t [max_in_size + max_out_size]);
+    std::unique_ptr<uint8_t[]> scratch(new(std::nothrow) uint8_t[max_in_size + max_out_size]);
     RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, scratch.get(), "Failed to allocate buffer");
 
     uint8_t * in_buf = scratch.get();
@@ -1542,19 +1539,19 @@ bool rwkv_quantize_model_file(const char * in_path, const char * out_path, const
         const char * name_str = name.c_str();
         RWKV_MSG("%*s - [%5" PRId32 ", %5" PRId32 "], type = %6s ", (int) max_key_length, name_str, header.width, header.height, rwkv_type_to_string[header.data_type]);
 
-        data = header.data_type == TYPE_F16 ? out_buf : in_buf;
+        data = header.data_type == TYPE_FP16 ? out_buf : in_buf;
         size_t orig_size = rwkv_tensor_size(header), new_size = orig_size;
         RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_MODEL_PARAMS, rwkv_fread_data(in_file.file, orig_size, data), "\nFailed to read tensor data of %s", name_str);
 
         // Quantize only 2D tensors, except embedding and head matrices.
         // Embedding and head take not too much space, especially in bigger models;
         // but they significantly increase perplexity when quantized.
-        if ((header.data_type == TYPE_F32 || header.data_type == TYPE_F16) && header.dim_count == 2 && name != "emb.weight" && name != "head.weight") {
+        if ((header.data_type == TYPE_FP32 || header.data_type == TYPE_FP16) && header.dim_count == 2 && name != "emb.weight" && name != "head.weight") {
             RWKV_MSG("quantizing... ");
 
             size_t nelements = (size_t) header.width * (size_t) header.height;
 
-            if (header.data_type == TYPE_F16) {
+            if (header.data_type == TYPE_FP16) {
                 ggml_fp16_to_fp32_row((const ggml_fp16_t *) out_buf, (float *) in_buf, nelements);
             }
 
diff --git a/rwkv.h b/rwkv.h
index 8327425..493dffe 100644
--- a/rwkv.h
+++ b/rwkv.h
@@ -97,9 +97,10 @@ extern "C" {
     // - n_threads: count of threads to use, must be positive.
     RWKV_API struct rwkv_context * rwkv_clone_context(struct rwkv_context * ctx, const uint32_t n_threads);
 
-    // Offloads specified layers of context onto GPU using cuBLAS, if it is enabled.
-    // If rwkv.cpp was compiled without cuBLAS support, this function is a no-op.
-    RWKV_API bool rwkv_gpu_offload_layers(const struct rwkv_context * ctx, const uint32_t n_gpu_layers);
+    // Offloads specified count of model layers onto the GPU. Offloaded layers are evaluated using cuBLAS.
+    // Returns true if at least one layer was offloaded.
+    // If rwkv.cpp was compiled without cuBLAS support, this function is a no-op and always returns false.
+    RWKV_API bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers);
 
     // Evaluates the model for a single token.
     // Not thread-safe. For parallel inference, call rwkv_clone_context to create one rwkv_context for each thread.
diff --git a/rwkv/convert_pytorch_to_ggml.py b/rwkv/convert_pytorch_to_ggml.py
index 2ea4a48..958fbe1 100644
--- a/rwkv/convert_pytorch_to_ggml.py
+++ b/rwkv/convert_pytorch_to_ggml.py
@@ -1,5 +1,5 @@
 # Converts an RWKV model checkpoint in PyTorch format to an rwkv.cpp compatible file.
-# Usage: python convert_pytorch_to_ggml.py C:\RWKV-4-Pile-169M-20220807-8023.pth C:\rwkv.cpp-169M.bin float32
+# Usage: python convert_pytorch_to_ggml.py C:\RWKV-4-Pile-169M-20220807-8023.pth C:\rwkv.cpp-169M-FP16.bin FP16
 # Get model checkpoints from https://huggingface.co/BlinkDL
 # See FILE_FORMAT.md for the documentation on the file format.
 
@@ -12,7 +12,7 @@ def parse_args():
     parser = argparse.ArgumentParser(description='Convert an RWKV model checkpoint in PyTorch format to an rwkv.cpp compatible file')
     parser.add_argument('src_path', help='Path to PyTorch checkpoint file')
     parser.add_argument('dest_path', help='Path to rwkv.cpp checkpoint file, will be overwritten')
-    parser.add_argument('data_type', help='Data type, float16 or float32', type=str, choices=['float16', 'float32'], default='float32')
+    parser.add_argument('data_type', help='Data type, FP16 or FP32', type=str, choices=['FP16', 'FP32', 'float16', 'float32'], default='FP16')
     return parser.parse_args()
 
 def get_layer_count(state_dict: Dict[str, torch.Tensor]) -> int:
@@ -33,6 +33,8 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
     n_embed = emb_weight.shape[1]
 
     with open(dest_path, 'wb') as out_file:
+        is_FP16: bool = data_type == 'FP16' or data_type == 'float16'
+
         out_file.write(struct.pack(
             # Disable padding with '='
             '=iiiiii',
@@ -42,7 +44,7 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
             n_vocab,
             n_embed,
             n_layer,
-            1 if data_type == 'float16' else 0
+            1 if is_FP16 else 0
         ))
 
         for k in state_dict.keys():
@@ -56,8 +58,8 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
             if '.time_decay' in k:
                 tensor = -torch.exp(tensor)
 
-            # Keep 1-dim vectors in fp32
-            if data_type == 'float16' and len(tensor.shape) > 1:
+            # Keep 1-dim vectors in FP32
+            if is_FP16 and len(tensor.shape) > 1:
                 tensor = tensor.half()
 
             shape = tensor.shape
diff --git a/rwkv/convert_pytorch_to_ggml.test.py b/rwkv/convert_pytorch_to_ggml.test.py
index 9ced1d0..501a85e 100644
--- a/rwkv/convert_pytorch_to_ggml.test.py
+++ b/rwkv/convert_pytorch_to_ggml.test.py
@@ -13,7 +13,7 @@ def test() -> None:
             'blocks.0.ln1.weight': torch.tensor([1], dtype=torch.float32)
         }
 
-        convert_pytorch_to_ggml.write_state_dict(state_dict, dest_path=test_file_path, data_type='float32')
+        convert_pytorch_to_ggml.write_state_dict(state_dict, dest_path=test_file_path, data_type='FP32')
 
         with open(test_file_path, 'rb') as input:
             actual_bytes: bytes = input.read()
diff --git a/rwkv/rwkv_cpp_model.py b/rwkv/rwkv_cpp_model.py
index 74be878..1d166ff 100644
--- a/rwkv/rwkv_cpp_model.py
+++ b/rwkv/rwkv_cpp_model.py
@@ -14,7 +14,8 @@ def __init__(
             shared_library: rwkv_cpp_shared_library.RWKVSharedLibrary,
             model_path: str,
             thread_count: int = max(1, multiprocessing.cpu_count() // 2),
-            gpu_layers_count: int = 0,
+            gpu_layer_count: int = 0,
+            **kwargs
     ):
         """
         Loads the model and prepares it for inference.
@@ -28,18 +29,23 @@ def __init__(
             Path to RWKV model file in ggml format.
         thread_count : int
             Thread count to use. If not set, defaults to CPU count / 2.
+        gpu_layer_count : int
+            Count of layers to offload onto the GPU, must be >= 0.
         """
 
+        if 'gpu_layers_count' in kwargs:
+            gpu_layer_count = kwargs['gpu_layers_count']
+
         assert os.path.isfile(model_path), f'{model_path} is not a file'
-        assert thread_count > 0, 'Thread count must be positive'
-        assert gpu_layers_count >= 0, 'GPU layers count must be >= 0'
+        assert thread_count > 0, 'Thread count must be > 0'
+        assert gpu_layer_count >= 0, 'GPU layer count must be >= 0'
 
         self._library = shared_library
 
         self._ctx = self._library.rwkv_init_from_file(model_path, thread_count)
 
-        if gpu_layers_count > 0:
-            self._library.rwkv_gpu_offload_layers(self._ctx, gpu_layers_count)
+        if gpu_layer_count > 0:
+            self._library.rwkv_gpu_offload_layers(self._ctx, gpu_layer_count)
 
         self._state_buffer_element_count = self._library.rwkv_get_state_buffer_element_count(self._ctx)
         self._logits_buffer_element_count = self._library.rwkv_get_logits_buffer_element_count(self._ctx)
@@ -76,25 +82,20 @@ def eval(
 
         assert self._valid, 'Model was freed'
 
-        def validate_buffer(buf: torch.Tensor, name: str, size: int) -> None:
-            assert buf.dtype == torch.float32, f'{name} is not of type float32'
-            assert buf.is_contiguous(), f'{name} is not contiguous'
-            assert buf.shape == (size,), f'{name} has invalid shape {buf.shape}, expected ({size})'
-
         if state_in is not None:
-            validate_buffer(state_in, 'state_in', self._state_buffer_element_count)
+            validate_tensor(state_in, 'state_in', self._state_buffer_element_count)
 
             state_in_ptr = state_in.data_ptr()
         else:
             state_in_ptr = 0
 
         if state_out is not None:
-            validate_buffer(state_out, 'state_out', self._state_buffer_element_count)
+            validate_tensor(state_out, 'state_out', self._state_buffer_element_count)
         else:
             state_out = torch.zeros(self._state_buffer_element_count, dtype=torch.float32, device='cpu')
 
         if logits_out is not None:
-            validate_buffer(logits_out, 'logits_out', self._logits_buffer_element_count)
+            validate_tensor(logits_out, 'logits_out', self._logits_buffer_element_count)
         else:
             logits_out = torch.zeros(self._logits_buffer_element_count, dtype=torch.float32, device='cpu')
 
@@ -138,25 +139,20 @@ def eval_sequence(
 
         assert self._valid, 'Model was freed'
 
-        def validate_buffer(buf: torch.Tensor, name: str, size: int) -> None:
-            assert buf.dtype == torch.float32, f'{name} is not of type float32'
-            assert buf.is_contiguous(), f'{name} is not contiguous'
-            assert buf.shape == (size,), f'{name} has invalid shape {buf.shape}, expected ({size})'
-
         if state_in is not None:
-            validate_buffer(state_in, 'state_in', self._state_buffer_element_count)
+            validate_tensor(state_in, 'state_in', self._state_buffer_element_count)
 
             state_in_ptr = state_in.data_ptr()
         else:
             state_in_ptr = 0
 
         if state_out is not None:
-            validate_buffer(state_out, 'state_out', self._state_buffer_element_count)
+            validate_tensor(state_out, 'state_out', self._state_buffer_element_count)
         else:
             state_out = torch.zeros(self._state_buffer_element_count, dtype=torch.float32, device='cpu')
 
         if logits_out is not None:
-            validate_buffer(logits_out, 'logits_out', self._logits_buffer_element_count)
+            validate_tensor(logits_out, 'logits_out', self._logits_buffer_element_count)
         else:
             logits_out = torch.zeros(self._logits_buffer_element_count, dtype=torch.float32, device='cpu')
 
@@ -187,3 +183,9 @@ def __del__(self):
         # Free the context on GC in case user forgot to call free() explicitly.
         if hasattr(self, '_valid') and self._valid:
             self.free()
+
+def validate_tensor(buf: torch.Tensor, name: str, size: int) -> None:
+    assert buf.device == torch.device('cpu'), f'{name} is not on CPU'
+    assert buf.dtype == torch.float32, f'{name} is not of type float32'
+    assert buf.shape == (size,), f'{name} has invalid shape {buf.shape}, expected ({size})'
+    assert buf.is_contiguous(), f'{name} is not contiguous'
diff --git a/rwkv/rwkv_cpp_shared_library.py b/rwkv/rwkv_cpp_shared_library.py
index d4854cd..ac6b722 100644
--- a/rwkv/rwkv_cpp_shared_library.py
+++ b/rwkv/rwkv_cpp_shared_library.py
@@ -100,20 +100,23 @@ def rwkv_init_from_file(self, model_file_path: str, thread_count: int) -> RWKVCo
 
         return RWKVContext(ptr)
 
-    def rwkv_gpu_offload_layers(self, ctx: RWKVContext, gpu_layers_count: int) -> None:
+    def rwkv_gpu_offload_layers(self, ctx: RWKVContext, layer_count: int) -> bool:
         """
-        Offloads specified layers of context onto GPU using cuBLAS, if it is enabled.
-        If rwkv.cpp was compiled without cuBLAS support, this function is a no-op.
+        Offloads specified count of model layers onto the GPU. Offloaded layers are evaluated using cuBLAS.
+        Returns true if at least one layer was offloaded.
+        If rwkv.cpp was compiled without cuBLAS support, this function is a no-op and always returns false.
 
         Parameters
         ----------
         ctx : RWKVContext
             RWKV context obtained from rwkv_init_from_file.
-        gpu_layers_count : int
-            Count of layers to load onto gpu, must be >= 0.
+        layer_count : int
+            Count of layers to offload onto the GPU, must be >= 0.
         """
 
-        assert self.library.rwkv_gpu_offload_layers(ctx.ptr, ctypes.c_uint32(gpu_layers_count)), 'rwkv_gpu_offload_layers failed, check stderr'
+        assert layer_count >= 0, 'Layer count must be >= 0'
+
+        return self.library.rwkv_gpu_offload_layers(ctx.ptr, ctypes.c_uint32(layer_count))
 
     def rwkv_eval(
             self,
diff --git a/rwkv/rwkv_tokenizer.py b/rwkv/rwkv_tokenizer.py
index 10ee52d..cf033b7 100644
--- a/rwkv/rwkv_tokenizer.py
+++ b/rwkv/rwkv_tokenizer.py
@@ -112,7 +112,9 @@ def encode(self, src: str) -> List[int]:
         return self.encode_bytes(src.encode('utf-8'))
 
     def decode(self, tokens: List[int]) -> str:
-        return self.decode_bytes(tokens).decode('utf-8')
+        # 'replace' error handling mode will insert \uFFFD characters in place of malformed/partial UTF-8 sequences.
+        # Downstream code needs to detect \uFFFD and attempt to postpone decoding until more tokens arrive and UTF-8 sequences are complete.
+        return self.decode_bytes(tokens).decode('utf-8', errors='replace')
 
 def get_tokenizer(tokenizer: str = '20B') -> Tuple[
     Callable[[List[int]], str],
diff --git a/tests/test_tiny_rwkv.c b/tests/test_tiny_rwkv.c
index adb0de7..2a12329 100644
--- a/tests/test_tiny_rwkv.c
+++ b/tests/test_tiny_rwkv.c
@@ -21,7 +21,6 @@
 
 #define N_VOCAB 256
 #define N_THREADS 2
-#define N_GPU_LAYERS 1
 
 void test_model(const char * model_path, const float * expected_logits, const float max_diff) {
     fprintf(stderr, "Testing %s\n", model_path);
@@ -29,8 +28,9 @@ void test_model(const char * model_path, const float * expected_logits, const fl
     struct rwkv_context * model = rwkv_init_from_file(model_path, N_THREADS);
     enum rwkv_error_flags error = rwkv_get_last_error(NULL);
     ASSERT(error == 0, "Unexpected error %d", error);
+
 #ifdef GGML_USE_CUBLAS
-    ASSERT(rwkv_gpu_offload_layers(model, N_GPU_LAYERS), "Unexpected error %d", rwkv_get_last_error(model));
+    ASSERT(rwkv_gpu_offload_layers(model, rwkv_get_n_layer(model)), "Failed to offload layers to GPU");
 #endif
 
     const size_t n_vocab = rwkv_get_logits_len(model);