-
Notifications
You must be signed in to change notification settings - Fork 77
/
rwkv_cpp_shared_library.py
287 lines (228 loc) · 10 KB
/
rwkv_cpp_shared_library.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
import os
import sys
import ctypes
import pathlib
import enum
from typing import Dict, Optional
QUANTIZED_FORMAT_NAMES = (
'Q4_0',
'Q4_1',
'Q5_0',
'Q5_1',
'Q8_0'
)
P_FLOAT = ctypes.POINTER(ctypes.c_float)
class RWKVInitFromFileOptionKey(enum.Enum):
# Sets target format of model parameters.
#
# If an FP16 or FP32 model is being loaded, and this option is set,
# parameters will be quantized just-in-time into the specified format.
# If an already quantized model is being loaded, value of this option is ignored.
# The function will not read the whole model file at once, but will do quantization tensor-by-tensor;
# it is safe to load big models which will fit into RAM when quantized.
# Use of this option will introduce significant one-time delay when loading the model.
#
# Intended use-case is to have only FP16 model on disk, while not wasting
# the disk space on models of all available quantized formats.
#
# For allowed values, see QUANTIZED_FORMAT_NAMES.
TARGET_FORMAT_NAME = 0
class RWKVInitFromFileOption(ctypes.Structure):
_fields_ = [
('key', ctypes.c_int),
('value', ctypes.c_char_p)
]
class RWKVContext:
def __init__(self, ptr: ctypes.pointer):
self.ptr = ptr
class RWKVSharedLibrary:
"""
Python wrapper around rwkv.cpp shared library.
"""
def __init__(self, shared_library_path: str):
"""
Loads the shared library from specified file.
In case of any error, this method will throw an exception.
Parameters
----------
shared_library_path : str
Path to rwkv.cpp shared library. On Windows, it would look like 'rwkv.dll'. On UNIX, 'rwkv.so'.
"""
self.library = ctypes.cdll.LoadLibrary(shared_library_path)
self.library.rwkv_init_from_file_ex.argtypes = [ctypes.c_char_p, ctypes.c_uint32, ctypes.POINTER(RWKVInitFromFileOption), ctypes.c_size_t]
self.library.rwkv_init_from_file_ex.restype = ctypes.c_void_p
self.library.rwkv_gpu_offload_layers.argtypes = [ctypes.c_void_p, ctypes.c_uint32]
self.library.rwkv_gpu_offload_layers.restype = ctypes.c_bool
self.library.rwkv_eval.argtypes = [
ctypes.c_void_p, # ctx
ctypes.c_int32, # token
P_FLOAT, # state_in
P_FLOAT, # state_out
P_FLOAT # logits_out
]
self.library.rwkv_eval.restype = ctypes.c_bool
self.library.rwkv_get_state_buffer_element_count.argtypes = [ctypes.c_void_p]
self.library.rwkv_get_state_buffer_element_count.restype = ctypes.c_uint32
self.library.rwkv_get_logits_buffer_element_count.argtypes = [ctypes.c_void_p]
self.library.rwkv_get_logits_buffer_element_count.restype = ctypes.c_uint32
self.library.rwkv_free.argtypes = [ctypes.c_void_p]
self.library.rwkv_free.restype = None
self.library.rwkv_free.argtypes = [ctypes.c_void_p]
self.library.rwkv_free.restype = None
self.library.rwkv_quantize_model_file.argtypes = [ctypes.c_char_p, ctypes.c_char_p, ctypes.c_char_p]
self.library.rwkv_quantize_model_file.restype = ctypes.c_bool
self.library.rwkv_get_system_info_string.argtypes = []
self.library.rwkv_get_system_info_string.restype = ctypes.c_char_p
def rwkv_init_from_file(self, model_file_path: str, thread_count: int, options: Optional[Dict[RWKVInitFromFileOptionKey, str]] = None) -> RWKVContext:
"""
Loads the model from a file and prepares it for inference.
Loading behavior can be customized with options, but none of them are required.
Throws an exception in case of any error. Error messages would be printed to stderr.
Parameters
----------
model_file_path : str
Path to model file in ggml format.
thread_count : int
Count of threads to use, must be positive.
options : Optional[Dict[RWKVInitFromFileOptionKey, str]]
Options passed to rwkv_init_from_file_ex.
"""
options_count = 0
options_ptr = None
if options is not None and len(options) > 0:
options_count = len(options)
options_ptr = (RWKVInitFromFileOption * options_count)()
i = 0
for k, v in options.items():
options_ptr[i].key = k.value
options_ptr[i].value = v.encode('utf-8')
i += 1
ptr = self.library.rwkv_init_from_file_ex(model_file_path.encode('utf-8'), ctypes.c_uint32(thread_count), options_ptr, options_count)
assert ptr is not None, 'rwkv_init_from_file failed, check stderr'
return RWKVContext(ptr)
def rwkv_gpu_offload_layers(self, ctx: RWKVContext, gpu_layers_count: int) -> None:
"""
Offloads specified layers of context onto GPU using cuBLAS, if it is enabled.
If rwkv.cpp was compiled without cuBLAS support, this function is a no-op.
Parameters
----------
ctx : RWKVContext
RWKV context obtained from rwkv_init_from_file.
gpu_layers_count : int
Count of layers to load onto gpu, must be >= 0.
"""
assert self.library.rwkv_gpu_offload_layers(ctx.ptr, ctypes.c_uint32(gpu_layers_count)), 'rwkv_gpu_offload_layers failed, check stderr'
def rwkv_eval(
self,
ctx: RWKVContext,
token: int,
state_in_address: Optional[int],
state_out_address: int,
logits_out_address: int
) -> None:
"""
Evaluates the model for a single token.
Throws an exception in case of any error. Error messages would be printed to stderr.
Parameters
----------
ctx : RWKVContext
RWKV context obtained from rwkv_init_from_file.
token : int
Next token index, in range 0 <= token < n_vocab.
state_in_address : int
Address of the first element of a FP32 buffer of size rwkv_get_state_buffer_element_count; or None, if this is a first pass.
state_out_address : int
Address of the first element of a FP32 buffer of size rwkv_get_state_buffer_element_count. This buffer will be written to.
logits_out_address : int
Address of the first element of a FP32 buffer of size rwkv_get_logits_buffer_element_count. This buffer will be written to.
"""
assert self.library.rwkv_eval(
ctx.ptr,
ctypes.c_int32(token),
ctypes.cast(0 if state_in_address is None else state_in_address, P_FLOAT),
ctypes.cast(state_out_address, P_FLOAT),
ctypes.cast(logits_out_address, P_FLOAT)
), 'rwkv_eval failed, check stderr'
def rwkv_get_state_buffer_element_count(self, ctx: RWKVContext) -> int:
"""
Returns count of FP32 elements in state buffer.
Parameters
----------
ctx : RWKVContext
RWKV context obtained from rwkv_init_from_file.
"""
return self.library.rwkv_get_state_buffer_element_count(ctx.ptr)
def rwkv_get_logits_buffer_element_count(self, ctx: RWKVContext) -> int:
"""
Returns count of FP32 elements in logits buffer.
Parameters
----------
ctx : RWKVContext
RWKV context obtained from rwkv_init_from_file.
"""
return self.library.rwkv_get_logits_buffer_element_count(ctx.ptr)
def rwkv_free(self, ctx: RWKVContext) -> None:
"""
Frees all allocated memory and the context.
Parameters
----------
ctx : RWKVContext
RWKV context obtained from rwkv_init_from_file.
"""
self.library.rwkv_free(ctx.ptr)
ctx.ptr = ctypes.cast(0, ctypes.c_void_p)
def rwkv_quantize_model_file(self, model_file_path_in: str, model_file_path_out: str, format_name: str) -> None:
"""
Quantizes FP32 or FP16 model to one of INT4 formats.
Throws an exception in case of any error. Error messages would be printed to stderr.
Parameters
----------
model_file_path_in : str
Path to model file in ggml format, must be either FP32 or FP16.
model_file_path_out : str
Quantized model will be written here.
format_name : str
One of QUANTIZED_FORMAT_NAMES.
"""
assert format_name in QUANTIZED_FORMAT_NAMES, f'Unknown format name {format_name}, use one of {QUANTIZED_FORMAT_NAMES}'
assert self.library.rwkv_quantize_model_file(
model_file_path_in.encode('utf-8'),
model_file_path_out.encode('utf-8'),
format_name.encode('utf-8')
), 'rwkv_quantize_model_file failed, check stderr'
def rwkv_get_system_info_string(self) -> str:
"""
Returns system information string.
"""
return self.library.rwkv_get_system_info_string().decode('utf-8')
def load_rwkv_shared_library() -> RWKVSharedLibrary:
"""
Attempts to find rwkv.cpp shared library and load it.
To specify exact path to the library, create an instance of RWKVSharedLibrary explicitly.
"""
file_name: str
if 'win32' in sys.platform or 'cygwin' in sys.platform:
file_name = 'rwkv.dll'
elif 'darwin' in sys.platform:
file_name = 'librwkv.dylib'
else:
file_name = 'librwkv.so'
repo_root_dir: pathlib.Path = pathlib.Path(os.path.abspath(__file__)).parent.parent
paths = [
# If we are in "rwkv" directory
f'../bin/Release/{file_name}',
# If we are in repo root directory
f'bin/Release/{file_name}',
# If we compiled in build directory
f'build/bin/Release/{file_name}',
# If we compiled in build directory
f'build/{file_name}',
# Search relative to this file
str(repo_root_dir / 'bin' / 'Release' / file_name),
# Fallback
str(repo_root_dir / file_name)
]
for path in paths:
if os.path.isfile(path):
return RWKVSharedLibrary(path)
return RWKVSharedLibrary(paths[-1])