Skip to content

Commit

Permalink
Decrease memory padding for serial and sequential contexts (#132)
Browse files Browse the repository at this point in the history
  • Loading branch information
saharNooby committed Sep 23, 2023
1 parent 39ed572 commit 0df970a
Showing 1 changed file with 8 additions and 6 deletions.
14 changes: 8 additions & 6 deletions rwkv_graph.inc
Expand Up @@ -322,12 +322,13 @@ static bool rwkv_measure_and_build_serial_context(struct rwkv_model & model, str
struct ggml_allocr * allocator = ggml_allocr_new_measure(tensor_alignment);

size_t required_context_size = ggml_allocr_alloc_graph(allocator, graph.cgraph.get()) +
// With the node limit set 80K, this overhead would be 28 MB.
+ rwkv_ggml_overhead()
+ tensor_alignment
// For some reason, calculation above does not result in enough memory allocated.
// For some reason, `ggml_allocr_alloc_graph` underestimates required memory amount.
// Instead of diving deep into ggml internals to debug this issue, I will just add some padding.
// 64 MB seems to be enough for Raven 14B model.
+ size_t(64) * 1024 * 1024;
// 40 MB seems to be enough for Raven 14B model when GGML_MAX_NODES is set to default value of 4096.
+ size_t(40) * 1024 * 1024;

ggml_allocr_free(allocator);
ggml_free(graph.ggml_ctx);
Expand Down Expand Up @@ -444,12 +445,13 @@ static bool rwkv_measure_and_build_sequential_context(struct rwkv_model & model,
struct ggml_allocr * allocator = ggml_allocr_new_measure(tensor_alignment);

size_t required_context_size = ggml_allocr_alloc_graph(allocator, graph.cgraph.get()) +
// With the node limit set 80K, this overhead would be 28 MB.
+ rwkv_ggml_overhead()
+ tensor_alignment
// For some reason, calculation above does not result in enough memory allocated.
// For some reason, `ggml_allocr_alloc_graph` underestimates required memory amount.
// Instead of diving deep into ggml internals to debug this issue, I will just add some padding.
// 64 MB per token seems to be enough for Raven 14B model. It works for sequence_length up to 71 at least.
+ sequence_length * 64 * 1024 * 1024;
// 40 MB per token seems to be enough for Raven 14B model. It works for sequence_length at least up to 71.
+ sequence_length * 40 * 1024 * 1024;

ggml_allocr_free(allocator);
ggml_free(graph.ggml_ctx);
Expand Down

0 comments on commit 0df970a

Please sign in to comment.