Rewrite synchronization primitives

So far this is improving token generation speed by 10% and it also appears to be unleashing even better prompt eval times.
Mozilla-Ocho · Apr 28, 2024 · 6162004 · 6162004
1 parent 58d2ca0
commit 6162004
Show file tree

Hide file tree

Showing 8 changed files with 537 additions and 1,231 deletions.
diff --git a/llama.cpp/ggml.c b/llama.cpp/ggml.c
diff --git a/llama.cpp/ggml.h b/llama.cpp/ggml.h
@@ -666,25 +666,22 @@ extern "C" {
 
     // compute types
 
-    // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
-    // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
-    enum ggml_task_type {
-        GGML_TASK_TYPE_INIT = 0,
-        GGML_TASK_TYPE_COMPUTE,
-        GGML_TASK_TYPE_FINALIZE,
-    };
+    struct ggml_barrier;
 
     struct ggml_compute_params {
-        enum ggml_task_type type;
 
         // ith = thread index, nth = number of threads
         int ith, nth;
 
         // work buffer for all threads
         size_t wsize;
         void * wdata;
+
+        struct ggml_barrier *barrier;
     };
 
+    GGML_API void ggml_syncthreads(struct ggml_barrier *);
+
     // numa strategies
     enum ggml_numa_strategy {
         GGML_NUMA_STRATEGY_DISABLED   = 0,

diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h
@@ -49,7 +49,7 @@ int llamafile_gpu_parse(const char *);
 const char *llamafile_describe_gpu(void);
 
 bool llamafile_sgemm(long, long, long, const void *, long, const void *, long, void *, long, int,
-                     int, int, int, int, int);
+                     int, int, int, int);
 
 struct ggml_tensor;
 struct ggml_compute_params;

diff --git a/llamafile/sgemm.cpp b/llamafile/sgemm.cpp
@@ -107,16 +107,14 @@ static const struct GemmFuncs {
  * @param ldc is row stride of `C`
  * @param ith is thread id (must be less than `nth`)
  * @param nth is number of threads (must be greater than zero)
- * @param task is GGML task type
  * @param Atype is GGML data type of `A`
  * @param Btype is GGML data type of `B`
  * @param Ctype is GGML data type of `C`
  * @return true if this function was able to service the matmul request
  */
 bool llamafile_sgemm(long m, long n, long k, const void *A, long lda, const void *B, long ldb,
-                     void *C, long ldc, int ith, int nth, int task, int Atype, int Btype,
-                     int Ctype) {
-    return funcs.sgemm(m, n, k, A, lda, B, ldb, C, ldc, ith, nth, task, Atype, Btype, Ctype);
+                     void *C, long ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
+    return funcs.sgemm(m, n, k, A, lda, B, ldb, C, ldc, ith, nth, Atype, Btype, Ctype);
 }
 
 /**

diff --git a/llamafile/sgemm.h b/llamafile/sgemm.h
@@ -8,23 +8,23 @@ struct ggml_tensor;
 struct ggml_compute_params;
 
 bool llamafile_sgemm_unsupported(long, long, long, const void *, long, const void *, long, void *,
-                                 long, int, int, int, int, int, int);
+                                 long, int, int, int, int, int);
 bool llamafile_sgemm_amd_avx(long, long, long, const void *, long, const void *, long, void *, long,
-                             int, int, int, int, int, int);
+                             int, int, int, int, int);
 bool llamafile_sgemm_amd_fma(long, long, long, const void *, long, const void *, long, void *, long,
-                             int, int, int, int, int, int);
+                             int, int, int, int, int);
 bool llamafile_sgemm_amd_avx2(long, long, long, const void *, long, const void *, long, void *,
-                              long, int, int, int, int, int, int);
+                              long, int, int, int, int, int);
 bool llamafile_sgemm_amd_avxvnni(long, long, long, const void *, long, const void *, long, void *,
-                                 long, int, int, int, int, int, int);
+                                 long, int, int, int, int, int);
 bool llamafile_sgemm_amd_avx512f(long, long, long, const void *, long, const void *, long, void *,
-                                 long, int, int, int, int, int, int);
+                                 long, int, int, int, int, int);
 bool llamafile_sgemm_amd_zen4(long, long, long, const void *, long, const void *, long, void *,
-                              long, int, int, int, int, int, int);
+                              long, int, int, int, int, int);
 bool llamafile_sgemm_arm80(long, long, long, const void *, long, const void *, long, void *, long,
-                           int, int, int, int, int, int);
+                           int, int, int, int, int);
 bool llamafile_sgemm_arm82(long, long, long, const void *, long, const void *, long, void *, long,
-                           int, int, int, int, int, int);
+                           int, int, int, int, int);
 
 bool llamafile_mixmul_unsupported(const struct ggml_compute_params *, const struct ggml_tensor *,
                                   const struct ggml_tensor *, const struct ggml_tensor *,

diff --git a/llamafile/sgemm_sss_test.cpp b/llamafile/sgemm_sss_test.cpp
@@ -46,8 +46,8 @@ int main(int argc, char *argv[]) {
     randomize(k, n, B, ldb);
 
     BENCH(gemm(true, false, m, n, k, 1., A, lda, B, ldb, 0., G, ldc));
-    BENCH(llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1, GGML_TASK_TYPE_COMPUTE,
-                          GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
+    BENCH(llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1, GGML_TYPE_F32, GGML_TYPE_F32,
+                          GGML_TYPE_F32));
 
     for (int i = 0; i < m; ++i)
         for (int j = 0; j < n; ++j) {

diff --git a/llamafile/sgemm_vecdot_test.cpp b/llamafile/sgemm_vecdot_test.cpp
@@ -45,8 +45,8 @@ int main(int argc, char *argv[]) {
     randomize(k, n, B, ldb);
 
     BENCH(gemm(true, false, m, n, k, 1., A, lda, B, ldb, 0., G, ldc));
-    BENCH(llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1, GGML_TASK_TYPE_COMPUTE,
-                          GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
+    BENCH(llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1, GGML_TYPE_F32, GGML_TYPE_F32,
+                          GGML_TYPE_F32));
 
     double err_sum = 0;
     long long err_worst = 0;