Skip to content

Commit

Permalink
Rewrite synchronization primitives
Browse files Browse the repository at this point in the history
So far this is improving token generation speed by 10% and it also
appears to be unleashing even better prompt eval times.
  • Loading branch information
jart committed Apr 28, 2024
1 parent 58d2ca0 commit 6162004
Show file tree
Hide file tree
Showing 8 changed files with 537 additions and 1,231 deletions.
1,585 changes: 443 additions & 1,142 deletions llama.cpp/ggml.c

Large diffs are not rendered by default.

13 changes: 5 additions & 8 deletions llama.cpp/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -666,25 +666,22 @@ extern "C" {

// compute types

// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
enum ggml_task_type {
GGML_TASK_TYPE_INIT = 0,
GGML_TASK_TYPE_COMPUTE,
GGML_TASK_TYPE_FINALIZE,
};
struct ggml_barrier;

struct ggml_compute_params {
enum ggml_task_type type;

// ith = thread index, nth = number of threads
int ith, nth;

// work buffer for all threads
size_t wsize;
void * wdata;

struct ggml_barrier *barrier;
};

GGML_API void ggml_syncthreads(struct ggml_barrier *);

// numa strategies
enum ggml_numa_strategy {
GGML_NUMA_STRATEGY_DISABLED = 0,
Expand Down
2 changes: 1 addition & 1 deletion llamafile/llamafile.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ int llamafile_gpu_parse(const char *);
const char *llamafile_describe_gpu(void);

bool llamafile_sgemm(long, long, long, const void *, long, const void *, long, void *, long, int,
int, int, int, int, int);
int, int, int, int);

struct ggml_tensor;
struct ggml_compute_params;
Expand Down
6 changes: 2 additions & 4 deletions llamafile/sgemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,16 +107,14 @@ static const struct GemmFuncs {
* @param ldc is row stride of `C`
* @param ith is thread id (must be less than `nth`)
* @param nth is number of threads (must be greater than zero)
* @param task is GGML task type
* @param Atype is GGML data type of `A`
* @param Btype is GGML data type of `B`
* @param Ctype is GGML data type of `C`
* @return true if this function was able to service the matmul request
*/
bool llamafile_sgemm(long m, long n, long k, const void *A, long lda, const void *B, long ldb,
void *C, long ldc, int ith, int nth, int task, int Atype, int Btype,
int Ctype) {
return funcs.sgemm(m, n, k, A, lda, B, ldb, C, ldc, ith, nth, task, Atype, Btype, Ctype);
void *C, long ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
return funcs.sgemm(m, n, k, A, lda, B, ldb, C, ldc, ith, nth, Atype, Btype, Ctype);
}

/**
Expand Down
18 changes: 9 additions & 9 deletions llamafile/sgemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,23 @@ struct ggml_tensor;
struct ggml_compute_params;

bool llamafile_sgemm_unsupported(long, long, long, const void *, long, const void *, long, void *,
long, int, int, int, int, int, int);
long, int, int, int, int, int);
bool llamafile_sgemm_amd_avx(long, long, long, const void *, long, const void *, long, void *, long,
int, int, int, int, int, int);
int, int, int, int, int);
bool llamafile_sgemm_amd_fma(long, long, long, const void *, long, const void *, long, void *, long,
int, int, int, int, int, int);
int, int, int, int, int);
bool llamafile_sgemm_amd_avx2(long, long, long, const void *, long, const void *, long, void *,
long, int, int, int, int, int, int);
long, int, int, int, int, int);
bool llamafile_sgemm_amd_avxvnni(long, long, long, const void *, long, const void *, long, void *,
long, int, int, int, int, int, int);
long, int, int, int, int, int);
bool llamafile_sgemm_amd_avx512f(long, long, long, const void *, long, const void *, long, void *,
long, int, int, int, int, int, int);
long, int, int, int, int, int);
bool llamafile_sgemm_amd_zen4(long, long, long, const void *, long, const void *, long, void *,
long, int, int, int, int, int, int);
long, int, int, int, int, int);
bool llamafile_sgemm_arm80(long, long, long, const void *, long, const void *, long, void *, long,
int, int, int, int, int, int);
int, int, int, int, int);
bool llamafile_sgemm_arm82(long, long, long, const void *, long, const void *, long, void *, long,
int, int, int, int, int, int);
int, int, int, int, int);

bool llamafile_mixmul_unsupported(const struct ggml_compute_params *, const struct ggml_tensor *,
const struct ggml_tensor *, const struct ggml_tensor *,
Expand Down
4 changes: 2 additions & 2 deletions llamafile/sgemm_sss_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ int main(int argc, char *argv[]) {
randomize(k, n, B, ldb);

BENCH(gemm(true, false, m, n, k, 1., A, lda, B, ldb, 0., G, ldc));
BENCH(llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1, GGML_TASK_TYPE_COMPUTE,
GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
BENCH(llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1, GGML_TYPE_F32, GGML_TYPE_F32,
GGML_TYPE_F32));

for (int i = 0; i < m; ++i)
for (int j = 0; j < n; ++j) {
Expand Down
4 changes: 2 additions & 2 deletions llamafile/sgemm_vecdot_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ int main(int argc, char *argv[]) {
randomize(k, n, B, ldb);

BENCH(gemm(true, false, m, n, k, 1., A, lda, B, ldb, 0., G, ldc));
BENCH(llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1, GGML_TASK_TYPE_COMPUTE,
GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
BENCH(llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1, GGML_TYPE_F32, GGML_TYPE_F32,
GGML_TYPE_F32));

double err_sum = 0;
long long err_worst = 0;
Expand Down

0 comments on commit 6162004

Please sign in to comment.