Update ggml (#128)

* Fix quantize.py doc * Add Q5 format compatibility test * Update ggml * Add documentation about limitations of sequence mode * Fix most compiler warnings * Clean up CMakeLists.txt * Assert contiguity instead of assuming it * Update README.md * Fix warnings * Try to fix compilation error * Attempt to fix Ubuntu build * Attempt to fix Ubuntu build * Restore all build jobs * Allow sequence lengths of up to 64 out of the box by forking ggml
RWKV · Sep 19, 2023 · 8db73b1 · 8db73b1
1 parent d6c691e
commit 8db73b1
Show file tree

Hide file tree

Showing 19 changed files with 1,060 additions and 1,004 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "ggml"]
 	path = ggml
-	url = https://github.com/ggerganov/ggml
-	branch = master
+	url = https://github.com/saharNooby/ggml
+	branch = increased-node-limit-2023-09-19
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -106,6 +106,7 @@ if (RWKV_CUBLAS)
     cmake_minimum_required(VERSION 3.17)
 
     find_package(CUDAToolkit)
+
     if (CUDAToolkit_FOUND)
         message(STATUS "cuBLAS found")
 
@@ -121,6 +122,17 @@ if (RWKV_CUBLAS)
             set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
         endif()
 
+        # Architecture set-up copy-pasted from https://github.com/ggerganov/llama.cpp/blob/111163e2463171891680feed94371eb9becd9817/CMakeLists.txt#L317
+        if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+            # 52: lowest CUDA 12 standard
+            # 60: f16 CUDA intrinsics
+            # 61: integer CUDA intrinsics
+            # 70: compute capability at which unrolling a loop in mul_mat_q kernels is faster
+
+            # Lowest CUDA 12 standard + lowest for integer intrinsics.
+            set(CMAKE_CUDA_ARCHITECTURES "52;61;70")
+        endif()
+        message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
     else()
         message(WARNING "cuBLAS not found")
     endif()
@@ -138,15 +150,15 @@ if (RWKV_CLBLAST)
         $ENV{OPENCL_HOME}
         $ENV{OPENCL_HOME}/include
         ${OPENCL_INCLUDE_SEARCH_PATHS}
-        )
+    )
 
     set(CLBLAST_INCLUDE_SEARCH_PATHS
         /usr/include
         /usr/local/include
         $ENV{CLBLAST_HOME}
         $ENV{CLBLAST_HOME}/include
         ${CLBLAST_INCLUDE_SEARCH_PATHS}
-        )
+    )
 
     find_path(OPENCL_INC NAMES opencl.h PATHS ${OPENCL_INCLUDE_SEARCH_PATHS} PATH_SUFFIXES include/CL)
     find_library(OPENCL_LIB NAMES OpenCL PATHS ${OPENCL_INCLUDE_SEARCH_PATHS} PATH_SUFFIXES lib)
@@ -286,6 +298,56 @@ else()
     message(STATUS "Unknown architecture")
 endif()
 
+#
+# POSIX conformance
+# Section copy-pasted from https://github.com/ggerganov/llama.cpp/blob/8781013ef654270cbead3e0011e33a6d690fb168/CMakeLists.txt#L580C20-L580C20
+#
+
+# clock_gettime came in POSIX.1b (1993)
+# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
+# posix_memalign came in POSIX.1-2001 / SUSv3
+# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
+add_compile_definitions(_XOPEN_SOURCE=600)
+
+# Somehow in OpenBSD whenever POSIX conformance is specified
+# some string functions rely on locale_t availability,
+# which was introduced in POSIX.1-2008, forcing us to go higher.
+if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+    remove_definitions(-D_XOPEN_SOURCE=600)
+    add_compile_definitions(_XOPEN_SOURCE=700)
+endif()
+
+# Data types, macros and functions related to controlling CPU affinity and
+# some memory allocation are available on Linux through GNU extensions in libc.
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    add_compile_definitions(_GNU_SOURCE)
+endif()
+
+# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
+# and on macOS its availability depends on enabling Darwin extensions.
+# Similarly on DragonFly, enabling BSD extensions is necessary.
+if (
+    CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
+    CMAKE_SYSTEM_NAME MATCHES "iOS" OR
+    CMAKE_SYSTEM_NAME MATCHES "tvOS" OR
+    CMAKE_SYSTEM_NAME MATCHES "DragonFly"
+)
+    add_compile_definitions(_DARWIN_C_SOURCE)
+endif()
+
+# alloca is a non-standard interface that is not visible on BSDs when
+# POSIX conformance is specified, but not all of them provide a clean way
+# to enable it in such cases.
+if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+    add_compile_definitions(__BSD_VISIBLE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
+    add_compile_definitions(_NETBSD_SOURCE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+    add_compile_definitions(_BSD_SOURCE)
+endif()
+
 #
 # Build libraries
 #
@@ -296,7 +358,9 @@ endif()
 
 add_library(ggml OBJECT
             ${CMAKE_SOURCE_DIR}/ggml/src/ggml.c
+            ${CMAKE_SOURCE_DIR}/ggml/src/ggml-alloc.c
             ${CMAKE_SOURCE_DIR}/ggml/include/ggml/ggml.h
+            ${CMAKE_SOURCE_DIR}/ggml/include/ggml/ggml-alloc.h
             ${GGML_CUDA_SOURCES}
             ${GGML_OPENCL_SOURCES})
 
@@ -328,13 +392,6 @@ if (RWKV_BUILD_SHARED_LIBRARY)
     target_compile_definitions(rwkv PRIVATE RWKV_SHARED RWKV_BUILD)
 endif()
 
-if (GGML_CUDA_SOURCES)
-    message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
-    set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
-    set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
-    set_property(TARGET rwkv PROPERTY CUDA_ARCHITECTURES OFF)
-endif()
-
 if (NOT RWKV_STANDALONE)
     set_property(TARGET ggml PROPERTY GGML_STANDALONE OFF)
     enable_testing()

diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ Besides the usual **FP32**, it supports **FP16**, **quantized INT4, INT5 and INT
 
 This project provides [a C library rwkv.h](rwkv.h) and [a convinient Python wrapper](rwkv%2Frwkv_cpp_model.py) for it.
 
-RWKV is a novel large language model architecture, [with the largest model in the family having 14B parameters](https://huggingface.co/BlinkDL/rwkv-4-pile-14b). In contrast to Transformer with `O(n^2)` attention, RWKV requires only state from previous step to calculate logits. This makes RWKV very CPU-friendly on large context lenghts.
+[RWKV](https://arxiv.org/abs/2305.13048) is a novel large language model architecture, [with the largest model in the family having 14B parameters](https://huggingface.co/BlinkDL/rwkv-4-pile-14b). In contrast to Transformer with `O(n^2)` attention, RWKV requires only state from previous step to calculate logits. This makes RWKV very CPU-friendly on large context lenghts.
 
 Loading LoRA checkpoints in [Blealtan's format](https://github.com/Blealtan/RWKV-LM-LoRA) is supported through [merge_lora_into_ggml.py script](rwkv%2Fmerge_lora_into_ggml.py).
 

diff --git a/ggml b/ggml