Skip to content

Commit

Permalink
add hipBLAS for windows (#135)
Browse files Browse the repository at this point in the history
* add hipBLAS for windows

* fix test build

* do test and fix load dll on windows

* add hipBLAS_on_Windows document

* update hipBLAS_on_Windows doc

* add benchmark
  • Loading branch information
Cyberhan123 committed Oct 6, 2023
1 parent 0df970a commit 22a2778
Show file tree
Hide file tree
Showing 6 changed files with 191 additions and 12 deletions.
42 changes: 41 additions & 1 deletion CMakeLists.txt
Expand Up @@ -41,6 +41,7 @@ option(RWKV_ACCELERATE "rwkv: enable Accelerate framework"
option(RWKV_OPENBLAS "rwkv: use OpenBLAS" OFF)
option(RWKV_CUBLAS "rwkv: use cuBLAS" OFF)
option(RWKV_CLBLAST "rwkv: use CLBlast" OFF)
option(RWKV_HIPBLAS "rwkv: use hipBLAS" OFF)

# Build only shared library without building tests and extras
option(RWKV_STANDALONE "rwkv: build only RWKV library" OFF)
Expand Down Expand Up @@ -184,6 +185,41 @@ if (RWKV_CLBLAST)
endif()
endif()

if (RWKV_HIPBLAS)
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)

if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
endif()
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
endif()

find_package(hip)
find_package(hipblas)
find_package(rocblas)

if (${hipblas_FOUND} AND ${hip_FOUND})
message(STATUS "HIP and hipBLAS found")
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
add_library(ggml-rocm OBJECT
${CMAKE_SOURCE_DIR}/ggml/src/ggml-cuda.cu
${CMAKE_SOURCE_DIR}/ggml/src/ggml-cuda.h )
if (BUILD_SHARED_LIBS)
set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_include_directories(ggml-rocm PUBLIC ${CMAKE_SOURCE_DIR}/ggml/include/ggml)
set_source_files_properties(${CMAKE_SOURCE_DIR}/ggml/src/ggml-cuda.cu PROPERTIES LANGUAGE CXX)
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
if (RWKV_STATIC)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
endif()
set(RWKV_EXTRA_LIBS ${RWKV_EXTRA_LIBS} ggml-rocm)
else()
message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
endif()
endif()

if (RWKV_ALL_WARNINGS)
if (NOT MSVC)
set(c_flags
Expand Down Expand Up @@ -370,7 +406,11 @@ target_compile_features(ggml PUBLIC c_std_11) # Don't bump
if (MSVC)
target_link_libraries(ggml PUBLIC ${RWKV_EXTRA_LIBS} Threads::Threads)
else()
target_link_libraries(ggml PUBLIC m ${RWKV_EXTRA_LIBS} Threads::Threads)
if(WIN32 AND RWKV_HIPBLAS)
target_link_libraries(ggml PUBLIC ${RWKV_EXTRA_LIBS} Threads::Threads)
else()
target_link_libraries(ggml PUBLIC m ${RWKV_EXTRA_LIBS} Threads::Threads)
endif()
endif()

if (RWKV_BUILD_SHARED_LIBRARY)
Expand Down
16 changes: 16 additions & 0 deletions README.md
Expand Up @@ -46,6 +46,18 @@ Measurements were made on Intel i7 13700K & NVIDIA 3060 Ti 8 GB. Latency per tok

Note: since cuBLAS is supported only for `ggml_mul_mat()`, we still need to use few CPU resources to execute remaining operations.

### With hipBLAS
Measurements were made on CPU AMD Ryzen 9 5900X & GPU AMD Radeon RX 7900 XTX. Latency per token in ms shown.

| Model | Layers on GPU | Format | 1 thread | 2 threads | 4 threads | 8 threads | 24 threads |
|------------------------------------------|---------------|--------|----------|-----------|-----------|-----------|------------|
| `RWKV-novel-4-World-7B-20230810-ctx128k` | 32 | `f16` | 94 | 91 | 94 | 106 | 944 |
| `RWKV-novel-4-World-7B-20230810-ctx128k` | 32 | `Q4_0` | 83 | 77 | 75 | 110 | 1692 |
| `RWKV-novel-4-World-7B-20230810-ctx128k` | 32 | `Q4_1` | 85 | 80 | 85 | 93 | 1691 |
| `RWKV-novel-4-World-7B-20230810-ctx128k` | 32 | `Q5_1` | 83 | 78 | 83 | 90 | 1115 |

Note: hipBLAS is same as cuBLAS.They only support `ggml_mul_mat()`, we still need to use few CPU resources to execute remaining operations.

## How to use

### 1. Clone the repo
Expand Down Expand Up @@ -86,6 +98,10 @@ If everything went OK, `bin\Release\rwkv.dll` file should appear.

Refer to [docs/cuBLAS_on_Windows.md](docs%2FcuBLAS_on_Windows.md) for a comprehensive guide.

##### Windows + hipBLAS

Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.

##### Linux / MacOS

**Requirements**: CMake (Linux: `sudo apt install cmake`, MacOS: `brew install cmake`, anaconoda: [cmake package](https://anaconda.org/conda-forge/cmake)).
Expand Down
99 changes: 99 additions & 0 deletions docs/hipBLAS_on_Windows.md
@@ -0,0 +1,99 @@
# Using cuBLAS on Windows

To get hipBLAS in `rwkv.cpp` working on Windows, go through this guide section by section.

## Build Tools for Visual Studio 2022

Skip this step if you already have Build Tools installed.

To install Build Tools, go to [Visual Studio Older Downloads](https://visualstudio.microsoft.com/vs/), download `Visual Studio 2022 and other Products` and run the installer.


## CMake

Skip this step if you already have CMake installed: running `cmake --version` should output `cmake version x.y.z`.

Download latest `Windows x64 Installer` from [Download | CMake](https://cmake.org/download/) and run it.

## ROCM

Skip this step if you already have Build Tools installed.

The [validation tools](https://rocm.docs.amd.com/en/latest/reference/validation_tools.html) not support on Windows. So you should confirm the Version of `ROCM` by yourself.

Fortunately `AMD` provides complete help documentation, you can use the help documentation to install [ROCM](https://rocm.docs.amd.com/en/latest/deploy/windows/quick_start.html)

>**If you encounter an error, if it is [AMD ROCm Windows Installation Error 215](https://github.com/RadeonOpenCompute/ROCm/issues/2363), don't worry about this error. ROCM has been installed correctly, but the vs studio plugin installation failed, we can ignore it.**
Then we must set `ROCM` as environment variables before running cmake.

Usually if you install according to the official tutorial and do not modify the ROCM path, then there is a high probability that it is here `C:\Program Files\AMD\ROCm\5.5\bin`

This is what I use to set the clang:
```Commandline
set CC=C:\Program Files\AMD\ROCm\5.5\bin\clang.exe
set CXX=C:\Program Files\AMD\ROCm\5.5\bin\clang++.exe
```

## Ninja

Skip this step if you already have Ninja installed: running `ninja --version` should output `1.11.1`.

Download latest `ninja-win.zip` from [GitHub Releases Page](https://github.com/ninja-build/ninja/releases/tag/v1.11.1) and unzip.Then set as environment variables.
I unzipped it in `C:\Program Files\ninja`, so I set it like this:

```Commandline
set ninja=C:\Program Files\ninja\ninja.exe
```
## Building rwkv.cpp

The thing different from the regular CPU build is `-DRWKV_HIPBLAS=ON` ,
`-G "Ninja"`, `-DCMAKE_C_COMPILER=clang`, `-DCMAKE_CXX_COMPILER=clang++`, `-DAMDGPU_TARGETS=gfx1100`

>**Notice** check the `clang` and `clang++` information:
```Commandline
clang --version
clang++ --version
```

If you see like this, we can continue:
```
clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be)
Target: x86_64-pc-windows-msvc
Thread model: posix
InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin
```

```
clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be)
Target: x86_64-pc-windows-msvc
Thread model: posix
InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin
```

>**Notice** that the `gfx1100` is the GPU architecture of my GPU, you can change it to your GPU architecture. Click here to see your architecture [LLVM Target](https://rocm.docs.amd.com/en/latest/release/windows_support.html#windows-supported-gpus)
My GPU is AMD Radeon™ RX 7900 XTX Graphics, so I set it to `gfx1100`.

option:

```commandline
mkdir build
cd build
cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DRWKV_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100
cmake --build . --config Release
```

If everything went OK, `build\bin\Release\rwkv.dll` file should appear.

## Using the GPU

You need to choose layer count that will be offloaded onto the GPU. In general, the more layers offloaded, the better will be the performance; but you may be constrained by VRAM size of your GPU. Increase offloaded layer count until you get "CUDA out of memory" errors.

If most of the computation is performed on GPU, you will not need high thread count. Optimal value may be as low as 1, since any additional threads would just eat CPU cycles while waiting for GPU operation to complete.

To offload layers to GPU:

- if using Python model: pass non-zero number in `gpu_layer_count` to constructor of `rwkv.rwkv_cpp_model.RWKVModel`
- if using Python wrapper for C library: call `rwkv.rwkv_cpp_shared_library.RWKVSharedLibrary.rwkv_gpu_offload_layers`
- if using C library directly: call `bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers)`
19 changes: 14 additions & 5 deletions extras/CMakeLists.txt
@@ -1,11 +1,20 @@
function(rwkv_add_extra source)
get_filename_component(EXTRA_TARGET ${source} NAME_WE)
add_executable(rwkv_${EXTRA_TARGET} ${source})
target_link_libraries(rwkv_${EXTRA_TARGET} PRIVATE ggml rwkv)
if (RWKV_STATIC)
get_target_property(target_LINK_OPTIONS rwkv_${EXTRA_TARGET} LINK_OPTIONS)
list(REMOVE_ITEM target_LINK_OPTIONS "-static")
set_target_properties(rwkv_${EXTRA_TARGET} PROPERTIES LINK_OPTIONS "${target_LINK_OPTIONS}")
if(RWKV_HIPBLAS)
target_link_libraries(rwkv_${EXTRA_TARGET} PRIVATE ggml-rocm ggml rwkv)
else()
target_link_libraries(rwkv_${EXTRA_TARGET} PRIVATE ggml rwkv)
endif()

if (RWKV_STATIC)
if(RWKV_HIPBLAS)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
else()
get_target_property(target_LINK_OPTIONS ${TEST_TARGET} LINK_OPTIONS)
list(REMOVE_ITEM target_LINK_OPTIONS "-static")
set_target_properties(${TEST_TARGET} PROPERTIES LINK_OPTIONS "${target_LINK_OPTIONS}")
endif()
endif()
endfunction()

Expand Down
11 changes: 9 additions & 2 deletions python/rwkv_cpp/rwkv_cpp_shared_library.py
Expand Up @@ -2,6 +2,7 @@
import sys
import ctypes
import pathlib
import platform
from typing import Optional, List, Tuple, Callable

QUANTIZED_FORMAT_NAMES: Tuple[str, str, str, str, str] = (
Expand Down Expand Up @@ -35,8 +36,13 @@ def __init__(self, shared_library_path: str) -> None:
shared_library_path : str
Path to rwkv.cpp shared library. On Windows, it would look like 'rwkv.dll'. On UNIX, 'rwkv.so'.
"""

self.library = ctypes.cdll.LoadLibrary(shared_library_path)
# When Python is greater than 3.8, we need to reprocess the custom dll
# according to the documentation to prevent loading failure errors.
# https://docs.python.org/3/whatsnew/3.8.html#ctypes
if platform.system().lower() == 'windows':
self.library = ctypes.CDLL(shared_library_path, winmode=0)
else:
self.library = ctypes.cdll.LoadLibrary(shared_library_path)

self.library.rwkv_init_from_file.argtypes = [ctypes.c_char_p, ctypes.c_uint32]
self.library.rwkv_init_from_file.restype = ctypes.c_void_p
Expand Down Expand Up @@ -406,6 +412,7 @@ def load_rwkv_shared_library() -> RWKVSharedLibrary:
lambda p: p / 'bin' / file_name,
# Some people prefer to build in the "build" subdirectory.
lambda p: p / 'build' / 'bin' / 'Release' / file_name,
lambda p: p / 'build' / 'bin' / file_name,
lambda p: p / 'build' / file_name,
# Fallback.
lambda p: p / file_name
Expand Down
16 changes: 12 additions & 4 deletions tests/CMakeLists.txt
Expand Up @@ -4,12 +4,20 @@ function(rwkv_add_test source)
if (GGML_CUDA_SOURCES)
set_property(TARGET ${TEST_TARGET} PROPERTY CUDA_ARCHITECTURES OFF)
endif()
target_link_libraries(${TEST_TARGET} PRIVATE ggml rwkv)
if(RWKV_HIPBLAS)
target_link_libraries(${TEST_TARGET} PRIVATE ggml-rocm ggml rwkv)
else()
target_link_libraries(${TEST_TARGET} PRIVATE ggml rwkv)
endif()
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
if (RWKV_STATIC)
get_target_property(target_LINK_OPTIONS ${TEST_TARGET} LINK_OPTIONS)
list(REMOVE_ITEM target_LINK_OPTIONS "-static")
set_target_properties(${TEST_TARGET} PROPERTIES LINK_OPTIONS "${target_LINK_OPTIONS}")
if(RWKV_HIPBLAS)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
else()
get_target_property(target_LINK_OPTIONS ${TEST_TARGET} LINK_OPTIONS)
list(REMOVE_ITEM target_LINK_OPTIONS "-static")
set_target_properties(${TEST_TARGET} PROPERTIES LINK_OPTIONS "${target_LINK_OPTIONS}")
endif()
endif()
endfunction()

Expand Down

0 comments on commit 22a2778

Please sign in to comment.