源码编译llama.cpp 、ggml 后端启用自定义BLAS加速
我在llama.cpp 官网上提交了我的解决方案:How to setup OpenBlas on windows? #625
GGML 官网 https://github.com/ggerganov/ggml/issues/959
windows on arm 编译 llama.cpp 、ggml 后端启用自定义BLAS加速
我这以编译 windows on arm 的 llama.cpp 、ggml 为例子,其它情况同样可以
参考我的文章 《源码编译 openblas for windows on arm》 我用了自己编译的这个 openblas
原理 blas 加速是通过编译ggml提供的
所以修改llama.cpp/ggml/src/CMakeLists.txt
,在这一行代码if (GGML_BLAS)
前面添加以下代码:
自定义编译blas:
# add custom blas
if (CUSTOM_BLAS)
set(BLAS_ROOT "C:/workspace/program/openblas")
set(BLAS_INCLUDE_DIRS
"${BLAS_ROOT}/include/"
"${BLAS_ROOT}/include/openblas"
)
set(BLAS_LIBRARIES "${BLAS_ROOT}/lib/openblas.lib")
list(APPEND GGML_CDEF_PUBLIC GGML_USE_BLAS)
set(GGML_HEADERS_BLAS ../include/ggml-blas.h)
set(GGML_SOURCES_BLAS ggml-blas.cpp)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${BLAS_LIBRARIES})
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
endif()
然后编译时指定 CUSTOM_BLAS=ON
:
cmake -B build -DGGML_BLAS=OFF -DCUSTOM_BLAS=ON
cmake --build build --config Release
测试
llama.cpp/wmx_test/test_cli.sh :
bash
#!/bin/bash
# ./llama-cli --hf-repo hfxing/Qwen2-1.5B-Q4_K_M-GGUF --hf-file qwen2-1.5b-q4_k_m.gguf -p "The meaning to life and the universe is"
cmd=../build/bin/llama-cli
modelpath=/media/wmx/soft1/huggingface_cache/Qwen2-1.5B-Q4_K_M-GGUF/qwen2-1.5b-q4_k_m.gguf
# modelpath=/media/wmx/soft1/huggingface_cache/Qwen1.5-1.8B-Chat/ggml-model-f16.gguf
user_prompt="你是一个AI助手。请问:深圳在哪里?"
$cmd \
-m $modelpath \
-p "$user_prompt"
llama.cpp/wmx_test/test_llava_cli.sh :
bash
#!/bin/bash
cmd=../build/bin/llama-llava-cli
modelpath=/media/wmx/soft1/huggingface_cache/Bunny-v1_0-4B-gguf
user_prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. \
USER: <image>\n Why is the image funny? \
ASSISTANT:"
# img_url="https://pic35.photophoto.cn/20150511/0034034892281415_b.jpg"
$cmd -m $modelpath/ggml-model-Q4_K_M.gguf \
--mmproj $modelpath/mmproj-model-f16.gguf \
--image $modelpath/images/example_2.png \
-c 4096 -e \
--temp 0.0 \
# --log-disable \
-p "$user_prompt"
llama.cpp/wmx_test/test_server.sh :
bash
#!/bin/bash
# netstat -lnp |grep 8000
# default port=8080 is used by ollama
PORT=8000
cmd=../build/bin/llama-server
modelpath=/media/wmx/soft1/huggingface_cache/Qwen2-1.5B-Q4_K_M-GGUF/qwen2-1.5b-q4_k_m.gguf
# modelpath=/media/wmx/soft1/huggingface_cache/Qwen1.5-1.8B-Chat/ggml-model-f16.gguf
$cmd \
-m $modelpath \
--port $PORT
llama.cpp/wmx_test/test_client.sh :
bash
import openai
PORT=8000
openai.api_key = "sk-no-key-required"
client = openai.OpenAI(
base_url=f"http://localhost:{PORT}/v1",
)
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
{"role": "user", "content": "深圳在哪里?"}
]
)
print(completion.choices[0].message)