llama.cpp

Infer homebrew

Recommend for user.

brew install llama.cpp

REPO=Qwen/Qwen2-1.5B-Instruct-GGUF
FILE=qwen2-1_5b-instruct-q4_k_m.gguf
PROMPT="Write helloworld code in Rust"
llama-cli --hf-repo ${REPO} --hf-file ${FILE} -p "${PROMPT}" -i --n-gpu-layers 10

Build

Recommend for dev.

git clone https://github.com/ggml-org/llama.cpp.git
cd llama.cpp

# not working: build but no GPU
make GGML_CUDA=1

# not working: get an error `cicc: not found`
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DLLAMA_CUBLAS=ON

# working after `rm -rf build`
cmake -B build -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.4/bin/nvcc -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DGGML_CUDA=ON

cmake --build build --config Release

# copy to current dir
cp ./build/bin/* .

# but still slow
CUDACXX=/usr/local/cuda-12.4/bin/nvcc CMAKE_ARGS="-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all-major" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir --force-reinstall --upgrade

# try again
cmake -B build -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.4/bin/nvcc -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=all-major

FORCE_CMAKE=1 cmake --build build --config Release

# copy to current dir
cp ./build/bin/* .

# but still slow...🤔

Prepare model

curl -LO https://huggingface.co/second-state/gemma-2-9b-it-GGUF/resolve/main/gemma-2-9b-it-Q4_K_M.gguf

Infer

MODEL=gemma-2-9b-it-Q4_K_M.gguf
PROMPT="Write helloworld code in Rust"
./llama-cli -m ${MODEL} -p "${PROMPT}" -i --n-gpu-layers 10

Infer - Docker

MODEL=second-state/gemma-2-9b-it-GGUF
docker run --gpus all -v ./:/models local/llama.cpp:full-cuda --run -m ./${MODEL} -p "Write helloworld code in Rust" -n 512 --n-gpu-layers 1