Skip to content

Commit

Permalink
Moving to pure llama.cpp server
Browse files Browse the repository at this point in the history
* configuration is now almost entirely withing the .env file
* server is being build as part of container image
* llama.cpp source is selected by tag
* parallel execution enabled
* removed separate llama_cpp_server config

Signed-off-by: Jiri Podivin <[email protected]>
  • Loading branch information
jpodivin committed Jan 17, 2025
1 parent 2b95f30 commit 6b9af1c
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 27 deletions.
12 changes: 9 additions & 3 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,14 @@
LLAMA_CPP_SERVER_PORT=8000
LLAMA_CPP_HOST=llama-cpp-server
LOGDETECTIVE_SERVER_PORT=8080
MODEL_FILEPATH=/models/mistral-7b-instruct-v0.2.Q4_K_S.gguf
# for some reason, fastapi cripples sys.path and some deps cannot be found
PYTHONPATH=/src:/usr/local/lib64/python3.12/site-packages:/usr/lib64/python312.zip:/usr/lib64/python3.12/:/usr/lib64/python3.12/lib-dynload:/usr/local/lib/python3.12/site-packages:/usr/lib64/python3.12/site-packages:/usr/lib/python3.12/site-packages
LLM_NGPUS=-1
LLAMA_CPP_CONFIG=llama_cpp_server_config.json
LLAMA_ARG_MODEL="/models/mistral-7b-instruct-v0.2.Q4_K_S.gguf"
LLAMA_ARG_ALIAS="default-model"
LLAMA_ARG_N_GPU_LAYERS=-1
LLAMA_ARG_THREADS=12
LLAMA_ARG_BATCH=512
# Modify following var when switching model
LLAMA_ARG_CHAT_TEMPLATE="mistral-v3"
LLAMA_ARG_CTX_SIZE=32768
LLAMA_ARG_N_PARALLEL=4
20 changes: 15 additions & 5 deletions Containerfile.cuda
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,22 @@ RUN dnf install -y python3-requests python3-pip gcc gcc-c++ python3-scikit-build
&& echo "gpgcheck=1" >> /etc/yum.repos.d/cuda.repo \
&& echo "gpgkey=https://developer.download.nvidia.com/compute/cuda/repos/fedora39/x86_64/D42D0685.pub" >> /etc/yum.repos.d/cuda.repo \
&& dnf module enable -y nvidia-driver:555-dkms \
&& dnf install -y cuda-compiler-12-5 cuda-toolkit-12-5 nvidia-driver-cuda-libs \
&& dnf install -y cuda-compiler-12-5 cuda-toolkit-12-5 nvidia-driver-cuda-libs cmake \
&& dnf clean all
ENV CMAKE_ARGS="-DGGML_CUDA=on"
ENV LLAMACPP_VER="b4501"
ENV PATH=${PATH}:/usr/local/cuda-12.5/bin/
# some of these are either not in F39 or have old version
RUN pip3 install llama_cpp_python==0.2.85 starlette drain3 sse-starlette starlette-context \
pydantic-settings fastapi[standard] \
&& mkdir /src
COPY ./logdetective/ /src/logdetective/logdetective
# RUN pip3 install llama_cpp_python==0.2.85 starlette drain3 sse-starlette starlette-context \
# pydantic-settings fastapi[standard] \
# && mkdir /src

# Clone, checkout, build and move llama.cpp server to path
RUN git clone https://github.com/ggerganov/llama.cpp.git && \
cd llama.cpp && \
git checkout $LLAMACPP_VER && \
cmake -B build && \
cmake --build build --config Release -j 4 -t llama-server && \
mv ./build/bin/llama-server /bin/llama-server

# COPY ./logdetective/ /src/logdetective/logdetective
2 changes: 1 addition & 1 deletion docker-compose-prod.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ services:
context: .
dockerfile: ./Containerfile.cuda
hostname: "${LLAMA_CPP_HOST}"
command: "python3 -m llama_cpp.server --model ${MODEL_FILEPATH} --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT} --n_gpu_layers ${LLM_NGPUS:-0} --n_ctx 32768"
command: "llama-server --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT}"
stdin_open: true
tty: true
env_file: .env
Expand Down
3 changes: 1 addition & 2 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,14 @@ services:
context: .
dockerfile: ./Containerfile.cuda
hostname: "${LLAMA_CPP_HOST}"
command: "python3 -m llama_cpp.server --model ${MODEL_FILEPATH} --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT} --n_gpu_layers ${LLM_NGPUS:-0} --config /${LLAMA_CPP_CONFIG}"
command: "llama-server --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT}"
stdin_open: true
tty: true
env_file: .env
ports:
- "${LLAMA_CPP_SERVER_PORT:-8000}:${LLAMA_CPP_SERVER_PORT:-8000}"
volumes:
- ${MODELS_PATH-./models}:/models:Z
- ./${LLAMA_CPP_CONFIG}:/${LLAMA_CPP_CONFIG}:Z
# these 4 lines are needed for CUDA acceleration
# devices:
# - nvidia.com/gpu=all
Expand Down
16 changes: 0 additions & 16 deletions llama_cpp_server_config.json

This file was deleted.

0 comments on commit 6b9af1c

Please sign in to comment.