diff --git a/.env b/.env index eefcb6c..b55bd76 100644 --- a/.env +++ b/.env @@ -3,8 +3,14 @@ LLAMA_CPP_SERVER_PORT=8000 LLAMA_CPP_HOST=llama-cpp-server LOGDETECTIVE_SERVER_PORT=8080 -MODEL_FILEPATH=/models/mistral-7b-instruct-v0.2.Q4_K_S.gguf # for some reason, fastapi cripples sys.path and some deps cannot be found PYTHONPATH=/src:/usr/local/lib64/python3.12/site-packages:/usr/lib64/python312.zip:/usr/lib64/python3.12/:/usr/lib64/python3.12/lib-dynload:/usr/local/lib/python3.12/site-packages:/usr/lib64/python3.12/site-packages:/usr/lib/python3.12/site-packages -LLM_NGPUS=-1 -LLAMA_CPP_CONFIG=llama_cpp_server_config.json +LLAMA_ARG_MODEL="/models/mistral-7b-instruct-v0.2.Q4_K_S.gguf" +LLAMA_ARG_ALIAS="default-model" +LLAMA_ARG_N_GPU_LAYERS=-1 +LLAMA_ARG_THREADS=12 +LLAMA_ARG_BATCH=512 +# Modify following var when switching model +LLAMA_ARG_CHAT_TEMPLATE="mistral-v3" +LLAMA_ARG_CTX_SIZE=32768 +LLAMA_ARG_N_PARALLEL=4 diff --git a/Containerfile.cuda b/Containerfile.cuda index 5bd69a7..440d672 100644 --- a/Containerfile.cuda +++ b/Containerfile.cuda @@ -8,12 +8,22 @@ RUN dnf install -y python3-requests python3-pip gcc gcc-c++ python3-scikit-build && echo "gpgcheck=1" >> /etc/yum.repos.d/cuda.repo \ && echo "gpgkey=https://developer.download.nvidia.com/compute/cuda/repos/fedora39/x86_64/D42D0685.pub" >> /etc/yum.repos.d/cuda.repo \ && dnf module enable -y nvidia-driver:555-dkms \ - && dnf install -y cuda-compiler-12-5 cuda-toolkit-12-5 nvidia-driver-cuda-libs \ + && dnf install -y cuda-compiler-12-5 cuda-toolkit-12-5 nvidia-driver-cuda-libs cmake \ && dnf clean all ENV CMAKE_ARGS="-DGGML_CUDA=on" +ENV LLAMACPP_VER="b4501" ENV PATH=${PATH}:/usr/local/cuda-12.5/bin/ # some of these are either not in F39 or have old version -RUN pip3 install llama_cpp_python==0.2.85 starlette drain3 sse-starlette starlette-context \ - pydantic-settings fastapi[standard] \ - && mkdir /src -COPY ./logdetective/ /src/logdetective/logdetective +# RUN pip3 install llama_cpp_python==0.2.85 starlette drain3 sse-starlette starlette-context \ +# pydantic-settings fastapi[standard] \ +# && mkdir /src + +# Clone, checkout, build and move llama.cpp server to path +RUN git clone https://github.com/ggerganov/llama.cpp.git && \ + cd llama.cpp && \ + git checkout $LLAMACPP_VER && \ + cmake -B build && \ + cmake --build build --config Release -j 4 -t llama-server && \ + mv ./build/bin/llama-server /bin/llama-server + +# COPY ./logdetective/ /src/logdetective/logdetective diff --git a/docker-compose-prod.yaml b/docker-compose-prod.yaml index 18d70e3..cf0ba0c 100644 --- a/docker-compose-prod.yaml +++ b/docker-compose-prod.yaml @@ -6,7 +6,7 @@ services: context: . dockerfile: ./Containerfile.cuda hostname: "${LLAMA_CPP_HOST}" - command: "python3 -m llama_cpp.server --model ${MODEL_FILEPATH} --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT} --n_gpu_layers ${LLM_NGPUS:-0} --n_ctx 32768" + command: "llama-server --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT}" stdin_open: true tty: true env_file: .env diff --git a/docker-compose.yaml b/docker-compose.yaml index 90ef6bc..e9c06a7 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -6,7 +6,7 @@ services: context: . dockerfile: ./Containerfile.cuda hostname: "${LLAMA_CPP_HOST}" - command: "python3 -m llama_cpp.server --model ${MODEL_FILEPATH} --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT} --n_gpu_layers ${LLM_NGPUS:-0} --config /${LLAMA_CPP_CONFIG}" + command: "llama-server --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT}" stdin_open: true tty: true env_file: .env @@ -14,7 +14,6 @@ services: - "${LLAMA_CPP_SERVER_PORT:-8000}:${LLAMA_CPP_SERVER_PORT:-8000}" volumes: - ${MODELS_PATH-./models}:/models:Z - - ./${LLAMA_CPP_CONFIG}:/${LLAMA_CPP_CONFIG}:Z # these 4 lines are needed for CUDA acceleration # devices: # - nvidia.com/gpu=all diff --git a/llama_cpp_server_config.json b/llama_cpp_server_config.json deleted file mode 100644 index 695de0d..0000000 --- a/llama_cpp_server_config.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "host": "0.0.0.0", - "port": 8000, - "models": [ - { - "model": "models/mistral-7b-instruct-v0.2.Q4_K_S.gguf", - "model_alias": "default-model", - "chat_format": "mistral-instruct", - "n_gpu_layers": -1, - "offload_kqv": true, - "n_threads": 12, - "n_batch": 512, - "n_ctx": 32768 - } - ] -}