diff --git a/.env b/.env
index eefcb6c..b55bd76 100644
--- a/.env
+++ b/.env
@@ -3,8 +3,14 @@
 LLAMA_CPP_SERVER_PORT=8000
 LLAMA_CPP_HOST=llama-cpp-server
 LOGDETECTIVE_SERVER_PORT=8080
-MODEL_FILEPATH=/models/mistral-7b-instruct-v0.2.Q4_K_S.gguf
 # for some reason, fastapi cripples sys.path and some deps cannot be found
 PYTHONPATH=/src:/usr/local/lib64/python3.12/site-packages:/usr/lib64/python312.zip:/usr/lib64/python3.12/:/usr/lib64/python3.12/lib-dynload:/usr/local/lib/python3.12/site-packages:/usr/lib64/python3.12/site-packages:/usr/lib/python3.12/site-packages
-LLM_NGPUS=-1
-LLAMA_CPP_CONFIG=llama_cpp_server_config.json
+LLAMA_ARG_MODEL="/models/mistral-7b-instruct-v0.2.Q4_K_S.gguf"
+LLAMA_ARG_ALIAS="default-model"
+LLAMA_ARG_N_GPU_LAYERS=-1
+LLAMA_ARG_THREADS=12
+LLAMA_ARG_BATCH=512
+# Modify following var when switching model
+LLAMA_ARG_CHAT_TEMPLATE="mistral-v3"
+LLAMA_ARG_CTX_SIZE=32768
+LLAMA_ARG_N_PARALLEL=4
diff --git a/Containerfile.cuda b/Containerfile.cuda
index 5bd69a7..440d672 100644
--- a/Containerfile.cuda
+++ b/Containerfile.cuda
@@ -8,12 +8,22 @@ RUN dnf install -y python3-requests python3-pip gcc gcc-c++ python3-scikit-build
     && echo "gpgcheck=1" >> /etc/yum.repos.d/cuda.repo \
     && echo "gpgkey=https://developer.download.nvidia.com/compute/cuda/repos/fedora39/x86_64/D42D0685.pub" >> /etc/yum.repos.d/cuda.repo \
     && dnf module enable -y nvidia-driver:555-dkms \
-    && dnf install -y cuda-compiler-12-5 cuda-toolkit-12-5 nvidia-driver-cuda-libs \
+    && dnf install -y cuda-compiler-12-5 cuda-toolkit-12-5 nvidia-driver-cuda-libs cmake \
     && dnf clean all
 ENV CMAKE_ARGS="-DGGML_CUDA=on"
+ENV LLAMACPP_VER="b4501"
 ENV PATH=${PATH}:/usr/local/cuda-12.5/bin/
 # some of these are either not in F39 or have old version
-RUN pip3 install llama_cpp_python==0.2.85 starlette drain3 sse-starlette starlette-context \
-    pydantic-settings fastapi[standard] \
-    && mkdir /src
-COPY ./logdetective/ /src/logdetective/logdetective
+# RUN pip3 install llama_cpp_python==0.2.85 starlette drain3 sse-starlette starlette-context \
+#     pydantic-settings fastapi[standard] \
+#     && mkdir /src
+
+# Clone, checkout, build and move llama.cpp server to path
+RUN git clone https://github.com/ggerganov/llama.cpp.git && \
+    cd llama.cpp && \
+    git checkout $LLAMACPP_VER && \
+    cmake -B build && \
+    cmake --build build --config Release -j 4 -t llama-server && \
+    mv ./build/bin/llama-server /bin/llama-server
+
+# COPY ./logdetective/ /src/logdetective/logdetective
diff --git a/docker-compose-prod.yaml b/docker-compose-prod.yaml
index 18d70e3..cf0ba0c 100644
--- a/docker-compose-prod.yaml
+++ b/docker-compose-prod.yaml
@@ -6,7 +6,7 @@ services:
       context: .
       dockerfile: ./Containerfile.cuda
     hostname: "${LLAMA_CPP_HOST}"
-    command: "python3 -m llama_cpp.server --model ${MODEL_FILEPATH} --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT} --n_gpu_layers ${LLM_NGPUS:-0} --n_ctx 32768"
+    command: "llama-server --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT}"
     stdin_open: true
     tty: true
     env_file: .env
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 90ef6bc..e9c06a7 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -6,7 +6,7 @@ services:
       context: .
       dockerfile: ./Containerfile.cuda
     hostname: "${LLAMA_CPP_HOST}"
-    command: "python3 -m llama_cpp.server --model ${MODEL_FILEPATH} --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT} --n_gpu_layers ${LLM_NGPUS:-0} --config /${LLAMA_CPP_CONFIG}"
+    command: "llama-server --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT}"
     stdin_open: true
     tty: true
     env_file: .env
@@ -14,7 +14,6 @@ services:
       - "${LLAMA_CPP_SERVER_PORT:-8000}:${LLAMA_CPP_SERVER_PORT:-8000}"
     volumes:
       - ${MODELS_PATH-./models}:/models:Z
-      - ./${LLAMA_CPP_CONFIG}:/${LLAMA_CPP_CONFIG}:Z
     # these 4 lines are needed for CUDA acceleration
     # devices:
     #   - nvidia.com/gpu=all
diff --git a/llama_cpp_server_config.json b/llama_cpp_server_config.json
deleted file mode 100644
index 695de0d..0000000
--- a/llama_cpp_server_config.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-    "host": "0.0.0.0",
-    "port": 8000,
-    "models": [
-        {
-            "model": "models/mistral-7b-instruct-v0.2.Q4_K_S.gguf",
-            "model_alias": "default-model",
-            "chat_format": "mistral-instruct",
-            "n_gpu_layers": -1,
-            "offload_kqv": true,
-            "n_threads": 12,
-            "n_batch": 512,
-            "n_ctx": 32768
-        }
-    ]
-}