# --- 1. Use a standard, reliable Python base --- FROM python:3.12-slim WORKDIR /app # --- 2. Install C/C++ build tools & venv --- RUN apt-get update && apt-get install -y \ build-essential \ cmake \ python3-venv \ && rm -rf /var/lib/apt/lists/* # --- 3. Download the model first --- # This uses a safe, isolated venv just for downloading COPY download_model.py . RUN --mount=type=secret,id=HF_TOKEN \ sh -c 'python3 -m venv /tmp/downloader-venv && \ . /tmp/downloader-venv/bin/activate && \ pip install huggingface_hub && \ python3 download_model.py' # --- 4. Build llama-cpp-python (CPU-ONLY) --- # This is the CRITICAL FIX. # This forces a fast, CPU-only build that will not time out. ENV CMAKE_ARGS="-DLLAMA_CUDA=OFF" # This build step will now be fast (1-2 minutes) RUN pip install "llama-cpp-python[server]" # --- 5. Set the runtime command --- # Expose the port (matches README.md) EXPOSE 8000 # This command runs the server CMD [ \ "python", \ "-m", "llama_cpp.server", \ "--model", "prem-1B-SQL.Q8_0.gguf", \ "--n_gpu_layers", "0", \ "--port", "8000", \ "--host", "0.0.0.0", \ "--api_key_env_var", "API_KEY" \ ]