PreethiCarmelBosco commited on
Commit
edd4ff1
·
verified ·
1 Parent(s): 28eb58c
Files changed (1) hide show
  1. Dockerfile +32 -20
Dockerfile CHANGED
@@ -1,31 +1,43 @@
1
- # --- 1. Use the official Hugging Face TGI image ---
2
- # This is a pre-built image with everything included.
3
- FROM ghcr.io/huggingface/text-generation-inference:latest
4
 
5
- # --- 2. Download the GGUF model using cURL ---
6
- # We use cURL (which is already in the image) to avoid
7
- # installing Python and causing version conflicts.
8
  WORKDIR /app
9
 
10
- # Get the HF_TOKEN from the build secrets
11
- ARG HF_TOKEN
 
 
 
 
12
 
13
- # Run the download command
 
 
14
  RUN --mount=type=secret,id=HF_TOKEN \
15
- curl -L \
16
- -H "Authorization: Bearer $(cat /run/secrets/HF_TOKEN)" \
17
- "https://huggingface.co/mradermacher/prem-1B-SQL-GGUF/resolve/main/prem-1B-SQL.Q8_0.gguf" \
18
- -o "prem-1B-SQL.Q8_0.gguf"
19
 
20
- # --- 3. Set the container's command to run TGI ---
21
- # This is the command that will run when the container starts.
22
- ENV MODEL_ID="/app/prem-1B-SQL.Q8_0.gguf"
 
23
 
 
 
 
 
 
 
 
 
24
  CMD [ \
25
- "text-generation-launcher", \
26
- "--model-id", "${MODEL_ID}", \
27
- "--quantize", "gguf", \
 
28
  "--port", "8000", \
29
  "--host", "0.0.0.0", \
30
- "--openai-api-key-env-var", "API_KEY" \
31
  ]
 
1
+ # --- 1. Use a standard, reliable Python base ---
2
+ FROM python:3.12-slim
 
3
 
 
 
 
4
  WORKDIR /app
5
 
6
+ # --- 2. Install C/C++ build tools & venv ---
7
+ RUN apt-get update && apt-get install -y \
8
+ build-essential \
9
+ cmake \
10
+ python3-venv \
11
+ && rm -rf /var/lib/apt/lists/*
12
 
13
+ # --- 3. Download the model first ---
14
+ # This uses a safe, isolated venv just for downloading
15
+ COPY download_model.py .
16
  RUN --mount=type=secret,id=HF_TOKEN \
17
+ sh -c 'python3 -m venv /tmp/downloader-venv && \
18
+ . /tmp/downloader-venv/bin/activate && \
19
+ pip install huggingface_hub && \
20
+ python3 download_model.py'
21
 
22
+ # --- 4. Build llama-cpp-python (CPU-ONLY) ---
23
+ # This is the CRITICAL FIX.
24
+ # This forces a fast, CPU-only build that will not time out.
25
+ ENV CMAKE_ARGS="-DLLAMA_CUDA=OFF"
26
 
27
+ # This build step will now be fast (1-2 minutes)
28
+ RUN pip install "llama-cpp-python[server]"
29
+
30
+ # --- 5. Set the runtime command ---
31
+ # Expose the port (matches README.md)
32
+ EXPOSE 8000
33
+
34
+ # This command runs the server
35
  CMD [ \
36
+ "python", \
37
+ "-m", "llama_cpp.server", \
38
+ "--model", "prem-1B-SQL.Q8_0.gguf", \
39
+ "--n_gpu_layers", "0", \
40
  "--port", "8000", \
41
  "--host", "0.0.0.0", \
42
+ "--api_key_env_var", "API_KEY" \
43
  ]