Commit
·
2ed485e
1
Parent(s):
11dee55
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
# app.py
|
| 2 |
-
import asyncio, logging
|
| 3 |
import gradio as gr
|
| 4 |
|
| 5 |
from config import settings
|
|
@@ -31,17 +31,18 @@ except Exception:
|
|
| 31 |
|
| 32 |
# ----------------- vLLM init -----------------
|
| 33 |
async def init_vllm():
|
| 34 |
-
"""Initialize the global vLLM engine and store it in state.py"""
|
| 35 |
if state.vllm_engine is not None:
|
| 36 |
return state.vllm_engine
|
| 37 |
|
| 38 |
model_id = getattr(settings, "LlmHFModelID", "Qwen/Qwen2.5-7B-Instruct")
|
| 39 |
log.info(f"Loading vLLM model: {model_id}")
|
| 40 |
|
|
|
|
| 41 |
args = AsyncEngineArgs(
|
| 42 |
model=model_id,
|
| 43 |
trust_remote_code=True,
|
| 44 |
max_model_len=getattr(settings, "LlmOpenAICtxSize", 32768),
|
|
|
|
| 45 |
)
|
| 46 |
state.vllm_engine = AsyncLLMEngine.from_engine_args(args)
|
| 47 |
return state.vllm_engine
|
|
|
|
| 1 |
# app.py
|
| 2 |
+
import asyncio, logging, os
|
| 3 |
import gradio as gr
|
| 4 |
|
| 5 |
from config import settings
|
|
|
|
| 31 |
|
| 32 |
# ----------------- vLLM init -----------------
|
| 33 |
async def init_vllm():
|
|
|
|
| 34 |
if state.vllm_engine is not None:
|
| 35 |
return state.vllm_engine
|
| 36 |
|
| 37 |
model_id = getattr(settings, "LlmHFModelID", "Qwen/Qwen2.5-7B-Instruct")
|
| 38 |
log.info(f"Loading vLLM model: {model_id}")
|
| 39 |
|
| 40 |
+
# Always use GPU (cuda) — Spaces provides GPU when @spaces.GPU is active
|
| 41 |
args = AsyncEngineArgs(
|
| 42 |
model=model_id,
|
| 43 |
trust_remote_code=True,
|
| 44 |
max_model_len=getattr(settings, "LlmOpenAICtxSize", 32768),
|
| 45 |
+
device="cuda", # ✅ force GPU
|
| 46 |
)
|
| 47 |
state.vllm_engine = AsyncLLMEngine.from_engine_args(args)
|
| 48 |
return state.vllm_engine
|