johnbridges commited on
Commit
2ed485e
·
1 Parent(s): 11dee55
Files changed (1) hide show
  1. app.py +3 -2
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # app.py
2
- import asyncio, logging
3
  import gradio as gr
4
 
5
  from config import settings
@@ -31,17 +31,18 @@ except Exception:
31
 
32
  # ----------------- vLLM init -----------------
33
  async def init_vllm():
34
- """Initialize the global vLLM engine and store it in state.py"""
35
  if state.vllm_engine is not None:
36
  return state.vllm_engine
37
 
38
  model_id = getattr(settings, "LlmHFModelID", "Qwen/Qwen2.5-7B-Instruct")
39
  log.info(f"Loading vLLM model: {model_id}")
40
 
 
41
  args = AsyncEngineArgs(
42
  model=model_id,
43
  trust_remote_code=True,
44
  max_model_len=getattr(settings, "LlmOpenAICtxSize", 32768),
 
45
  )
46
  state.vllm_engine = AsyncLLMEngine.from_engine_args(args)
47
  return state.vllm_engine
 
1
  # app.py
2
+ import asyncio, logging, os
3
  import gradio as gr
4
 
5
  from config import settings
 
31
 
32
  # ----------------- vLLM init -----------------
33
  async def init_vllm():
 
34
  if state.vllm_engine is not None:
35
  return state.vllm_engine
36
 
37
  model_id = getattr(settings, "LlmHFModelID", "Qwen/Qwen2.5-7B-Instruct")
38
  log.info(f"Loading vLLM model: {model_id}")
39
 
40
+ # Always use GPU (cuda) — Spaces provides GPU when @spaces.GPU is active
41
  args = AsyncEngineArgs(
42
  model=model_id,
43
  trust_remote_code=True,
44
  max_model_len=getattr(settings, "LlmOpenAICtxSize", 32768),
45
+ device="cuda", # ✅ force GPU
46
  )
47
  state.vllm_engine = AsyncLLMEngine.from_engine_args(args)
48
  return state.vllm_engine