# app.py import asyncio, logging import gradio as gr from config import settings from rabbit_base import RabbitBase from listener import RabbitListenerBase from rabbit_repo import RabbitRepo from oa_server import OpenAIServers from vllm_backend import VLLMChatBackend, StubImagesBackend import state # holds vllm_engine reference # ---- vLLM imports ---- from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.arg_utils import AsyncEngineArgs logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s" ) log = logging.getLogger("app") # ----------------- Hugging Face Spaces helpers ----------------- try: import spaces @spaces.GPU(duration=60) def gpu_entrypoint() -> str: return "gpu: ready" @spaces.GPU(duration=600) def _build_vllm_engine_on_gpu(model_id: str, max_len: int): args = AsyncEngineArgs( model=model_id, trust_remote_code=True, max_model_len=max_len, ) return AsyncLLMEngine.from_engine_args(args) except Exception: def gpu_entrypoint() -> str: return "gpu: not available (CPU only)" def _build_vllm_engine_on_gpu(model_id: str, max_len: int): args = AsyncEngineArgs( model=model_id, trust_remote_code=True, max_model_len=max_len, ) return AsyncLLMEngine.from_engine_args(args) # ----------------- vLLM init ----------------- async def init_vllm(): if state.vllm_engine is not None: return state.vllm_engine model_id = getattr(settings, "LlmHFModelID", "Qwen/Qwen2.5-7B-Instruct") max_len = int(getattr(settings, "LlmOpenAICtxSize", 32768)) log.info(f"Loading vLLM model: {model_id}") # Build inside a GPU context so Spaces ZeroGPU exposes CUDA state.vllm_engine = _build_vllm_engine_on_gpu(model_id, max_len) return state.vllm_engine # ----------------- RabbitMQ wiring ----------------- publisher = RabbitRepo(external_source="openai.mq.server") resolver = (lambda name: "direct" if name.startswith("oa.") else settings.RABBIT_EXCHANGE_TYPE) base = RabbitBase(exchange_type_resolver=resolver) servers = OpenAIServers( publisher, chat_backend=VLLMChatBackend(), images_backend=StubImagesBackend() ) handlers = { "oaChatCreate": servers.handle_chat_create, "oaImagesGenerate": servers.handle_images_generate, } DECLS = [ {"ExchangeName": "oa.chat.create", "FuncName": "oaChatCreate", "MessageTimeout": 600_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]}, {"ExchangeName": "oa.images.generate", "FuncName": "oaImagesGenerate", "MessageTimeout": 600_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]}, ] listener = RabbitListenerBase(base, instance_name=settings.RABBIT_INSTANCE_NAME, handlers=handlers) # ----------------- Startup init ----------------- async def _startup_init(): try: await init_vllm() # load vLLM model await base.connect() # connect to RabbitMQ await listener.start(DECLS) # start queue listeners return "OpenAI MQ + vLLM: ready" except Exception as e: log.exception("Startup init failed") return f"ERROR: {e}" async def ping(): return "ok" # ----------------- Gradio UI ----------------- with gr.Blocks(title="OpenAI over RabbitMQ (local vLLM)", theme=gr.themes.Soft()) as demo: gr.Markdown("## OpenAI-compatible over RabbitMQ — using vLLM locally inside Space") with gr.Tabs(): with gr.Tab("Service"): btn = gr.Button("Ping") out = gr.Textbox(label="Ping result") btn.click(ping, inputs=None, outputs=out) init_status = gr.Textbox(label="Startup status", interactive=False) demo.load(fn=_startup_init, inputs=None, outputs=init_status) with gr.Tab("@spaces.GPU Probe"): gpu_btn = gr.Button("GPU Ready Probe", variant="primary") gpu_out = gr.Textbox(label="GPU Probe Result", interactive=False) gpu_btn.click(gpu_entrypoint, inputs=None, outputs=gpu_out) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True, debug=True)