# app.py import asyncio from contextlib import asynccontextmanager import gradio as gr from fastapi import FastAPI from config import settings from rabbit_base import RabbitBase from listener import RabbitListenerBase from rabbit_repo import RabbitRepo from service import LLMService from runners.base import ILLMRunner # ========================= # @spaces.GPU() SECTION # ========================= # Mirrors the working Space: define a concrete GPU-decorated fn that Gradio calls. try: import spaces ZERO_GPU_AVAILABLE = True @spaces.GPU(duration=120) # trivial GPU entrypoint; detector-friendly def gpu_entrypoint(): """ Minimal GPU function so ZeroGPU sees a GPU endpoint. Replace the body later with real CUDA work as needed. """ return "gpu: ready" except Exception: ZERO_GPU_AVAILABLE = False def gpu_entrypoint(): return "gpu: not available (CPU only)" # ---------------- Runner factory (stub) ---------------- class EchoRunner(ILLMRunner): Type = "EchoRunner" async def StartProcess(self, llmServiceObj: dict): pass async def RemoveProcess(self, sessionId: str): pass async def StopRequest(self, sessionId: str): pass async def SendInputAndGetResponse(self, llmServiceObj: dict): pass async def runner_factory(llmServiceObj: dict) -> ILLMRunner: return EchoRunner() # ---------------- Publisher and Service ---------------- publisher = RabbitRepo(external_source="https://space.external") service = LLMService(publisher, runner_factory) # ---------------- Handlers (.NET FuncName -> service) ---------------- async def h_start(data): await service.StartProcess(data or {}) async def h_user(data): await service.UserInput(data or {}) async def h_remove(data): await service.RemoveSession(data or {}) async def h_stop(data): await service.StopRequest(data or {}) async def h_qir(data): await service.QueryIndexResult(data or {}) async def h_getreg(_): await service.GetFunctionRegistry(False) async def h_getreg_f(_): await service.GetFunctionRegistry(True) handlers = { "llmStartSession": h_start, "llmUserInput": h_user, "llmRemoveSession": h_remove, "llmStopRequest": h_stop, "queryIndexResult": h_qir, "getFunctionRegistry": h_getreg, "getFunctionRegistryFiltered": h_getreg_f, } # ---------------- Listener wiring ---------------- base = RabbitBase() listener = RabbitListenerBase( base, instance_name=settings.RABBIT_INSTANCE_NAME, # queue prefix like your .NET instance handlers=handlers, ) # Declarations mirror your C# InitRabbitMQObjs() DECLS = [ {"ExchangeName": f"llmStartSession{settings.SERVICE_ID}", "FuncName": "llmStartSession", "MessageTimeout": 600_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]}, {"ExchangeName": f"llmUserInput{settings.SERVICE_ID}", "FuncName": "llmUserInput", "MessageTimeout": 600_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]}, {"ExchangeName": f"llmRemoveSession{settings.SERVICE_ID}", "FuncName": "llmRemoveSession", "MessageTimeout": 60_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]}, {"ExchangeName": f"llmStopRequest{settings.SERVICE_ID}", "FuncName": "llmStopRequest", "MessageTimeout": 60_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]}, {"ExchangeName": f"queryIndexResult{settings.SERVICE_ID}", "FuncName": "queryIndexResult", "MessageTimeout": 60_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]}, {"ExchangeName": f"getFunctionRegistry{settings.SERVICE_ID}", "FuncName": "getFunctionRegistry", "MessageTimeout": 60_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]}, {"ExchangeName": f"getFunctionRegistryFiltered{settings.SERVICE_ID}", "FuncName": "getFunctionRegistryFiltered", "MessageTimeout": 60_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]}, ] # ---------------- Gradio UI (smoke test + GPU button) ---------------- async def ping(): return "ok" with gr.Blocks() as demo: gr.Markdown("### LLM Runner (Python) — RabbitMQ listener (ZeroGPU-ready)") with gr.Row(): btn = gr.Button("Ping") out = gr.Textbox(label="Ping result") btn.click(ping, inputs=None, outputs=out) # Reference the GPU-decorated function **directly** (no lambda) with gr.Row(): gpu_btn = gr.Button("GPU Ready Probe") gpu_out = gr.Textbox(label="GPU Probe Result", interactive=False) gpu_btn.click(gpu_entrypoint, inputs=None, outputs=gpu_out) # ---------------- FastAPI + lifespan ---------------- @asynccontextmanager async def lifespan(_app: FastAPI): # startup await publisher.connect() await service.init() await listener.start(DECLS) yield # shutdown (optional) # await publisher.close() # await listener.stop() app = FastAPI(lifespan=lifespan) app = gr.mount_gradio_app(app, demo, path="/") @app.get("/health") async def health(): return {"status": "ok"} # Also expose the probe via HTTP (extra-safe for detectors) @app.get("/gpu-probe") def gpu_probe_route(): return {"status": gpu_entrypoint()} if __name__ == "__main__": # For local runs; on HF Spaces, the SDK manages the server. import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)