Spaces:
Runtime error
Runtime error
| from fastapi import FastAPI , Request | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import torch | |
| import os | |
| app = FastAPI(title="MobileLLM-Pro API", description="Public API for MobileLLM-Pro") | |
| # --- Load model & tokenizer --- | |
| MODEL_PATH = "/app/model" | |
| print("π§ Loading tokenizer and model...") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_PATH, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16, | |
| low_cpu_mem_usage=True, | |
| device_map=None, | |
| ) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model.to(device) | |
| model.eval() | |
| # Set pad_token if missing | |
| if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| SYSTEM_PROMPT = ( | |
| "You are an expert AI assistant. Provide clear, accurate, and concise answers to the user's questions. " | |
| "Do not add extra commentary, disclaimers, or summaries unless asked. Answer directly." | |
| ) | |
| def root(): | |
| return {"message": "MobileLLM-Pro API is running!"} | |
| def generate(prompt: str, max_tokens: int = 256): | |
| try: | |
| # Build messages with system instruction | |
| messages = [ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": prompt} | |
| ] | |
| # β Use apply_chat_template with return_tensors="pt" (like in your working code) | |
| inputs = tokenizer.apply_chat_template( | |
| messages, | |
| return_tensors="pt", | |
| add_generation_prompt=True, | |
| tokenize=True # explicit | |
| ).to(device) | |
| # Generate | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| input_ids=inputs, | |
| max_new_tokens=max_tokens, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.95, | |
| pad_token_id=tokenizer.pad_token_id, | |
| eos_token_id=tokenizer.eos_token_id, | |
| ) | |
| # Decode only the new part | |
| input_len = inputs.shape[1] | |
| generated_tokens = outputs[0][input_len:] | |
| result = tokenizer.decode(generated_tokens, skip_special_tokens=True) | |
| return {"input": prompt, "output": result.strip()} | |
| except Exception as e: | |
| return {"error": str(e)} | |
| async def generate(request: Request): | |
| try: | |
| # Read JSON body from request | |
| data = await request.json() | |
| prompt = data.get("prompt", "") | |
| max_tokens = data.get("max_tokens", 256) | |
| # Build messages with system instruction | |
| messages = [ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": prompt} | |
| ] | |
| # Apply chat template | |
| inputs = tokenizer.apply_chat_template( | |
| messages, | |
| return_tensors="pt", | |
| add_generation_prompt=True, | |
| tokenize=True | |
| ).to(device) | |
| # Generate | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| input_ids=inputs, | |
| max_new_tokens=max_tokens, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.95, | |
| pad_token_id=tokenizer.pad_token_id, | |
| eos_token_id=tokenizer.eos_token_id, | |
| ) | |
| # Decode only the new part | |
| input_len = inputs.shape[1] | |
| generated_tokens = outputs[0][input_len:] | |
| result = tokenizer.decode(generated_tokens, skip_special_tokens=True) | |
| return {"input": prompt, "output": result.strip()} | |
| except Exception as e: | |
| return {"error": str(e)} |