# app.py - FINAL Self-Contained Backend import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread # --- 1. Load the Model and Tokenizer --- # We are loading a small, fast model directly into our app. # This will run on a free CPU Space. print("Loading model... This may take a moment.") tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2") model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2") # Set the pad token to the end-of-sentence token if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print("Model loaded successfully!") # --- 2. The Inference Function --- # This function runs the model on the CPU. def get_response(message, history): # 1. Create a simple prompt. prompt = ( "You are Healthify AI, a medical assistant. " "Provide a clear and concise answer to the user's question.\n\n" f"Question: {message}\n\nAnswer:" ) try: # 2. Tokenize the prompt input_ids = tokenizer(prompt, return_tensors="pt").input_ids # 3. Generate the response # Generation on a CPU is slow, so we limit the response length. outputs = model.generate( input_ids, max_new_tokens=100, # Keep this low for faster CPU response pad_token_id=tokenizer.eos_token_id, do_sample=True, top_p=0.9, temperature=0.7 ) # 4. Decode the full response full_response = tokenizer.decode(outputs[0], skip_special_tokens=True) # 5. Extract *only* the answer part # We find the "Answer:" part and return everything after it. answer_marker = "Answer:" answer_start = full_response.find(answer_marker) if answer_start != -1: bot_response = full_response[answer_start + len(answer_marker):].strip() else: bot_response = "I'm not sure how to respond to that." # Fallback except Exception as e: print(f"Error during generation: {e}") bot_response = "Sorry, I encountered an error while processing your request." return bot_response # --- 3. The Gradio Interface --- # This function handles the chat logic. def handle_chat(message, history): # This is not a streaming call anymore. # It will wait for the full response and then return it. bot_response = get_response(message, history) history.append((message, bot_response)) return history # --- 4. Build the Gradio App --- # This is what your frontend will connect to. with gr.Blocks(theme=gr.themes.Base()) as demo: gr.Markdown("# Healthify AI 🇳🇬 - Backend (Self-Contained)") gr.Markdown("This backend runs its own model on a CPU.") chatbot = gr.Chatbot(label="Healthify AI Chat") with gr.Row(): prompt_input = gr.Textbox( label="Your Question", placeholder="Ask a medical question...", scale=4, ) submit_button = gr.Button("Send", variant="primary", scale=1) submit_button.click(handle_chat, [prompt_input, chatbot], chatbot, show_progress="full") prompt_input.submit(handle_chat, [prompt_input, chatbot], chatbot, show_progress="full") demo.queue().launch()