# app.py - FINAL Self-Contained Backend

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread

# --- 1. Load the Model and Tokenizer ---
# We are loading a small, fast model directly into our app.
# This will run on a free CPU Space.
print("Loading model... This may take a moment.")
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")

# Set the pad token to the end-of-sentence token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model loaded successfully!")

# --- 2. The Inference Function ---
# This function runs the model on the CPU.
def get_response(message, history):
    
    # 1. Create a simple prompt.
    prompt = (
        "You are Healthify AI, a medical assistant. "
        "Provide a clear and concise answer to the user's question.\n\n"
        f"Question: {message}\n\nAnswer:"
    )
    
    try:
        # 2. Tokenize the prompt
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids
        
        # 3. Generate the response
        # Generation on a CPU is slow, so we limit the response length.
        outputs = model.generate(
            input_ids,
            max_new_tokens=100,  # Keep this low for faster CPU response
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_p=0.9,
            temperature=0.7
        )
        
        # 4. Decode the full response
        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # 5. Extract *only* the answer part
        # We find the "Answer:" part and return everything after it.
        answer_marker = "Answer:"
        answer_start = full_response.find(answer_marker)
        if answer_start != -1:
            bot_response = full_response[answer_start + len(answer_marker):].strip()
        else:
            bot_response = "I'm not sure how to respond to that." # Fallback

    except Exception as e:
        print(f"Error during generation: {e}")
        bot_response = "Sorry, I encountered an error while processing your request."

    return bot_response

# --- 3. The Gradio Interface ---
# This function handles the chat logic.
def handle_chat(message, history):
    # This is not a streaming call anymore.
    # It will wait for the full response and then return it.
    bot_response = get_response(message, history)
    history.append((message, bot_response))
    return history

# --- 4. Build the Gradio App ---
# This is what your frontend will connect to.
with gr.Blocks(theme=gr.themes.Base()) as demo:
    gr.Markdown("# Healthify AI 🇳🇬 - Backend (Self-Contained)")
    gr.Markdown("This backend runs its own model on a CPU.")
    
    chatbot = gr.Chatbot(label="Healthify AI Chat")
    
    with gr.Row():
        prompt_input = gr.Textbox(
            label="Your Question",
            placeholder="Ask a medical question...",
            scale=4,
        )
        submit_button = gr.Button("Send", variant="primary", scale=1)

    submit_button.click(handle_chat, [prompt_input, chatbot], chatbot, show_progress="full")
    prompt_input.submit(handle_chat, [prompt_input, chatbot], chatbot, show_progress="full")

demo.queue().launch()