Rohit-Katkar2003 commited on
Commit
18a4e9d
·
verified ·
1 Parent(s): 2452d2e

update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -15
app.py CHANGED
@@ -15,50 +15,56 @@ model = AutoModelForCausalLM.from_pretrained(
15
  trust_remote_code=True
16
  )
17
 
18
- # Ensure tokenizer has a chat template
19
- if tokenizer.chat_template is None:
20
- # Fallback: define a basic one if needed (check model card for correct format)
21
- tokenizer.chat_template = "{% for message in messages %}{{message['role']}}: {{message['content']}}\n{% endfor %}"
22
-
23
  device = "cuda" if torch.cuda.is_available() else "cpu"
24
  model.to(device)
25
  model.eval()
26
  print(f"✅ Model loaded on {device}!")
27
 
 
 
 
 
 
 
28
  @app.get("/")
29
  def root():
30
  return {"message": "MobileLLM-Pro API is running!"}
31
 
32
  @app.get("/generate")
33
- def generate(prompt: str, max_tokens: int = 50):
34
  try:
35
- # Format as a chat with user message
36
- messages = [{"role": "user", "content": prompt}]
 
 
 
37
 
38
  # Apply chat template
39
  input_text = tokenizer.apply_chat_template(
40
  messages,
41
- tokenize=False, # We'll tokenize next
42
- add_generation_prompt=True # Adds assistant start token
43
  )
44
 
45
- # Tokenize
46
  inputs = tokenizer(input_text, return_tensors="pt").to(device)
47
 
48
- # Generate
49
  outputs = model.generate(
50
  **inputs,
51
  max_new_tokens=max_tokens,
52
  do_sample=True,
53
  temperature=0.7,
 
54
  pad_token_id=tokenizer.eos_token_id
55
  )
56
 
57
- # Decode only the generated part (after input)
58
- generated_tokens = outputs[0][inputs['input_ids'].shape[1]:]
 
59
  result = tokenizer.decode(generated_tokens, skip_special_tokens=True)
60
 
61
- return {"input": prompt, "output": result}
62
 
63
  except Exception as e:
64
  return {"error": str(e)}
 
15
  trust_remote_code=True
16
  )
17
 
 
 
 
 
 
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
  model.to(device)
20
  model.eval()
21
  print(f"✅ Model loaded on {device}!")
22
 
23
+ # Define a strong system prompt
24
+ SYSTEM_PROMPT = (
25
+ "You are an expert AI assistant. Provide clear, accurate, and concise answers to the user's questions. "
26
+ "Do not add extra commentary, disclaimers, or summaries unless asked. Answer directly."
27
+ )
28
+
29
  @app.get("/")
30
  def root():
31
  return {"message": "MobileLLM-Pro API is running!"}
32
 
33
  @app.get("/generate")
34
+ def generate(prompt: str, max_tokens: int = 256):
35
  try:
36
+ # Construct full chat with system + user
37
+ messages = [
38
+ {"role": "system", "content": SYSTEM_PROMPT},
39
+ {"role": "user", "content": prompt}
40
+ ]
41
 
42
  # Apply chat template
43
  input_text = tokenizer.apply_chat_template(
44
  messages,
45
+ tokenize=False,
46
+ add_generation_prompt=True # Ensures <|assistant|> or equivalent is added
47
  )
48
 
49
+ # Tokenize and move to device
50
  inputs = tokenizer(input_text, return_tensors="pt").to(device)
51
 
52
+ # Generate response
53
  outputs = model.generate(
54
  **inputs,
55
  max_new_tokens=max_tokens,
56
  do_sample=True,
57
  temperature=0.7,
58
+ top_p=0.9,
59
  pad_token_id=tokenizer.eos_token_id
60
  )
61
 
62
+ # Extract only the generated part (after input)
63
+ input_len = inputs.input_ids.shape[1]
64
+ generated_tokens = outputs[0][input_len:]
65
  result = tokenizer.decode(generated_tokens, skip_special_tokens=True)
66
 
67
+ return {"input": prompt, "output": result.strip()}
68
 
69
  except Exception as e:
70
  return {"error": str(e)}