import gradio as gr import mlx.core as mx import mlx.nn as nn import spaces from mlx_lm import load, generate from transformers import AutoTokenizer # モデルとトークナイザーのロード model_name = "Sakalti/ultiima-78B-Q2-mlx" tokenizer = AutoTokenizer.from_pretrained(model_name) model = load(model_name) @spaces.gpu(duration=100) def chat(prompt, top_p, top_k, max_new_tokens, system_prompt): messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": prompt} ] text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer([text], return_tensors="pt") # モデルに入力を渡して生成 output = generate( model, inputs.input_ids, tokenizer=tokenizer, max_tokens=max_new_tokens, top_p=top_p, top_k=top_k ) return output # GradioのUI設定 chat_interface = gr.ChatInterface( fn=chat, additional_inputs=[ gr.Textbox(value="あなたはフレンドリーなチャットボットです。", label="System Prompt"), gr.Slider(0.0, 1.0, value=0.9, label="Top-p"), gr.Slider(1, 100, value=50, label="Top-k"), gr.Slider(1, 1024, value=512, step=1, label="Max New Tokens") ] ) chat_interface.launch()