您的用户名 commited on
Commit
b53c3a8
·
1 Parent(s): 9ae7eaa

Final fix: Switch to lighter 4B model to fit in memory

Browse files
Files changed (1) hide show
  1. main.py +43 -12
main.py CHANGED
@@ -6,7 +6,7 @@ from fastapi import FastAPI
6
  from pydantic import BaseModel
7
 
8
  # ================================================================
9
- # 动态安装核心AI引擎
10
  # ================================================================
11
  # 检查核心库是否存在,如果不存在,则在第一次运行时动态安装
12
  try:
@@ -17,7 +17,16 @@ except ImportError:
17
  logging.warning("这个过程会极其缓慢 (预计15-25分钟),且只会执行一次。请耐心等待日志完成。")
18
  try:
19
  # 使用subprocess来执行pip安装命令
20
- subprocess.check_call([sys.executable, "-m", "pip", "install", "llama-cpp-python"])
 
 
 
 
 
 
 
 
 
21
  logging.info("核心AI引擎动态安装成功!正在重新导入...")
22
  # 重新导入
23
  from llama_cpp import Llama
@@ -29,11 +38,16 @@ except ImportError:
29
  from huggingface_hub import hf_hub_download
30
 
31
  # ================================================================
32
- # 后续代码与之前版本相同
33
  # ================================================================
34
  app = FastAPI()
35
- MODEL_ID = "Qwen/Qwen1.5-7B-Chat-GGUF"
36
- MODEL_FILE = "qwen1_5-7b-chat-q5_k_m.gguf"
 
 
 
 
 
37
  llm = None
38
 
39
  @app.on_event("startup")
@@ -44,17 +58,24 @@ def load_model():
44
  return
45
 
46
  logging.info("正在CPU上使用 llama-cpp-python 加载GGUF模型...")
47
- logging.info("这仍然会很慢,请耐心等待模型下载和加载...")
48
  try:
49
- model_path = hf_hub_download(repo_id=MODEL_ID, filename=MODEL_FILE)
 
 
 
 
50
  logging.info(f"模型已成功下载到: {model_path}")
 
 
51
  llm = Llama(
52
  model_path=model_path,
53
- n_ctx=4096,
54
- n_threads=2,
55
- n_gpu_layers=0
56
  )
57
  logging.info("AI模型加载成功!API已准备就绪。")
 
58
  except Exception as e:
59
  logging.error(f"!!!!!!!!!!!!!! 模型加载失败 !!!!!!!!!!!!!!")
60
  logging.error(f"错误类型: {type(e).__name__}")
@@ -69,13 +90,23 @@ class ChatCompletionRequest(BaseModel):
69
  def chat_completions(request: ChatCompletionRequest):
70
  if llm is None:
71
  return {"error": "模型未能成功加载,API不可用。请检查Space日志。"}
 
 
72
  messages = request.messages
 
73
  try:
74
  logging.info("正在生成回复...")
75
- completion = llm.create_chat_completion(messages=messages, max_tokens=2048, temperature=0.7)
76
- response_text = completion['choices'][0]['message']['content']
 
 
 
 
77
  logging.info("回复生成成功!")
 
 
78
  return completion
 
79
  except Exception as e:
80
  logging.error(f"生成回复时出错: {e}")
81
  return {"error": "生成回复时遇到内部错误。"}
 
6
  from pydantic import BaseModel
7
 
8
  # ================================================================
9
+ # 核心AI引擎的动态安装
10
  # ================================================================
11
  # 检查核心库是否存在,如果不存在,则在第一次运行时动态安装
12
  try:
 
17
  logging.warning("这个过程会极其缓慢 (预计15-25分钟),且只会执行一次。请耐心等待日志完成。")
18
  try:
19
  # 使用subprocess来执行pip安装命令
20
+ # 我们将安装目标指定到一个拥有写入权限的本地目录
21
+ install_path = "/app/pip_packages"
22
+ os.makedirs(install_path, exist_ok=True)
23
+ # 将这个路径添加到Python的搜索路径中
24
+ sys.path.append(install_path)
25
+ subprocess.check_call([
26
+ sys.executable, "-m", "pip", "install",
27
+ f"--target={install_path}",
28
+ "llama-cpp-python"
29
+ ])
30
  logging.info("核心AI引擎动态安装成功!正在重新导入...")
31
  # 重新导入
32
  from llama_cpp import Llama
 
38
  from huggingface_hub import hf_hub_download
39
 
40
  # ================================================================
41
+ # 最终的main.py代码
42
  # ================================================================
43
  app = FastAPI()
44
+
45
+ # --- 核心修正点在这里 ---
46
+ # 我们从7B模型切换到更轻量级的4B模型,以适应16GB的内存限制
47
+ MODEL_ID = "Qwen/Qwen1.5-4B-Chat-GGUF"
48
+ # 同时,我们也选择这个模型对应的量化版本
49
+ MODEL_FILE = "qwen1_5-4b-chat-q5_k_m.gguf"
50
+
51
  llm = None
52
 
53
  @app.on_event("startup")
 
58
  return
59
 
60
  logging.info("正在CPU上使用 llama-cpp-python 加载GGUF模型...")
61
+ logging.info(f"目标模型: {MODEL_ID}/{MODEL_FILE}")
62
  try:
63
+ # 1. 从Hugging Face Hub下载模型文件到本地缓存
64
+ model_path = hf_hub_download(
65
+ repo_id=MODEL_ID,
66
+ filename=MODEL_FILE
67
+ )
68
  logging.info(f"模型已成功下载到: {model_path}")
69
+
70
+ # 2. 使用llama-cpp-python加载模型
71
  llm = Llama(
72
  model_path=model_path,
73
+ n_ctx=4096, # 上下文长度
74
+ n_threads=2, # 使用CPU核心数,对于免费版2核CPU是最佳设置
75
+ n_gpu_layers=0 # 明确指定在CPU上运行
76
  )
77
  logging.info("AI模型加载成功!API已准备就绪。")
78
+
79
  except Exception as e:
80
  logging.error(f"!!!!!!!!!!!!!! 模型加载失败 !!!!!!!!!!!!!!")
81
  logging.error(f"错误类型: {type(e).__name__}")
 
90
  def chat_completions(request: ChatCompletionRequest):
91
  if llm is None:
92
  return {"error": "模型未能成功加载,API不可用。请检查Space日志。"}
93
+
94
+ # llama-cpp-python直接接收OpenAI格式的messages
95
  messages = request.messages
96
+
97
  try:
98
  logging.info("正在生成回复...")
99
+ # 直接调用create_chat_completion
100
+ completion = llm.create_chat_completion(
101
+ messages=messages,
102
+ max_tokens=2048,
103
+ temperature=0.7
104
+ )
105
  logging.info("回复生成成功!")
106
+
107
+ # 直接返回OpenAI兼容的格式
108
  return completion
109
+
110
  except Exception as e:
111
  logging.error(f"生成回复时出错: {e}")
112
  return {"error": "生成回复时遇到内部错误。"}