update
Browse files
app.py
CHANGED
|
@@ -20,7 +20,7 @@ import json
|
|
| 20 |
import pytesseract
|
| 21 |
import openpyxl
|
| 22 |
import fitz # PyMuPDF
|
| 23 |
-
from
|
| 24 |
from PIL import Image
|
| 25 |
from mimetypes import guess_type
|
| 26 |
from openpyxl.styles import numbers
|
|
@@ -37,7 +37,7 @@ import pytesseract
|
|
| 37 |
import os
|
| 38 |
|
| 39 |
# 硬编码 Hugging Face 环境的 Tesseract 路径
|
| 40 |
-
pytesseract.pytesseract.tesseract_cmd = r'/
|
| 41 |
|
| 42 |
# 验证路径是否存在(调试用)
|
| 43 |
if not os.path.exists(pytesseract.pytesseract.tesseract_cmd):
|
|
@@ -75,7 +75,7 @@ class PDFImageToMarkdown:
|
|
| 75 |
prompt_txt = "Precisely identify the content within an image, specifically extracting all information and table content from an invoice. Convert the extracted information into Markdown format, and ensure the output is in Chinese."
|
| 76 |
|
| 77 |
response = self.client.chat.completions.create(
|
| 78 |
-
model="glm-
|
| 79 |
temperature=0,
|
| 80 |
messages=[
|
| 81 |
{
|
|
@@ -170,7 +170,6 @@ class PDFImageToMarkdown:
|
|
| 170 |
class MarkdownToJSON:
|
| 171 |
DEFAULT_PROMPT = """
|
| 172 |
请将以下 Markdown 格式的发票内容转换为 JSON 格式。要求如下:
|
| 173 |
-
|
| 174 |
1. **提取以下字段**:
|
| 175 |
- 发票代码(invoice_code)
|
| 176 |
- 开票日期(invoice_date)
|
|
@@ -190,7 +189,6 @@ class MarkdownToJSON:
|
|
| 190 |
- 价税合计(total_including_tax):包含大写金额(capitalized)和小写金额(numeric)
|
| 191 |
- 备注(remarks):以列表形式存储
|
| 192 |
- 开票人(issuer)
|
| 193 |
-
|
| 194 |
2. **JSON 格式要求**:
|
| 195 |
- 字段名称必须与上述要求一致。
|
| 196 |
- 金额和税额字段的值应为字符串类型。
|
|
@@ -213,7 +211,7 @@ class MarkdownToJSON:
|
|
| 213 |
|
| 214 |
try:
|
| 215 |
response = self.client.chat.completions.create(
|
| 216 |
-
model="glm-4
|
| 217 |
temperature=0,
|
| 218 |
top_p=0.1,
|
| 219 |
max_tokens=4095,
|
|
@@ -447,10 +445,10 @@ async def process_invoices(api_key, files, progress=gr.Progress()):
|
|
| 447 |
raise gr.Error("请输入智谱 API Key")
|
| 448 |
|
| 449 |
try:
|
| 450 |
-
client =
|
| 451 |
# 测试API连接
|
| 452 |
test_response = client.chat.completions.create(
|
| 453 |
-
model="glm-4",
|
| 454 |
messages=[{"role": "user", "content": "test"}],
|
| 455 |
temperature=0
|
| 456 |
)
|
|
@@ -609,4 +607,5 @@ with gr.Blocks(title="发票识别系统") as demo:
|
|
| 609 |
)
|
| 610 |
|
| 611 |
if __name__ == "__main__":
|
| 612 |
-
demo.launch()
|
|
|
|
|
|
| 20 |
import pytesseract
|
| 21 |
import openpyxl
|
| 22 |
import fitz # PyMuPDF
|
| 23 |
+
from zai import ZhipuAiClient
|
| 24 |
from PIL import Image
|
| 25 |
from mimetypes import guess_type
|
| 26 |
from openpyxl.styles import numbers
|
|
|
|
| 37 |
import os
|
| 38 |
|
| 39 |
# 硬编码 Hugging Face 环境的 Tesseract 路径
|
| 40 |
+
pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract'
|
| 41 |
|
| 42 |
# 验证路径是否存在(调试用)
|
| 43 |
if not os.path.exists(pytesseract.pytesseract.tesseract_cmd):
|
|
|
|
| 75 |
prompt_txt = "Precisely identify the content within an image, specifically extracting all information and table content from an invoice. Convert the extracted information into Markdown format, and ensure the output is in Chinese."
|
| 76 |
|
| 77 |
response = self.client.chat.completions.create(
|
| 78 |
+
model="glm-4.5v",
|
| 79 |
temperature=0,
|
| 80 |
messages=[
|
| 81 |
{
|
|
|
|
| 170 |
class MarkdownToJSON:
|
| 171 |
DEFAULT_PROMPT = """
|
| 172 |
请将以下 Markdown 格式的发票内容转换为 JSON 格式。要求如下:
|
|
|
|
| 173 |
1. **提取以下字段**:
|
| 174 |
- 发票代码(invoice_code)
|
| 175 |
- 开票日期(invoice_date)
|
|
|
|
| 189 |
- 价税合计(total_including_tax):包含大写金额(capitalized)和小写金额(numeric)
|
| 190 |
- 备注(remarks):以列表形式存储
|
| 191 |
- 开票人(issuer)
|
|
|
|
| 192 |
2. **JSON 格式要求**:
|
| 193 |
- 字段名称必须与上述要求一致。
|
| 194 |
- 金额和税额字段的值应为字符串类型。
|
|
|
|
| 211 |
|
| 212 |
try:
|
| 213 |
response = self.client.chat.completions.create(
|
| 214 |
+
model="glm-4.6",
|
| 215 |
temperature=0,
|
| 216 |
top_p=0.1,
|
| 217 |
max_tokens=4095,
|
|
|
|
| 445 |
raise gr.Error("请输入智谱 API Key")
|
| 446 |
|
| 447 |
try:
|
| 448 |
+
client = ZhipuAiClient(api_key=api_key)
|
| 449 |
# 测试API连接
|
| 450 |
test_response = client.chat.completions.create(
|
| 451 |
+
model="glm-4.6",
|
| 452 |
messages=[{"role": "user", "content": "test"}],
|
| 453 |
temperature=0
|
| 454 |
)
|
|
|
|
| 607 |
)
|
| 608 |
|
| 609 |
if __name__ == "__main__":
|
| 610 |
+
demo.launch()
|
| 611 |
+
|