docling-app / app.py
AyoubChLin's picture
Update app.py
f8aea0d verified
raw
history blame
2.36 kB
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import shutil
import os
from uuid import uuid4
from docling.document_converter import DocumentConverter
from threading import Lock
from concurrent.futures import ThreadPoolExecutor
import asyncio
app = FastAPI()
# CORS for all
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Thread pool sized according to vCPU
MAX_WORKERS = os.cpu_count() or 2 # fallback to 2 if os.cpu_count() returns None
thread_pool = ThreadPoolExecutor(max_workers=MAX_WORKERS)
# Singleton class for DocumentConverter
class PdfConverterSingleton:
_instance = None
_lock = Lock()
def __new__(cls):
if cls._instance is None:
with cls._lock:
if cls._instance is None:
instance = super().__new__(cls)
instance._initialize()
cls._instance = instance
return cls._instance
def _initialize(self):
self.converter = DocumentConverter()
def get_text(self, pdf_path: str) -> str:
result = self.converter.convert(pdf_path)
return result.document.export_to_markdown()
# Run sync function in threadpool
def sync_extract_text(pdf_path: str) -> str:
return PdfConverterSingleton().get_text(pdf_path)
# Async wrapper for thread pool
async def async_extract_text(pdf_path: str) -> str:
loop = asyncio.get_event_loop()
return await loop.run_in_executor(thread_pool, sync_extract_text, pdf_path)
# Main endpoint
@app.post("/extract-pdf-text")
async def extract_pdf_text(file: UploadFile = File(...)):
if file.content_type != "application/pdf":
raise HTTPException(status_code=400, detail="Only PDF files are supported.")
temp_filename = f"/tmp/{uuid4().hex}.pdf"
try:
with open(temp_filename, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
text = await async_extract_text(temp_filename)
return JSONResponse(content={"markdown_text": text})
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
finally:
if os.path.exists(temp_filename):
os.remove(temp_filename)