docling-app

Sleeping

App Files Files Community

AyoubChLin commited on Jun 2

Commit

19907be

verified ·

1 Parent(s): f3672be

[INIT]

Browse files

Files changed (3) hide show

Dockerfile +16 -0
app.py +58 -0
requirements.txt +2 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.10
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.responses import JSONResponse
+import shutil
+import os
+from uuid import uuid4
+from marker.converters.pdf import PdfConverter
+from marker.models import create_model_dict
+from marker.output import text_from_rendered
+from threading import Lock
+app = FastAPI()
+# Singleton class for PdfConverter
+class PdfConverterSingleton:
+    _instance = None
+    _lock = Lock()
+    def __new__(cls):
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    instance = super().__new__(cls)
+                    instance._initialize()
+                    cls._instance = instance
+        return cls._instance
+    def _initialize(self):
+        self.converter = PdfConverter(artifact_dict=create_model_dict())
+    def get_text(self, pdf_path: str) -> str:
+        rendered = self.converter(pdf_path)
+        text, _, _ = text_from_rendered(rendered)
+        return str(text)
+# API function to call converter
+def extract_text_from_pdf(pdf_path: str) -> str:
+    return PdfConverterSingleton().get_text(pdf_path)
+# Endpoint to upload a file and extract markdown text
+@app.post("/extract-pdf-text")
+async def extract_pdf_text(file: UploadFile = File(...)):
+    if file.content_type != "application/pdf":
+        raise HTTPException(status_code=400, detail="Only PDF files are supported.")
+    temp_filename = f"/tmp/{uuid4().hex}.pdf"
+    try:
+        with open(temp_filename, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+        text = extract_text_from_pdf(temp_filename)
+        return JSONResponse(content={"markdown_text": text})
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        if os.path.exists(temp_filename):
+            os.remove(temp_filename)

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ fastapi[standard]
2	+ marker