YAML Metadata Warning: empty or missing yaml metadata in repo card (https://huggingface.co/docs/hub/model-cards#model-card-metadata)

Quantized Nanonets-OCR-s Model 😘

Requirements

pip install vllm
pip install pdf2image
apt-get install -y poppler-utils

Init model. Model usage about ~5gb vRAM

from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
model_name = "jester6136/Nanonets-OCR-s-w8a8"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
llm = LLM(model=model_name, trust_remote_code=True,
          gpu_memory_utilization=0.5,
          max_model_len=10000,
          max_num_seqs=1)

Extract pdf as markdown.

from PIL import Image
import io
from pdf2image import convert_from_path
from vllm import SamplingParams
from typing import List

def make_prompt(question: str) -> str:
    return (
        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
        "<|im_start|>user\n"
        "<|vision_start|><|image_pad|><|vision_end|>\n"
        f"{question}<|im_end|>\n"
        "<|im_start|>assistant\n"
    )

question_text = (
    "Extract the text from the above document as if you were reading it naturally. Keep fontweight as **."
)

sampling_params = SamplingParams(
    repetition_penalty=1.05,
    temperature=0.0,
    max_tokens=10000
)

pdf_path = "your_pdf_path.pdf"
images = convert_from_path(pdf_path)

def downscale_image(img: Image.Image, max_dim: int = 768) -> Image.Image:
    width, height = img.size
    if max(width, height) <= max_dim:
        return img
    if width > height:
        new_width = max_dim
        new_height = int((max_dim / width) * height)
    else:
        new_height = max_dim
        new_width = int((max_dim / height) * width)
    return img.resize((new_width, new_height), Image.LANCZOS)

# ⬇️ Prepare batched inputs
batched_inputs = []
for page_num, image in enumerate(images):
    print(f"Preparing page {page_num + 1}...")

    image = downscale_image(image)
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format='PNG')
    pil_image = Image.open(io.BytesIO(img_byte_arr.getvalue()))

    prompt = make_prompt(question_text)
    batched_inputs.append({
        "prompt": prompt,
        "multi_modal_data": {
            "image": [pil_image]
        }
    })

# ⬇️ Run batched inference
print("Running batch inference...")
batched_outputs = llm.generate(batched_inputs, sampling_params=sampling_params)

# ⬇️ Collect results
all_outputs = []
for page_num, output in enumerate(batched_outputs):
    extracted_text = output.outputs[0].text
    all_outputs.append(f"<page_number>{page_num + 1}</page_number>\n{extracted_text}\n{'-'*50}")

# ⬇️ Save to file
output_file = "/content/extracted_text.txt"
with open(output_file, "w", encoding="utf-8") as f:
    f.write("\n".join(all_outputs))

print(f"βœ… Extracted text saved to: {output_file}")
Downloads last month
12
Safetensors
Model size
4B params
Tensor type
BF16
Β·
I8
Β·
Inference Providers NEW
This model isn't deployed by any Inference Provider. πŸ™‹ Ask for provider support