Skip to content

Custom OCR #1502

Closed
Closed
@pusapatiakhilraju

Description

@pusapatiakhilraju

Question

Can I create my custom ocr class and pass it in to ocr_options? Any example code that can help me get started?
...

will this work?

from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.datamodel.pipeline_options import PdfPipelineOptions
from surya.recognition import RecognitionPredictor
from surya.detection import DetectionPredictor
from docling.datamodel.base_models import InputFormat
from docling_core.types.doc import ImageRefMode
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, EasyOcrOptions, TesseractOcrOptions, OcrMacOptions, RapidOcrOptions, smolvlm_picture_description
from docling.datamodel.settings import settings
from PIL import Image
import os
from docling.models.base_model import BaseEnrichmentModel
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline


class SuryaOcrModel(BaseEnrichmentModel):
    def __init__(self, enabled: bool = True):
        self.enabled = enabled
        self.recognition_predictor = RecognitionPredictor()
        self.detection_predictor = DetectionPredictor()

    def is_processable(self, doc, element) -> bool:
        return self.enabled and element.type == "page_image"

    def __call__(self, doc, element_batch: Iterable) -> Iterable:
        for element in element_batch:
            image: Image.Image = element.get_image(doc).convert("RGB")

            # Run Surya OCR
            prediction = self.recognition_predictor([image], [None], self.detection_predictor)[0][0]

            for line in prediction.text_lines:
                text = line.text.strip()
                if not text:
                    continue

                l, t, r, b = line.bbox  # Already in LTRB
                bbox = BoundingBox.from_ltrb(l, t, r, b)
                doc.add_item(TextItem(text=text, bbox=bbox, page_no=element.page_no))

            yield element

class SuryaOcrPipeline(StandardPdfPipeline):
    def __init__(self, pipeline_options):
        super().__init__(pipeline_options)
        self.enrichment_pipe = []
        self.enrichment_pipe.append(SuryaOcrModel(enabled=True))

    @classmethod
    def get_default_options(cls):
        return PdfPipelineOptions(
            generate_page_images=True,
            images_scale=2.0,
            do_ocr=True
        )

Converting

input_pdf_path = Path("./img/test.png")
output_dir = Path("parsed-doc-advanced/test")
output_dir.mkdir(parents=True, exist_ok=True)

pipeline_options = PyMuPdfOcrPipeline.get_default_options()

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_cls=SuryaOcrPipeline,
            pipeline_options=pipeline_options
        )
    }
)

result = converter.convert(input_pdf_path)

is this the right way to use the custom OCR? I create a class and use it in pipeline_cls

Metadata

Metadata

Assignees

No one assigned

    Labels

    questionFurther information is requested

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions