Closed
Description
Question
Can I create my custom ocr class and pass it in to ocr_options? Any example code that can help me get started?
...
will this work?
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.datamodel.pipeline_options import PdfPipelineOptions
from surya.recognition import RecognitionPredictor
from surya.detection import DetectionPredictor
from docling.datamodel.base_models import InputFormat
from docling_core.types.doc import ImageRefMode
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, EasyOcrOptions, TesseractOcrOptions, OcrMacOptions, RapidOcrOptions, smolvlm_picture_description
from docling.datamodel.settings import settings
from PIL import Image
import os
from docling.models.base_model import BaseEnrichmentModel
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
class SuryaOcrModel(BaseEnrichmentModel):
def __init__(self, enabled: bool = True):
self.enabled = enabled
self.recognition_predictor = RecognitionPredictor()
self.detection_predictor = DetectionPredictor()
def is_processable(self, doc, element) -> bool:
return self.enabled and element.type == "page_image"
def __call__(self, doc, element_batch: Iterable) -> Iterable:
for element in element_batch:
image: Image.Image = element.get_image(doc).convert("RGB")
# Run Surya OCR
prediction = self.recognition_predictor([image], [None], self.detection_predictor)[0][0]
for line in prediction.text_lines:
text = line.text.strip()
if not text:
continue
l, t, r, b = line.bbox # Already in LTRB
bbox = BoundingBox.from_ltrb(l, t, r, b)
doc.add_item(TextItem(text=text, bbox=bbox, page_no=element.page_no))
yield element
class SuryaOcrPipeline(StandardPdfPipeline):
def __init__(self, pipeline_options):
super().__init__(pipeline_options)
self.enrichment_pipe = []
self.enrichment_pipe.append(SuryaOcrModel(enabled=True))
@classmethod
def get_default_options(cls):
return PdfPipelineOptions(
generate_page_images=True,
images_scale=2.0,
do_ocr=True
)
Converting
input_pdf_path = Path("./img/test.png")
output_dir = Path("parsed-doc-advanced/test")
output_dir.mkdir(parents=True, exist_ok=True)
pipeline_options = PyMuPdfOcrPipeline.get_default_options()
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=SuryaOcrPipeline,
pipeline_options=pipeline_options
)
}
)
result = converter.convert(input_pdf_path)
is this the right way to use the custom OCR? I create a class and use it in pipeline_cls