Merge pull request #30 from useblocks/rect-model

Rect model
useblocks · Jan 23, 2024 · aa2999f · aa2999f
2 parents d08febc + 946c820
commit aa2999f
Show file tree

Hide file tree

Showing 12 changed files with 239 additions and 26 deletions.
diff --git a/docs/contents/api.rst b/docs/contents/api.rst
@@ -72,6 +72,11 @@ Figure
 
 .. autoclass:: libpdf.models.figure.Figure
 
+Rect
+~~~~~~
+
+.. autoclass:: libpdf.models.rect.Rect
+
 
 Position
 ~~~~~~~~

diff --git a/docs/contents/changelog.rst b/docs/contents/changelog.rst
@@ -20,11 +20,15 @@ __ https://github.com/useblocks/libpdf/compare/v0.0.1...HEAD
 Added
 ~~~~~
 
+- Introduced new element ``Rect`` (`PR #30 <https://github.com/useblocks/libpdf/pull/30>`_)
+- Introduced Ruff as linter and formatter (PRs `#28 <https://github.com/useblocks/libpdf/pull/28>`_,
+  `#29 <https://github.com/useblocks/libpdf/pull/29>`_ and `#31 <https://github.com/useblocks/libpdf/pull/31>`_)
+- Added support for Python 3.10, 3.11 and 3.12 (`PR #27 <https://github.com/useblocks/libpdf/pull/27>`_)
+
 Changed
 ~~~~~~~
 
 - Updated downstream library dependencies
-
 - Added flag ``--no_annotations`` to exclude annotation extraction from the catalog to speed up extraction
   (`PR #15 <https://github.com/useblocks/libpdf/pull/15>`_)
 
@@ -33,6 +37,8 @@ Removed
 
 - libpdf has dropped support for Python 3.6, which reached end-of-life on 2021-12-23; this also fixes the Pillow 8
   security vulnerabilities
+- libpdf has dropped support for Python 3.7, which reached end-of-life on 2023-06-27
+  (`PR #27 <https://github.com/useblocks/libpdf/pull/27>`_)
 
 Fixed
 ~~~~~

diff --git a/docs/contents/pdf_model.puml b/docs/contents/pdf_model.puml
@@ -176,6 +176,14 @@
             **rel_path**: figures/<image>
         }
 
+        class Rect {
+            + [tuple] non_stroking_color
+            --
+            **id**: ""rect.<1,2,3,n>""
+            **non_stroking_color**: RGB color space,
+            given as floats between 0 and 1
+        }
+
         package x <<layout>> {
 
            class HorizontalBox {
@@ -217,9 +225,7 @@
                 --
                 **text**: the character itself
             }
-
-
-
+        }
         Char "+chars  1..*" ---left*  Word
         Word "+words  1..*" ---left*  HorizontalLine
         HorizontalLine "+lines  1..*" --*  HorizontalBox
@@ -228,6 +234,7 @@
         HorizontalBox "+textbox  0..1" --* Cell
         HorizontalBox "+textboxes  0..*" --* Figure
         HorizontalBox "+textbox  0..1" --* Chapter
+        HorizontalBox "+textbox  0..1" --* Rect
 
         Paragraph "+b_source  1" *-- "+links  *" Link
         Figure "+b_source  1" *-- "+links  *" Link
@@ -240,6 +247,7 @@
         Figure -[#COL_DERIVE]-|> Element
         Chapter -[#COL_DERIVE]-|> Element
         Paragraph -[#COL_DERIVE]|> Element
+        Rect -[#COL_DERIVE]|> Element
 
         Chapter "1" *-- "+content  *" Element: ordered
         Page "1" *-- "+content  *" Element: ordered

diff --git a/libpdf/apiobjects.py b/libpdf/apiobjects.py
@@ -8,6 +8,7 @@
 from libpdf.models.chapter import Chapter
 from libpdf.models.figure import Figure
 from libpdf.models.paragraph import Paragraph
+from libpdf.models.rect import Rect
 from libpdf.models.root import Root
 from libpdf.models.table import Table
 
@@ -36,6 +37,7 @@ def __init__(  # pylint: disable=too-many-arguments  # the parameters are needed
         paragraphs: List[Paragraph],
         tables: List[Table],
         figures: List[Figure],
+        rects: List[Rect],
         pdfplumber: PDF,
         pdfminer: PDFDocument,
     ):
@@ -45,7 +47,11 @@ def __init__(  # pylint: disable=too-many-arguments  # the parameters are needed
 
         # attributes for API convenience
         self.flattened = Flattened(
-            chapters=chapters, paragraphs=paragraphs, tables=tables, figures=figures
+            chapters=chapters,
+            paragraphs=paragraphs,
+            tables=tables,
+            figures=figures,
+            rects=rects,
         )
 
         # exposing the pdfplumber PDF object
@@ -71,3 +77,4 @@ class Flattened(NamedTuple):
     paragraphs: List[Paragraph]
     tables: List[Table]
     figures: List[Figure]
+    rects: List[Rect]
diff --git a/libpdf/catalog.py b/libpdf/catalog.py
@@ -3,7 +3,7 @@
 import re
 from typing import Any, Dict, List, Union
 
-from pdfminer.pdftypes import PDFObjRef
+from pdfminer.pdftypes import PDFObjRef, resolve1
 from pdfminer.psparser import PSLiteral
 
 from libpdf.log import logging_needed
@@ -51,7 +51,7 @@ def get_named_destination(pdf):  # pylint: disable=too-many-branches
         ):
             name_tree = pdf_catalog["Names"].resolve()["Dests"].resolve()
         elif isinstance(pdf_catalog["Names"], dict) and "Dests" in pdf_catalog["Names"]:
-            name_tree = pdf_catalog["Names"]["Dests"].resolve()
+            name_tree = resolve1(pdf_catalog["Names"]["Dests"])
         # check if name tree not empty
         if name_tree:
             # map page id to page number

diff --git a/libpdf/core.py b/libpdf/core.py
@@ -35,6 +35,7 @@ def main(  # pylint: disable=too-many-arguments,too-many-locals  # no reasonable
     no_paragraphs: bool = False,
     no_tables: bool = False,
     no_figures: bool = False,
+    no_rects: bool = False,
     cli_usage: bool = False,
     visual_debug: bool = False,
     visual_debug_output_dir: str = None,
@@ -66,6 +67,7 @@ def main(  # pylint: disable=too-many-arguments,too-many-locals  # no reasonable
     :param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
     :param no_tables: flag triggering the exclusion of tables
     :param no_figures: flag triggering the exclusion of figures
+    :param no_rects: flag triggering the exclusion of rects
     :param cli_usage: flag indicating that the function was called through CLI
     :param visual_debug: flag triggering visual debug feature
     :param visual_debug_output_dir: output directory for visualized pdf pages
@@ -120,6 +122,7 @@ def main(  # pylint: disable=too-many-arguments,too-many-locals  # no reasonable
         LOG.info("Extract paragraphs: %s", "no" if no_paragraphs else "yes")
         LOG.info("Extract tables: %s", "no" if no_tables else "yes")
         LOG.info("Extract figures: %s", "no" if no_figures else "yes")
+        LOG.info("Extract rects: %s", "no" if no_rects else "yes")
         overall_pbar.update(1)
         try:
             objects = extract(
@@ -133,6 +136,7 @@ def main(  # pylint: disable=too-many-arguments,too-many-locals  # no reasonable
                 no_paragraphs,
                 no_tables,
                 no_figures,
+                no_rects,
                 overall_pbar,
             )
         except LibpdfException:
@@ -177,6 +181,7 @@ def main_api(  # pylint: disable=too-many-arguments, too-many-locals
     no_paragraphs: bool = False,
     no_tables: bool = False,
     no_figures: bool = False,
+    no_rects: bool = False,
     init_logging: bool = True,
     visual_debug: bool = False,
     visual_debug_output_dir: str = "visual_debug_libpdf",
@@ -202,6 +207,7 @@ def main_api(  # pylint: disable=too-many-arguments, too-many-locals
     :param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
     :param no_tables: flag triggering the exclusion of tables
     :param no_figures: flag triggering the exclusion of figures
+    :param no_rects: flag triggering the exclusion of rects
     :param init_logging: flag indicating whether libpdf shall instantiate a root log handler that is capable of
                          handling both log messages and progress bars; it does so by passing all log messages to
                          tqdm.write()
@@ -245,6 +251,7 @@ def main_api(  # pylint: disable=too-many-arguments, too-many-locals
         no_paragraphs=no_paragraphs,
         no_tables=no_tables,
         no_figures=no_figures,
+        no_rects=no_rects,
         cli_usage=False,
         visual_debug=visual_debug,
         visual_debug_output_dir=visual_debug_output_dir,
@@ -463,6 +470,13 @@ def handle_parse_result(self, ctx, opts, args):
     help="Skip figures. Figures will not be part of the output JSON/YAML structures and also not saved if"
     " --save-figures is given.",
 )
+@click.option(
+    "--no-rects",
+    is_flag=True,
+    show_default=True,
+    help="Skip rects. rects will not be part of the output JSON/YAML structures and also not saved if"
+    " --save-rects is given.",
+)
 @click.option("-vd", "--visual-debug", is_flag=True, help="Visual debug libpdf.")
 @click.option(
     "-vo",

diff --git a/libpdf/extract.py b/libpdf/extract.py
@@ -21,6 +21,7 @@
 from libpdf.models.file_meta import FileMeta
 from libpdf.models.page import Page
 from libpdf.models.position import Position
+from libpdf.models.rect import Rect
 from libpdf.models.root import Root
 from libpdf.parameters import (
     FIGURE_MIN_HEIGHT,
@@ -33,7 +34,12 @@
 from libpdf.progress import bar_format_lvl2, tqdm
 from libpdf.tables import extract_pdf_table
 from libpdf.textbox import extract_linked_chars, extract_paragraphs_chapters
-from libpdf.utils import lt_page_crop, lt_to_libpdf_hbox_converter, to_pdfplumber_bbox
+from libpdf.utils import (
+    lt_page_crop,
+    lt_textbox_crop,
+    lt_to_libpdf_hbox_converter,
+    to_pdfplumber_bbox,
+)
 
 LOG = logging.getLogger(__name__)
 
@@ -61,6 +67,7 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
     no_paragraphs: bool,
     no_tables: bool,
     no_figures: bool,
+    no_rects: bool,
     overall_pbar: tqdm,
 ) -> ApiObjects:
     """
@@ -76,6 +83,7 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
     :param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
     :param no_tables: flag triggering the exclusion of tables
     :param no_figures: flag triggering the exclusion of figures
+    :param no_rects: flag triggering the exclusion of rects
     :param overall_pbar: total progress bar for whole libpdf run
     :return: instance of Objects class
     :raise LibpdfException: PDF contains no pages
@@ -142,7 +150,17 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
             # smartly remove figures that are in header and footer
             if smart_page_crop:
                 figure_list = smart_page_crop_header_footer(pdf, figure_list)
-        overall_pbar.update(30)
+        overall_pbar.update(15)
+
+        if no_rects:
+            LOG.info("Excluding rects extraction")
+            rect_list = []
+        else:
+            rect_list = extract_rects(pdf, pages_list)
+            # smartly remove figures that are in header and footer
+            if smart_page_crop:
+                rect_list = smart_page_crop_header_footer(pdf, rect_list)
+        overall_pbar.update(15)
 
         if no_tables:
             LOG.info("Excluding tables extraction")
@@ -169,7 +187,7 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
             paragraph_list = smart_page_crop_header_footer(pdf, paragraph_list)
 
         element_list = pro.merge_all_elements(
-            figure_list, table_list, paragraph_list, chapter_list
+            figure_list, table_list, paragraph_list, chapter_list, rect_list
         )
 
         # to check if elements shall be mapped into nested outline structure.
@@ -195,6 +213,7 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
         paragraphs=paragraph_list,
         tables=table_list,
         figures=figure_list,
+        rects=rect_list,
         pdfplumber=pdf,
         pdfminer=pdf.doc,
     )
@@ -640,6 +659,76 @@ def extract_figures(
     return figure_list
 
 
+def extract_rects(
+    pdf,
+    pages_list,
+) -> List[Rect]:  # pylint: disable=too-many-nested-blocks, too-many-branches  # local algorithm, easier to read when not split up
+    """Extract rects in PDF."""
+    LOG.info("Extracting rects ...")
+    rect_list = []
+
+    for idx_page, page in enumerate(  # pylint: disable=too-many-nested-blocks
+        tqdm(
+            pdf.pages,
+            desc="###### Extracting rects",
+            unit="pages",
+            bar_format=bar_format_lvl2(),
+        ),
+    ):
+        if logging_needed(idx_page, len(pdf.pages)):
+            LOG.debug("Extracting rects page %s of %s", idx_page + 1, len(pdf.pages))
+        page_crop = pro.remove_page_header_footer(page)
+        lt_page = page._layout  # pylint: disable=protected-access  # easiest way to obtain LTPage
+
+        # check and filter figures
+        # figures = check_and_filter_figures(page_crop.objects['figure']) if 'figure' in page_crop.objects else []
+        # rects = page_crop.objects['rects'] if 'rects' in page_crop.objects else []
+        rects = page.objects["rect"] if "rect" in page.objects else []
+
+        if len(rects) != 0:
+            for idx_rect, rect in enumerate(rects):
+                rect_pos = Position(
+                    float(rect["x0"]),
+                    float(rect["y0"]),
+                    float(rect["x1"]),
+                    float(rect["y1"]),
+                    pages_list[idx_page],
+                )
+
+                non_stroking_color = rect["non_stroking_color"]
+
+                offset = 5
+                rect_bbox = (
+                    rect_pos.x0 - offset,
+                    rect_pos.y0 - offset,
+                    rect_pos.x1 + offset,
+                    rect_pos.y1 + offset,
+                )
+
+                LOG.info(
+                    f"found rect at {rect_bbox} at page {idx_page+1}: color {non_stroking_color}"
+                )
+                lt_textbox = lt_textbox_crop(
+                    rect_bbox,
+                    lt_page._objs,
+                    word_margin=LA_PARAMS["word_margin"],
+                    y_tolerance=LA_PARAMS["line_overlap"],
+                )
+                if lt_textbox:
+                    hbox = lt_to_libpdf_hbox_converter(lt_textbox)
+                else:
+                    hbox = None
+
+                rect = Rect(idx_rect + 1, rect_pos, hbox, non_stroking_color)
+                rect_list.append(rect)
+
+        else:
+            LOG.info(f"found no rects on page {idx_page+1}: {page_crop.objects.keys()}")
+
+    # return figure_list
+    return rect_list
+
+
 def images_to_save(pdf, figure_list):
     """Save images to given path."""
     for fig in figure_list:
@@ -708,6 +797,11 @@ def check_and_filter_figures(figures_list):  # pylint: disable=too-many-branches
         if figure["height"] > FIGURE_MIN_HEIGHT and figure["width"] > FIGURE_MIN_WIDTH:
             filtered_figures.append(figure)
 
+    if len(filtered_figures) < len(figures_list):
+        LOG.debug(
+            f"check_and_filter_figures removed {len(figures_list) - len(filtered_figures)} out of {len(figures_list)}  due to invalid height/width"
+        )
+
     for figure in filtered_figures:
         # if figure exceed the boundary of the page, then only keep the part of figure that inside this page
         if not (
@@ -734,6 +828,7 @@ def check_and_filter_figures(figures_list):  # pylint: disable=too-many-branches
             and fig0["y1"] >= fig1["y1"]
         ):
             if fig1 in filtered_figures:
+                LOG.debug("remove filtered figure due to contained in other figure")
                 filtered_figures.remove(fig1)
 
     # check if figures partially overlap
@@ -754,9 +849,20 @@ def check_and_filter_figures(figures_list):  # pylint: disable=too-many-branches
                 # compare the size of two figures, keep the bigger figure
                 if fig0["width"] * fig0["height"] <= fig1["width"] * fig1["height"]:
                     if fig0 in filtered_figures:
+                        LOG.debug(
+                            "remove filtered figure fig0 due to partially overlap"
+                        )
                         filtered_figures.remove(fig0)
                 else:
                     if fig1 in filtered_figures:
+                        LOG.debug(
+                            "remove filtered figure fig1 due to partially overlap"
+                        )
                         filtered_figures.remove(fig1)
 
+    if len(filtered_figures) < len(figures_list):
+        LOG.debug(
+            f"check_and_filter_figures removed {len(figures_list) - len(filtered_figures)} out of {len(figures_list)} figures"
+        )
+
     return filtered_figures