From 5433f179353d9525a0af8641796a7848f9ac910a Mon Sep 17 00:00:00 2001
From: Joerg Kreuzberger <j.kreuzberger@procitec.de>
Date: Wed, 17 Jan 2024 10:23:05 +0100
Subject: [PATCH 01/18] cherry pick the commit from upgrade branch

-textboxes are not excluded for rects
-add rect model
-extract rect
---
 docs/contents/api.rst |   5 ++
 libpdf/apiobjects.py  |   7 +--
 libpdf/core.py        |  37 ++++++++------
 libpdf/extract.py     | 114 ++++++++++++++++++++++++++++++++++++++++--
 libpdf/models/rect.py |  57 +++++++++++++++++++++
 libpdf/parameters.py  |  27 +++++-----
 libpdf/process.py     |  33 ++++++------
 libpdf/textbox.py     |  14 ++++--
 libpdf/utils.py       |  35 +++++++------
 9 files changed, 257 insertions(+), 72 deletions(-)
 create mode 100644 libpdf/models/rect.py

diff --git a/docs/contents/api.rst b/docs/contents/api.rst
index ee1b95c..b56afa6 100644
--- a/docs/contents/api.rst
+++ b/docs/contents/api.rst
@@ -72,6 +72,11 @@ Figure
 
 .. autoclass:: libpdf.models.figure.Figure
 
+Rect
+~~~~~~
+
+.. autoclass:: libpdf.models.figure.Rect
+
 
 Position
 ~~~~~~~~
diff --git a/libpdf/apiobjects.py b/libpdf/apiobjects.py
index b334f46..042d36b 100644
--- a/libpdf/apiobjects.py
+++ b/libpdf/apiobjects.py
@@ -7,6 +7,7 @@
 
 from libpdf.models.chapter import Chapter
 from libpdf.models.figure import Figure
+from libpdf.models.rect import Rect
 from libpdf.models.paragraph import Paragraph
 from libpdf.models.root import Root
 from libpdf.models.table import Table
@@ -36,6 +37,7 @@ def __init__(  # pylint: disable=too-many-arguments  # the parameters are needed
         paragraphs: List[Paragraph],
         tables: List[Table],
         figures: List[Figure],
+        rects: List[Rect],
         pdfplumber: PDF,
         pdfminer: PDFDocument,
     ):
@@ -44,9 +46,7 @@ def __init__(  # pylint: disable=too-many-arguments  # the parameters are needed
         self.root = root
 
         # attributes for API convenience
-        self.flattened = Flattened(
-            chapters=chapters, paragraphs=paragraphs, tables=tables, figures=figures
-        )
+        self.flattened = Flattened(chapters=chapters, paragraphs=paragraphs, tables=tables, figures=figures, rects=rects)
 
         # exposing the pdfplumber PDF object
         self.pdfplumber = pdfplumber
@@ -71,3 +71,4 @@ class Flattened(NamedTuple):
     paragraphs: List[Paragraph]
     tables: List[Table]
     figures: List[Figure]
+    rects: List[Rect]
diff --git a/libpdf/core.py b/libpdf/core.py
index d21da08..034bba1 100644
--- a/libpdf/core.py
+++ b/libpdf/core.py
@@ -35,6 +35,8 @@ def main(  # pylint: disable=too-many-arguments,too-many-locals  # no reasonable
     no_paragraphs: bool = False,
     no_tables: bool = False,
     no_figures: bool = False,
+    no_rects: bool = False,
+    crop_rects_text: bool = False,
     cli_usage: bool = False,
     visual_debug: bool = False,
     visual_debug_output_dir: str = None,
@@ -66,6 +68,8 @@ def main(  # pylint: disable=too-many-arguments,too-many-locals  # no reasonable
     :param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
     :param no_tables: flag triggering the exclusion of tables
     :param no_figures: flag triggering the exclusion of figures
+    :param no_rects: flag triggering the exclusion of rects
+    :param crop_rects_text: flag triggering that rects text should be cropped from text like paragraphs
     :param cli_usage: flag indicating that the function was called through CLI
     :param visual_debug: flag triggering visual debug feature
     :param visual_debug_output_dir: output directory for visualized pdf pages
@@ -106,20 +110,17 @@ def main(  # pylint: disable=too-many-arguments,too-many-locals  # no reasonable
         if output_path:
             LOG.info("Output path: %s", output_path)
         else:
-            LOG.info("Writing extracted data to stdout")
-        LOG.info(
-            "Page range: [%s]", "all" if not pages else ",".join(str(x) for x in pages)
-        )
-        LOG.info(
-            "Page crop: %s",
-            "not cropped" if not page_crop else " ".join(str(x) for x in page_crop),
-        )
-        LOG.info("Smart page crop: %s", "on" if smart_page_crop else "off")
-        LOG.info("Extract annotations: %s", "no" if no_annotations else "yes")
-        LOG.info("Extract chapters: %s", "no" if no_chapters else "yes")
-        LOG.info("Extract paragraphs: %s", "no" if no_paragraphs else "yes")
-        LOG.info("Extract tables: %s", "no" if no_tables else "yes")
-        LOG.info("Extract figures: %s", "no" if no_figures else "yes")
+            LOG.info('Writing extracted data to stdout')
+        LOG.info('Page range: [%s]', 'all' if not pages else ','.join(str(x) for x in pages))
+        LOG.info('Page crop: %s', 'not cropped' if not page_crop else ' '.join(str(x) for x in page_crop))
+        LOG.info('Smart page crop: %s', 'on' if smart_page_crop else 'off')
+        LOG.info('Extract annotations: %s', 'no' if no_annotations else 'yes')
+        LOG.info('Extract chapters: %s', 'no' if no_chapters else 'yes')
+        LOG.info('Extract paragraphs: %s', 'no' if no_paragraphs else 'yes')
+        LOG.info('Extract tables: %s', 'no' if no_tables else 'yes')
+        LOG.info('Extract figures: %s', 'no' if no_figures else 'yes')
+        LOG.info('Extract rects: %s', 'no' if no_rects else 'yes')
+        LOG.info('Text rects crop: %s', 'no' if crop_rects_text else 'no')
         overall_pbar.update(1)
         try:
             objects = extract(
@@ -133,6 +134,8 @@ def main(  # pylint: disable=too-many-arguments,too-many-locals  # no reasonable
                 no_paragraphs,
                 no_tables,
                 no_figures,
+                no_rects,
+                crop_rects_text,
                 overall_pbar,
             )
         except LibpdfException:
@@ -177,6 +180,8 @@ def main_api(  # pylint: disable=too-many-arguments, too-many-locals
     no_paragraphs: bool = False,
     no_tables: bool = False,
     no_figures: bool = False,
+    no_rects: bool = False,
+    crop_rects_text: bool = False,
     init_logging: bool = True,
     visual_debug: bool = False,
     visual_debug_output_dir: str = "visual_debug_libpdf",
@@ -202,6 +207,8 @@ def main_api(  # pylint: disable=too-many-arguments, too-many-locals
     :param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
     :param no_tables: flag triggering the exclusion of tables
     :param no_figures: flag triggering the exclusion of figures
+    :param no_rects: flag triggering the exclusion of rects
+    :param crop_rects_text: flag triggering that rects text should be cropped from text like paragraphs
     :param init_logging: flag indicating whether libpdf shall instantiate a root log handler that is capable of
                          handling both log messages and progress bars; it does so by passing all log messages to
                          tqdm.write()
@@ -245,6 +252,8 @@ def main_api(  # pylint: disable=too-many-arguments, too-many-locals
         no_paragraphs=no_paragraphs,
         no_tables=no_tables,
         no_figures=no_figures,
+        no_rects = no_rects,
+        crop_rects_text = crop_rects_text,
         cli_usage=False,
         visual_debug=visual_debug,
         visual_debug_output_dir=visual_debug_output_dir,
diff --git a/libpdf/extract.py b/libpdf/extract.py
index 833c6a1..681d6b5 100644
--- a/libpdf/extract.py
+++ b/libpdf/extract.py
@@ -17,6 +17,7 @@
 from libpdf.exceptions import LibpdfException
 from libpdf.log import logging_needed
 from libpdf.models.figure import Figure
+from libpdf.models.rect import Rect
 from libpdf.models.file import File
 from libpdf.models.file_meta import FileMeta
 from libpdf.models.page import Page
@@ -61,6 +62,8 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
     no_paragraphs: bool,
     no_tables: bool,
     no_figures: bool,
+    no_rects: bool,
+    crop_rects_text: bool,
     overall_pbar: tqdm,
 ) -> ApiObjects:
     """
@@ -76,6 +79,8 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
     :param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
     :param no_tables: flag triggering the exclusion of tables
     :param no_figures: flag triggering the exclusion of figures
+    :param no_rects: flag triggering the exclusion of rects
+    :param crop_rects_text: flag triggering that rects text should be cropped from text like paragraphs
     :param overall_pbar: total progress bar for whole libpdf run
     :return: instance of Objects class
     :raise LibpdfException: PDF contains no pages
@@ -142,7 +147,18 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
             # smartly remove figures that are in header and footer
             if smart_page_crop:
                 figure_list = smart_page_crop_header_footer(pdf, figure_list)
-        overall_pbar.update(30)
+        overall_pbar.update(15)
+
+        if no_rects:
+            LOG.info('Excluding rects extraction')
+            rect_list = []
+        else:
+            rect_list = extract_rects(pdf, pages_list, figure_dir)
+            # smartly remove figures that are in header and footer
+            if smart_page_crop:
+                rect_list = smart_page_crop_header_footer(pdf, rect_list)
+        overall_pbar.update(15)
+
 
         if no_tables:
             LOG.info("Excluding tables extraction")
@@ -159,6 +175,7 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
             pdf,
             figure_list,
             table_list,
+            rect_list if crop_rects_text else [],
             pages_list,
             no_chapters,
             no_paragraphs,
@@ -168,9 +185,7 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
         if smart_page_crop:
             paragraph_list = smart_page_crop_header_footer(pdf, paragraph_list)
 
-        element_list = pro.merge_all_elements(
-            figure_list, table_list, paragraph_list, chapter_list
-        )
+        element_list = pro.merge_all_elements(figure_list, table_list, paragraph_list, chapter_list, rect_list)
 
         # to check if elements shall be mapped into nested outline structure.
         if catalog["outline"] is not None and not no_chapters:
@@ -195,6 +210,7 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
         paragraphs=paragraph_list,
         tables=table_list,
         figures=figure_list,
+        rects=rect_list,
         pdfplumber=pdf,
         pdfminer=pdf.doc,
     )
@@ -639,6 +655,85 @@ def extract_figures(
 
     return figure_list
 
+def extract_rects(
+    pdf,
+    pages_list,
+    figure_dir,
+) -> List[
+    Rect
+]:  # pylint: disable=too-many-nested-blocks, too-many-branches  # local algorithm, easier to read when not split up
+    """Extract rects in PDF."""
+    LOG.info('Extracting rects ...')
+    rect_list = []
+
+    for idx_page, page in enumerate(  # pylint: disable=too-many-nested-blocks
+        tqdm(pdf.pages, desc='###### Extracting rects', unit='pages', bar_format=bar_format_lvl2()),
+    ):
+        if logging_needed(idx_page, len(pdf.pages)):
+            LOG.debug('Extracting rects page %s of %s', idx_page + 1, len(pdf.pages))
+        page_crop = pro.remove_page_header_footer(page)
+        lt_page = page._layout  # pylint: disable=protected-access  # easiest way to obtain LTPage
+
+        # check and filter figures
+        #figures = check_and_filter_figures(page_crop.objects['figure']) if 'figure' in page_crop.objects else []
+        #rects = page_crop.objects['rects'] if 'rects' in page_crop.objects else []
+        rects = page.objects['rect'] if 'rect' in page.objects else []
+
+
+        if len(rects) != 0:
+            for idx_rect, rect in enumerate(rects):
+                rect_pos = Position(
+                    float(rect['x0']),
+                    float(rect['y0']),
+                    float(rect['x1']),
+                    float(rect['y1']),
+                    pages_list[idx_page],
+                )
+
+                non_stroking_color = rect['non_stroking_color']
+                fill = rect['fill']
+
+                bbox = (rect_pos.x0, rect_pos.y0, rect_pos.x1, rect_pos.y1)
+
+                LOG.info(f"found rect at {bbox} at page {idx_page+1}: color {non_stroking_color}");
+
+                lt_textboxes = lt_page_crop(
+                    bbox,
+                    lt_page._objs,  # pylint: disable=protected-access # access needed
+                    LTText,
+                    contain_completely=True,
+                )
+
+                textboxes = []
+                links = []
+                for lt_textbox in lt_textboxes:
+                    if catalog['annos']:
+                        links.extend(extract_linked_chars(lt_textbox, lt_page.pageid))
+                    bbox = (lt_textbox.x0, lt_textbox.y0, lt_textbox.x1, lt_textbox.y1)
+
+                    hbox = lt_to_libpdf_hbox_converter(lt_textbox)
+
+                    textboxes.append(hbox)
+
+                rect_name = f'page_{page.page_number}_rect.{idx_rect + 1}.png'
+
+                # create figures directory if not exist
+                Path(figure_dir).mkdir(parents=True, exist_ok=True)
+
+                rect_path = os.path.abspath(os.path.join(figure_dir, rect_name))
+
+                #figure = Figure(idx_figure + 1, image_path, fig_pos, links, textboxes, 'None')
+                #figure_list.append(figure)
+                rect = Rect( idx_rect + 1, rect_pos, links, textboxes, non_stroking_color )
+                rect_list.append(rect)
+
+        else:
+            LOG.info(f"found no rects on page {idx_page+1}: {page_crop.objects.keys()}")
+
+
+    #return figure_list
+    return rect_list
+
 
 def images_to_save(pdf, figure_list):
     """Save images to given path."""
@@ -708,6 +803,11 @@ def check_and_filter_figures(figures_list):  # pylint: disable=too-many-branches
         if figure["height"] > FIGURE_MIN_HEIGHT and figure["width"] > FIGURE_MIN_WIDTH:
             filtered_figures.append(figure)
 
+    if len(filtered_figures) < len(figures_list):
+        LOG.debug(f"check_and_filter_figures removed {len(figures_list) - len(filtered_figures)} out of {len(figures_list)}  due to invalid height/width")
+
+
+
     for figure in filtered_figures:
         # if figure exceed the boundary of the page, then only keep the part of figure that inside this page
         if not (
@@ -734,6 +834,7 @@ def check_and_filter_figures(figures_list):  # pylint: disable=too-many-branches
             and fig0["y1"] >= fig1["y1"]
         ):
             if fig1 in filtered_figures:
+                LOG.debug("remove filtered figure due to contained in other figure")
                 filtered_figures.remove(fig1)
 
     # check if figures partially overlap
@@ -754,9 +855,14 @@ def check_and_filter_figures(figures_list):  # pylint: disable=too-many-branches
                 # compare the size of two figures, keep the bigger figure
                 if fig0["width"] * fig0["height"] <= fig1["width"] * fig1["height"]:
                     if fig0 in filtered_figures:
+                        LOG.debug("remove filtered figure fig0 due to partially overlap")
                         filtered_figures.remove(fig0)
                 else:
                     if fig1 in filtered_figures:
+                        LOG.debug("remove filtered figure fig1 due to partially overlap")
                         filtered_figures.remove(fig1)
 
+    if len(filtered_figures) < len(figures_list):
+        LOG.debug(f"check_and_filter_figures removed {len(figures_list) - len(filtered_figures)} out of {len(figures_list)} figures")
+
     return filtered_figures
diff --git a/libpdf/models/rect.py b/libpdf/models/rect.py
new file mode 100644
index 0000000..1c13cb7
--- /dev/null
+++ b/libpdf/models/rect.py
@@ -0,0 +1,57 @@
+"""Definition for PDF figures."""
+from typing import TYPE_CHECKING, List
+
+from libpdf.models.element import Element
+from libpdf.models.horizontal_box import HorizontalBox
+from libpdf.models.link import Link
+
+# avoid import cycles for back reference type hinting
+# https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
+if TYPE_CHECKING:
+    # F401 imported but unused - it's needed for type hinting
+    from libpdf.models.position import Position  # noqa: F401, pylint: disable=ungrouped-imports
+
+
+class Rect(Element):
+    """
+    PDF Rect.
+    """
+
+    def __init__(
+        self,
+        idx: int,
+        position: 'Position',
+        links: List[Link],
+        textboxes: List[HorizontalBox],
+        non_stroking_color: str = None,
+    ):
+        """Initialize the instance."""
+        super().__init__(position=position)
+        self.idx = idx
+        self.textboxes = textboxes
+        self.links = links
+        self.non_stroking_color = non_stroking_color
+        if self.links:
+            self.set_links_backref()
+
+    @property
+    def id_(self):
+        """
+        Return the identifier to address the Figure.
+
+        The identifier follows the pattern ``figure.<idx>``.
+        idx the 1-based number of the Figure in the current scope (root, chapter, sub-chapters, page).
+
+        It is used as a link target if a PDF link-annotation points to the Element.
+
+        According to PDF model the parameter should be called ``id`` but the name is reserved in Python, so ``id_``
+        is used.
+
+        :type: str
+        """
+        return f'rect.{self.idx}'
+
+    def set_links_backref(self):
+        """Set b_source back reference on all links."""
+        for link in self.links:
+            link.b_source = self
diff --git a/libpdf/parameters.py b/libpdf/parameters.py
index 2cbc2fa..70d0640 100644
--- a/libpdf/parameters.py
+++ b/libpdf/parameters.py
@@ -188,26 +188,29 @@
 # The following parameters are used for visual debugging
 # Give colors a human readable name
 COLORS = {
-    "red": (255, 0, 0),
-    "green": (0, 255, 0),
-    "blue": (0, 0, 255),
-    "yellow": (255, 255, 0),
+    'red': (255, 0, 0),
+    'green': (0, 255, 0),
+    'blue': (0, 0, 255),
+    'yellow': (255, 255, 0),
+    'cyan': (255,255,1)
 }
 
 # Map extracted elements with color
 # the numbers at the end means transparency, the value should be set in range (40, 160)
 VIS_DBG_MAP_ELEMENTS_COLOR = {
-    "chapter": COLORS["green"] + (80,),
-    "paragraph": COLORS["blue"] + (40,),
-    "table": COLORS["red"] + (40,),
-    "figure": COLORS["yellow"] + (80,),
+    'chapter': COLORS['green'] + (80,),
+    'paragraph': COLORS['blue'] + (40,),
+    'table': COLORS['red'] + (40,),
+    'figure': COLORS['yellow'] + (80,),
+    'rect': COLORS['cyan'] + (40,),
 }
 
 RENDER_ELEMENTS = [
-    "chapter",
-    "paragraph",
-    "table",
-    "figure",
+    'chapter',
+    'paragraph',
+    'table',
+    'figure',
+    'rect'
 ]  # the elements that shall be rendered
 
 # pdfminer layout analysis parameter from from pdfminer.layout -> LAParams.__init__
diff --git a/libpdf/process.py b/libpdf/process.py
index b0f1cc7..6c926e5 100644
--- a/libpdf/process.py
+++ b/libpdf/process.py
@@ -24,6 +24,7 @@
 from libpdf.models.chapter import Chapter
 from libpdf.models.element import Element
 from libpdf.models.figure import Figure
+from libpdf.models.rect import Rect
 from libpdf.models.link import Link
 from libpdf.models.model_base import ModelBase
 from libpdf.models.page import Page
@@ -66,7 +67,7 @@ def to_dict_output(obj: Union[ModelBase, Position]) -> Dict:  # pylint: disable=
     """Turn all objects attributes into a dictionary."""
     vars_dict = vars(obj).copy()
 
-    if isinstance(obj, (Chapter, Figure, Page, Paragraph, Table)):
+    if isinstance(obj, (Chapter, Figure, Page, Paragraph, Table, Rect)):
         # insert id as first key into vars_dict
         # After python3.6/3.7, a dict is sorted in insertion order
         #     https://docs.python.org/3.6/whatsnew/3.6.html#whatsnew36-compactdict
@@ -74,7 +75,7 @@ def to_dict_output(obj: Union[ModelBase, Position]) -> Dict:  # pylint: disable=
         temp_dict = {"id": obj.id_}
         temp_dict.update(vars_dict)
         vars_dict = temp_dict
-    if isinstance(obj, (Figure, Paragraph, Table)):
+    if isinstance(obj, (Figure, Paragraph, Table, Rect)):
         # idx is not part of the UML model and should not be exported
         del vars_dict["idx"]
     if isinstance(obj, Page):
@@ -84,9 +85,9 @@ def to_dict_output(obj: Union[ModelBase, Position]) -> Dict:  # pylint: disable=
         # textboxes with positions are not interest of the output file
         if obj.textbox:
             text = obj.textbox.text
-            vars_dict["text"] = text
-        del vars_dict["textbox"]
-    if isinstance(obj, Figure):
+            vars_dict['text'] = text
+        del vars_dict['textbox']
+    if isinstance(obj, (Figure, Rect)):
         # textboxes with positions are not interest of the output file
         if obj.textboxes:
             text = "\n".join(x.text for x in obj.textboxes)
@@ -219,13 +220,13 @@ def filter_out_outline_page(outline_dict):
 
 
 def map_elements_outline(
-    element_list: List[Union[Chapter, Figure, Table, Paragraph]],
+    element_list: List[Union[Chapter, Figure, Table, Paragraph, Rect]],
     outline_dict,
-) -> List[Union[Chapter, Figure, Table, Paragraph]]:
+) -> List[Union[Chapter, Figure, Table, Paragraph, Rect]]:
     """
     Map elements into a nested outline structure.
 
-    :param element_list: a list of elements including chapters, figures, tables, and paragraphs in a flatten structure.
+    :param element_list: a list of elements including chapters, figures, rects, tables, and paragraphs in a flatten structure.
     :param outline_dict: a nested outline structure from catalogs.
     :return:
     """
@@ -272,7 +273,7 @@ def map_elements_outline(
             del elements_in_outline[:idx]
             break
 
-    # acquire a list of chapters where their contents are filled with the corresponding elements, figures, tables
+    # acquire a list of chapters where their contents are filled with the corresponding elements, figures, rects, tables
     # and paragraphs. This chapter list is still in a flatten structure
     chapters_content_filled = fill_elements_content(elements_in_outline)
 
@@ -292,11 +293,9 @@ def map_elements_outline(
     return nested_elements
 
 
-def fill_elements_content(
-    elements_in_outline: List[Union[Chapter, Figure, Table, Paragraph]],
-) -> List[Chapter]:
+def fill_elements_content(elements_in_outline: List[Union[Chapter, Figure, Rect, Table, Paragraph]]) -> List[Chapter]:
     """
-    Fill the elements, tables, figures and paragraphs into their corresponding chapters' contents.
+    Fill the elements, tables, figures, rects and paragraphs into their corresponding chapters' contents.
 
     The back chapter's reference of tables, figures, and paragraphs are added in this function
 
@@ -305,7 +304,7 @@ def fill_elements_content(
     """
     for index_element, element in enumerate(elements_in_outline):
         if isinstance(element, Chapter):
-            id_dict = {"table": 1, "figure": 1, "paragraph": 1}
+            id_dict = {'table': 1, 'figure': 1, 'paragraph': 1, 'rect': 1}
             content = elements_in_outline[index_element].content
             index_b_chapter = index_element
         else:
@@ -452,7 +451,7 @@ def libpdf_target_explorer(  # pylint: disable=too-many-nested-blocks # local al
 
 def elements_with_anno_finder(
     elements_on_page: List[Union[Paragraph, Table]],
-) -> Union[List[Union[Chapter, Paragraph, Figure, Table, Cell]], None]:
+) -> Union[List[Union[Chapter, Paragraph, Figure, Rect, Table, Cell]], None]:
     """
     Find the elements, tables or paragraphs containing source links.
 
@@ -536,7 +535,7 @@ def find_target_id(link: Link, pages_list: List[Page], src_element: Element) ->
     return target_id
 
 
-def get_elements_page(target_page: Page) -> List[Union[Paragraph, Table, Figure]]:
+def get_elements_page(target_page: Page) -> List[Union[Paragraph, Table, Figure, Rect]]:
     """
     Collect the elements, which occurs on a certain target page.
 
@@ -552,7 +551,7 @@ def get_elements_page(target_page: Page) -> List[Union[Paragraph, Table, Figure]
     return elements_target_page
 
 
-def nest_explorer(element: Union[Figure, Table, Chapter, Paragraph]) -> str:
+def nest_explorer(element: Union[Figure, Rect, Table, Chapter, Paragraph]) -> str:
     """
     Explore the nested target ID path recursively.
 
diff --git a/libpdf/textbox.py b/libpdf/textbox.py
index e4db528..c352a42 100644
--- a/libpdf/textbox.py
+++ b/libpdf/textbox.py
@@ -46,6 +46,7 @@
 from libpdf.log import logging_needed
 from libpdf.models.chapter import Chapter
 from libpdf.models.figure import Figure
+from libpdf.models.rect import Rect
 from libpdf.models.link import Link
 from libpdf.models.page import Page
 from libpdf.models.paragraph import Paragraph
@@ -63,11 +64,11 @@
 
 LOG = logging.getLogger(__name__)
 
-
 def extract_paragraphs_chapters(
     pdf,
     figure_list: List[Figure],
     table_list: List[Table],
+    rect_list: List[Rect],
     page_list: List[Page],
     no_chapters,
     no_paragraphs,
@@ -93,7 +94,7 @@ def extract_paragraphs_chapters(
     return paragraph_list, chapter_list
 
 
-def extract_lt_textboxes(pdf, figure_list, table_list, page_list):
+def extract_lt_textboxes(pdf, figure_list, table_list, rect_list, page_list):
     """
     Extract and filter lt_textboxes using pdfminer.
 
@@ -103,6 +104,7 @@ def extract_lt_textboxes(pdf, figure_list, table_list, page_list):
     :param pdf:
     :param figure_list:
     :param table_list:
+    :param rect_list:
     :param page_list:
     :return:
     """
@@ -862,10 +864,11 @@ def _flatten_outline(nested_outline, flatten_outline: List):
             _flatten_outline(chapter["content"], flatten_outline)
 
 
-def remove_lt_textboxes_in_tables_figures(
+def remove_lt_textboxes_in_tables_figures_rect(
     page_lt_textboxes: Dict[int, List[LTTextBox]],
     figure_list: List[Figure],
     table_list: List[Table],
+    rect_list: List[Rect]
 ):
     """
     Remove lt_textboxes in the coverage of tables or figures from page_lt_textboxes.
@@ -876,6 +879,7 @@ def remove_lt_textboxes_in_tables_figures(
     :param page_lt_textboxes:
     :param figure_list:
     :param table_list:
+    :param rect_list:
     :return:
     """
     page_lt_textboxes_filter = {}
@@ -904,9 +908,10 @@ def remove_lt_textboxes_in_tables_figures(
     return page_lt_textboxes_filter
 
 
-def tables_figures_merge(
+def tables_figures_rect_merge(
     figure_list: List[Figure],
     table_list: List[Table],
+    rect_list: List[Rect],
     page_index: int,
 ) -> List[Union[Figure, Table]]:
     """
@@ -917,6 +922,7 @@ def tables_figures_merge(
 
     :param figure_list: A list of all figures extracted from the pages in this pdf
     :param table_list: A list of all tables extracted from the pages in this pdf
+    :param rect_list:
     :param page_index: index of current page number
     :return:
     """
diff --git a/libpdf/utils.py b/libpdf/utils.py
index f39fa59..d9f1ba1 100644
--- a/libpdf/utils.py
+++ b/libpdf/utils.py
@@ -34,6 +34,7 @@
 from libpdf.models.chapter import Chapter
 from libpdf.models.element import Element
 from libpdf.models.figure import Figure
+from libpdf.models.rect import Rect
 from libpdf.models.horizontal_box import Char, HorizontalBox, HorizontalLine, Word
 from libpdf.models.paragraph import Paragraph
 from libpdf.models.table import Table
@@ -41,19 +42,20 @@
 from libpdf.progress import bar_format_lvl1, tqdm
 
 MAP_TYPES = {
-    Chapter: "chapter",
-    Paragraph: "paragraph",
-    Table: "table",
-    Figure: "figure",
-    LTChar: "paragraph",
-    LTCurve: "figure",
-    LTTextBox: "paragraph",
-    LTTextBoxHorizontal: "paragraph",
-    LTTextLineHorizontal: "paragraph",
-    LTFigure: "figure",
-    LTLine: "figure",
-    LTRect: "figure",
-    LTImage: "figure",
+    Chapter: 'chapter',
+    Paragraph: 'paragraph',
+    Table: 'table',
+    Figure: 'figure',
+    Rect: 'rect',
+    LTChar: 'paragraph',
+    LTCurve: 'figure',
+    LTTextBox: 'paragraph',
+    LTTextBoxHorizontal: 'paragraph',
+    LTTextLineHorizontal: 'paragraph',
+    LTFigure: 'figure',
+    LTLine: 'figure',
+    LTRect: 'rect',
+    LTImage: 'figure',
 }
 
 LOG = logging.getLogger(__name__)
@@ -639,10 +641,7 @@ def visual_debug_libpdf(  # pylint: disable=too-many-branches
     LOG.info("Starting visual debug...")
     # collect all elements
     all_elements = (
-        objects.flattened.chapters
-        + objects.flattened.paragraphs
-        + objects.flattened.tables
-        + objects.flattened.figures
+        objects.flattened.chapters + objects.flattened.paragraphs + objects.flattened.tables + objects.flattened.figures + objects.flattened.rects
     )
 
     # prepare for calling the common draw and output function
@@ -727,7 +726,7 @@ def render_pages(
                                     3: {...}
                                 }
 
-    :param render_elements: list of elements to render, options are chapter, paragraph, table, figure
+    :param render_elements: list of elements to render, options are chapter, paragraph, table, figure, rect
     :return: None
     """
     render_elements_joined = ", ".join(render_elements)

From 138110877393bb39d07b8bb970337bf8354732ff Mon Sep 17 00:00:00 2001
From: "jui-wen.chen" <jui-wen.chen@useblocks.com>
Date: Sat, 20 Jan 2024 14:26:06 +0100
Subject: [PATCH 02/18] adapted code to include textboxes of rects

---
 libpdf/textbox.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/libpdf/textbox.py b/libpdf/textbox.py
index c352a42..ce4085f 100644
--- a/libpdf/textbox.py
+++ b/libpdf/textbox.py
@@ -94,7 +94,7 @@ def extract_paragraphs_chapters(
     return paragraph_list, chapter_list
 
 
-def extract_lt_textboxes(pdf, figure_list, table_list, rect_list, page_list):
+def extract_lt_textboxes(pdf, figure_list, table_list, page_list):
     """
     Extract and filter lt_textboxes using pdfminer.
 
@@ -104,7 +104,6 @@ def extract_lt_textboxes(pdf, figure_list, table_list, rect_list, page_list):
     :param pdf:
     :param figure_list:
     :param table_list:
-    :param rect_list:
     :param page_list:
     :return:
     """
@@ -116,7 +115,8 @@ def extract_lt_textboxes(pdf, figure_list, table_list, rect_list, page_list):
             del page_lt_textboxes[idx_page]
 
     if table_list is not None or figure_list is not None:
-        page_lt_textboxes_filtered = remove_lt_textboxes_in_tables_figures(
+        +9
+        = remove_lt_textboxes_in_tables_figures(
             page_lt_textboxes, figure_list, table_list
         )
     else:
@@ -864,11 +864,10 @@ def _flatten_outline(nested_outline, flatten_outline: List):
             _flatten_outline(chapter["content"], flatten_outline)
 
 
-def remove_lt_textboxes_in_tables_figures_rect(
+def remove_lt_textboxes_in_tables_figures(
     page_lt_textboxes: Dict[int, List[LTTextBox]],
     figure_list: List[Figure],
     table_list: List[Table],
-    rect_list: List[Rect]
 ):
     """
     Remove lt_textboxes in the coverage of tables or figures from page_lt_textboxes.
@@ -879,7 +878,6 @@ def remove_lt_textboxes_in_tables_figures_rect(
     :param page_lt_textboxes:
     :param figure_list:
     :param table_list:
-    :param rect_list:
     :return:
     """
     page_lt_textboxes_filter = {}
@@ -908,10 +906,9 @@ def remove_lt_textboxes_in_tables_figures_rect(
     return page_lt_textboxes_filter
 
 
-def tables_figures_rect_merge(
+def tables_figures_merge(
     figure_list: List[Figure],
     table_list: List[Table],
-    rect_list: List[Rect],
     page_index: int,
 ) -> List[Union[Figure, Table]]:
     """
@@ -922,7 +919,6 @@ def tables_figures_rect_merge(
 
     :param figure_list: A list of all figures extracted from the pages in this pdf
     :param table_list: A list of all tables extracted from the pages in this pdf
-    :param rect_list:
     :param page_index: index of current page number
     :return:
     """

From e57322a2efba9ded9576552f50714ffb78498a54 Mon Sep 17 00:00:00 2001
From: "jui-wen.chen" <jui-wen.chen@useblocks.com>
Date: Sat, 20 Jan 2024 14:29:38 +0100
Subject: [PATCH 03/18] fixed typo

---
 libpdf/textbox.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libpdf/textbox.py b/libpdf/textbox.py
index ce4085f..f876485 100644
--- a/libpdf/textbox.py
+++ b/libpdf/textbox.py
@@ -115,8 +115,7 @@ def extract_lt_textboxes(pdf, figure_list, table_list, page_list):
             del page_lt_textboxes[idx_page]
 
     if table_list is not None or figure_list is not None:
-        +9
-        = remove_lt_textboxes_in_tables_figures(
+        page_lt_textboxes_filtered = remove_lt_textboxes_in_tables_figures(
             page_lt_textboxes, figure_list, table_list
         )
     else:

From fc488f7854533ddbdbd8995c33201f45db1985bf Mon Sep 17 00:00:00 2001
From: "jui-wen.chen" <jui-wen.chen@useblocks.com>
Date: Sat, 20 Jan 2024 14:46:00 +0100
Subject: [PATCH 04/18] make rect visual obvious

---
 libpdf/parameters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libpdf/parameters.py b/libpdf/parameters.py
index 70d0640..02d6cab 100644
--- a/libpdf/parameters.py
+++ b/libpdf/parameters.py
@@ -202,7 +202,7 @@
     'paragraph': COLORS['blue'] + (40,),
     'table': COLORS['red'] + (40,),
     'figure': COLORS['yellow'] + (80,),
-    'rect': COLORS['cyan'] + (40,),
+    'rect': COLORS['cyan'] + (160,),
 }
 
 RENDER_ELEMENTS = [

From 8b4447fe524b66bad351681873b568a11df16c7b Mon Sep 17 00:00:00 2001
From: "jui-wen.chen" <jui-wen.chen@useblocks.com>
Date: Mon, 22 Jan 2024 10:18:31 +0100
Subject: [PATCH 05/18] improved comments

---
 libpdf/models/rect.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libpdf/models/rect.py b/libpdf/models/rect.py
index 1c13cb7..20090ac 100644
--- a/libpdf/models/rect.py
+++ b/libpdf/models/rect.py
@@ -1,4 +1,4 @@
-"""Definition for PDF figures."""
+"""Definition for PDF rects."""
 from typing import TYPE_CHECKING, List
 
 from libpdf.models.element import Element
@@ -23,7 +23,7 @@ def __init__(
         position: 'Position',
         links: List[Link],
         textboxes: List[HorizontalBox],
-        non_stroking_color: str = None,
+        non_stroking_color: tuple = None,
     ):
         """Initialize the instance."""
         super().__init__(position=position)

From f45b86d54815ccbb8e3cc441cf44f1bfd9433131 Mon Sep 17 00:00:00 2001
From: "jui-wen.chen" <jui-wen.chen@useblocks.com>
Date: Mon, 22 Jan 2024 11:23:04 +0100
Subject: [PATCH 06/18] fixed docs

---
 docs/contents/api.rst        |  2 +-
 docs/contents/pdf_model.puml | 14 +++++++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/docs/contents/api.rst b/docs/contents/api.rst
index b56afa6..c6b1df1 100644
--- a/docs/contents/api.rst
+++ b/docs/contents/api.rst
@@ -75,7 +75,7 @@ Figure
 Rect
 ~~~~~~
 
-.. autoclass:: libpdf.models.figure.Rect
+.. autoclass:: libpdf.models.rect.Rect
 
 
 Position
diff --git a/docs/contents/pdf_model.puml b/docs/contents/pdf_model.puml
index 2233cfb..819b969 100644
--- a/docs/contents/pdf_model.puml
+++ b/docs/contents/pdf_model.puml
@@ -176,6 +176,13 @@
             **rel_path**: figures/<image>
         }
 
+        class Rect {
+            + [tuple] non_stroking_color
+            --
+            **id**: ""rect.<1,2,3,n>""
+            **non_stroking_color**: RGB color space
+        }
+
         package x <<layout>> {
 
            class HorizontalBox {
@@ -217,9 +224,7 @@
                 --
                 **text**: the character itself
             }
-        
-       
-        
+        }
         Char "+chars  1..*" ---left*  Word
         Word "+words  1..*" ---left*  HorizontalLine
         HorizontalLine "+lines  1..*" --*  HorizontalBox
@@ -228,10 +233,12 @@
         HorizontalBox "+textbox  0..1" --* Cell
         HorizontalBox "+textboxes  0..*" --* Figure
         HorizontalBox "+textbox  0..1" --* Chapter
+        HorizontalBox "+textboxes  0..*" --* Rect
 
         Paragraph "+b_source  1" *-- "+links  *" Link
         Figure "+b_source  1" *-- "+links  *" Link
         Cell "+b_source  1" *-- "+links  *" Link
+        Rect "+b_source  1" *-- "+links  *" Link
 
         Cell "+cells  1..*" --* "+b_table  1" Table
         Cell "+b_cell  1" *-down- "+position  1" Position
@@ -240,6 +247,7 @@
         Figure -[#COL_DERIVE]-|> Element
         Chapter -[#COL_DERIVE]-|> Element
         Paragraph -[#COL_DERIVE]|> Element
+        Rect -[#COL_DERIVE]-|> Element
         
         Chapter "1" *-- "+content  *" Element: ordered
         Page "1" *-- "+content  *" Element: ordered

From 2ab1fd7f4e012dfe77b46ded6e5293bc04ee569f Mon Sep 17 00:00:00 2001
From: "jui-wen.chen" <jui-wen.chen@useblocks.com>
Date: Mon, 22 Jan 2024 12:00:01 +0100
Subject: [PATCH 07/18] adapted CLI

---
 libpdf/core.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/libpdf/core.py b/libpdf/core.py
index 034bba1..ab97b2b 100644
--- a/libpdf/core.py
+++ b/libpdf/core.py
@@ -472,6 +472,13 @@ def handle_parse_result(self, ctx, opts, args):
     help="Skip figures. Figures will not be part of the output JSON/YAML structures and also not saved if"
     " --save-figures is given.",
 )
+@click.option(
+    "--no-rects",
+    is_flag=True,
+    show_default=True,
+    help="Skip rects. rects will not be part of the output JSON/YAML structures and also not saved if"
+    " --save-rects is given.",
+)
 @click.option("-vd", "--visual-debug", is_flag=True, help="Visual debug libpdf.")
 @click.option(
     "-vo",

From 99d95b988d7bdd9de7e0308abca6ca18f5230873 Mon Sep 17 00:00:00 2001
From: Marco Heinemann <marco.heinemann@useblocks.com>
Date: Tue, 23 Jan 2024 09:08:52 +0100
Subject: [PATCH 08/18] Fix ruff issues for new rect file

---
 libpdf/models/rect.py | 42 +++++++++++++++++++++---------------------
 pyproject.toml        |  7 +++++++
 2 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/libpdf/models/rect.py b/libpdf/models/rect.py
index 20090ac..7d05e7d 100644
--- a/libpdf/models/rect.py
+++ b/libpdf/models/rect.py
@@ -1,29 +1,28 @@
 """Definition for PDF rects."""
-from typing import TYPE_CHECKING, List
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
 
 from libpdf.models.element import Element
-from libpdf.models.horizontal_box import HorizontalBox
-from libpdf.models.link import Link
 
-# avoid import cycles for back reference type hinting
-# https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
 if TYPE_CHECKING:
-    # F401 imported but unused - it's needed for type hinting
-    from libpdf.models.position import Position  # noqa: F401, pylint: disable=ungrouped-imports
+    from libpdf.models.horizontal_box import HorizontalBox
+    from libpdf.models.link import Link
+    from libpdf.models.position import (
+        Position,
+    )
 
 
 class Rect(Element):
-    """
-    PDF Rect.
-    """
+    """PDF Rect."""
 
     def __init__(
         self,
         idx: int,
-        position: 'Position',
-        links: List[Link],
-        textboxes: List[HorizontalBox],
-        non_stroking_color: tuple = None,
+        position: Position,
+        links: list[Link],
+        textboxes: list[HorizontalBox],
+        non_stroking_color: tuple | None = None,
     ):
         """Initialize the instance."""
         super().__init__(position=position)
@@ -32,26 +31,27 @@ def __init__(
         self.links = links
         self.non_stroking_color = non_stroking_color
         if self.links:
-            self.set_links_backref()
+            self._set_links_backref()
 
     @property
-    def id_(self):
+    def id_(self) -> str:
         """
         Return the identifier to address the Figure.
 
         The identifier follows the pattern ``figure.<idx>``.
-        idx the 1-based number of the Figure in the current scope (root, chapter, sub-chapters, page).
+        idx the 1-based number of the Figure in the current scope
+        (root, chapter, sub-chapters, page).
 
         It is used as a link target if a PDF link-annotation points to the Element.
 
-        According to PDF model the parameter should be called ``id`` but the name is reserved in Python, so ``id_``
-        is used.
+        According to PDF model the parameter should be called ``id`` but the name is
+        reserved in Python, so ``id_`` is used.
 
         :type: str
         """
-        return f'rect.{self.idx}'
+        return f"rect.{self.idx}"
 
-    def set_links_backref(self):
+    def _set_links_backref(self) -> None:
         """Set b_source back reference on all links."""
         for link in self.links:
             link.b_source = self
diff --git a/pyproject.toml b/pyproject.toml
index cfcf7ad..899746d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -139,6 +139,7 @@ ignore = [
     "FIX002", # line-contains-todo - TODOs are considered helpful
     "ISC001", # may conflict with Ruff formatter
     "TD003",  # missing-todo-link - no big processes for us
+    "ANN101", # missing-type-self - infer typing is simple for this library
 ]
 
 # Allow autofix for all enabled rules (when `--fix`) is provided.
@@ -148,6 +149,12 @@ unfixable = [
     "ERA001", # commented-out-code - commented code is commonly there for a reason
 ]
 
+[tool.ruff.lint.pylint]
+max-args = 8
+
+[tool.ruff.lint.flake8-annotations]
+mypy-init-return = true  # no init return type needed if at least one arg is annotated
+
 [tool.ruff.mccabe]
 max-complexity = 20
 

From b06bdcb34f614ef819a2ea9c4a58ae1da8d83c30 Mon Sep 17 00:00:00 2001
From: Marco Heinemann <marco.heinemann@useblocks.com>
Date: Tue, 23 Jan 2024 09:11:58 +0100
Subject: [PATCH 09/18] Improved docstring

---
 libpdf/models/rect.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/libpdf/models/rect.py b/libpdf/models/rect.py
index 7d05e7d..72f7f8d 100644
--- a/libpdf/models/rect.py
+++ b/libpdf/models/rect.py
@@ -14,7 +14,13 @@
 
 
 class Rect(Element):
-    """PDF Rect."""
+    """
+    Rectangles in a PDF.
+
+    The rectangles are extracted from pdfplumber.
+    References to textboxes are stored in case the rectangle touches them.
+    The textboxes can still be part of paragraphs.
+    """
 
     def __init__(
         self,
@@ -24,7 +30,6 @@ def __init__(
         textboxes: list[HorizontalBox],
         non_stroking_color: tuple | None = None,
     ):
-        """Initialize the instance."""
         super().__init__(position=position)
         self.idx = idx
         self.textboxes = textboxes

From 8484832e1a383e667faa6b8ec80aa121961ede5d Mon Sep 17 00:00:00 2001
From: "jui-wen.chen" <jui-wen.chen@useblocks.com>
Date: Mon, 22 Jan 2024 14:37:28 +0100
Subject: [PATCH 10/18] rect extraction not saved in figures

---
 libpdf/extract.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/libpdf/extract.py b/libpdf/extract.py
index 681d6b5..07c43e4 100644
--- a/libpdf/extract.py
+++ b/libpdf/extract.py
@@ -153,7 +153,7 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
             LOG.info('Excluding rects extraction')
             rect_list = []
         else:
-            rect_list = extract_rects(pdf, pages_list, figure_dir)
+            rect_list = extract_rects(pdf, pages_list)
             # smartly remove figures that are in header and footer
             if smart_page_crop:
                 rect_list = smart_page_crop_header_footer(pdf, rect_list)
@@ -658,7 +658,6 @@ def extract_figures(
 def extract_rects(
     pdf,
     pages_list,
-    figure_dir,
 ) -> List[
     Rect
 ]:  # pylint: disable=too-many-nested-blocks, too-many-branches  # local algorithm, easier to read when not split up
@@ -715,16 +714,7 @@ def extract_rects(
 
                     textboxes.append(hbox)
 
-                rect_name = f'page_{page.page_number}_rect.{idx_rect + 1}.png'
-
-                # create figures directory if not exist
-                Path(figure_dir).mkdir(parents=True, exist_ok=True)
-
-                rect_path = os.path.abspath(os.path.join(figure_dir, rect_name))
-
-                #figure = Figure(idx_figure + 1, image_path, fig_pos, links, textboxes, 'None')
-                #figure_list.append(figure)
-                rect = Rect( idx_rect + 1, rect_pos, links, textboxes, non_stroking_color )
+                rect = Rect( idx_rect + 1, rect_pos, links, textboxes, non_stroking_color)
                 rect_list.append(rect)
 
         else:

From 1d2a721634d3f67e771083457904806bd8654100 Mon Sep 17 00:00:00 2001
From: "jui-wen.chen" <jui-wen.chen@useblocks.com>
Date: Tue, 23 Jan 2024 10:11:30 +0100
Subject: [PATCH 11/18] adapted model

---
 docs/contents/pdf_model.puml |  5 ++---
 libpdf/extract.py            | 34 +++++++++++++++-------------------
 libpdf/models/rect.py        | 20 ++++++--------------
 3 files changed, 23 insertions(+), 36 deletions(-)

diff --git a/docs/contents/pdf_model.puml b/docs/contents/pdf_model.puml
index 819b969..219350a 100644
--- a/docs/contents/pdf_model.puml
+++ b/docs/contents/pdf_model.puml
@@ -233,12 +233,11 @@
         HorizontalBox "+textbox  0..1" --* Cell
         HorizontalBox "+textboxes  0..*" --* Figure
         HorizontalBox "+textbox  0..1" --* Chapter
-        HorizontalBox "+textboxes  0..*" --* Rect
+        HorizontalBox "+textbox  0..1" --* Rect
 
         Paragraph "+b_source  1" *-- "+links  *" Link
         Figure "+b_source  1" *-- "+links  *" Link
         Cell "+b_source  1" *-- "+links  *" Link
-        Rect "+b_source  1" *-- "+links  *" Link
 
         Cell "+cells  1..*" --* "+b_table  1" Table
         Cell "+b_cell  1" *-down- "+position  1" Position
@@ -247,7 +246,7 @@
         Figure -[#COL_DERIVE]-|> Element
         Chapter -[#COL_DERIVE]-|> Element
         Paragraph -[#COL_DERIVE]|> Element
-        Rect -[#COL_DERIVE]-|> Element
+        Rect -[#COL_DERIVE]|> Element
         
         Chapter "1" *-- "+content  *" Element: ordered
         Page "1" *-- "+content  *" Element: ordered
diff --git a/libpdf/extract.py b/libpdf/extract.py
index 07c43e4..c1dd8bb 100644
--- a/libpdf/extract.py
+++ b/libpdf/extract.py
@@ -34,7 +34,7 @@
 from libpdf.progress import bar_format_lvl2, tqdm
 from libpdf.tables import extract_pdf_table
 from libpdf.textbox import extract_linked_chars, extract_paragraphs_chapters
-from libpdf.utils import lt_page_crop, lt_to_libpdf_hbox_converter, to_pdfplumber_bbox
+from libpdf.utils import lt_page_crop,lt_textbox_crop, lt_to_libpdf_hbox_converter, to_pdfplumber_bbox
 
 LOG = logging.getLogger(__name__)
 
@@ -690,31 +690,27 @@ def extract_rects(
                 )
 
                 non_stroking_color = rect['non_stroking_color']
-                fill = rect['fill']
 
-                bbox = (rect_pos.x0, rect_pos.y0, rect_pos.x1, rect_pos.y1)
-
-                LOG.info(f"found rect at {bbox} at page {idx_page+1}: color {non_stroking_color}");
-
-                lt_textboxes = lt_page_crop(
-                    bbox,
-                    lt_page._objs,  # pylint: disable=protected-access # access needed
-                    LTText,
-                    contain_completely=True,
+                offset = 5
+                rect_bbox = (
+                    rect_pos.x0 - offset,
+                    rect_pos.y0 - offset,
+                    rect_pos.x1 + offset,
+                    rect_pos.y1 + offset,
                 )
 
-                textboxes = []
-                links = []
-                for lt_textbox in lt_textboxes:
-                    if catalog['annos']:
-                        links.extend(extract_linked_chars(lt_textbox, lt_page.pageid))
-                    bbox = (lt_textbox.x0, lt_textbox.y0, lt_textbox.x1, lt_textbox.y1)
+                LOG.info(f"found rect at {rect_bbox} at page {idx_page+1}: color {non_stroking_color}");
 
+                lt_textbox = lt_textbox_crop(
+                    rect_bbox, lt_page._objs,
+                    word_margin=LA_PARAMS["word_margin"],
+                    y_tolerance=LA_PARAMS["line_overlap"],
+                    )
+                if lt_textbox:
                     hbox = lt_to_libpdf_hbox_converter(lt_textbox)
 
-                    textboxes.append(hbox)
 
-                rect = Rect( idx_rect + 1, rect_pos, links, textboxes, non_stroking_color)
+                rect = Rect( idx_rect + 1, rect_pos, hbox, non_stroking_color)
                 rect_list.append(rect)
 
         else:
diff --git a/libpdf/models/rect.py b/libpdf/models/rect.py
index 72f7f8d..700841b 100644
--- a/libpdf/models/rect.py
+++ b/libpdf/models/rect.py
@@ -4,6 +4,7 @@
 from typing import TYPE_CHECKING
 
 from libpdf.models.element import Element
+from libpdf.models.horizontal_box import HorizontalBox
 
 if TYPE_CHECKING:
     from libpdf.models.horizontal_box import HorizontalBox
@@ -25,18 +26,14 @@ class Rect(Element):
     def __init__(
         self,
         idx: int,
-        position: Position,
-        links: list[Link],
-        textboxes: list[HorizontalBox],
-        non_stroking_color: tuple | None = None,
+        position: 'Position',
+        textbox: HorizontalBox,
+        non_stroking_color: tuple = None,
     ):
         super().__init__(position=position)
         self.idx = idx
-        self.textboxes = textboxes
-        self.links = links
+        self.textbox = textbox
         self.non_stroking_color = non_stroking_color
-        if self.links:
-            self._set_links_backref()
 
     @property
     def id_(self) -> str:
@@ -54,9 +51,4 @@ def id_(self) -> str:
 
         :type: str
         """
-        return f"rect.{self.idx}"
-
-    def _set_links_backref(self) -> None:
-        """Set b_source back reference on all links."""
-        for link in self.links:
-            link.b_source = self
+        return f'rect.{self.idx}'

From 5cd2e647922afdee66e56b0cb30d369bc20e1319 Mon Sep 17 00:00:00 2001
From: "jui-wen.chen" <jui-wen.chen@useblocks.com>
Date: Tue, 23 Jan 2024 11:45:22 +0100
Subject: [PATCH 12/18] changed rect

-remove crop_rects_text flag
-text within the rect is extracted
---
 libpdf/core.py        | 7 -------
 libpdf/extract.py     | 6 ++----
 libpdf/models/rect.py | 5 ++---
 libpdf/process.py     | 4 ++--
 libpdf/textbox.py     | 1 -
 5 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/libpdf/core.py b/libpdf/core.py
index ab97b2b..cd402e7 100644
--- a/libpdf/core.py
+++ b/libpdf/core.py
@@ -36,7 +36,6 @@ def main(  # pylint: disable=too-many-arguments,too-many-locals  # no reasonable
     no_tables: bool = False,
     no_figures: bool = False,
     no_rects: bool = False,
-    crop_rects_text: bool = False,
     cli_usage: bool = False,
     visual_debug: bool = False,
     visual_debug_output_dir: str = None,
@@ -69,7 +68,6 @@ def main(  # pylint: disable=too-many-arguments,too-many-locals  # no reasonable
     :param no_tables: flag triggering the exclusion of tables
     :param no_figures: flag triggering the exclusion of figures
     :param no_rects: flag triggering the exclusion of rects
-    :param crop_rects_text: flag triggering that rects text should be cropped from text like paragraphs
     :param cli_usage: flag indicating that the function was called through CLI
     :param visual_debug: flag triggering visual debug feature
     :param visual_debug_output_dir: output directory for visualized pdf pages
@@ -120,7 +118,6 @@ def main(  # pylint: disable=too-many-arguments,too-many-locals  # no reasonable
         LOG.info('Extract tables: %s', 'no' if no_tables else 'yes')
         LOG.info('Extract figures: %s', 'no' if no_figures else 'yes')
         LOG.info('Extract rects: %s', 'no' if no_rects else 'yes')
-        LOG.info('Text rects crop: %s', 'no' if crop_rects_text else 'no')
         overall_pbar.update(1)
         try:
             objects = extract(
@@ -135,7 +132,6 @@ def main(  # pylint: disable=too-many-arguments,too-many-locals  # no reasonable
                 no_tables,
                 no_figures,
                 no_rects,
-                crop_rects_text,
                 overall_pbar,
             )
         except LibpdfException:
@@ -181,7 +177,6 @@ def main_api(  # pylint: disable=too-many-arguments, too-many-locals
     no_tables: bool = False,
     no_figures: bool = False,
     no_rects: bool = False,
-    crop_rects_text: bool = False,
     init_logging: bool = True,
     visual_debug: bool = False,
     visual_debug_output_dir: str = "visual_debug_libpdf",
@@ -208,7 +203,6 @@ def main_api(  # pylint: disable=too-many-arguments, too-many-locals
     :param no_tables: flag triggering the exclusion of tables
     :param no_figures: flag triggering the exclusion of figures
     :param no_rects: flag triggering the exclusion of rects
-    :param crop_rects_text: flag triggering that rects text should be cropped from text like paragraphs
     :param init_logging: flag indicating whether libpdf shall instantiate a root log handler that is capable of
                          handling both log messages and progress bars; it does so by passing all log messages to
                          tqdm.write()
@@ -253,7 +247,6 @@ def main_api(  # pylint: disable=too-many-arguments, too-many-locals
         no_tables=no_tables,
         no_figures=no_figures,
         no_rects = no_rects,
-        crop_rects_text = crop_rects_text,
         cli_usage=False,
         visual_debug=visual_debug,
         visual_debug_output_dir=visual_debug_output_dir,
diff --git a/libpdf/extract.py b/libpdf/extract.py
index c1dd8bb..a9b4a11 100644
--- a/libpdf/extract.py
+++ b/libpdf/extract.py
@@ -63,7 +63,6 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
     no_tables: bool,
     no_figures: bool,
     no_rects: bool,
-    crop_rects_text: bool,
     overall_pbar: tqdm,
 ) -> ApiObjects:
     """
@@ -80,7 +79,6 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
     :param no_tables: flag triggering the exclusion of tables
     :param no_figures: flag triggering the exclusion of figures
     :param no_rects: flag triggering the exclusion of rects
-    :param crop_rects_text: flag triggering that rects text should be cropped from text like paragraphs
     :param overall_pbar: total progress bar for whole libpdf run
     :return: instance of Objects class
     :raise LibpdfException: PDF contains no pages
@@ -175,7 +173,6 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
             pdf,
             figure_list,
             table_list,
-            rect_list if crop_rects_text else [],
             pages_list,
             no_chapters,
             no_paragraphs,
@@ -708,7 +705,8 @@ def extract_rects(
                     )
                 if lt_textbox:
                     hbox = lt_to_libpdf_hbox_converter(lt_textbox)
-
+                else:
+                    hbox = None
 
                 rect = Rect( idx_rect + 1, rect_pos, hbox, non_stroking_color)
                 rect_list.append(rect)
diff --git a/libpdf/models/rect.py b/libpdf/models/rect.py
index 700841b..7796f4d 100644
--- a/libpdf/models/rect.py
+++ b/libpdf/models/rect.py
@@ -8,7 +8,6 @@
 
 if TYPE_CHECKING:
     from libpdf.models.horizontal_box import HorizontalBox
-    from libpdf.models.link import Link
     from libpdf.models.position import (
         Position,
     )
@@ -19,8 +18,8 @@ class Rect(Element):
     Rectangles in a PDF.
 
     The rectangles are extracted from pdfplumber.
-    References to textboxes are stored in case the rectangle touches them.
-    The textboxes can still be part of paragraphs.
+    The text covered in the rectangle is extracted and 
+    stored in an newly instantiated textbox.
     """
 
     def __init__(
diff --git a/libpdf/process.py b/libpdf/process.py
index 6c926e5..7cb9072 100644
--- a/libpdf/process.py
+++ b/libpdf/process.py
@@ -81,13 +81,13 @@ def to_dict_output(obj: Union[ModelBase, Position]) -> Dict:  # pylint: disable=
     if isinstance(obj, Page):
         # no serialization for the contents of pages
         del vars_dict["content"]
-    if isinstance(obj, (Paragraph, Cell, Chapter)):
+    if isinstance(obj, (Paragraph, Cell, Chapter, Rect)):
         # textboxes with positions are not interest of the output file
         if obj.textbox:
             text = obj.textbox.text
             vars_dict['text'] = text
         del vars_dict['textbox']
-    if isinstance(obj, (Figure, Rect)):
+    if isinstance(obj, (Figure)):
         # textboxes with positions are not interest of the output file
         if obj.textboxes:
             text = "\n".join(x.text for x in obj.textboxes)
diff --git a/libpdf/textbox.py b/libpdf/textbox.py
index f876485..200c448 100644
--- a/libpdf/textbox.py
+++ b/libpdf/textbox.py
@@ -68,7 +68,6 @@ def extract_paragraphs_chapters(
     pdf,
     figure_list: List[Figure],
     table_list: List[Table],
-    rect_list: List[Rect],
     page_list: List[Page],
     no_chapters,
     no_paragraphs,

From b6b966100ea1a6c54e2d5fa9ac12f0dff4bf709b Mon Sep 17 00:00:00 2001
From: "jui-wen.chen" <jui-wen.chen@useblocks.com>
Date: Tue, 23 Jan 2024 11:48:46 +0100
Subject: [PATCH 13/18] reformatted by ruff

---
 libpdf/apiobjects.py  | 10 ++++--
 libpdf/core.py        | 27 ++++++++------
 libpdf/extract.py     | 82 ++++++++++++++++++++++++++-----------------
 libpdf/models/rect.py |  6 ++--
 libpdf/parameters.py  | 30 ++++++++--------
 libpdf/process.py     | 12 ++++---
 libpdf/textbox.py     |  2 +-
 libpdf/utils.py       | 36 ++++++++++---------
 8 files changed, 119 insertions(+), 86 deletions(-)

diff --git a/libpdf/apiobjects.py b/libpdf/apiobjects.py
index 042d36b..9ca56cf 100644
--- a/libpdf/apiobjects.py
+++ b/libpdf/apiobjects.py
@@ -7,8 +7,8 @@
 
 from libpdf.models.chapter import Chapter
 from libpdf.models.figure import Figure
-from libpdf.models.rect import Rect
 from libpdf.models.paragraph import Paragraph
+from libpdf.models.rect import Rect
 from libpdf.models.root import Root
 from libpdf.models.table import Table
 
@@ -46,7 +46,13 @@ def __init__(  # pylint: disable=too-many-arguments  # the parameters are needed
         self.root = root
 
         # attributes for API convenience
-        self.flattened = Flattened(chapters=chapters, paragraphs=paragraphs, tables=tables, figures=figures, rects=rects)
+        self.flattened = Flattened(
+            chapters=chapters,
+            paragraphs=paragraphs,
+            tables=tables,
+            figures=figures,
+            rects=rects,
+        )
 
         # exposing the pdfplumber PDF object
         self.pdfplumber = pdfplumber
diff --git a/libpdf/core.py b/libpdf/core.py
index cd402e7..fc7bac9 100644
--- a/libpdf/core.py
+++ b/libpdf/core.py
@@ -108,16 +108,21 @@ def main(  # pylint: disable=too-many-arguments,too-many-locals  # no reasonable
         if output_path:
             LOG.info("Output path: %s", output_path)
         else:
-            LOG.info('Writing extracted data to stdout')
-        LOG.info('Page range: [%s]', 'all' if not pages else ','.join(str(x) for x in pages))
-        LOG.info('Page crop: %s', 'not cropped' if not page_crop else ' '.join(str(x) for x in page_crop))
-        LOG.info('Smart page crop: %s', 'on' if smart_page_crop else 'off')
-        LOG.info('Extract annotations: %s', 'no' if no_annotations else 'yes')
-        LOG.info('Extract chapters: %s', 'no' if no_chapters else 'yes')
-        LOG.info('Extract paragraphs: %s', 'no' if no_paragraphs else 'yes')
-        LOG.info('Extract tables: %s', 'no' if no_tables else 'yes')
-        LOG.info('Extract figures: %s', 'no' if no_figures else 'yes')
-        LOG.info('Extract rects: %s', 'no' if no_rects else 'yes')
+            LOG.info("Writing extracted data to stdout")
+        LOG.info(
+            "Page range: [%s]", "all" if not pages else ",".join(str(x) for x in pages)
+        )
+        LOG.info(
+            "Page crop: %s",
+            "not cropped" if not page_crop else " ".join(str(x) for x in page_crop),
+        )
+        LOG.info("Smart page crop: %s", "on" if smart_page_crop else "off")
+        LOG.info("Extract annotations: %s", "no" if no_annotations else "yes")
+        LOG.info("Extract chapters: %s", "no" if no_chapters else "yes")
+        LOG.info("Extract paragraphs: %s", "no" if no_paragraphs else "yes")
+        LOG.info("Extract tables: %s", "no" if no_tables else "yes")
+        LOG.info("Extract figures: %s", "no" if no_figures else "yes")
+        LOG.info("Extract rects: %s", "no" if no_rects else "yes")
         overall_pbar.update(1)
         try:
             objects = extract(
@@ -246,7 +251,7 @@ def main_api(  # pylint: disable=too-many-arguments, too-many-locals
         no_paragraphs=no_paragraphs,
         no_tables=no_tables,
         no_figures=no_figures,
-        no_rects = no_rects,
+        no_rects=no_rects,
         cli_usage=False,
         visual_debug=visual_debug,
         visual_debug_output_dir=visual_debug_output_dir,
diff --git a/libpdf/extract.py b/libpdf/extract.py
index a9b4a11..311710d 100644
--- a/libpdf/extract.py
+++ b/libpdf/extract.py
@@ -17,11 +17,11 @@
 from libpdf.exceptions import LibpdfException
 from libpdf.log import logging_needed
 from libpdf.models.figure import Figure
-from libpdf.models.rect import Rect
 from libpdf.models.file import File
 from libpdf.models.file_meta import FileMeta
 from libpdf.models.page import Page
 from libpdf.models.position import Position
+from libpdf.models.rect import Rect
 from libpdf.models.root import Root
 from libpdf.parameters import (
     FIGURE_MIN_HEIGHT,
@@ -34,7 +34,12 @@
 from libpdf.progress import bar_format_lvl2, tqdm
 from libpdf.tables import extract_pdf_table
 from libpdf.textbox import extract_linked_chars, extract_paragraphs_chapters
-from libpdf.utils import lt_page_crop,lt_textbox_crop, lt_to_libpdf_hbox_converter, to_pdfplumber_bbox
+from libpdf.utils import (
+    lt_page_crop,
+    lt_textbox_crop,
+    lt_to_libpdf_hbox_converter,
+    to_pdfplumber_bbox,
+)
 
 LOG = logging.getLogger(__name__)
 
@@ -148,7 +153,7 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
         overall_pbar.update(15)
 
         if no_rects:
-            LOG.info('Excluding rects extraction')
+            LOG.info("Excluding rects extraction")
             rect_list = []
         else:
             rect_list = extract_rects(pdf, pages_list)
@@ -157,7 +162,6 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
                 rect_list = smart_page_crop_header_footer(pdf, rect_list)
         overall_pbar.update(15)
 
-
         if no_tables:
             LOG.info("Excluding tables extraction")
             table_list = []
@@ -182,7 +186,9 @@ def extract(  # pylint: disable=too-many-locals, too-many-branches, too-many-sta
         if smart_page_crop:
             paragraph_list = smart_page_crop_header_footer(pdf, paragraph_list)
 
-        element_list = pro.merge_all_elements(figure_list, table_list, paragraph_list, chapter_list, rect_list)
+        element_list = pro.merge_all_elements(
+            figure_list, table_list, paragraph_list, chapter_list, rect_list
+        )
 
         # to check if elements shall be mapped into nested outline structure.
         if catalog["outline"] is not None and not no_chapters:
@@ -652,41 +658,44 @@ def extract_figures(
 
     return figure_list
 
+
 def extract_rects(
     pdf,
     pages_list,
-) -> List[
-    Rect
-]:  # pylint: disable=too-many-nested-blocks, too-many-branches  # local algorithm, easier to read when not split up
+) -> List[Rect]:  # pylint: disable=too-many-nested-blocks, too-many-branches  # local algorithm, easier to read when not split up
     """Extract rects in PDF."""
-    LOG.info('Extracting rects ...')
+    LOG.info("Extracting rects ...")
     rect_list = []
 
     for idx_page, page in enumerate(  # pylint: disable=too-many-nested-blocks
-        tqdm(pdf.pages, desc='###### Extracting rects', unit='pages', bar_format=bar_format_lvl2()),
+        tqdm(
+            pdf.pages,
+            desc="###### Extracting rects",
+            unit="pages",
+            bar_format=bar_format_lvl2(),
+        ),
     ):
         if logging_needed(idx_page, len(pdf.pages)):
-            LOG.debug('Extracting rects page %s of %s', idx_page + 1, len(pdf.pages))
+            LOG.debug("Extracting rects page %s of %s", idx_page + 1, len(pdf.pages))
         page_crop = pro.remove_page_header_footer(page)
         lt_page = page._layout  # pylint: disable=protected-access  # easiest way to obtain LTPage
 
         # check and filter figures
-        #figures = check_and_filter_figures(page_crop.objects['figure']) if 'figure' in page_crop.objects else []
-        #rects = page_crop.objects['rects'] if 'rects' in page_crop.objects else []
-        rects = page.objects['rect'] if 'rect' in page.objects else []
-
+        # figures = check_and_filter_figures(page_crop.objects['figure']) if 'figure' in page_crop.objects else []
+        # rects = page_crop.objects['rects'] if 'rects' in page_crop.objects else []
+        rects = page.objects["rect"] if "rect" in page.objects else []
 
         if len(rects) != 0:
             for idx_rect, rect in enumerate(rects):
                 rect_pos = Position(
-                    float(rect['x0']),
-                    float(rect['y0']),
-                    float(rect['x1']),
-                    float(rect['y1']),
+                    float(rect["x0"]),
+                    float(rect["y0"]),
+                    float(rect["x1"]),
+                    float(rect["y1"]),
                     pages_list[idx_page],
                 )
 
-                non_stroking_color = rect['non_stroking_color']
+                non_stroking_color = rect["non_stroking_color"]
 
                 offset = 5
                 rect_bbox = (
@@ -696,26 +705,27 @@ def extract_rects(
                     rect_pos.y1 + offset,
                 )
 
-                LOG.info(f"found rect at {rect_bbox} at page {idx_page+1}: color {non_stroking_color}");
-
+                LOG.info(
+                    f"found rect at {rect_bbox} at page {idx_page+1}: color {non_stroking_color}"
+                )
                 lt_textbox = lt_textbox_crop(
-                    rect_bbox, lt_page._objs,
+                    rect_bbox,
+                    lt_page._objs,
                     word_margin=LA_PARAMS["word_margin"],
                     y_tolerance=LA_PARAMS["line_overlap"],
-                    )
+                )
                 if lt_textbox:
                     hbox = lt_to_libpdf_hbox_converter(lt_textbox)
                 else:
                     hbox = None
 
-                rect = Rect( idx_rect + 1, rect_pos, hbox, non_stroking_color)
+                rect = Rect(idx_rect + 1, rect_pos, hbox, non_stroking_color)
                 rect_list.append(rect)
 
         else:
             LOG.info(f"found no rects on page {idx_page+1}: {page_crop.objects.keys()}")
 
-
-    #return figure_list
+    # return figure_list
     return rect_list
 
 
@@ -788,9 +798,9 @@ def check_and_filter_figures(figures_list):  # pylint: disable=too-many-branches
             filtered_figures.append(figure)
 
     if len(filtered_figures) < len(figures_list):
-        LOG.debug(f"check_and_filter_figures removed {len(figures_list) - len(filtered_figures)} out of {len(figures_list)}  due to invalid height/width")
-
-
+        LOG.debug(
+            f"check_and_filter_figures removed {len(figures_list) - len(filtered_figures)} out of {len(figures_list)}  due to invalid height/width"
+        )
 
     for figure in filtered_figures:
         # if figure exceed the boundary of the page, then only keep the part of figure that inside this page
@@ -839,14 +849,20 @@ def check_and_filter_figures(figures_list):  # pylint: disable=too-many-branches
                 # compare the size of two figures, keep the bigger figure
                 if fig0["width"] * fig0["height"] <= fig1["width"] * fig1["height"]:
                     if fig0 in filtered_figures:
-                        LOG.debug("remove filtered figure fig0 due to partially overlap")
+                        LOG.debug(
+                            "remove filtered figure fig0 due to partially overlap"
+                        )
                         filtered_figures.remove(fig0)
                 else:
                     if fig1 in filtered_figures:
-                        LOG.debug("remove filtered figure fig1 due to partially overlap")
+                        LOG.debug(
+                            "remove filtered figure fig1 due to partially overlap"
+                        )
                         filtered_figures.remove(fig1)
 
     if len(filtered_figures) < len(figures_list):
-        LOG.debug(f"check_and_filter_figures removed {len(figures_list) - len(filtered_figures)} out of {len(figures_list)} figures")
+        LOG.debug(
+            f"check_and_filter_figures removed {len(figures_list) - len(filtered_figures)} out of {len(figures_list)} figures"
+        )
 
     return filtered_figures
diff --git a/libpdf/models/rect.py b/libpdf/models/rect.py
index 7796f4d..7e898d0 100644
--- a/libpdf/models/rect.py
+++ b/libpdf/models/rect.py
@@ -18,14 +18,14 @@ class Rect(Element):
     Rectangles in a PDF.
 
     The rectangles are extracted from pdfplumber.
-    The text covered in the rectangle is extracted and 
+    The text covered in the rectangle is extracted and
     stored in an newly instantiated textbox.
     """
 
     def __init__(
         self,
         idx: int,
-        position: 'Position',
+        position: Position,
         textbox: HorizontalBox,
         non_stroking_color: tuple = None,
     ):
@@ -50,4 +50,4 @@ def id_(self) -> str:
 
         :type: str
         """
-        return f'rect.{self.idx}'
+        return f"rect.{self.idx}"
diff --git a/libpdf/parameters.py b/libpdf/parameters.py
index 02d6cab..afd9e0d 100644
--- a/libpdf/parameters.py
+++ b/libpdf/parameters.py
@@ -188,29 +188,29 @@
 # The following parameters are used for visual debugging
 # Give colors a human readable name
 COLORS = {
-    'red': (255, 0, 0),
-    'green': (0, 255, 0),
-    'blue': (0, 0, 255),
-    'yellow': (255, 255, 0),
-    'cyan': (255,255,1)
+    "red": (255, 0, 0),
+    "green": (0, 255, 0),
+    "blue": (0, 0, 255),
+    "yellow": (255, 255, 0),
+    "cyan": (255, 255, 1),
 }
 
 # Map extracted elements with color
 # the numbers at the end means transparency, the value should be set in range (40, 160)
 VIS_DBG_MAP_ELEMENTS_COLOR = {
-    'chapter': COLORS['green'] + (80,),
-    'paragraph': COLORS['blue'] + (40,),
-    'table': COLORS['red'] + (40,),
-    'figure': COLORS['yellow'] + (80,),
-    'rect': COLORS['cyan'] + (160,),
+    "chapter": COLORS["green"] + (80,),
+    "paragraph": COLORS["blue"] + (40,),
+    "table": COLORS["red"] + (40,),
+    "figure": COLORS["yellow"] + (80,),
+    "rect": COLORS["cyan"] + (160,),
 }
 
 RENDER_ELEMENTS = [
-    'chapter',
-    'paragraph',
-    'table',
-    'figure',
-    'rect'
+    "chapter",
+    "paragraph",
+    "table",
+    "figure",
+    "rect",
 ]  # the elements that shall be rendered
 
 # pdfminer layout analysis parameter from from pdfminer.layout -> LAParams.__init__
diff --git a/libpdf/process.py b/libpdf/process.py
index 7cb9072..6500aec 100644
--- a/libpdf/process.py
+++ b/libpdf/process.py
@@ -24,12 +24,12 @@
 from libpdf.models.chapter import Chapter
 from libpdf.models.element import Element
 from libpdf.models.figure import Figure
-from libpdf.models.rect import Rect
 from libpdf.models.link import Link
 from libpdf.models.model_base import ModelBase
 from libpdf.models.page import Page
 from libpdf.models.paragraph import Paragraph
 from libpdf.models.position import Position
+from libpdf.models.rect import Rect
 from libpdf.models.table import Cell, Table
 from libpdf.parameters import HEADLINE_TOLERANCE
 
@@ -85,8 +85,8 @@ def to_dict_output(obj: Union[ModelBase, Position]) -> Dict:  # pylint: disable=
         # textboxes with positions are not interest of the output file
         if obj.textbox:
             text = obj.textbox.text
-            vars_dict['text'] = text
-        del vars_dict['textbox']
+            vars_dict["text"] = text
+        del vars_dict["textbox"]
     if isinstance(obj, (Figure)):
         # textboxes with positions are not interest of the output file
         if obj.textboxes:
@@ -293,7 +293,9 @@ def map_elements_outline(
     return nested_elements
 
 
-def fill_elements_content(elements_in_outline: List[Union[Chapter, Figure, Rect, Table, Paragraph]]) -> List[Chapter]:
+def fill_elements_content(
+    elements_in_outline: List[Union[Chapter, Figure, Rect, Table, Paragraph]],
+) -> List[Chapter]:
     """
     Fill the elements, tables, figures, rects and paragraphs into their corresponding chapters' contents.
 
@@ -304,7 +306,7 @@ def fill_elements_content(elements_in_outline: List[Union[Chapter, Figure, Rect,
     """
     for index_element, element in enumerate(elements_in_outline):
         if isinstance(element, Chapter):
-            id_dict = {'table': 1, 'figure': 1, 'paragraph': 1, 'rect': 1}
+            id_dict = {"table": 1, "figure": 1, "paragraph": 1, "rect": 1}
             content = elements_in_outline[index_element].content
             index_b_chapter = index_element
         else:
diff --git a/libpdf/textbox.py b/libpdf/textbox.py
index 200c448..e4db528 100644
--- a/libpdf/textbox.py
+++ b/libpdf/textbox.py
@@ -46,7 +46,6 @@
 from libpdf.log import logging_needed
 from libpdf.models.chapter import Chapter
 from libpdf.models.figure import Figure
-from libpdf.models.rect import Rect
 from libpdf.models.link import Link
 from libpdf.models.page import Page
 from libpdf.models.paragraph import Paragraph
@@ -64,6 +63,7 @@
 
 LOG = logging.getLogger(__name__)
 
+
 def extract_paragraphs_chapters(
     pdf,
     figure_list: List[Figure],
diff --git a/libpdf/utils.py b/libpdf/utils.py
index d9f1ba1..b593191 100644
--- a/libpdf/utils.py
+++ b/libpdf/utils.py
@@ -34,28 +34,28 @@
 from libpdf.models.chapter import Chapter
 from libpdf.models.element import Element
 from libpdf.models.figure import Figure
-from libpdf.models.rect import Rect
 from libpdf.models.horizontal_box import Char, HorizontalBox, HorizontalLine, Word
 from libpdf.models.paragraph import Paragraph
+from libpdf.models.rect import Rect
 from libpdf.models.table import Table
 from libpdf.parameters import RENDER_ELEMENTS, VIS_DBG_MAP_ELEMENTS_COLOR
 from libpdf.progress import bar_format_lvl1, tqdm
 
 MAP_TYPES = {
-    Chapter: 'chapter',
-    Paragraph: 'paragraph',
-    Table: 'table',
-    Figure: 'figure',
-    Rect: 'rect',
-    LTChar: 'paragraph',
-    LTCurve: 'figure',
-    LTTextBox: 'paragraph',
-    LTTextBoxHorizontal: 'paragraph',
-    LTTextLineHorizontal: 'paragraph',
-    LTFigure: 'figure',
-    LTLine: 'figure',
-    LTRect: 'rect',
-    LTImage: 'figure',
+    Chapter: "chapter",
+    Paragraph: "paragraph",
+    Table: "table",
+    Figure: "figure",
+    Rect: "rect",
+    LTChar: "paragraph",
+    LTCurve: "figure",
+    LTTextBox: "paragraph",
+    LTTextBoxHorizontal: "paragraph",
+    LTTextLineHorizontal: "paragraph",
+    LTFigure: "figure",
+    LTLine: "figure",
+    LTRect: "rect",
+    LTImage: "figure",
 }
 
 LOG = logging.getLogger(__name__)
@@ -641,7 +641,11 @@ def visual_debug_libpdf(  # pylint: disable=too-many-branches
     LOG.info("Starting visual debug...")
     # collect all elements
     all_elements = (
-        objects.flattened.chapters + objects.flattened.paragraphs + objects.flattened.tables + objects.flattened.figures + objects.flattened.rects
+        objects.flattened.chapters
+        + objects.flattened.paragraphs
+        + objects.flattened.tables
+        + objects.flattened.figures
+        + objects.flattened.rects
     )
 
     # prepare for calling the common draw and output function

From 1aa72240b8830e31231905bbb23dc6304f8b143f Mon Sep 17 00:00:00 2001
From: "jui-wen.chen" <jui-wen.chen@useblocks.com>
Date: Tue, 23 Jan 2024 11:49:06 +0100
Subject: [PATCH 14/18] versioned

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 899746d..bb8f351 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "libpdf"
-version = "0.0.1"
+version = "0.1.1"
 description = "Extract structured data from PDFs."
 authors = [
     "Marco Heinemann <marco.heinemann@useblocks.com>",

From 64fc19fef2255425926a97164349bc55a977c7c7 Mon Sep 17 00:00:00 2001
From: "jui-wen.chen" <jui-wen.chen@useblocks.com>
Date: Tue, 23 Jan 2024 14:23:55 +0100
Subject: [PATCH 15/18] fix for resolve()

---
 libpdf/catalog.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libpdf/catalog.py b/libpdf/catalog.py
index e78de7f..fedc50b 100644
--- a/libpdf/catalog.py
+++ b/libpdf/catalog.py
@@ -3,7 +3,7 @@
 import re
 from typing import Any, Dict, List, Union
 
-from pdfminer.pdftypes import PDFObjRef
+from pdfminer.pdftypes import PDFObjRef, resolve1
 from pdfminer.psparser import PSLiteral
 
 from libpdf.log import logging_needed
@@ -51,7 +51,7 @@ def get_named_destination(pdf):  # pylint: disable=too-many-branches
         ):
             name_tree = pdf_catalog["Names"].resolve()["Dests"].resolve()
         elif isinstance(pdf_catalog["Names"], dict) and "Dests" in pdf_catalog["Names"]:
-            name_tree = pdf_catalog["Names"]["Dests"].resolve()
+            name_tree = resolve1(pdf_catalog["Names"]["Dests"])
         # check if name tree not empty
         if name_tree:
             # map page id to page number

From 76813902208d75ff65b093769b55e79321ff9b0d Mon Sep 17 00:00:00 2001
From: Marco Heinemann <marco.heinemann@useblocks.com>
Date: Tue, 23 Jan 2024 14:42:50 +0100
Subject: [PATCH 16/18] Improve docs for non_stroking_color

---
 docs/contents/pdf_model.puml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/contents/pdf_model.puml b/docs/contents/pdf_model.puml
index 219350a..67de5ae 100644
--- a/docs/contents/pdf_model.puml
+++ b/docs/contents/pdf_model.puml
@@ -180,7 +180,8 @@
             + [tuple] non_stroking_color
             --
             **id**: ""rect.<1,2,3,n>""
-            **non_stroking_color**: RGB color space
+            **non_stroking_color**: RGB color space,
+            given as floats between 0 and 1
         }
 
         package x <<layout>> {

From 7335d53624f7704baf5d3538a0d871b624d0e301 Mon Sep 17 00:00:00 2001
From: Marco Heinemann <marco.heinemann@useblocks.com>
Date: Tue, 23 Jan 2024 14:45:05 +0100
Subject: [PATCH 17/18] Revert version update

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index bb8f351..899746d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "libpdf"
-version = "0.1.1"
+version = "0.0.1"
 description = "Extract structured data from PDFs."
 authors = [
     "Marco Heinemann <marco.heinemann@useblocks.com>",

From 946c8200e3eb0d14c40c785bf45f7f4c01dcff89 Mon Sep 17 00:00:00 2001
From: Marco Heinemann <marco.heinemann@useblocks.com>
Date: Tue, 23 Jan 2024 14:51:30 +0100
Subject: [PATCH 18/18] Updated changelog

---
 docs/contents/changelog.rst | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/contents/changelog.rst b/docs/contents/changelog.rst
index c341097..0d19565 100644
--- a/docs/contents/changelog.rst
+++ b/docs/contents/changelog.rst
@@ -20,11 +20,15 @@ __ https://github.com/useblocks/libpdf/compare/v0.0.1...HEAD
 Added
 ~~~~~
 
+- Introduced new element ``Rect`` (`PR #30 <https://github.com/useblocks/libpdf/pull/30>`_)
+- Introduced Ruff as linter and formatter (PRs `#28 <https://github.com/useblocks/libpdf/pull/28>`_,
+  `#29 <https://github.com/useblocks/libpdf/pull/29>`_ and `#31 <https://github.com/useblocks/libpdf/pull/31>`_)
+- Added support for Python 3.10, 3.11 and 3.12 (`PR #27 <https://github.com/useblocks/libpdf/pull/27>`_)
+
 Changed
 ~~~~~~~
 
 - Updated downstream library dependencies
-
 - Added flag ``--no_annotations`` to exclude annotation extraction from the catalog to speed up extraction
   (`PR #15 <https://github.com/useblocks/libpdf/pull/15>`_)
 
@@ -33,6 +37,8 @@ Removed
 
 - libpdf has dropped support for Python 3.6, which reached end-of-life on 2021-12-23; this also fixes the Pillow 8
   security vulnerabilities
+- libpdf has dropped support for Python 3.7, which reached end-of-life on 2023-06-27
+  (`PR #27 <https://github.com/useblocks/libpdf/pull/27>`_)
 
 Fixed
 ~~~~~