Adding tests for rect extractions (#36)

* adding tests for rect * finish basic tests * disable visual debug for tests, seems to depend on external tools * adding ghostscript hint in visual debug
useblocks · Feb 21, 2024 · 29fe5a3 · 29fe5a3
1 parent c285d7d
commit 29fe5a3
Show file tree

Hide file tree

Showing 18 changed files with 388 additions and 190 deletions.
diff --git a/docs/contents/visual_debugging.rst b/docs/contents/visual_debugging.rst
@@ -1,2 +1,10 @@
 Visual debugging
-================
+================
+
+Requirements
+------------
+
+The Visual debugging feature requires Ghostscript to extract images from pdf.
+This is required as a separated install on your system.
+
+See `Ghostscript <https://www.ghostscript.com/>`_ for installation instructions.
diff --git a/libpdf/apiobjects.py b/libpdf/apiobjects.py
@@ -61,13 +61,12 @@ def __init__(  # pylint: disable=too-many-arguments  # the parameters are needed
         if pdfminer is not None:
             # take argument first
             self.pdfminer = pdfminer
+        elif pdfplumber is not None:
+            # set from pdfplumber document
+            self.pdfminer = pdfplumber.doc
         else:
-            if pdfplumber is not None:
-                # set from pdfplumber document
-                self.pdfminer = pdfplumber.doc
-            else:
-                # nothing available
-                self.pdfminer = None
+            # nothing available
+            self.pdfminer = None
 
 
 class Flattened(NamedTuple):

diff --git a/libpdf/catalog.py b/libpdf/catalog.py
@@ -124,7 +124,7 @@ def resolve_name_obj(name_tree_kids):
     """
     temp_list = []
     for kid in name_tree_kids:
-        if "Kids" in kid and kid["Kids"]:
+        if kid.get("Kids"):
             temp_list.extend([kid_kid.resolve() for kid_kid in kid["Kids"]])
         elif "Names" in kid:
             return name_tree_kids
@@ -311,14 +311,12 @@ def resolve_outline(outline_obj, outline_list, des_dict, pdf):  # pylint: disabl
                 raise RuntimeError(
                     f"Page {outline_obj['Dest'][0]} is not an indirect reference to a page object"
                 )
+        elif isinstance(outline_obj["Dest"], PSLiteral):
+            # PDF 1.1 name object
+            outline_dest = outline_obj["Dest"].name
         else:
-            # named destination
-            if isinstance(outline_obj["Dest"], PSLiteral):
-                # PDF 1.1 name object
-                outline_dest = outline_obj["Dest"].name
-            else:
-                # PDF 1.2 byte string
-                outline_dest = outline_obj["Dest"].decode("utf-8")
+            # PDF 1.2 byte string
+            outline_dest = outline_obj["Dest"].decode("utf-8")
         title_bytes = outline_obj["Title"]
     else:
         raise ValueError("No key A and Dest in outline.")

diff --git a/libpdf/extract.py b/libpdf/extract.py
@@ -429,9 +429,8 @@ def check_false_positive_header_footer(pdf, elements_list):  # pylint: disable=t
                 # recursively check again, to find the next min_low_pos, which will determine the header/footer boundary
                 if elements_list:
                     return check_false_positive_header_footer(pdf, elements_list)
-        else:
-            if len(elements_list) == 1:
-                elements_list.pop()
+        elif len(elements_list) == 1:
+            elements_list.pop()
     else:
         for idx, element in enumerate(elements_list):
             if float(f"{element.position.y0:.4f}") == header_low_pos:
@@ -853,12 +852,9 @@ def check_and_filter_figures(figures_list):  # pylint: disable=too-many-branches
                             "remove filtered figure fig0 due to partially overlap"
                         )
                         filtered_figures.remove(fig0)
-                else:
-                    if fig1 in filtered_figures:
-                        LOG.debug(
-                            "remove filtered figure fig1 due to partially overlap"
-                        )
-                        filtered_figures.remove(fig1)
+                elif fig1 in filtered_figures:
+                    LOG.debug("remove filtered figure fig1 due to partially overlap")
+                    filtered_figures.remove(fig1)
 
     if len(filtered_figures) < len(figures_list):
         LOG.debug(

diff --git a/libpdf/log.py b/libpdf/log.py
@@ -71,14 +71,13 @@ def config_logger(cli=True):
             init_tqdm = True
         else:
             init_basic = True
-    else:  # API usage
-        if TQDM_AVAILABLE:
-            # this needs to be documented so any API user is not surprised that the libpdf logger has an attached
-            # handler; users may delete it if unwanted or it could be configurable later if tqdm handler should be
-            # used or the user wants to define something else
-            init_tqdm = True
-        else:  # don't init anything, it's up to the user
-            pass
+    elif TQDM_AVAILABLE:
+        # this needs to be documented so any API user is not surprised that the libpdf logger has an attached
+        # handler; users may delete it if unwanted or it could be configurable later if tqdm handler should be
+        # used or the user wants to define something else
+        init_tqdm = True
+    else:  # don't init anything, it's up to the user
+        pass
 
     log_format = "[%(levelname)5s] %(name)s - %(message)s"
     if init_tqdm:

diff --git a/libpdf/models/model_base.py b/libpdf/models/model_base.py
@@ -18,9 +18,8 @@ def to_dict(self):
         for key, value in vars_dict.items():
             if key.startswith("b_"):
                 delete_backref_keys.append(key)
-            else:
-                if isinstance(value, ModelBase):
-                    vars_dict[key] = value.to_dict()
+            elif isinstance(value, ModelBase):
+                vars_dict[key] = value.to_dict()
         # delete back references
         for key in delete_backref_keys:
             del vars_dict[key]

diff --git a/libpdf/process.py b/libpdf/process.py
@@ -309,21 +309,20 @@ def fill_elements_content(
             id_dict = {"table": 1, "figure": 1, "paragraph": 1, "rect": 1}
             content = elements_in_outline[index_element].content
             index_b_chapter = index_element
+        elif "content" in locals():
+            element.idx = id_dict[element.type]
+            element.b_chapter = elements_in_outline[index_b_chapter]
+            content.append(element)
+            id_dict[element.type] += 1
         else:
-            if "content" in locals():
-                element.idx = id_dict[element.type]
-                element.b_chapter = elements_in_outline[index_b_chapter]
-                content.append(element)
-                id_dict[element.type] += 1
-            else:
-                # TODO 1. this exception is not caught in libpdf code and will go all the way up to the user (wanted?)
-                #      2. the message is unclear
-                #      3. if it's a programming error, fix the code
-                #      4. if it's a real runtime issue coming from wrong PDF input, catch the error one level above
-                #         and log an understandable, critical error
-                raise ValueError(
-                    "elements can not fill into the content because it does not exist"
-                )
+            # TODO 1. this exception is not caught in libpdf code and will go all the way up to the user (wanted?)
+            #      2. the message is unclear
+            #      3. if it's a programming error, fix the code
+            #      4. if it's a real runtime issue coming from wrong PDF input, catch the error one level above
+            #         and log an understandable, critical error
+            raise ValueError(
+                "elements can not fill into the content because it does not exist"
+            )
 
     chapters_content = list(
         filter(lambda x: isinstance(x, Chapter), elements_in_outline)
@@ -435,20 +434,19 @@ def libpdf_target_explorer(  # pylint: disable=too-many-nested-blocks # local al
                         for link in element.links:
                             target_id = find_target_id(link, pages_list, element)
                             link.libpdf_target = target_id
+                    elif isinstance(element, Cell):
+                        # Cell is not considered as element
+                        pass
                     else:
-                        if isinstance(element, Cell):
-                            # Cell is not considered as element
-                            pass
-                        else:
-                            # TODO reason about the overall logic; which cases can be removed? distinguish between
-                            #      programming errors (raise RuntimeErrors) and cases that actually may exist in the
-                            #      wild and write human-readable log messages (e.g.
-                            #        The link on page xy with text xy cannot be resolved to a libpdf element; linking
-                            #        to the target page position instead
-                            LOG.error(
-                                "The source link in the paragraph %s is missing",
-                                repr(element),
-                            )
+                        # TODO reason about the overall logic; which cases can be removed? distinguish between
+                        #      programming errors (raise RuntimeErrors) and cases that actually may exist in the
+                        #      wild and write human-readable log messages (e.g.
+                        #        The link on page xy with text xy cannot be resolved to a libpdf element; linking
+                        #        to the target page position instead
+                        LOG.error(
+                            "The source link in the paragraph %s is missing",
+                            repr(element),
+                        )
 
 
 def elements_with_anno_finder(

diff --git a/libpdf/textbox.py b/libpdf/textbox.py
@@ -79,10 +79,9 @@ def extract_paragraphs_chapters(
     chapter_list = []
     if no_chapters:
         LOG.info("Excluding chapters extraction")
-    else:
-        if catalog["outline"]:
-            LOG.info("Extracting chapters ...")
-            chapter_list = render_chapters(extracted_lt_textboxes, page_list, pdf)
+    elif catalog["outline"]:
+        LOG.info("Extracting chapters ...")
+        chapter_list = render_chapters(extracted_lt_textboxes, page_list, pdf)
 
     paragraph_list = []
     if no_paragraphs:
@@ -782,17 +781,15 @@ def first_last_char_in_anno_marker(  # pylint: disable=too-many-branches # bette
             # the incoming char is outside the anno-rectangle
             pass
 
-    else:
-        # the char is LTAnno
-        if idx_char == len(ltobjs_in_lttextline) - 1:
-            # the last char of the textline
+    elif idx_char == len(ltobjs_in_lttextline) - 1:
+        # the last char of the textline
+        anno_complete = True
+    elif isinstance(ltobjs_in_lttextline[idx_char + 1], LTChar):
+        if ltobjs_in_lttextline[idx_char + 1].x0 > anno["rect"][2]:
+            # the next char is outside of the current anno-rectangle
             anno_complete = True
-        elif isinstance(ltobjs_in_lttextline[idx_char + 1], LTChar):
-            if ltobjs_in_lttextline[idx_char + 1].x0 > anno["rect"][2]:
-                # the next char is outside of the current anno-rectangle
-                anno_complete = True
-        else:
-            raise ValueError("two LTAnno occurs in a row")
+    else:
+        raise ValueError("two LTAnno occurs in a row")
 
     return anno_complete
 

diff --git a/libpdf/utils.py b/libpdf/utils.py
@@ -303,43 +303,40 @@ def find_lt_obj_in_bbox(
     ):
         # This is the case when a LT object is neither inside nor intersected with the given bounding box.
         pass
-    else:
-        # This is the case when a LT object is intersected with the given box. In this case, the LT objects inside the
-        # given bounding box need to be hierarchically and recursively found.
-        if hasattr(lt_obj, "_objs"):
-            # All the downwards hierarchical LT objects are stored in the attribute "_objs".
-            # If the _objs attribute doesn't exist, it means it's the bottom of the hierarchy.
-            text_inside_bbox = False  # True on LTTextLine level when the first LTChar is inside the BBOX
-            for item in lt_obj._objs:  # pylint: disable=protected-access
-                if isinstance(item, LTAnno):
-                    # special treatment of LTAnno because it is virtual with no position data
-                    if text_inside_bbox:
-                        # LTAnno is added because an LTChar was inside the bbox before
+    elif hasattr(lt_obj, "_objs"):
+        # All the downwards hierarchical LT objects are stored in the attribute "_objs".
+        # If the _objs attribute doesn't exist, it means it's the bottom of the hierarchy.
+        text_inside_bbox = (
+            False  # True on LTTextLine level when the first LTChar is inside the BBOX
+        )
+        for item in lt_obj._objs:  # pylint: disable=protected-access
+            if isinstance(item, LTAnno):
+                # special treatment of LTAnno because it is virtual with no position data
+                if text_inside_bbox:
+                    # LTAnno is added because an LTChar was inside the bbox before
+                    lt_objs_in_bbox.append(item)
+            elif isinstance(item, LTChar):
+                # check if the first and last LTChar have shown in the given bbox to decide if the trailing
+                # LTAnno should be added
+                ltchar_inside = check_lt_obj_in_bbox(item, bbox)
+                if text_inside_bbox:
+                    if ltchar_inside:
                         lt_objs_in_bbox.append(item)
-                else:
-                    if isinstance(item, LTChar):
-                        # check if the first and last LTChar have shown in the given bbox to decide if the trailing
-                        # LTAnno should be added
-                        ltchar_inside = check_lt_obj_in_bbox(item, bbox)
-                        if text_inside_bbox:
-                            if ltchar_inside:
-                                lt_objs_in_bbox.append(item)
-                            else:
-                                # the bbox just ended and can't enter again
-                                break
-                        else:
-                            if ltchar_inside:
-                                lt_objs_in_bbox.append(item)
-                                text_inside_bbox = True
-                            else:
-                                # no LTChar was added before, so not in BBOX yet
-                                pass
                     else:
-                        # it is not an LTAnno nor an LTChar, so recurse and break it further down
-                        find_lt_obj_in_bbox(lt_objs_in_bbox, item, bbox)
-        else:
-            # no attribute "_objs" exists. It reaches the bottom of the hierarchy
-            pass
+                        # the bbox just ended and can't enter again
+                        break
+                elif ltchar_inside:
+                    lt_objs_in_bbox.append(item)
+                    text_inside_bbox = True
+                else:
+                    # no LTChar was added before, so not in BBOX yet
+                    pass
+            else:
+                # it is not an LTAnno nor an LTChar, so recurse and break it further down
+                find_lt_obj_in_bbox(lt_objs_in_bbox, item, bbox)
+    else:
+        # no attribute "_objs" exists. It reaches the bottom of the hierarchy
+        pass
 
 
 def lt_page_crop(