Skip to content

Commit

Permalink
Adding tests for rect extractions (#36)
Browse files Browse the repository at this point in the history
* adding tests for rect
* finish basic tests
* disable visual debug for tests, seems to depend on external tools
* adding ghostscript hint in visual debug
  • Loading branch information
kreuzberger authored Feb 21, 2024
1 parent c285d7d commit 29fe5a3
Show file tree
Hide file tree
Showing 18 changed files with 388 additions and 190 deletions.
10 changes: 9 additions & 1 deletion docs/contents/visual_debugging.rst
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
Visual debugging
================
================

Requirements
------------

The Visual debugging feature requires Ghostscript to extract images from pdf.
This is required as a separated install on your system.

See `Ghostscript <https://www.ghostscript.com/>`_ for installation instructions.
11 changes: 5 additions & 6 deletions libpdf/apiobjects.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,12 @@ def __init__( # pylint: disable=too-many-arguments # the parameters are needed
if pdfminer is not None:
# take argument first
self.pdfminer = pdfminer
elif pdfplumber is not None:
# set from pdfplumber document
self.pdfminer = pdfplumber.doc
else:
if pdfplumber is not None:
# set from pdfplumber document
self.pdfminer = pdfplumber.doc
else:
# nothing available
self.pdfminer = None
# nothing available
self.pdfminer = None


class Flattened(NamedTuple):
Expand Down
14 changes: 6 additions & 8 deletions libpdf/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def resolve_name_obj(name_tree_kids):
"""
temp_list = []
for kid in name_tree_kids:
if "Kids" in kid and kid["Kids"]:
if kid.get("Kids"):
temp_list.extend([kid_kid.resolve() for kid_kid in kid["Kids"]])
elif "Names" in kid:
return name_tree_kids
Expand Down Expand Up @@ -311,14 +311,12 @@ def resolve_outline(outline_obj, outline_list, des_dict, pdf): # pylint: disabl
raise RuntimeError(
f"Page {outline_obj['Dest'][0]} is not an indirect reference to a page object"
)
elif isinstance(outline_obj["Dest"], PSLiteral):
# PDF 1.1 name object
outline_dest = outline_obj["Dest"].name
else:
# named destination
if isinstance(outline_obj["Dest"], PSLiteral):
# PDF 1.1 name object
outline_dest = outline_obj["Dest"].name
else:
# PDF 1.2 byte string
outline_dest = outline_obj["Dest"].decode("utf-8")
# PDF 1.2 byte string
outline_dest = outline_obj["Dest"].decode("utf-8")
title_bytes = outline_obj["Title"]
else:
raise ValueError("No key A and Dest in outline.")
Expand Down
14 changes: 5 additions & 9 deletions libpdf/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,9 +429,8 @@ def check_false_positive_header_footer(pdf, elements_list): # pylint: disable=t
# recursively check again, to find the next min_low_pos, which will determine the header/footer boundary
if elements_list:
return check_false_positive_header_footer(pdf, elements_list)
else:
if len(elements_list) == 1:
elements_list.pop()
elif len(elements_list) == 1:
elements_list.pop()
else:
for idx, element in enumerate(elements_list):
if float(f"{element.position.y0:.4f}") == header_low_pos:
Expand Down Expand Up @@ -853,12 +852,9 @@ def check_and_filter_figures(figures_list): # pylint: disable=too-many-branches
"remove filtered figure fig0 due to partially overlap"
)
filtered_figures.remove(fig0)
else:
if fig1 in filtered_figures:
LOG.debug(
"remove filtered figure fig1 due to partially overlap"
)
filtered_figures.remove(fig1)
elif fig1 in filtered_figures:
LOG.debug("remove filtered figure fig1 due to partially overlap")
filtered_figures.remove(fig1)

if len(filtered_figures) < len(figures_list):
LOG.debug(
Expand Down
15 changes: 7 additions & 8 deletions libpdf/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,13 @@ def config_logger(cli=True):
init_tqdm = True
else:
init_basic = True
else: # API usage
if TQDM_AVAILABLE:
# this needs to be documented so any API user is not surprised that the libpdf logger has an attached
# handler; users may delete it if unwanted or it could be configurable later if tqdm handler should be
# used or the user wants to define something else
init_tqdm = True
else: # don't init anything, it's up to the user
pass
elif TQDM_AVAILABLE:
# this needs to be documented so any API user is not surprised that the libpdf logger has an attached
# handler; users may delete it if unwanted or it could be configurable later if tqdm handler should be
# used or the user wants to define something else
init_tqdm = True
else: # don't init anything, it's up to the user
pass

log_format = "[%(levelname)5s] %(name)s - %(message)s"
if init_tqdm:
Expand Down
5 changes: 2 additions & 3 deletions libpdf/models/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@ def to_dict(self):
for key, value in vars_dict.items():
if key.startswith("b_"):
delete_backref_keys.append(key)
else:
if isinstance(value, ModelBase):
vars_dict[key] = value.to_dict()
elif isinstance(value, ModelBase):
vars_dict[key] = value.to_dict()
# delete back references
for key in delete_backref_keys:
del vars_dict[key]
Expand Down
52 changes: 25 additions & 27 deletions libpdf/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,21 +309,20 @@ def fill_elements_content(
id_dict = {"table": 1, "figure": 1, "paragraph": 1, "rect": 1}
content = elements_in_outline[index_element].content
index_b_chapter = index_element
elif "content" in locals():
element.idx = id_dict[element.type]
element.b_chapter = elements_in_outline[index_b_chapter]
content.append(element)
id_dict[element.type] += 1
else:
if "content" in locals():
element.idx = id_dict[element.type]
element.b_chapter = elements_in_outline[index_b_chapter]
content.append(element)
id_dict[element.type] += 1
else:
# TODO 1. this exception is not caught in libpdf code and will go all the way up to the user (wanted?)
# 2. the message is unclear
# 3. if it's a programming error, fix the code
# 4. if it's a real runtime issue coming from wrong PDF input, catch the error one level above
# and log an understandable, critical error
raise ValueError(
"elements can not fill into the content because it does not exist"
)
# TODO 1. this exception is not caught in libpdf code and will go all the way up to the user (wanted?)
# 2. the message is unclear
# 3. if it's a programming error, fix the code
# 4. if it's a real runtime issue coming from wrong PDF input, catch the error one level above
# and log an understandable, critical error
raise ValueError(
"elements can not fill into the content because it does not exist"
)

chapters_content = list(
filter(lambda x: isinstance(x, Chapter), elements_in_outline)
Expand Down Expand Up @@ -435,20 +434,19 @@ def libpdf_target_explorer( # pylint: disable=too-many-nested-blocks # local al
for link in element.links:
target_id = find_target_id(link, pages_list, element)
link.libpdf_target = target_id
elif isinstance(element, Cell):
# Cell is not considered as element
pass
else:
if isinstance(element, Cell):
# Cell is not considered as element
pass
else:
# TODO reason about the overall logic; which cases can be removed? distinguish between
# programming errors (raise RuntimeErrors) and cases that actually may exist in the
# wild and write human-readable log messages (e.g.
# The link on page xy with text xy cannot be resolved to a libpdf element; linking
# to the target page position instead
LOG.error(
"The source link in the paragraph %s is missing",
repr(element),
)
# TODO reason about the overall logic; which cases can be removed? distinguish between
# programming errors (raise RuntimeErrors) and cases that actually may exist in the
# wild and write human-readable log messages (e.g.
# The link on page xy with text xy cannot be resolved to a libpdf element; linking
# to the target page position instead
LOG.error(
"The source link in the paragraph %s is missing",
repr(element),
)


def elements_with_anno_finder(
Expand Down
25 changes: 11 additions & 14 deletions libpdf/textbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,9 @@ def extract_paragraphs_chapters(
chapter_list = []
if no_chapters:
LOG.info("Excluding chapters extraction")
else:
if catalog["outline"]:
LOG.info("Extracting chapters ...")
chapter_list = render_chapters(extracted_lt_textboxes, page_list, pdf)
elif catalog["outline"]:
LOG.info("Extracting chapters ...")
chapter_list = render_chapters(extracted_lt_textboxes, page_list, pdf)

paragraph_list = []
if no_paragraphs:
Expand Down Expand Up @@ -782,17 +781,15 @@ def first_last_char_in_anno_marker( # pylint: disable=too-many-branches # bette
# the incoming char is outside the anno-rectangle
pass

else:
# the char is LTAnno
if idx_char == len(ltobjs_in_lttextline) - 1:
# the last char of the textline
elif idx_char == len(ltobjs_in_lttextline) - 1:
# the last char of the textline
anno_complete = True
elif isinstance(ltobjs_in_lttextline[idx_char + 1], LTChar):
if ltobjs_in_lttextline[idx_char + 1].x0 > anno["rect"][2]:
# the next char is outside of the current anno-rectangle
anno_complete = True
elif isinstance(ltobjs_in_lttextline[idx_char + 1], LTChar):
if ltobjs_in_lttextline[idx_char + 1].x0 > anno["rect"][2]:
# the next char is outside of the current anno-rectangle
anno_complete = True
else:
raise ValueError("two LTAnno occurs in a row")
else:
raise ValueError("two LTAnno occurs in a row")

return anno_complete

Expand Down
67 changes: 32 additions & 35 deletions libpdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,43 +303,40 @@ def find_lt_obj_in_bbox(
):
# This is the case when a LT object is neither inside nor intersected with the given bounding box.
pass
else:
# This is the case when a LT object is intersected with the given box. In this case, the LT objects inside the
# given bounding box need to be hierarchically and recursively found.
if hasattr(lt_obj, "_objs"):
# All the downwards hierarchical LT objects are stored in the attribute "_objs".
# If the _objs attribute doesn't exist, it means it's the bottom of the hierarchy.
text_inside_bbox = False # True on LTTextLine level when the first LTChar is inside the BBOX
for item in lt_obj._objs: # pylint: disable=protected-access
if isinstance(item, LTAnno):
# special treatment of LTAnno because it is virtual with no position data
if text_inside_bbox:
# LTAnno is added because an LTChar was inside the bbox before
elif hasattr(lt_obj, "_objs"):
# All the downwards hierarchical LT objects are stored in the attribute "_objs".
# If the _objs attribute doesn't exist, it means it's the bottom of the hierarchy.
text_inside_bbox = (
False # True on LTTextLine level when the first LTChar is inside the BBOX
)
for item in lt_obj._objs: # pylint: disable=protected-access
if isinstance(item, LTAnno):
# special treatment of LTAnno because it is virtual with no position data
if text_inside_bbox:
# LTAnno is added because an LTChar was inside the bbox before
lt_objs_in_bbox.append(item)
elif isinstance(item, LTChar):
# check if the first and last LTChar have shown in the given bbox to decide if the trailing
# LTAnno should be added
ltchar_inside = check_lt_obj_in_bbox(item, bbox)
if text_inside_bbox:
if ltchar_inside:
lt_objs_in_bbox.append(item)
else:
if isinstance(item, LTChar):
# check if the first and last LTChar have shown in the given bbox to decide if the trailing
# LTAnno should be added
ltchar_inside = check_lt_obj_in_bbox(item, bbox)
if text_inside_bbox:
if ltchar_inside:
lt_objs_in_bbox.append(item)
else:
# the bbox just ended and can't enter again
break
else:
if ltchar_inside:
lt_objs_in_bbox.append(item)
text_inside_bbox = True
else:
# no LTChar was added before, so not in BBOX yet
pass
else:
# it is not an LTAnno nor an LTChar, so recurse and break it further down
find_lt_obj_in_bbox(lt_objs_in_bbox, item, bbox)
else:
# no attribute "_objs" exists. It reaches the bottom of the hierarchy
pass
# the bbox just ended and can't enter again
break
elif ltchar_inside:
lt_objs_in_bbox.append(item)
text_inside_bbox = True
else:
# no LTChar was added before, so not in BBOX yet
pass
else:
# it is not an LTAnno nor an LTChar, so recurse and break it further down
find_lt_obj_in_bbox(lt_objs_in_bbox, item, bbox)
else:
# no attribute "_objs" exists. It reaches the bottom of the hierarchy
pass


def lt_page_crop(
Expand Down
Loading

0 comments on commit 29fe5a3

Please sign in to comment.