Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rect model #30

Merged
merged 18 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/contents/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ Figure

.. autoclass:: libpdf.models.figure.Figure

Rect
~~~~~~

.. autoclass:: libpdf.models.rect.Rect


Position
~~~~~~~~
Expand Down
8 changes: 7 additions & 1 deletion docs/contents/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,15 @@ __ https://github.com/useblocks/libpdf/compare/v0.0.1...HEAD
Added
~~~~~

- Introduced new element ``Rect`` (`PR #30 <https://github.com/useblocks/libpdf/pull/30>`_)
- Introduced Ruff as linter and formatter (PRs `#28 <https://github.com/useblocks/libpdf/pull/28>`_,
`#29 <https://github.com/useblocks/libpdf/pull/29>`_ and `#31 <https://github.com/useblocks/libpdf/pull/31>`_)
- Added support for Python 3.10, 3.11 and 3.12 (`PR #27 <https://github.com/useblocks/libpdf/pull/27>`_)

Changed
~~~~~~~

- Updated downstream library dependencies

- Added flag ``--no_annotations`` to exclude annotation extraction from the catalog to speed up extraction
(`PR #15 <https://github.com/useblocks/libpdf/pull/15>`_)

Expand All @@ -33,6 +37,8 @@ Removed

- libpdf has dropped support for Python 3.6, which reached end-of-life on 2021-12-23; this also fixes the Pillow 8
security vulnerabilities
- libpdf has dropped support for Python 3.7, which reached end-of-life on 2023-06-27
(`PR #27 <https://github.com/useblocks/libpdf/pull/27>`_)

Fixed
~~~~~
Expand Down
14 changes: 11 additions & 3 deletions docs/contents/pdf_model.puml
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,14 @@
**rel_path**: figures/<image>
}

class Rect {
+ [tuple] non_stroking_color
--
**id**: ""rect.<1,2,3,n>""
**non_stroking_color**: RGB color space,
given as floats between 0 and 1
}

package x <<layout>> {

class HorizontalBox {
Expand Down Expand Up @@ -217,9 +225,7 @@
--
**text**: the character itself
}



}
Char "+chars 1..*" ---left* Word
Word "+words 1..*" ---left* HorizontalLine
HorizontalLine "+lines 1..*" --* HorizontalBox
Expand All @@ -228,6 +234,7 @@
HorizontalBox "+textbox 0..1" --* Cell
HorizontalBox "+textboxes 0..*" --* Figure
HorizontalBox "+textbox 0..1" --* Chapter
HorizontalBox "+textbox 0..1" --* Rect

Paragraph "+b_source 1" *-- "+links *" Link
Figure "+b_source 1" *-- "+links *" Link
Expand All @@ -240,6 +247,7 @@
Figure -[#COL_DERIVE]-|> Element
Chapter -[#COL_DERIVE]-|> Element
Paragraph -[#COL_DERIVE]|> Element
Rect -[#COL_DERIVE]|> Element

Chapter "1" *-- "+content *" Element: ordered
Page "1" *-- "+content *" Element: ordered
Expand Down
9 changes: 8 additions & 1 deletion libpdf/apiobjects.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from libpdf.models.chapter import Chapter
from libpdf.models.figure import Figure
from libpdf.models.paragraph import Paragraph
from libpdf.models.rect import Rect
from libpdf.models.root import Root
from libpdf.models.table import Table

Expand Down Expand Up @@ -36,6 +37,7 @@ def __init__( # pylint: disable=too-many-arguments # the parameters are needed
paragraphs: List[Paragraph],
tables: List[Table],
figures: List[Figure],
rects: List[Rect],
pdfplumber: PDF,
pdfminer: PDFDocument,
):
Expand All @@ -45,7 +47,11 @@ def __init__( # pylint: disable=too-many-arguments # the parameters are needed

# attributes for API convenience
self.flattened = Flattened(
chapters=chapters, paragraphs=paragraphs, tables=tables, figures=figures
chapters=chapters,
paragraphs=paragraphs,
tables=tables,
figures=figures,
rects=rects,
)

# exposing the pdfplumber PDF object
Expand All @@ -71,3 +77,4 @@ class Flattened(NamedTuple):
paragraphs: List[Paragraph]
tables: List[Table]
figures: List[Figure]
rects: List[Rect]
4 changes: 2 additions & 2 deletions libpdf/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
from typing import Any, Dict, List, Union

from pdfminer.pdftypes import PDFObjRef
from pdfminer.pdftypes import PDFObjRef, resolve1
from pdfminer.psparser import PSLiteral

from libpdf.log import logging_needed
Expand Down Expand Up @@ -51,7 +51,7 @@ def get_named_destination(pdf): # pylint: disable=too-many-branches
):
name_tree = pdf_catalog["Names"].resolve()["Dests"].resolve()
elif isinstance(pdf_catalog["Names"], dict) and "Dests" in pdf_catalog["Names"]:
name_tree = pdf_catalog["Names"]["Dests"].resolve()
name_tree = resolve1(pdf_catalog["Names"]["Dests"])
# check if name tree not empty
if name_tree:
# map page id to page number
Expand Down
14 changes: 14 additions & 0 deletions libpdf/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
no_paragraphs: bool = False,
no_tables: bool = False,
no_figures: bool = False,
no_rects: bool = False,
cli_usage: bool = False,
visual_debug: bool = False,
visual_debug_output_dir: str = None,
Expand Down Expand Up @@ -66,6 +67,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
:param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
:param no_tables: flag triggering the exclusion of tables
:param no_figures: flag triggering the exclusion of figures
:param no_rects: flag triggering the exclusion of rects
:param cli_usage: flag indicating that the function was called through CLI
:param visual_debug: flag triggering visual debug feature
:param visual_debug_output_dir: output directory for visualized pdf pages
Expand Down Expand Up @@ -120,6 +122,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
LOG.info("Extract paragraphs: %s", "no" if no_paragraphs else "yes")
LOG.info("Extract tables: %s", "no" if no_tables else "yes")
LOG.info("Extract figures: %s", "no" if no_figures else "yes")
LOG.info("Extract rects: %s", "no" if no_rects else "yes")
overall_pbar.update(1)
try:
objects = extract(
Expand All @@ -133,6 +136,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
no_paragraphs,
no_tables,
no_figures,
no_rects,
overall_pbar,
)
except LibpdfException:
Expand Down Expand Up @@ -177,6 +181,7 @@ def main_api( # pylint: disable=too-many-arguments, too-many-locals
no_paragraphs: bool = False,
no_tables: bool = False,
no_figures: bool = False,
no_rects: bool = False,
init_logging: bool = True,
visual_debug: bool = False,
visual_debug_output_dir: str = "visual_debug_libpdf",
Expand All @@ -202,6 +207,7 @@ def main_api( # pylint: disable=too-many-arguments, too-many-locals
:param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
:param no_tables: flag triggering the exclusion of tables
:param no_figures: flag triggering the exclusion of figures
:param no_rects: flag triggering the exclusion of rects
:param init_logging: flag indicating whether libpdf shall instantiate a root log handler that is capable of
handling both log messages and progress bars; it does so by passing all log messages to
tqdm.write()
Expand Down Expand Up @@ -245,6 +251,7 @@ def main_api( # pylint: disable=too-many-arguments, too-many-locals
no_paragraphs=no_paragraphs,
no_tables=no_tables,
no_figures=no_figures,
no_rects=no_rects,
cli_usage=False,
visual_debug=visual_debug,
visual_debug_output_dir=visual_debug_output_dir,
Expand Down Expand Up @@ -463,6 +470,13 @@ def handle_parse_result(self, ctx, opts, args):
help="Skip figures. Figures will not be part of the output JSON/YAML structures and also not saved if"
" --save-figures is given.",
)
@click.option(
"--no-rects",
is_flag=True,
show_default=True,
help="Skip rects. rects will not be part of the output JSON/YAML structures and also not saved if"
" --save-rects is given.",
)
@click.option("-vd", "--visual-debug", is_flag=True, help="Visual debug libpdf.")
@click.option(
"-vo",
Expand Down
112 changes: 109 additions & 3 deletions libpdf/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from libpdf.models.file_meta import FileMeta
from libpdf.models.page import Page
from libpdf.models.position import Position
from libpdf.models.rect import Rect
from libpdf.models.root import Root
from libpdf.parameters import (
FIGURE_MIN_HEIGHT,
Expand All @@ -33,7 +34,12 @@
from libpdf.progress import bar_format_lvl2, tqdm
from libpdf.tables import extract_pdf_table
from libpdf.textbox import extract_linked_chars, extract_paragraphs_chapters
from libpdf.utils import lt_page_crop, lt_to_libpdf_hbox_converter, to_pdfplumber_bbox
from libpdf.utils import (
lt_page_crop,
lt_textbox_crop,
lt_to_libpdf_hbox_converter,
to_pdfplumber_bbox,
)

LOG = logging.getLogger(__name__)

Expand Down Expand Up @@ -61,6 +67,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
no_paragraphs: bool,
no_tables: bool,
no_figures: bool,
no_rects: bool,
overall_pbar: tqdm,
) -> ApiObjects:
"""
Expand All @@ -76,6 +83,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
:param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
:param no_tables: flag triggering the exclusion of tables
:param no_figures: flag triggering the exclusion of figures
:param no_rects: flag triggering the exclusion of rects
:param overall_pbar: total progress bar for whole libpdf run
:return: instance of Objects class
:raise LibpdfException: PDF contains no pages
Expand Down Expand Up @@ -142,7 +150,17 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
# smartly remove figures that are in header and footer
if smart_page_crop:
figure_list = smart_page_crop_header_footer(pdf, figure_list)
overall_pbar.update(30)
overall_pbar.update(15)

if no_rects:
LOG.info("Excluding rects extraction")
rect_list = []
else:
rect_list = extract_rects(pdf, pages_list)
# smartly remove figures that are in header and footer
if smart_page_crop:
rect_list = smart_page_crop_header_footer(pdf, rect_list)
overall_pbar.update(15)

if no_tables:
LOG.info("Excluding tables extraction")
Expand All @@ -169,7 +187,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
paragraph_list = smart_page_crop_header_footer(pdf, paragraph_list)

element_list = pro.merge_all_elements(
figure_list, table_list, paragraph_list, chapter_list
figure_list, table_list, paragraph_list, chapter_list, rect_list
)

# to check if elements shall be mapped into nested outline structure.
Expand All @@ -195,6 +213,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
paragraphs=paragraph_list,
tables=table_list,
figures=figure_list,
rects=rect_list,
pdfplumber=pdf,
pdfminer=pdf.doc,
)
Expand Down Expand Up @@ -640,6 +659,76 @@ def extract_figures(
return figure_list


def extract_rects(
pdf,
pages_list,
) -> List[Rect]: # pylint: disable=too-many-nested-blocks, too-many-branches # local algorithm, easier to read when not split up
"""Extract rects in PDF."""
LOG.info("Extracting rects ...")
rect_list = []

for idx_page, page in enumerate( # pylint: disable=too-many-nested-blocks
tqdm(
pdf.pages,
desc="###### Extracting rects",
unit="pages",
bar_format=bar_format_lvl2(),
),
):
if logging_needed(idx_page, len(pdf.pages)):
LOG.debug("Extracting rects page %s of %s", idx_page + 1, len(pdf.pages))
page_crop = pro.remove_page_header_footer(page)
lt_page = page._layout # pylint: disable=protected-access # easiest way to obtain LTPage

# check and filter figures
# figures = check_and_filter_figures(page_crop.objects['figure']) if 'figure' in page_crop.objects else []
# rects = page_crop.objects['rects'] if 'rects' in page_crop.objects else []
rects = page.objects["rect"] if "rect" in page.objects else []

if len(rects) != 0:
for idx_rect, rect in enumerate(rects):
rect_pos = Position(
float(rect["x0"]),
float(rect["y0"]),
float(rect["x1"]),
float(rect["y1"]),
pages_list[idx_page],
)

non_stroking_color = rect["non_stroking_color"]

offset = 5
rect_bbox = (
rect_pos.x0 - offset,
rect_pos.y0 - offset,
rect_pos.x1 + offset,
rect_pos.y1 + offset,
)

LOG.info(
f"found rect at {rect_bbox} at page {idx_page+1}: color {non_stroking_color}"
)
lt_textbox = lt_textbox_crop(
rect_bbox,
lt_page._objs,
word_margin=LA_PARAMS["word_margin"],
y_tolerance=LA_PARAMS["line_overlap"],
)
if lt_textbox:
hbox = lt_to_libpdf_hbox_converter(lt_textbox)
else:
hbox = None

rect = Rect(idx_rect + 1, rect_pos, hbox, non_stroking_color)
rect_list.append(rect)

else:
LOG.info(f"found no rects on page {idx_page+1}: {page_crop.objects.keys()}")

# return figure_list
return rect_list


def images_to_save(pdf, figure_list):
"""Save images to given path."""
for fig in figure_list:
Expand Down Expand Up @@ -708,6 +797,11 @@ def check_and_filter_figures(figures_list): # pylint: disable=too-many-branches
if figure["height"] > FIGURE_MIN_HEIGHT and figure["width"] > FIGURE_MIN_WIDTH:
filtered_figures.append(figure)

if len(filtered_figures) < len(figures_list):
LOG.debug(
f"check_and_filter_figures removed {len(figures_list) - len(filtered_figures)} out of {len(figures_list)} due to invalid height/width"
)

for figure in filtered_figures:
# if figure exceed the boundary of the page, then only keep the part of figure that inside this page
if not (
Expand All @@ -734,6 +828,7 @@ def check_and_filter_figures(figures_list): # pylint: disable=too-many-branches
and fig0["y1"] >= fig1["y1"]
):
if fig1 in filtered_figures:
LOG.debug("remove filtered figure due to contained in other figure")
filtered_figures.remove(fig1)

# check if figures partially overlap
Expand All @@ -754,9 +849,20 @@ def check_and_filter_figures(figures_list): # pylint: disable=too-many-branches
# compare the size of two figures, keep the bigger figure
if fig0["width"] * fig0["height"] <= fig1["width"] * fig1["height"]:
if fig0 in filtered_figures:
LOG.debug(
"remove filtered figure fig0 due to partially overlap"
)
filtered_figures.remove(fig0)
else:
if fig1 in filtered_figures:
LOG.debug(
"remove filtered figure fig1 due to partially overlap"
)
filtered_figures.remove(fig1)

if len(filtered_figures) < len(figures_list):
LOG.debug(
f"check_and_filter_figures removed {len(figures_list) - len(filtered_figures)} out of {len(figures_list)} figures"
)

return filtered_figures
Loading
Loading