Skip to content

Commit

Permalink
Merge pull request #30 from useblocks/rect-model
Browse files Browse the repository at this point in the history
Rect model
  • Loading branch information
juiwenchen authored Jan 23, 2024
2 parents d08febc + 946c820 commit aa2999f
Show file tree
Hide file tree
Showing 12 changed files with 239 additions and 26 deletions.
5 changes: 5 additions & 0 deletions docs/contents/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ Figure

.. autoclass:: libpdf.models.figure.Figure

Rect
~~~~~~

.. autoclass:: libpdf.models.rect.Rect


Position
~~~~~~~~
Expand Down
8 changes: 7 additions & 1 deletion docs/contents/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,15 @@ __ https://github.com/useblocks/libpdf/compare/v0.0.1...HEAD
Added
~~~~~

- Introduced new element ``Rect`` (`PR #30 <https://github.com/useblocks/libpdf/pull/30>`_)
- Introduced Ruff as linter and formatter (PRs `#28 <https://github.com/useblocks/libpdf/pull/28>`_,
`#29 <https://github.com/useblocks/libpdf/pull/29>`_ and `#31 <https://github.com/useblocks/libpdf/pull/31>`_)
- Added support for Python 3.10, 3.11 and 3.12 (`PR #27 <https://github.com/useblocks/libpdf/pull/27>`_)

Changed
~~~~~~~

- Updated downstream library dependencies

- Added flag ``--no_annotations`` to exclude annotation extraction from the catalog to speed up extraction
(`PR #15 <https://github.com/useblocks/libpdf/pull/15>`_)

Expand All @@ -33,6 +37,8 @@ Removed

- libpdf has dropped support for Python 3.6, which reached end-of-life on 2021-12-23; this also fixes the Pillow 8
security vulnerabilities
- libpdf has dropped support for Python 3.7, which reached end-of-life on 2023-06-27
(`PR #27 <https://github.com/useblocks/libpdf/pull/27>`_)

Fixed
~~~~~
Expand Down
14 changes: 11 additions & 3 deletions docs/contents/pdf_model.puml
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,14 @@
**rel_path**: figures/<image>
}

class Rect {
+ [tuple] non_stroking_color
--
**id**: ""rect.<1,2,3,n>""
**non_stroking_color**: RGB color space,
given as floats between 0 and 1
}

package x <<layout>> {

class HorizontalBox {
Expand Down Expand Up @@ -217,9 +225,7 @@
--
**text**: the character itself
}



}
Char "+chars 1..*" ---left* Word
Word "+words 1..*" ---left* HorizontalLine
HorizontalLine "+lines 1..*" --* HorizontalBox
Expand All @@ -228,6 +234,7 @@
HorizontalBox "+textbox 0..1" --* Cell
HorizontalBox "+textboxes 0..*" --* Figure
HorizontalBox "+textbox 0..1" --* Chapter
HorizontalBox "+textbox 0..1" --* Rect

Paragraph "+b_source 1" *-- "+links *" Link
Figure "+b_source 1" *-- "+links *" Link
Expand All @@ -240,6 +247,7 @@
Figure -[#COL_DERIVE]-|> Element
Chapter -[#COL_DERIVE]-|> Element
Paragraph -[#COL_DERIVE]|> Element
Rect -[#COL_DERIVE]|> Element

Chapter "1" *-- "+content *" Element: ordered
Page "1" *-- "+content *" Element: ordered
Expand Down
9 changes: 8 additions & 1 deletion libpdf/apiobjects.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from libpdf.models.chapter import Chapter
from libpdf.models.figure import Figure
from libpdf.models.paragraph import Paragraph
from libpdf.models.rect import Rect
from libpdf.models.root import Root
from libpdf.models.table import Table

Expand Down Expand Up @@ -36,6 +37,7 @@ def __init__( # pylint: disable=too-many-arguments # the parameters are needed
paragraphs: List[Paragraph],
tables: List[Table],
figures: List[Figure],
rects: List[Rect],
pdfplumber: PDF,
pdfminer: PDFDocument,
):
Expand All @@ -45,7 +47,11 @@ def __init__( # pylint: disable=too-many-arguments # the parameters are needed

# attributes for API convenience
self.flattened = Flattened(
chapters=chapters, paragraphs=paragraphs, tables=tables, figures=figures
chapters=chapters,
paragraphs=paragraphs,
tables=tables,
figures=figures,
rects=rects,
)

# exposing the pdfplumber PDF object
Expand All @@ -71,3 +77,4 @@ class Flattened(NamedTuple):
paragraphs: List[Paragraph]
tables: List[Table]
figures: List[Figure]
rects: List[Rect]
4 changes: 2 additions & 2 deletions libpdf/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
from typing import Any, Dict, List, Union

from pdfminer.pdftypes import PDFObjRef
from pdfminer.pdftypes import PDFObjRef, resolve1
from pdfminer.psparser import PSLiteral

from libpdf.log import logging_needed
Expand Down Expand Up @@ -51,7 +51,7 @@ def get_named_destination(pdf): # pylint: disable=too-many-branches
):
name_tree = pdf_catalog["Names"].resolve()["Dests"].resolve()
elif isinstance(pdf_catalog["Names"], dict) and "Dests" in pdf_catalog["Names"]:
name_tree = pdf_catalog["Names"]["Dests"].resolve()
name_tree = resolve1(pdf_catalog["Names"]["Dests"])
# check if name tree not empty
if name_tree:
# map page id to page number
Expand Down
14 changes: 14 additions & 0 deletions libpdf/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
no_paragraphs: bool = False,
no_tables: bool = False,
no_figures: bool = False,
no_rects: bool = False,
cli_usage: bool = False,
visual_debug: bool = False,
visual_debug_output_dir: str = None,
Expand Down Expand Up @@ -66,6 +67,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
:param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
:param no_tables: flag triggering the exclusion of tables
:param no_figures: flag triggering the exclusion of figures
:param no_rects: flag triggering the exclusion of rects
:param cli_usage: flag indicating that the function was called through CLI
:param visual_debug: flag triggering visual debug feature
:param visual_debug_output_dir: output directory for visualized pdf pages
Expand Down Expand Up @@ -120,6 +122,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
LOG.info("Extract paragraphs: %s", "no" if no_paragraphs else "yes")
LOG.info("Extract tables: %s", "no" if no_tables else "yes")
LOG.info("Extract figures: %s", "no" if no_figures else "yes")
LOG.info("Extract rects: %s", "no" if no_rects else "yes")
overall_pbar.update(1)
try:
objects = extract(
Expand All @@ -133,6 +136,7 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable
no_paragraphs,
no_tables,
no_figures,
no_rects,
overall_pbar,
)
except LibpdfException:
Expand Down Expand Up @@ -177,6 +181,7 @@ def main_api( # pylint: disable=too-many-arguments, too-many-locals
no_paragraphs: bool = False,
no_tables: bool = False,
no_figures: bool = False,
no_rects: bool = False,
init_logging: bool = True,
visual_debug: bool = False,
visual_debug_output_dir: str = "visual_debug_libpdf",
Expand All @@ -202,6 +207,7 @@ def main_api( # pylint: disable=too-many-arguments, too-many-locals
:param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
:param no_tables: flag triggering the exclusion of tables
:param no_figures: flag triggering the exclusion of figures
:param no_rects: flag triggering the exclusion of rects
:param init_logging: flag indicating whether libpdf shall instantiate a root log handler that is capable of
handling both log messages and progress bars; it does so by passing all log messages to
tqdm.write()
Expand Down Expand Up @@ -245,6 +251,7 @@ def main_api( # pylint: disable=too-many-arguments, too-many-locals
no_paragraphs=no_paragraphs,
no_tables=no_tables,
no_figures=no_figures,
no_rects=no_rects,
cli_usage=False,
visual_debug=visual_debug,
visual_debug_output_dir=visual_debug_output_dir,
Expand Down Expand Up @@ -463,6 +470,13 @@ def handle_parse_result(self, ctx, opts, args):
help="Skip figures. Figures will not be part of the output JSON/YAML structures and also not saved if"
" --save-figures is given.",
)
@click.option(
"--no-rects",
is_flag=True,
show_default=True,
help="Skip rects. rects will not be part of the output JSON/YAML structures and also not saved if"
" --save-rects is given.",
)
@click.option("-vd", "--visual-debug", is_flag=True, help="Visual debug libpdf.")
@click.option(
"-vo",
Expand Down
112 changes: 109 additions & 3 deletions libpdf/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from libpdf.models.file_meta import FileMeta
from libpdf.models.page import Page
from libpdf.models.position import Position
from libpdf.models.rect import Rect
from libpdf.models.root import Root
from libpdf.parameters import (
FIGURE_MIN_HEIGHT,
Expand All @@ -33,7 +34,12 @@
from libpdf.progress import bar_format_lvl2, tqdm
from libpdf.tables import extract_pdf_table
from libpdf.textbox import extract_linked_chars, extract_paragraphs_chapters
from libpdf.utils import lt_page_crop, lt_to_libpdf_hbox_converter, to_pdfplumber_bbox
from libpdf.utils import (
lt_page_crop,
lt_textbox_crop,
lt_to_libpdf_hbox_converter,
to_pdfplumber_bbox,
)

LOG = logging.getLogger(__name__)

Expand Down Expand Up @@ -61,6 +67,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
no_paragraphs: bool,
no_tables: bool,
no_figures: bool,
no_rects: bool,
overall_pbar: tqdm,
) -> ApiObjects:
"""
Expand All @@ -76,6 +83,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
:param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content)
:param no_tables: flag triggering the exclusion of tables
:param no_figures: flag triggering the exclusion of figures
:param no_rects: flag triggering the exclusion of rects
:param overall_pbar: total progress bar for whole libpdf run
:return: instance of Objects class
:raise LibpdfException: PDF contains no pages
Expand Down Expand Up @@ -142,7 +150,17 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
# smartly remove figures that are in header and footer
if smart_page_crop:
figure_list = smart_page_crop_header_footer(pdf, figure_list)
overall_pbar.update(30)
overall_pbar.update(15)

if no_rects:
LOG.info("Excluding rects extraction")
rect_list = []
else:
rect_list = extract_rects(pdf, pages_list)
# smartly remove figures that are in header and footer
if smart_page_crop:
rect_list = smart_page_crop_header_footer(pdf, rect_list)
overall_pbar.update(15)

if no_tables:
LOG.info("Excluding tables extraction")
Expand All @@ -169,7 +187,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
paragraph_list = smart_page_crop_header_footer(pdf, paragraph_list)

element_list = pro.merge_all_elements(
figure_list, table_list, paragraph_list, chapter_list
figure_list, table_list, paragraph_list, chapter_list, rect_list
)

# to check if elements shall be mapped into nested outline structure.
Expand All @@ -195,6 +213,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta
paragraphs=paragraph_list,
tables=table_list,
figures=figure_list,
rects=rect_list,
pdfplumber=pdf,
pdfminer=pdf.doc,
)
Expand Down Expand Up @@ -640,6 +659,76 @@ def extract_figures(
return figure_list


def extract_rects(
pdf,
pages_list,
) -> List[Rect]: # pylint: disable=too-many-nested-blocks, too-many-branches # local algorithm, easier to read when not split up
"""Extract rects in PDF."""
LOG.info("Extracting rects ...")
rect_list = []

for idx_page, page in enumerate( # pylint: disable=too-many-nested-blocks
tqdm(
pdf.pages,
desc="###### Extracting rects",
unit="pages",
bar_format=bar_format_lvl2(),
),
):
if logging_needed(idx_page, len(pdf.pages)):
LOG.debug("Extracting rects page %s of %s", idx_page + 1, len(pdf.pages))
page_crop = pro.remove_page_header_footer(page)
lt_page = page._layout # pylint: disable=protected-access # easiest way to obtain LTPage

# check and filter figures
# figures = check_and_filter_figures(page_crop.objects['figure']) if 'figure' in page_crop.objects else []
# rects = page_crop.objects['rects'] if 'rects' in page_crop.objects else []
rects = page.objects["rect"] if "rect" in page.objects else []

if len(rects) != 0:
for idx_rect, rect in enumerate(rects):
rect_pos = Position(
float(rect["x0"]),
float(rect["y0"]),
float(rect["x1"]),
float(rect["y1"]),
pages_list[idx_page],
)

non_stroking_color = rect["non_stroking_color"]

offset = 5
rect_bbox = (
rect_pos.x0 - offset,
rect_pos.y0 - offset,
rect_pos.x1 + offset,
rect_pos.y1 + offset,
)

LOG.info(
f"found rect at {rect_bbox} at page {idx_page+1}: color {non_stroking_color}"
)
lt_textbox = lt_textbox_crop(
rect_bbox,
lt_page._objs,
word_margin=LA_PARAMS["word_margin"],
y_tolerance=LA_PARAMS["line_overlap"],
)
if lt_textbox:
hbox = lt_to_libpdf_hbox_converter(lt_textbox)
else:
hbox = None

rect = Rect(idx_rect + 1, rect_pos, hbox, non_stroking_color)
rect_list.append(rect)

else:
LOG.info(f"found no rects on page {idx_page+1}: {page_crop.objects.keys()}")

# return figure_list
return rect_list


def images_to_save(pdf, figure_list):
"""Save images to given path."""
for fig in figure_list:
Expand Down Expand Up @@ -708,6 +797,11 @@ def check_and_filter_figures(figures_list): # pylint: disable=too-many-branches
if figure["height"] > FIGURE_MIN_HEIGHT and figure["width"] > FIGURE_MIN_WIDTH:
filtered_figures.append(figure)

if len(filtered_figures) < len(figures_list):
LOG.debug(
f"check_and_filter_figures removed {len(figures_list) - len(filtered_figures)} out of {len(figures_list)} due to invalid height/width"
)

for figure in filtered_figures:
# if figure exceed the boundary of the page, then only keep the part of figure that inside this page
if not (
Expand All @@ -734,6 +828,7 @@ def check_and_filter_figures(figures_list): # pylint: disable=too-many-branches
and fig0["y1"] >= fig1["y1"]
):
if fig1 in filtered_figures:
LOG.debug("remove filtered figure due to contained in other figure")
filtered_figures.remove(fig1)

# check if figures partially overlap
Expand All @@ -754,9 +849,20 @@ def check_and_filter_figures(figures_list): # pylint: disable=too-many-branches
# compare the size of two figures, keep the bigger figure
if fig0["width"] * fig0["height"] <= fig1["width"] * fig1["height"]:
if fig0 in filtered_figures:
LOG.debug(
"remove filtered figure fig0 due to partially overlap"
)
filtered_figures.remove(fig0)
else:
if fig1 in filtered_figures:
LOG.debug(
"remove filtered figure fig1 due to partially overlap"
)
filtered_figures.remove(fig1)

if len(filtered_figures) < len(figures_list):
LOG.debug(
f"check_and_filter_figures removed {len(figures_list) - len(filtered_figures)} out of {len(figures_list)} figures"
)

return filtered_figures
Loading

0 comments on commit aa2999f

Please sign in to comment.