From 5433f179353d9525a0af8641796a7848f9ac910a Mon Sep 17 00:00:00 2001 From: Joerg Kreuzberger Date: Wed, 17 Jan 2024 10:23:05 +0100 Subject: [PATCH 01/18] cherry pick the commit from upgrade branch -textboxes are not excluded for rects -add rect model -extract rect --- docs/contents/api.rst | 5 ++ libpdf/apiobjects.py | 7 +-- libpdf/core.py | 37 ++++++++------ libpdf/extract.py | 114 ++++++++++++++++++++++++++++++++++++++++-- libpdf/models/rect.py | 57 +++++++++++++++++++++ libpdf/parameters.py | 27 +++++----- libpdf/process.py | 33 ++++++------ libpdf/textbox.py | 14 ++++-- libpdf/utils.py | 35 +++++++------ 9 files changed, 257 insertions(+), 72 deletions(-) create mode 100644 libpdf/models/rect.py diff --git a/docs/contents/api.rst b/docs/contents/api.rst index ee1b95c..b56afa6 100644 --- a/docs/contents/api.rst +++ b/docs/contents/api.rst @@ -72,6 +72,11 @@ Figure .. autoclass:: libpdf.models.figure.Figure +Rect +~~~~~~ + +.. autoclass:: libpdf.models.figure.Rect + Position ~~~~~~~~ diff --git a/libpdf/apiobjects.py b/libpdf/apiobjects.py index b334f46..042d36b 100644 --- a/libpdf/apiobjects.py +++ b/libpdf/apiobjects.py @@ -7,6 +7,7 @@ from libpdf.models.chapter import Chapter from libpdf.models.figure import Figure +from libpdf.models.rect import Rect from libpdf.models.paragraph import Paragraph from libpdf.models.root import Root from libpdf.models.table import Table @@ -36,6 +37,7 @@ def __init__( # pylint: disable=too-many-arguments # the parameters are needed paragraphs: List[Paragraph], tables: List[Table], figures: List[Figure], + rects: List[Rect], pdfplumber: PDF, pdfminer: PDFDocument, ): @@ -44,9 +46,7 @@ def __init__( # pylint: disable=too-many-arguments # the parameters are needed self.root = root # attributes for API convenience - self.flattened = Flattened( - chapters=chapters, paragraphs=paragraphs, tables=tables, figures=figures - ) + self.flattened = Flattened(chapters=chapters, paragraphs=paragraphs, tables=tables, figures=figures, rects=rects) # exposing the pdfplumber PDF object self.pdfplumber = pdfplumber @@ -71,3 +71,4 @@ class Flattened(NamedTuple): paragraphs: List[Paragraph] tables: List[Table] figures: List[Figure] + rects: List[Rect] diff --git a/libpdf/core.py b/libpdf/core.py index d21da08..034bba1 100644 --- a/libpdf/core.py +++ b/libpdf/core.py @@ -35,6 +35,8 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable no_paragraphs: bool = False, no_tables: bool = False, no_figures: bool = False, + no_rects: bool = False, + crop_rects_text: bool = False, cli_usage: bool = False, visual_debug: bool = False, visual_debug_output_dir: str = None, @@ -66,6 +68,8 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable :param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content) :param no_tables: flag triggering the exclusion of tables :param no_figures: flag triggering the exclusion of figures + :param no_rects: flag triggering the exclusion of rects + :param crop_rects_text: flag triggering that rects text should be cropped from text like paragraphs :param cli_usage: flag indicating that the function was called through CLI :param visual_debug: flag triggering visual debug feature :param visual_debug_output_dir: output directory for visualized pdf pages @@ -106,20 +110,17 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable if output_path: LOG.info("Output path: %s", output_path) else: - LOG.info("Writing extracted data to stdout") - LOG.info( - "Page range: [%s]", "all" if not pages else ",".join(str(x) for x in pages) - ) - LOG.info( - "Page crop: %s", - "not cropped" if not page_crop else " ".join(str(x) for x in page_crop), - ) - LOG.info("Smart page crop: %s", "on" if smart_page_crop else "off") - LOG.info("Extract annotations: %s", "no" if no_annotations else "yes") - LOG.info("Extract chapters: %s", "no" if no_chapters else "yes") - LOG.info("Extract paragraphs: %s", "no" if no_paragraphs else "yes") - LOG.info("Extract tables: %s", "no" if no_tables else "yes") - LOG.info("Extract figures: %s", "no" if no_figures else "yes") + LOG.info('Writing extracted data to stdout') + LOG.info('Page range: [%s]', 'all' if not pages else ','.join(str(x) for x in pages)) + LOG.info('Page crop: %s', 'not cropped' if not page_crop else ' '.join(str(x) for x in page_crop)) + LOG.info('Smart page crop: %s', 'on' if smart_page_crop else 'off') + LOG.info('Extract annotations: %s', 'no' if no_annotations else 'yes') + LOG.info('Extract chapters: %s', 'no' if no_chapters else 'yes') + LOG.info('Extract paragraphs: %s', 'no' if no_paragraphs else 'yes') + LOG.info('Extract tables: %s', 'no' if no_tables else 'yes') + LOG.info('Extract figures: %s', 'no' if no_figures else 'yes') + LOG.info('Extract rects: %s', 'no' if no_rects else 'yes') + LOG.info('Text rects crop: %s', 'no' if crop_rects_text else 'no') overall_pbar.update(1) try: objects = extract( @@ -133,6 +134,8 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable no_paragraphs, no_tables, no_figures, + no_rects, + crop_rects_text, overall_pbar, ) except LibpdfException: @@ -177,6 +180,8 @@ def main_api( # pylint: disable=too-many-arguments, too-many-locals no_paragraphs: bool = False, no_tables: bool = False, no_figures: bool = False, + no_rects: bool = False, + crop_rects_text: bool = False, init_logging: bool = True, visual_debug: bool = False, visual_debug_output_dir: str = "visual_debug_libpdf", @@ -202,6 +207,8 @@ def main_api( # pylint: disable=too-many-arguments, too-many-locals :param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content) :param no_tables: flag triggering the exclusion of tables :param no_figures: flag triggering the exclusion of figures + :param no_rects: flag triggering the exclusion of rects + :param crop_rects_text: flag triggering that rects text should be cropped from text like paragraphs :param init_logging: flag indicating whether libpdf shall instantiate a root log handler that is capable of handling both log messages and progress bars; it does so by passing all log messages to tqdm.write() @@ -245,6 +252,8 @@ def main_api( # pylint: disable=too-many-arguments, too-many-locals no_paragraphs=no_paragraphs, no_tables=no_tables, no_figures=no_figures, + no_rects = no_rects, + crop_rects_text = crop_rects_text, cli_usage=False, visual_debug=visual_debug, visual_debug_output_dir=visual_debug_output_dir, diff --git a/libpdf/extract.py b/libpdf/extract.py index 833c6a1..681d6b5 100644 --- a/libpdf/extract.py +++ b/libpdf/extract.py @@ -17,6 +17,7 @@ from libpdf.exceptions import LibpdfException from libpdf.log import logging_needed from libpdf.models.figure import Figure +from libpdf.models.rect import Rect from libpdf.models.file import File from libpdf.models.file_meta import FileMeta from libpdf.models.page import Page @@ -61,6 +62,8 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta no_paragraphs: bool, no_tables: bool, no_figures: bool, + no_rects: bool, + crop_rects_text: bool, overall_pbar: tqdm, ) -> ApiObjects: """ @@ -76,6 +79,8 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta :param no_paragraphs: flag triggering the exclusion of paragraphs (no normal text content) :param no_tables: flag triggering the exclusion of tables :param no_figures: flag triggering the exclusion of figures + :param no_rects: flag triggering the exclusion of rects + :param crop_rects_text: flag triggering that rects text should be cropped from text like paragraphs :param overall_pbar: total progress bar for whole libpdf run :return: instance of Objects class :raise LibpdfException: PDF contains no pages @@ -142,7 +147,18 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta # smartly remove figures that are in header and footer if smart_page_crop: figure_list = smart_page_crop_header_footer(pdf, figure_list) - overall_pbar.update(30) + overall_pbar.update(15) + + if no_rects: + LOG.info('Excluding rects extraction') + rect_list = [] + else: + rect_list = extract_rects(pdf, pages_list, figure_dir) + # smartly remove figures that are in header and footer + if smart_page_crop: + rect_list = smart_page_crop_header_footer(pdf, rect_list) + overall_pbar.update(15) + if no_tables: LOG.info("Excluding tables extraction") @@ -159,6 +175,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta pdf, figure_list, table_list, + rect_list if crop_rects_text else [], pages_list, no_chapters, no_paragraphs, @@ -168,9 +185,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta if smart_page_crop: paragraph_list = smart_page_crop_header_footer(pdf, paragraph_list) - element_list = pro.merge_all_elements( - figure_list, table_list, paragraph_list, chapter_list - ) + element_list = pro.merge_all_elements(figure_list, table_list, paragraph_list, chapter_list, rect_list) # to check if elements shall be mapped into nested outline structure. if catalog["outline"] is not None and not no_chapters: @@ -195,6 +210,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta paragraphs=paragraph_list, tables=table_list, figures=figure_list, + rects=rect_list, pdfplumber=pdf, pdfminer=pdf.doc, ) @@ -639,6 +655,85 @@ def extract_figures( return figure_list +def extract_rects( + pdf, + pages_list, + figure_dir, +) -> List[ + Rect +]: # pylint: disable=too-many-nested-blocks, too-many-branches # local algorithm, easier to read when not split up + """Extract rects in PDF.""" + LOG.info('Extracting rects ...') + rect_list = [] + + for idx_page, page in enumerate( # pylint: disable=too-many-nested-blocks + tqdm(pdf.pages, desc='###### Extracting rects', unit='pages', bar_format=bar_format_lvl2()), + ): + if logging_needed(idx_page, len(pdf.pages)): + LOG.debug('Extracting rects page %s of %s', idx_page + 1, len(pdf.pages)) + page_crop = pro.remove_page_header_footer(page) + lt_page = page._layout # pylint: disable=protected-access # easiest way to obtain LTPage + + # check and filter figures + #figures = check_and_filter_figures(page_crop.objects['figure']) if 'figure' in page_crop.objects else [] + #rects = page_crop.objects['rects'] if 'rects' in page_crop.objects else [] + rects = page.objects['rect'] if 'rect' in page.objects else [] + + + if len(rects) != 0: + for idx_rect, rect in enumerate(rects): + rect_pos = Position( + float(rect['x0']), + float(rect['y0']), + float(rect['x1']), + float(rect['y1']), + pages_list[idx_page], + ) + + non_stroking_color = rect['non_stroking_color'] + fill = rect['fill'] + + bbox = (rect_pos.x0, rect_pos.y0, rect_pos.x1, rect_pos.y1) + + LOG.info(f"found rect at {bbox} at page {idx_page+1}: color {non_stroking_color}"); + + lt_textboxes = lt_page_crop( + bbox, + lt_page._objs, # pylint: disable=protected-access # access needed + LTText, + contain_completely=True, + ) + + textboxes = [] + links = [] + for lt_textbox in lt_textboxes: + if catalog['annos']: + links.extend(extract_linked_chars(lt_textbox, lt_page.pageid)) + bbox = (lt_textbox.x0, lt_textbox.y0, lt_textbox.x1, lt_textbox.y1) + + hbox = lt_to_libpdf_hbox_converter(lt_textbox) + + textboxes.append(hbox) + + rect_name = f'page_{page.page_number}_rect.{idx_rect + 1}.png' + + # create figures directory if not exist + Path(figure_dir).mkdir(parents=True, exist_ok=True) + + rect_path = os.path.abspath(os.path.join(figure_dir, rect_name)) + + #figure = Figure(idx_figure + 1, image_path, fig_pos, links, textboxes, 'None') + #figure_list.append(figure) + rect = Rect( idx_rect + 1, rect_pos, links, textboxes, non_stroking_color ) + rect_list.append(rect) + + else: + LOG.info(f"found no rects on page {idx_page+1}: {page_crop.objects.keys()}") + + + #return figure_list + return rect_list + def images_to_save(pdf, figure_list): """Save images to given path.""" @@ -708,6 +803,11 @@ def check_and_filter_figures(figures_list): # pylint: disable=too-many-branches if figure["height"] > FIGURE_MIN_HEIGHT and figure["width"] > FIGURE_MIN_WIDTH: filtered_figures.append(figure) + if len(filtered_figures) < len(figures_list): + LOG.debug(f"check_and_filter_figures removed {len(figures_list) - len(filtered_figures)} out of {len(figures_list)} due to invalid height/width") + + + for figure in filtered_figures: # if figure exceed the boundary of the page, then only keep the part of figure that inside this page if not ( @@ -734,6 +834,7 @@ def check_and_filter_figures(figures_list): # pylint: disable=too-many-branches and fig0["y1"] >= fig1["y1"] ): if fig1 in filtered_figures: + LOG.debug("remove filtered figure due to contained in other figure") filtered_figures.remove(fig1) # check if figures partially overlap @@ -754,9 +855,14 @@ def check_and_filter_figures(figures_list): # pylint: disable=too-many-branches # compare the size of two figures, keep the bigger figure if fig0["width"] * fig0["height"] <= fig1["width"] * fig1["height"]: if fig0 in filtered_figures: + LOG.debug("remove filtered figure fig0 due to partially overlap") filtered_figures.remove(fig0) else: if fig1 in filtered_figures: + LOG.debug("remove filtered figure fig1 due to partially overlap") filtered_figures.remove(fig1) + if len(filtered_figures) < len(figures_list): + LOG.debug(f"check_and_filter_figures removed {len(figures_list) - len(filtered_figures)} out of {len(figures_list)} figures") + return filtered_figures diff --git a/libpdf/models/rect.py b/libpdf/models/rect.py new file mode 100644 index 0000000..1c13cb7 --- /dev/null +++ b/libpdf/models/rect.py @@ -0,0 +1,57 @@ +"""Definition for PDF figures.""" +from typing import TYPE_CHECKING, List + +from libpdf.models.element import Element +from libpdf.models.horizontal_box import HorizontalBox +from libpdf.models.link import Link + +# avoid import cycles for back reference type hinting +# https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles +if TYPE_CHECKING: + # F401 imported but unused - it's needed for type hinting + from libpdf.models.position import Position # noqa: F401, pylint: disable=ungrouped-imports + + +class Rect(Element): + """ + PDF Rect. + """ + + def __init__( + self, + idx: int, + position: 'Position', + links: List[Link], + textboxes: List[HorizontalBox], + non_stroking_color: str = None, + ): + """Initialize the instance.""" + super().__init__(position=position) + self.idx = idx + self.textboxes = textboxes + self.links = links + self.non_stroking_color = non_stroking_color + if self.links: + self.set_links_backref() + + @property + def id_(self): + """ + Return the identifier to address the Figure. + + The identifier follows the pattern ``figure.``. + idx the 1-based number of the Figure in the current scope (root, chapter, sub-chapters, page). + + It is used as a link target if a PDF link-annotation points to the Element. + + According to PDF model the parameter should be called ``id`` but the name is reserved in Python, so ``id_`` + is used. + + :type: str + """ + return f'rect.{self.idx}' + + def set_links_backref(self): + """Set b_source back reference on all links.""" + for link in self.links: + link.b_source = self diff --git a/libpdf/parameters.py b/libpdf/parameters.py index 2cbc2fa..70d0640 100644 --- a/libpdf/parameters.py +++ b/libpdf/parameters.py @@ -188,26 +188,29 @@ # The following parameters are used for visual debugging # Give colors a human readable name COLORS = { - "red": (255, 0, 0), - "green": (0, 255, 0), - "blue": (0, 0, 255), - "yellow": (255, 255, 0), + 'red': (255, 0, 0), + 'green': (0, 255, 0), + 'blue': (0, 0, 255), + 'yellow': (255, 255, 0), + 'cyan': (255,255,1) } # Map extracted elements with color # the numbers at the end means transparency, the value should be set in range (40, 160) VIS_DBG_MAP_ELEMENTS_COLOR = { - "chapter": COLORS["green"] + (80,), - "paragraph": COLORS["blue"] + (40,), - "table": COLORS["red"] + (40,), - "figure": COLORS["yellow"] + (80,), + 'chapter': COLORS['green'] + (80,), + 'paragraph': COLORS['blue'] + (40,), + 'table': COLORS['red'] + (40,), + 'figure': COLORS['yellow'] + (80,), + 'rect': COLORS['cyan'] + (40,), } RENDER_ELEMENTS = [ - "chapter", - "paragraph", - "table", - "figure", + 'chapter', + 'paragraph', + 'table', + 'figure', + 'rect' ] # the elements that shall be rendered # pdfminer layout analysis parameter from from pdfminer.layout -> LAParams.__init__ diff --git a/libpdf/process.py b/libpdf/process.py index b0f1cc7..6c926e5 100644 --- a/libpdf/process.py +++ b/libpdf/process.py @@ -24,6 +24,7 @@ from libpdf.models.chapter import Chapter from libpdf.models.element import Element from libpdf.models.figure import Figure +from libpdf.models.rect import Rect from libpdf.models.link import Link from libpdf.models.model_base import ModelBase from libpdf.models.page import Page @@ -66,7 +67,7 @@ def to_dict_output(obj: Union[ModelBase, Position]) -> Dict: # pylint: disable= """Turn all objects attributes into a dictionary.""" vars_dict = vars(obj).copy() - if isinstance(obj, (Chapter, Figure, Page, Paragraph, Table)): + if isinstance(obj, (Chapter, Figure, Page, Paragraph, Table, Rect)): # insert id as first key into vars_dict # After python3.6/3.7, a dict is sorted in insertion order # https://docs.python.org/3.6/whatsnew/3.6.html#whatsnew36-compactdict @@ -74,7 +75,7 @@ def to_dict_output(obj: Union[ModelBase, Position]) -> Dict: # pylint: disable= temp_dict = {"id": obj.id_} temp_dict.update(vars_dict) vars_dict = temp_dict - if isinstance(obj, (Figure, Paragraph, Table)): + if isinstance(obj, (Figure, Paragraph, Table, Rect)): # idx is not part of the UML model and should not be exported del vars_dict["idx"] if isinstance(obj, Page): @@ -84,9 +85,9 @@ def to_dict_output(obj: Union[ModelBase, Position]) -> Dict: # pylint: disable= # textboxes with positions are not interest of the output file if obj.textbox: text = obj.textbox.text - vars_dict["text"] = text - del vars_dict["textbox"] - if isinstance(obj, Figure): + vars_dict['text'] = text + del vars_dict['textbox'] + if isinstance(obj, (Figure, Rect)): # textboxes with positions are not interest of the output file if obj.textboxes: text = "\n".join(x.text for x in obj.textboxes) @@ -219,13 +220,13 @@ def filter_out_outline_page(outline_dict): def map_elements_outline( - element_list: List[Union[Chapter, Figure, Table, Paragraph]], + element_list: List[Union[Chapter, Figure, Table, Paragraph, Rect]], outline_dict, -) -> List[Union[Chapter, Figure, Table, Paragraph]]: +) -> List[Union[Chapter, Figure, Table, Paragraph, Rect]]: """ Map elements into a nested outline structure. - :param element_list: a list of elements including chapters, figures, tables, and paragraphs in a flatten structure. + :param element_list: a list of elements including chapters, figures, rects, tables, and paragraphs in a flatten structure. :param outline_dict: a nested outline structure from catalogs. :return: """ @@ -272,7 +273,7 @@ def map_elements_outline( del elements_in_outline[:idx] break - # acquire a list of chapters where their contents are filled with the corresponding elements, figures, tables + # acquire a list of chapters where their contents are filled with the corresponding elements, figures, rects, tables # and paragraphs. This chapter list is still in a flatten structure chapters_content_filled = fill_elements_content(elements_in_outline) @@ -292,11 +293,9 @@ def map_elements_outline( return nested_elements -def fill_elements_content( - elements_in_outline: List[Union[Chapter, Figure, Table, Paragraph]], -) -> List[Chapter]: +def fill_elements_content(elements_in_outline: List[Union[Chapter, Figure, Rect, Table, Paragraph]]) -> List[Chapter]: """ - Fill the elements, tables, figures and paragraphs into their corresponding chapters' contents. + Fill the elements, tables, figures, rects and paragraphs into their corresponding chapters' contents. The back chapter's reference of tables, figures, and paragraphs are added in this function @@ -305,7 +304,7 @@ def fill_elements_content( """ for index_element, element in enumerate(elements_in_outline): if isinstance(element, Chapter): - id_dict = {"table": 1, "figure": 1, "paragraph": 1} + id_dict = {'table': 1, 'figure': 1, 'paragraph': 1, 'rect': 1} content = elements_in_outline[index_element].content index_b_chapter = index_element else: @@ -452,7 +451,7 @@ def libpdf_target_explorer( # pylint: disable=too-many-nested-blocks # local al def elements_with_anno_finder( elements_on_page: List[Union[Paragraph, Table]], -) -> Union[List[Union[Chapter, Paragraph, Figure, Table, Cell]], None]: +) -> Union[List[Union[Chapter, Paragraph, Figure, Rect, Table, Cell]], None]: """ Find the elements, tables or paragraphs containing source links. @@ -536,7 +535,7 @@ def find_target_id(link: Link, pages_list: List[Page], src_element: Element) -> return target_id -def get_elements_page(target_page: Page) -> List[Union[Paragraph, Table, Figure]]: +def get_elements_page(target_page: Page) -> List[Union[Paragraph, Table, Figure, Rect]]: """ Collect the elements, which occurs on a certain target page. @@ -552,7 +551,7 @@ def get_elements_page(target_page: Page) -> List[Union[Paragraph, Table, Figure] return elements_target_page -def nest_explorer(element: Union[Figure, Table, Chapter, Paragraph]) -> str: +def nest_explorer(element: Union[Figure, Rect, Table, Chapter, Paragraph]) -> str: """ Explore the nested target ID path recursively. diff --git a/libpdf/textbox.py b/libpdf/textbox.py index e4db528..c352a42 100644 --- a/libpdf/textbox.py +++ b/libpdf/textbox.py @@ -46,6 +46,7 @@ from libpdf.log import logging_needed from libpdf.models.chapter import Chapter from libpdf.models.figure import Figure +from libpdf.models.rect import Rect from libpdf.models.link import Link from libpdf.models.page import Page from libpdf.models.paragraph import Paragraph @@ -63,11 +64,11 @@ LOG = logging.getLogger(__name__) - def extract_paragraphs_chapters( pdf, figure_list: List[Figure], table_list: List[Table], + rect_list: List[Rect], page_list: List[Page], no_chapters, no_paragraphs, @@ -93,7 +94,7 @@ def extract_paragraphs_chapters( return paragraph_list, chapter_list -def extract_lt_textboxes(pdf, figure_list, table_list, page_list): +def extract_lt_textboxes(pdf, figure_list, table_list, rect_list, page_list): """ Extract and filter lt_textboxes using pdfminer. @@ -103,6 +104,7 @@ def extract_lt_textboxes(pdf, figure_list, table_list, page_list): :param pdf: :param figure_list: :param table_list: + :param rect_list: :param page_list: :return: """ @@ -862,10 +864,11 @@ def _flatten_outline(nested_outline, flatten_outline: List): _flatten_outline(chapter["content"], flatten_outline) -def remove_lt_textboxes_in_tables_figures( +def remove_lt_textboxes_in_tables_figures_rect( page_lt_textboxes: Dict[int, List[LTTextBox]], figure_list: List[Figure], table_list: List[Table], + rect_list: List[Rect] ): """ Remove lt_textboxes in the coverage of tables or figures from page_lt_textboxes. @@ -876,6 +879,7 @@ def remove_lt_textboxes_in_tables_figures( :param page_lt_textboxes: :param figure_list: :param table_list: + :param rect_list: :return: """ page_lt_textboxes_filter = {} @@ -904,9 +908,10 @@ def remove_lt_textboxes_in_tables_figures( return page_lt_textboxes_filter -def tables_figures_merge( +def tables_figures_rect_merge( figure_list: List[Figure], table_list: List[Table], + rect_list: List[Rect], page_index: int, ) -> List[Union[Figure, Table]]: """ @@ -917,6 +922,7 @@ def tables_figures_merge( :param figure_list: A list of all figures extracted from the pages in this pdf :param table_list: A list of all tables extracted from the pages in this pdf + :param rect_list: :param page_index: index of current page number :return: """ diff --git a/libpdf/utils.py b/libpdf/utils.py index f39fa59..d9f1ba1 100644 --- a/libpdf/utils.py +++ b/libpdf/utils.py @@ -34,6 +34,7 @@ from libpdf.models.chapter import Chapter from libpdf.models.element import Element from libpdf.models.figure import Figure +from libpdf.models.rect import Rect from libpdf.models.horizontal_box import Char, HorizontalBox, HorizontalLine, Word from libpdf.models.paragraph import Paragraph from libpdf.models.table import Table @@ -41,19 +42,20 @@ from libpdf.progress import bar_format_lvl1, tqdm MAP_TYPES = { - Chapter: "chapter", - Paragraph: "paragraph", - Table: "table", - Figure: "figure", - LTChar: "paragraph", - LTCurve: "figure", - LTTextBox: "paragraph", - LTTextBoxHorizontal: "paragraph", - LTTextLineHorizontal: "paragraph", - LTFigure: "figure", - LTLine: "figure", - LTRect: "figure", - LTImage: "figure", + Chapter: 'chapter', + Paragraph: 'paragraph', + Table: 'table', + Figure: 'figure', + Rect: 'rect', + LTChar: 'paragraph', + LTCurve: 'figure', + LTTextBox: 'paragraph', + LTTextBoxHorizontal: 'paragraph', + LTTextLineHorizontal: 'paragraph', + LTFigure: 'figure', + LTLine: 'figure', + LTRect: 'rect', + LTImage: 'figure', } LOG = logging.getLogger(__name__) @@ -639,10 +641,7 @@ def visual_debug_libpdf( # pylint: disable=too-many-branches LOG.info("Starting visual debug...") # collect all elements all_elements = ( - objects.flattened.chapters - + objects.flattened.paragraphs - + objects.flattened.tables - + objects.flattened.figures + objects.flattened.chapters + objects.flattened.paragraphs + objects.flattened.tables + objects.flattened.figures + objects.flattened.rects ) # prepare for calling the common draw and output function @@ -727,7 +726,7 @@ def render_pages( 3: {...} } - :param render_elements: list of elements to render, options are chapter, paragraph, table, figure + :param render_elements: list of elements to render, options are chapter, paragraph, table, figure, rect :return: None """ render_elements_joined = ", ".join(render_elements) From 138110877393bb39d07b8bb970337bf8354732ff Mon Sep 17 00:00:00 2001 From: "jui-wen.chen" Date: Sat, 20 Jan 2024 14:26:06 +0100 Subject: [PATCH 02/18] adapted code to include textboxes of rects --- libpdf/textbox.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/libpdf/textbox.py b/libpdf/textbox.py index c352a42..ce4085f 100644 --- a/libpdf/textbox.py +++ b/libpdf/textbox.py @@ -94,7 +94,7 @@ def extract_paragraphs_chapters( return paragraph_list, chapter_list -def extract_lt_textboxes(pdf, figure_list, table_list, rect_list, page_list): +def extract_lt_textboxes(pdf, figure_list, table_list, page_list): """ Extract and filter lt_textboxes using pdfminer. @@ -104,7 +104,6 @@ def extract_lt_textboxes(pdf, figure_list, table_list, rect_list, page_list): :param pdf: :param figure_list: :param table_list: - :param rect_list: :param page_list: :return: """ @@ -116,7 +115,8 @@ def extract_lt_textboxes(pdf, figure_list, table_list, rect_list, page_list): del page_lt_textboxes[idx_page] if table_list is not None or figure_list is not None: - page_lt_textboxes_filtered = remove_lt_textboxes_in_tables_figures( + +9 + = remove_lt_textboxes_in_tables_figures( page_lt_textboxes, figure_list, table_list ) else: @@ -864,11 +864,10 @@ def _flatten_outline(nested_outline, flatten_outline: List): _flatten_outline(chapter["content"], flatten_outline) -def remove_lt_textboxes_in_tables_figures_rect( +def remove_lt_textboxes_in_tables_figures( page_lt_textboxes: Dict[int, List[LTTextBox]], figure_list: List[Figure], table_list: List[Table], - rect_list: List[Rect] ): """ Remove lt_textboxes in the coverage of tables or figures from page_lt_textboxes. @@ -879,7 +878,6 @@ def remove_lt_textboxes_in_tables_figures_rect( :param page_lt_textboxes: :param figure_list: :param table_list: - :param rect_list: :return: """ page_lt_textboxes_filter = {} @@ -908,10 +906,9 @@ def remove_lt_textboxes_in_tables_figures_rect( return page_lt_textboxes_filter -def tables_figures_rect_merge( +def tables_figures_merge( figure_list: List[Figure], table_list: List[Table], - rect_list: List[Rect], page_index: int, ) -> List[Union[Figure, Table]]: """ @@ -922,7 +919,6 @@ def tables_figures_rect_merge( :param figure_list: A list of all figures extracted from the pages in this pdf :param table_list: A list of all tables extracted from the pages in this pdf - :param rect_list: :param page_index: index of current page number :return: """ From e57322a2efba9ded9576552f50714ffb78498a54 Mon Sep 17 00:00:00 2001 From: "jui-wen.chen" Date: Sat, 20 Jan 2024 14:29:38 +0100 Subject: [PATCH 03/18] fixed typo --- libpdf/textbox.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libpdf/textbox.py b/libpdf/textbox.py index ce4085f..f876485 100644 --- a/libpdf/textbox.py +++ b/libpdf/textbox.py @@ -115,8 +115,7 @@ def extract_lt_textboxes(pdf, figure_list, table_list, page_list): del page_lt_textboxes[idx_page] if table_list is not None or figure_list is not None: - +9 - = remove_lt_textboxes_in_tables_figures( + page_lt_textboxes_filtered = remove_lt_textboxes_in_tables_figures( page_lt_textboxes, figure_list, table_list ) else: From fc488f7854533ddbdbd8995c33201f45db1985bf Mon Sep 17 00:00:00 2001 From: "jui-wen.chen" Date: Sat, 20 Jan 2024 14:46:00 +0100 Subject: [PATCH 04/18] make rect visual obvious --- libpdf/parameters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libpdf/parameters.py b/libpdf/parameters.py index 70d0640..02d6cab 100644 --- a/libpdf/parameters.py +++ b/libpdf/parameters.py @@ -202,7 +202,7 @@ 'paragraph': COLORS['blue'] + (40,), 'table': COLORS['red'] + (40,), 'figure': COLORS['yellow'] + (80,), - 'rect': COLORS['cyan'] + (40,), + 'rect': COLORS['cyan'] + (160,), } RENDER_ELEMENTS = [ From 8b4447fe524b66bad351681873b568a11df16c7b Mon Sep 17 00:00:00 2001 From: "jui-wen.chen" Date: Mon, 22 Jan 2024 10:18:31 +0100 Subject: [PATCH 05/18] improved comments --- libpdf/models/rect.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libpdf/models/rect.py b/libpdf/models/rect.py index 1c13cb7..20090ac 100644 --- a/libpdf/models/rect.py +++ b/libpdf/models/rect.py @@ -1,4 +1,4 @@ -"""Definition for PDF figures.""" +"""Definition for PDF rects.""" from typing import TYPE_CHECKING, List from libpdf.models.element import Element @@ -23,7 +23,7 @@ def __init__( position: 'Position', links: List[Link], textboxes: List[HorizontalBox], - non_stroking_color: str = None, + non_stroking_color: tuple = None, ): """Initialize the instance.""" super().__init__(position=position) From f45b86d54815ccbb8e3cc441cf44f1bfd9433131 Mon Sep 17 00:00:00 2001 From: "jui-wen.chen" Date: Mon, 22 Jan 2024 11:23:04 +0100 Subject: [PATCH 06/18] fixed docs --- docs/contents/api.rst | 2 +- docs/contents/pdf_model.puml | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/docs/contents/api.rst b/docs/contents/api.rst index b56afa6..c6b1df1 100644 --- a/docs/contents/api.rst +++ b/docs/contents/api.rst @@ -75,7 +75,7 @@ Figure Rect ~~~~~~ -.. autoclass:: libpdf.models.figure.Rect +.. autoclass:: libpdf.models.rect.Rect Position diff --git a/docs/contents/pdf_model.puml b/docs/contents/pdf_model.puml index 2233cfb..819b969 100644 --- a/docs/contents/pdf_model.puml +++ b/docs/contents/pdf_model.puml @@ -176,6 +176,13 @@ **rel_path**: figures/ } + class Rect { + + [tuple] non_stroking_color + -- + **id**: ""rect.<1,2,3,n>"" + **non_stroking_color**: RGB color space + } + package x <> { class HorizontalBox { @@ -217,9 +224,7 @@ -- **text**: the character itself } - - - + } Char "+chars 1..*" ---left* Word Word "+words 1..*" ---left* HorizontalLine HorizontalLine "+lines 1..*" --* HorizontalBox @@ -228,10 +233,12 @@ HorizontalBox "+textbox 0..1" --* Cell HorizontalBox "+textboxes 0..*" --* Figure HorizontalBox "+textbox 0..1" --* Chapter + HorizontalBox "+textboxes 0..*" --* Rect Paragraph "+b_source 1" *-- "+links *" Link Figure "+b_source 1" *-- "+links *" Link Cell "+b_source 1" *-- "+links *" Link + Rect "+b_source 1" *-- "+links *" Link Cell "+cells 1..*" --* "+b_table 1" Table Cell "+b_cell 1" *-down- "+position 1" Position @@ -240,6 +247,7 @@ Figure -[#COL_DERIVE]-|> Element Chapter -[#COL_DERIVE]-|> Element Paragraph -[#COL_DERIVE]|> Element + Rect -[#COL_DERIVE]-|> Element Chapter "1" *-- "+content *" Element: ordered Page "1" *-- "+content *" Element: ordered From 2ab1fd7f4e012dfe77b46ded6e5293bc04ee569f Mon Sep 17 00:00:00 2001 From: "jui-wen.chen" Date: Mon, 22 Jan 2024 12:00:01 +0100 Subject: [PATCH 07/18] adapted CLI --- libpdf/core.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/libpdf/core.py b/libpdf/core.py index 034bba1..ab97b2b 100644 --- a/libpdf/core.py +++ b/libpdf/core.py @@ -472,6 +472,13 @@ def handle_parse_result(self, ctx, opts, args): help="Skip figures. Figures will not be part of the output JSON/YAML structures and also not saved if" " --save-figures is given.", ) +@click.option( + "--no-rects", + is_flag=True, + show_default=True, + help="Skip rects. rects will not be part of the output JSON/YAML structures and also not saved if" + " --save-rects is given.", +) @click.option("-vd", "--visual-debug", is_flag=True, help="Visual debug libpdf.") @click.option( "-vo", From 99d95b988d7bdd9de7e0308abca6ca18f5230873 Mon Sep 17 00:00:00 2001 From: Marco Heinemann Date: Tue, 23 Jan 2024 09:08:52 +0100 Subject: [PATCH 08/18] Fix ruff issues for new rect file --- libpdf/models/rect.py | 42 +++++++++++++++++++++--------------------- pyproject.toml | 7 +++++++ 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/libpdf/models/rect.py b/libpdf/models/rect.py index 20090ac..7d05e7d 100644 --- a/libpdf/models/rect.py +++ b/libpdf/models/rect.py @@ -1,29 +1,28 @@ """Definition for PDF rects.""" -from typing import TYPE_CHECKING, List +from __future__ import annotations + +from typing import TYPE_CHECKING from libpdf.models.element import Element -from libpdf.models.horizontal_box import HorizontalBox -from libpdf.models.link import Link -# avoid import cycles for back reference type hinting -# https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: - # F401 imported but unused - it's needed for type hinting - from libpdf.models.position import Position # noqa: F401, pylint: disable=ungrouped-imports + from libpdf.models.horizontal_box import HorizontalBox + from libpdf.models.link import Link + from libpdf.models.position import ( + Position, + ) class Rect(Element): - """ - PDF Rect. - """ + """PDF Rect.""" def __init__( self, idx: int, - position: 'Position', - links: List[Link], - textboxes: List[HorizontalBox], - non_stroking_color: tuple = None, + position: Position, + links: list[Link], + textboxes: list[HorizontalBox], + non_stroking_color: tuple | None = None, ): """Initialize the instance.""" super().__init__(position=position) @@ -32,26 +31,27 @@ def __init__( self.links = links self.non_stroking_color = non_stroking_color if self.links: - self.set_links_backref() + self._set_links_backref() @property - def id_(self): + def id_(self) -> str: """ Return the identifier to address the Figure. The identifier follows the pattern ``figure.``. - idx the 1-based number of the Figure in the current scope (root, chapter, sub-chapters, page). + idx the 1-based number of the Figure in the current scope + (root, chapter, sub-chapters, page). It is used as a link target if a PDF link-annotation points to the Element. - According to PDF model the parameter should be called ``id`` but the name is reserved in Python, so ``id_`` - is used. + According to PDF model the parameter should be called ``id`` but the name is + reserved in Python, so ``id_`` is used. :type: str """ - return f'rect.{self.idx}' + return f"rect.{self.idx}" - def set_links_backref(self): + def _set_links_backref(self) -> None: """Set b_source back reference on all links.""" for link in self.links: link.b_source = self diff --git a/pyproject.toml b/pyproject.toml index cfcf7ad..899746d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -139,6 +139,7 @@ ignore = [ "FIX002", # line-contains-todo - TODOs are considered helpful "ISC001", # may conflict with Ruff formatter "TD003", # missing-todo-link - no big processes for us + "ANN101", # missing-type-self - infer typing is simple for this library ] # Allow autofix for all enabled rules (when `--fix`) is provided. @@ -148,6 +149,12 @@ unfixable = [ "ERA001", # commented-out-code - commented code is commonly there for a reason ] +[tool.ruff.lint.pylint] +max-args = 8 + +[tool.ruff.lint.flake8-annotations] +mypy-init-return = true # no init return type needed if at least one arg is annotated + [tool.ruff.mccabe] max-complexity = 20 From b06bdcb34f614ef819a2ea9c4a58ae1da8d83c30 Mon Sep 17 00:00:00 2001 From: Marco Heinemann Date: Tue, 23 Jan 2024 09:11:58 +0100 Subject: [PATCH 09/18] Improved docstring --- libpdf/models/rect.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/libpdf/models/rect.py b/libpdf/models/rect.py index 7d05e7d..72f7f8d 100644 --- a/libpdf/models/rect.py +++ b/libpdf/models/rect.py @@ -14,7 +14,13 @@ class Rect(Element): - """PDF Rect.""" + """ + Rectangles in a PDF. + + The rectangles are extracted from pdfplumber. + References to textboxes are stored in case the rectangle touches them. + The textboxes can still be part of paragraphs. + """ def __init__( self, @@ -24,7 +30,6 @@ def __init__( textboxes: list[HorizontalBox], non_stroking_color: tuple | None = None, ): - """Initialize the instance.""" super().__init__(position=position) self.idx = idx self.textboxes = textboxes From 8484832e1a383e667faa6b8ec80aa121961ede5d Mon Sep 17 00:00:00 2001 From: "jui-wen.chen" Date: Mon, 22 Jan 2024 14:37:28 +0100 Subject: [PATCH 10/18] rect extraction not saved in figures --- libpdf/extract.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/libpdf/extract.py b/libpdf/extract.py index 681d6b5..07c43e4 100644 --- a/libpdf/extract.py +++ b/libpdf/extract.py @@ -153,7 +153,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta LOG.info('Excluding rects extraction') rect_list = [] else: - rect_list = extract_rects(pdf, pages_list, figure_dir) + rect_list = extract_rects(pdf, pages_list) # smartly remove figures that are in header and footer if smart_page_crop: rect_list = smart_page_crop_header_footer(pdf, rect_list) @@ -658,7 +658,6 @@ def extract_figures( def extract_rects( pdf, pages_list, - figure_dir, ) -> List[ Rect ]: # pylint: disable=too-many-nested-blocks, too-many-branches # local algorithm, easier to read when not split up @@ -715,16 +714,7 @@ def extract_rects( textboxes.append(hbox) - rect_name = f'page_{page.page_number}_rect.{idx_rect + 1}.png' - - # create figures directory if not exist - Path(figure_dir).mkdir(parents=True, exist_ok=True) - - rect_path = os.path.abspath(os.path.join(figure_dir, rect_name)) - - #figure = Figure(idx_figure + 1, image_path, fig_pos, links, textboxes, 'None') - #figure_list.append(figure) - rect = Rect( idx_rect + 1, rect_pos, links, textboxes, non_stroking_color ) + rect = Rect( idx_rect + 1, rect_pos, links, textboxes, non_stroking_color) rect_list.append(rect) else: From 1d2a721634d3f67e771083457904806bd8654100 Mon Sep 17 00:00:00 2001 From: "jui-wen.chen" Date: Tue, 23 Jan 2024 10:11:30 +0100 Subject: [PATCH 11/18] adapted model --- docs/contents/pdf_model.puml | 5 ++--- libpdf/extract.py | 34 +++++++++++++++------------------- libpdf/models/rect.py | 20 ++++++-------------- 3 files changed, 23 insertions(+), 36 deletions(-) diff --git a/docs/contents/pdf_model.puml b/docs/contents/pdf_model.puml index 819b969..219350a 100644 --- a/docs/contents/pdf_model.puml +++ b/docs/contents/pdf_model.puml @@ -233,12 +233,11 @@ HorizontalBox "+textbox 0..1" --* Cell HorizontalBox "+textboxes 0..*" --* Figure HorizontalBox "+textbox 0..1" --* Chapter - HorizontalBox "+textboxes 0..*" --* Rect + HorizontalBox "+textbox 0..1" --* Rect Paragraph "+b_source 1" *-- "+links *" Link Figure "+b_source 1" *-- "+links *" Link Cell "+b_source 1" *-- "+links *" Link - Rect "+b_source 1" *-- "+links *" Link Cell "+cells 1..*" --* "+b_table 1" Table Cell "+b_cell 1" *-down- "+position 1" Position @@ -247,7 +246,7 @@ Figure -[#COL_DERIVE]-|> Element Chapter -[#COL_DERIVE]-|> Element Paragraph -[#COL_DERIVE]|> Element - Rect -[#COL_DERIVE]-|> Element + Rect -[#COL_DERIVE]|> Element Chapter "1" *-- "+content *" Element: ordered Page "1" *-- "+content *" Element: ordered diff --git a/libpdf/extract.py b/libpdf/extract.py index 07c43e4..c1dd8bb 100644 --- a/libpdf/extract.py +++ b/libpdf/extract.py @@ -34,7 +34,7 @@ from libpdf.progress import bar_format_lvl2, tqdm from libpdf.tables import extract_pdf_table from libpdf.textbox import extract_linked_chars, extract_paragraphs_chapters -from libpdf.utils import lt_page_crop, lt_to_libpdf_hbox_converter, to_pdfplumber_bbox +from libpdf.utils import lt_page_crop,lt_textbox_crop, lt_to_libpdf_hbox_converter, to_pdfplumber_bbox LOG = logging.getLogger(__name__) @@ -690,31 +690,27 @@ def extract_rects( ) non_stroking_color = rect['non_stroking_color'] - fill = rect['fill'] - bbox = (rect_pos.x0, rect_pos.y0, rect_pos.x1, rect_pos.y1) - - LOG.info(f"found rect at {bbox} at page {idx_page+1}: color {non_stroking_color}"); - - lt_textboxes = lt_page_crop( - bbox, - lt_page._objs, # pylint: disable=protected-access # access needed - LTText, - contain_completely=True, + offset = 5 + rect_bbox = ( + rect_pos.x0 - offset, + rect_pos.y0 - offset, + rect_pos.x1 + offset, + rect_pos.y1 + offset, ) - textboxes = [] - links = [] - for lt_textbox in lt_textboxes: - if catalog['annos']: - links.extend(extract_linked_chars(lt_textbox, lt_page.pageid)) - bbox = (lt_textbox.x0, lt_textbox.y0, lt_textbox.x1, lt_textbox.y1) + LOG.info(f"found rect at {rect_bbox} at page {idx_page+1}: color {non_stroking_color}"); + lt_textbox = lt_textbox_crop( + rect_bbox, lt_page._objs, + word_margin=LA_PARAMS["word_margin"], + y_tolerance=LA_PARAMS["line_overlap"], + ) + if lt_textbox: hbox = lt_to_libpdf_hbox_converter(lt_textbox) - textboxes.append(hbox) - rect = Rect( idx_rect + 1, rect_pos, links, textboxes, non_stroking_color) + rect = Rect( idx_rect + 1, rect_pos, hbox, non_stroking_color) rect_list.append(rect) else: diff --git a/libpdf/models/rect.py b/libpdf/models/rect.py index 72f7f8d..700841b 100644 --- a/libpdf/models/rect.py +++ b/libpdf/models/rect.py @@ -4,6 +4,7 @@ from typing import TYPE_CHECKING from libpdf.models.element import Element +from libpdf.models.horizontal_box import HorizontalBox if TYPE_CHECKING: from libpdf.models.horizontal_box import HorizontalBox @@ -25,18 +26,14 @@ class Rect(Element): def __init__( self, idx: int, - position: Position, - links: list[Link], - textboxes: list[HorizontalBox], - non_stroking_color: tuple | None = None, + position: 'Position', + textbox: HorizontalBox, + non_stroking_color: tuple = None, ): super().__init__(position=position) self.idx = idx - self.textboxes = textboxes - self.links = links + self.textbox = textbox self.non_stroking_color = non_stroking_color - if self.links: - self._set_links_backref() @property def id_(self) -> str: @@ -54,9 +51,4 @@ def id_(self) -> str: :type: str """ - return f"rect.{self.idx}" - - def _set_links_backref(self) -> None: - """Set b_source back reference on all links.""" - for link in self.links: - link.b_source = self + return f'rect.{self.idx}' From 5cd2e647922afdee66e56b0cb30d369bc20e1319 Mon Sep 17 00:00:00 2001 From: "jui-wen.chen" Date: Tue, 23 Jan 2024 11:45:22 +0100 Subject: [PATCH 12/18] changed rect -remove crop_rects_text flag -text within the rect is extracted --- libpdf/core.py | 7 ------- libpdf/extract.py | 6 ++---- libpdf/models/rect.py | 5 ++--- libpdf/process.py | 4 ++-- libpdf/textbox.py | 1 - 5 files changed, 6 insertions(+), 17 deletions(-) diff --git a/libpdf/core.py b/libpdf/core.py index ab97b2b..cd402e7 100644 --- a/libpdf/core.py +++ b/libpdf/core.py @@ -36,7 +36,6 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable no_tables: bool = False, no_figures: bool = False, no_rects: bool = False, - crop_rects_text: bool = False, cli_usage: bool = False, visual_debug: bool = False, visual_debug_output_dir: str = None, @@ -69,7 +68,6 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable :param no_tables: flag triggering the exclusion of tables :param no_figures: flag triggering the exclusion of figures :param no_rects: flag triggering the exclusion of rects - :param crop_rects_text: flag triggering that rects text should be cropped from text like paragraphs :param cli_usage: flag indicating that the function was called through CLI :param visual_debug: flag triggering visual debug feature :param visual_debug_output_dir: output directory for visualized pdf pages @@ -120,7 +118,6 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable LOG.info('Extract tables: %s', 'no' if no_tables else 'yes') LOG.info('Extract figures: %s', 'no' if no_figures else 'yes') LOG.info('Extract rects: %s', 'no' if no_rects else 'yes') - LOG.info('Text rects crop: %s', 'no' if crop_rects_text else 'no') overall_pbar.update(1) try: objects = extract( @@ -135,7 +132,6 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable no_tables, no_figures, no_rects, - crop_rects_text, overall_pbar, ) except LibpdfException: @@ -181,7 +177,6 @@ def main_api( # pylint: disable=too-many-arguments, too-many-locals no_tables: bool = False, no_figures: bool = False, no_rects: bool = False, - crop_rects_text: bool = False, init_logging: bool = True, visual_debug: bool = False, visual_debug_output_dir: str = "visual_debug_libpdf", @@ -208,7 +203,6 @@ def main_api( # pylint: disable=too-many-arguments, too-many-locals :param no_tables: flag triggering the exclusion of tables :param no_figures: flag triggering the exclusion of figures :param no_rects: flag triggering the exclusion of rects - :param crop_rects_text: flag triggering that rects text should be cropped from text like paragraphs :param init_logging: flag indicating whether libpdf shall instantiate a root log handler that is capable of handling both log messages and progress bars; it does so by passing all log messages to tqdm.write() @@ -253,7 +247,6 @@ def main_api( # pylint: disable=too-many-arguments, too-many-locals no_tables=no_tables, no_figures=no_figures, no_rects = no_rects, - crop_rects_text = crop_rects_text, cli_usage=False, visual_debug=visual_debug, visual_debug_output_dir=visual_debug_output_dir, diff --git a/libpdf/extract.py b/libpdf/extract.py index c1dd8bb..a9b4a11 100644 --- a/libpdf/extract.py +++ b/libpdf/extract.py @@ -63,7 +63,6 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta no_tables: bool, no_figures: bool, no_rects: bool, - crop_rects_text: bool, overall_pbar: tqdm, ) -> ApiObjects: """ @@ -80,7 +79,6 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta :param no_tables: flag triggering the exclusion of tables :param no_figures: flag triggering the exclusion of figures :param no_rects: flag triggering the exclusion of rects - :param crop_rects_text: flag triggering that rects text should be cropped from text like paragraphs :param overall_pbar: total progress bar for whole libpdf run :return: instance of Objects class :raise LibpdfException: PDF contains no pages @@ -175,7 +173,6 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta pdf, figure_list, table_list, - rect_list if crop_rects_text else [], pages_list, no_chapters, no_paragraphs, @@ -708,7 +705,8 @@ def extract_rects( ) if lt_textbox: hbox = lt_to_libpdf_hbox_converter(lt_textbox) - + else: + hbox = None rect = Rect( idx_rect + 1, rect_pos, hbox, non_stroking_color) rect_list.append(rect) diff --git a/libpdf/models/rect.py b/libpdf/models/rect.py index 700841b..7796f4d 100644 --- a/libpdf/models/rect.py +++ b/libpdf/models/rect.py @@ -8,7 +8,6 @@ if TYPE_CHECKING: from libpdf.models.horizontal_box import HorizontalBox - from libpdf.models.link import Link from libpdf.models.position import ( Position, ) @@ -19,8 +18,8 @@ class Rect(Element): Rectangles in a PDF. The rectangles are extracted from pdfplumber. - References to textboxes are stored in case the rectangle touches them. - The textboxes can still be part of paragraphs. + The text covered in the rectangle is extracted and + stored in an newly instantiated textbox. """ def __init__( diff --git a/libpdf/process.py b/libpdf/process.py index 6c926e5..7cb9072 100644 --- a/libpdf/process.py +++ b/libpdf/process.py @@ -81,13 +81,13 @@ def to_dict_output(obj: Union[ModelBase, Position]) -> Dict: # pylint: disable= if isinstance(obj, Page): # no serialization for the contents of pages del vars_dict["content"] - if isinstance(obj, (Paragraph, Cell, Chapter)): + if isinstance(obj, (Paragraph, Cell, Chapter, Rect)): # textboxes with positions are not interest of the output file if obj.textbox: text = obj.textbox.text vars_dict['text'] = text del vars_dict['textbox'] - if isinstance(obj, (Figure, Rect)): + if isinstance(obj, (Figure)): # textboxes with positions are not interest of the output file if obj.textboxes: text = "\n".join(x.text for x in obj.textboxes) diff --git a/libpdf/textbox.py b/libpdf/textbox.py index f876485..200c448 100644 --- a/libpdf/textbox.py +++ b/libpdf/textbox.py @@ -68,7 +68,6 @@ def extract_paragraphs_chapters( pdf, figure_list: List[Figure], table_list: List[Table], - rect_list: List[Rect], page_list: List[Page], no_chapters, no_paragraphs, From b6b966100ea1a6c54e2d5fa9ac12f0dff4bf709b Mon Sep 17 00:00:00 2001 From: "jui-wen.chen" Date: Tue, 23 Jan 2024 11:48:46 +0100 Subject: [PATCH 13/18] reformatted by ruff --- libpdf/apiobjects.py | 10 ++++-- libpdf/core.py | 27 ++++++++------ libpdf/extract.py | 82 ++++++++++++++++++++++++++----------------- libpdf/models/rect.py | 6 ++-- libpdf/parameters.py | 30 ++++++++-------- libpdf/process.py | 12 ++++--- libpdf/textbox.py | 2 +- libpdf/utils.py | 36 ++++++++++--------- 8 files changed, 119 insertions(+), 86 deletions(-) diff --git a/libpdf/apiobjects.py b/libpdf/apiobjects.py index 042d36b..9ca56cf 100644 --- a/libpdf/apiobjects.py +++ b/libpdf/apiobjects.py @@ -7,8 +7,8 @@ from libpdf.models.chapter import Chapter from libpdf.models.figure import Figure -from libpdf.models.rect import Rect from libpdf.models.paragraph import Paragraph +from libpdf.models.rect import Rect from libpdf.models.root import Root from libpdf.models.table import Table @@ -46,7 +46,13 @@ def __init__( # pylint: disable=too-many-arguments # the parameters are needed self.root = root # attributes for API convenience - self.flattened = Flattened(chapters=chapters, paragraphs=paragraphs, tables=tables, figures=figures, rects=rects) + self.flattened = Flattened( + chapters=chapters, + paragraphs=paragraphs, + tables=tables, + figures=figures, + rects=rects, + ) # exposing the pdfplumber PDF object self.pdfplumber = pdfplumber diff --git a/libpdf/core.py b/libpdf/core.py index cd402e7..fc7bac9 100644 --- a/libpdf/core.py +++ b/libpdf/core.py @@ -108,16 +108,21 @@ def main( # pylint: disable=too-many-arguments,too-many-locals # no reasonable if output_path: LOG.info("Output path: %s", output_path) else: - LOG.info('Writing extracted data to stdout') - LOG.info('Page range: [%s]', 'all' if not pages else ','.join(str(x) for x in pages)) - LOG.info('Page crop: %s', 'not cropped' if not page_crop else ' '.join(str(x) for x in page_crop)) - LOG.info('Smart page crop: %s', 'on' if smart_page_crop else 'off') - LOG.info('Extract annotations: %s', 'no' if no_annotations else 'yes') - LOG.info('Extract chapters: %s', 'no' if no_chapters else 'yes') - LOG.info('Extract paragraphs: %s', 'no' if no_paragraphs else 'yes') - LOG.info('Extract tables: %s', 'no' if no_tables else 'yes') - LOG.info('Extract figures: %s', 'no' if no_figures else 'yes') - LOG.info('Extract rects: %s', 'no' if no_rects else 'yes') + LOG.info("Writing extracted data to stdout") + LOG.info( + "Page range: [%s]", "all" if not pages else ",".join(str(x) for x in pages) + ) + LOG.info( + "Page crop: %s", + "not cropped" if not page_crop else " ".join(str(x) for x in page_crop), + ) + LOG.info("Smart page crop: %s", "on" if smart_page_crop else "off") + LOG.info("Extract annotations: %s", "no" if no_annotations else "yes") + LOG.info("Extract chapters: %s", "no" if no_chapters else "yes") + LOG.info("Extract paragraphs: %s", "no" if no_paragraphs else "yes") + LOG.info("Extract tables: %s", "no" if no_tables else "yes") + LOG.info("Extract figures: %s", "no" if no_figures else "yes") + LOG.info("Extract rects: %s", "no" if no_rects else "yes") overall_pbar.update(1) try: objects = extract( @@ -246,7 +251,7 @@ def main_api( # pylint: disable=too-many-arguments, too-many-locals no_paragraphs=no_paragraphs, no_tables=no_tables, no_figures=no_figures, - no_rects = no_rects, + no_rects=no_rects, cli_usage=False, visual_debug=visual_debug, visual_debug_output_dir=visual_debug_output_dir, diff --git a/libpdf/extract.py b/libpdf/extract.py index a9b4a11..311710d 100644 --- a/libpdf/extract.py +++ b/libpdf/extract.py @@ -17,11 +17,11 @@ from libpdf.exceptions import LibpdfException from libpdf.log import logging_needed from libpdf.models.figure import Figure -from libpdf.models.rect import Rect from libpdf.models.file import File from libpdf.models.file_meta import FileMeta from libpdf.models.page import Page from libpdf.models.position import Position +from libpdf.models.rect import Rect from libpdf.models.root import Root from libpdf.parameters import ( FIGURE_MIN_HEIGHT, @@ -34,7 +34,12 @@ from libpdf.progress import bar_format_lvl2, tqdm from libpdf.tables import extract_pdf_table from libpdf.textbox import extract_linked_chars, extract_paragraphs_chapters -from libpdf.utils import lt_page_crop,lt_textbox_crop, lt_to_libpdf_hbox_converter, to_pdfplumber_bbox +from libpdf.utils import ( + lt_page_crop, + lt_textbox_crop, + lt_to_libpdf_hbox_converter, + to_pdfplumber_bbox, +) LOG = logging.getLogger(__name__) @@ -148,7 +153,7 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta overall_pbar.update(15) if no_rects: - LOG.info('Excluding rects extraction') + LOG.info("Excluding rects extraction") rect_list = [] else: rect_list = extract_rects(pdf, pages_list) @@ -157,7 +162,6 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta rect_list = smart_page_crop_header_footer(pdf, rect_list) overall_pbar.update(15) - if no_tables: LOG.info("Excluding tables extraction") table_list = [] @@ -182,7 +186,9 @@ def extract( # pylint: disable=too-many-locals, too-many-branches, too-many-sta if smart_page_crop: paragraph_list = smart_page_crop_header_footer(pdf, paragraph_list) - element_list = pro.merge_all_elements(figure_list, table_list, paragraph_list, chapter_list, rect_list) + element_list = pro.merge_all_elements( + figure_list, table_list, paragraph_list, chapter_list, rect_list + ) # to check if elements shall be mapped into nested outline structure. if catalog["outline"] is not None and not no_chapters: @@ -652,41 +658,44 @@ def extract_figures( return figure_list + def extract_rects( pdf, pages_list, -) -> List[ - Rect -]: # pylint: disable=too-many-nested-blocks, too-many-branches # local algorithm, easier to read when not split up +) -> List[Rect]: # pylint: disable=too-many-nested-blocks, too-many-branches # local algorithm, easier to read when not split up """Extract rects in PDF.""" - LOG.info('Extracting rects ...') + LOG.info("Extracting rects ...") rect_list = [] for idx_page, page in enumerate( # pylint: disable=too-many-nested-blocks - tqdm(pdf.pages, desc='###### Extracting rects', unit='pages', bar_format=bar_format_lvl2()), + tqdm( + pdf.pages, + desc="###### Extracting rects", + unit="pages", + bar_format=bar_format_lvl2(), + ), ): if logging_needed(idx_page, len(pdf.pages)): - LOG.debug('Extracting rects page %s of %s', idx_page + 1, len(pdf.pages)) + LOG.debug("Extracting rects page %s of %s", idx_page + 1, len(pdf.pages)) page_crop = pro.remove_page_header_footer(page) lt_page = page._layout # pylint: disable=protected-access # easiest way to obtain LTPage # check and filter figures - #figures = check_and_filter_figures(page_crop.objects['figure']) if 'figure' in page_crop.objects else [] - #rects = page_crop.objects['rects'] if 'rects' in page_crop.objects else [] - rects = page.objects['rect'] if 'rect' in page.objects else [] - + # figures = check_and_filter_figures(page_crop.objects['figure']) if 'figure' in page_crop.objects else [] + # rects = page_crop.objects['rects'] if 'rects' in page_crop.objects else [] + rects = page.objects["rect"] if "rect" in page.objects else [] if len(rects) != 0: for idx_rect, rect in enumerate(rects): rect_pos = Position( - float(rect['x0']), - float(rect['y0']), - float(rect['x1']), - float(rect['y1']), + float(rect["x0"]), + float(rect["y0"]), + float(rect["x1"]), + float(rect["y1"]), pages_list[idx_page], ) - non_stroking_color = rect['non_stroking_color'] + non_stroking_color = rect["non_stroking_color"] offset = 5 rect_bbox = ( @@ -696,26 +705,27 @@ def extract_rects( rect_pos.y1 + offset, ) - LOG.info(f"found rect at {rect_bbox} at page {idx_page+1}: color {non_stroking_color}"); - + LOG.info( + f"found rect at {rect_bbox} at page {idx_page+1}: color {non_stroking_color}" + ) lt_textbox = lt_textbox_crop( - rect_bbox, lt_page._objs, + rect_bbox, + lt_page._objs, word_margin=LA_PARAMS["word_margin"], y_tolerance=LA_PARAMS["line_overlap"], - ) + ) if lt_textbox: hbox = lt_to_libpdf_hbox_converter(lt_textbox) else: hbox = None - rect = Rect( idx_rect + 1, rect_pos, hbox, non_stroking_color) + rect = Rect(idx_rect + 1, rect_pos, hbox, non_stroking_color) rect_list.append(rect) else: LOG.info(f"found no rects on page {idx_page+1}: {page_crop.objects.keys()}") - - #return figure_list + # return figure_list return rect_list @@ -788,9 +798,9 @@ def check_and_filter_figures(figures_list): # pylint: disable=too-many-branches filtered_figures.append(figure) if len(filtered_figures) < len(figures_list): - LOG.debug(f"check_and_filter_figures removed {len(figures_list) - len(filtered_figures)} out of {len(figures_list)} due to invalid height/width") - - + LOG.debug( + f"check_and_filter_figures removed {len(figures_list) - len(filtered_figures)} out of {len(figures_list)} due to invalid height/width" + ) for figure in filtered_figures: # if figure exceed the boundary of the page, then only keep the part of figure that inside this page @@ -839,14 +849,20 @@ def check_and_filter_figures(figures_list): # pylint: disable=too-many-branches # compare the size of two figures, keep the bigger figure if fig0["width"] * fig0["height"] <= fig1["width"] * fig1["height"]: if fig0 in filtered_figures: - LOG.debug("remove filtered figure fig0 due to partially overlap") + LOG.debug( + "remove filtered figure fig0 due to partially overlap" + ) filtered_figures.remove(fig0) else: if fig1 in filtered_figures: - LOG.debug("remove filtered figure fig1 due to partially overlap") + LOG.debug( + "remove filtered figure fig1 due to partially overlap" + ) filtered_figures.remove(fig1) if len(filtered_figures) < len(figures_list): - LOG.debug(f"check_and_filter_figures removed {len(figures_list) - len(filtered_figures)} out of {len(figures_list)} figures") + LOG.debug( + f"check_and_filter_figures removed {len(figures_list) - len(filtered_figures)} out of {len(figures_list)} figures" + ) return filtered_figures diff --git a/libpdf/models/rect.py b/libpdf/models/rect.py index 7796f4d..7e898d0 100644 --- a/libpdf/models/rect.py +++ b/libpdf/models/rect.py @@ -18,14 +18,14 @@ class Rect(Element): Rectangles in a PDF. The rectangles are extracted from pdfplumber. - The text covered in the rectangle is extracted and + The text covered in the rectangle is extracted and stored in an newly instantiated textbox. """ def __init__( self, idx: int, - position: 'Position', + position: Position, textbox: HorizontalBox, non_stroking_color: tuple = None, ): @@ -50,4 +50,4 @@ def id_(self) -> str: :type: str """ - return f'rect.{self.idx}' + return f"rect.{self.idx}" diff --git a/libpdf/parameters.py b/libpdf/parameters.py index 02d6cab..afd9e0d 100644 --- a/libpdf/parameters.py +++ b/libpdf/parameters.py @@ -188,29 +188,29 @@ # The following parameters are used for visual debugging # Give colors a human readable name COLORS = { - 'red': (255, 0, 0), - 'green': (0, 255, 0), - 'blue': (0, 0, 255), - 'yellow': (255, 255, 0), - 'cyan': (255,255,1) + "red": (255, 0, 0), + "green": (0, 255, 0), + "blue": (0, 0, 255), + "yellow": (255, 255, 0), + "cyan": (255, 255, 1), } # Map extracted elements with color # the numbers at the end means transparency, the value should be set in range (40, 160) VIS_DBG_MAP_ELEMENTS_COLOR = { - 'chapter': COLORS['green'] + (80,), - 'paragraph': COLORS['blue'] + (40,), - 'table': COLORS['red'] + (40,), - 'figure': COLORS['yellow'] + (80,), - 'rect': COLORS['cyan'] + (160,), + "chapter": COLORS["green"] + (80,), + "paragraph": COLORS["blue"] + (40,), + "table": COLORS["red"] + (40,), + "figure": COLORS["yellow"] + (80,), + "rect": COLORS["cyan"] + (160,), } RENDER_ELEMENTS = [ - 'chapter', - 'paragraph', - 'table', - 'figure', - 'rect' + "chapter", + "paragraph", + "table", + "figure", + "rect", ] # the elements that shall be rendered # pdfminer layout analysis parameter from from pdfminer.layout -> LAParams.__init__ diff --git a/libpdf/process.py b/libpdf/process.py index 7cb9072..6500aec 100644 --- a/libpdf/process.py +++ b/libpdf/process.py @@ -24,12 +24,12 @@ from libpdf.models.chapter import Chapter from libpdf.models.element import Element from libpdf.models.figure import Figure -from libpdf.models.rect import Rect from libpdf.models.link import Link from libpdf.models.model_base import ModelBase from libpdf.models.page import Page from libpdf.models.paragraph import Paragraph from libpdf.models.position import Position +from libpdf.models.rect import Rect from libpdf.models.table import Cell, Table from libpdf.parameters import HEADLINE_TOLERANCE @@ -85,8 +85,8 @@ def to_dict_output(obj: Union[ModelBase, Position]) -> Dict: # pylint: disable= # textboxes with positions are not interest of the output file if obj.textbox: text = obj.textbox.text - vars_dict['text'] = text - del vars_dict['textbox'] + vars_dict["text"] = text + del vars_dict["textbox"] if isinstance(obj, (Figure)): # textboxes with positions are not interest of the output file if obj.textboxes: @@ -293,7 +293,9 @@ def map_elements_outline( return nested_elements -def fill_elements_content(elements_in_outline: List[Union[Chapter, Figure, Rect, Table, Paragraph]]) -> List[Chapter]: +def fill_elements_content( + elements_in_outline: List[Union[Chapter, Figure, Rect, Table, Paragraph]], +) -> List[Chapter]: """ Fill the elements, tables, figures, rects and paragraphs into their corresponding chapters' contents. @@ -304,7 +306,7 @@ def fill_elements_content(elements_in_outline: List[Union[Chapter, Figure, Rect, """ for index_element, element in enumerate(elements_in_outline): if isinstance(element, Chapter): - id_dict = {'table': 1, 'figure': 1, 'paragraph': 1, 'rect': 1} + id_dict = {"table": 1, "figure": 1, "paragraph": 1, "rect": 1} content = elements_in_outline[index_element].content index_b_chapter = index_element else: diff --git a/libpdf/textbox.py b/libpdf/textbox.py index 200c448..e4db528 100644 --- a/libpdf/textbox.py +++ b/libpdf/textbox.py @@ -46,7 +46,6 @@ from libpdf.log import logging_needed from libpdf.models.chapter import Chapter from libpdf.models.figure import Figure -from libpdf.models.rect import Rect from libpdf.models.link import Link from libpdf.models.page import Page from libpdf.models.paragraph import Paragraph @@ -64,6 +63,7 @@ LOG = logging.getLogger(__name__) + def extract_paragraphs_chapters( pdf, figure_list: List[Figure], diff --git a/libpdf/utils.py b/libpdf/utils.py index d9f1ba1..b593191 100644 --- a/libpdf/utils.py +++ b/libpdf/utils.py @@ -34,28 +34,28 @@ from libpdf.models.chapter import Chapter from libpdf.models.element import Element from libpdf.models.figure import Figure -from libpdf.models.rect import Rect from libpdf.models.horizontal_box import Char, HorizontalBox, HorizontalLine, Word from libpdf.models.paragraph import Paragraph +from libpdf.models.rect import Rect from libpdf.models.table import Table from libpdf.parameters import RENDER_ELEMENTS, VIS_DBG_MAP_ELEMENTS_COLOR from libpdf.progress import bar_format_lvl1, tqdm MAP_TYPES = { - Chapter: 'chapter', - Paragraph: 'paragraph', - Table: 'table', - Figure: 'figure', - Rect: 'rect', - LTChar: 'paragraph', - LTCurve: 'figure', - LTTextBox: 'paragraph', - LTTextBoxHorizontal: 'paragraph', - LTTextLineHorizontal: 'paragraph', - LTFigure: 'figure', - LTLine: 'figure', - LTRect: 'rect', - LTImage: 'figure', + Chapter: "chapter", + Paragraph: "paragraph", + Table: "table", + Figure: "figure", + Rect: "rect", + LTChar: "paragraph", + LTCurve: "figure", + LTTextBox: "paragraph", + LTTextBoxHorizontal: "paragraph", + LTTextLineHorizontal: "paragraph", + LTFigure: "figure", + LTLine: "figure", + LTRect: "rect", + LTImage: "figure", } LOG = logging.getLogger(__name__) @@ -641,7 +641,11 @@ def visual_debug_libpdf( # pylint: disable=too-many-branches LOG.info("Starting visual debug...") # collect all elements all_elements = ( - objects.flattened.chapters + objects.flattened.paragraphs + objects.flattened.tables + objects.flattened.figures + objects.flattened.rects + objects.flattened.chapters + + objects.flattened.paragraphs + + objects.flattened.tables + + objects.flattened.figures + + objects.flattened.rects ) # prepare for calling the common draw and output function From 1aa72240b8830e31231905bbb23dc6304f8b143f Mon Sep 17 00:00:00 2001 From: "jui-wen.chen" Date: Tue, 23 Jan 2024 11:49:06 +0100 Subject: [PATCH 14/18] versioned --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 899746d..bb8f351 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "libpdf" -version = "0.0.1" +version = "0.1.1" description = "Extract structured data from PDFs." authors = [ "Marco Heinemann ", From 64fc19fef2255425926a97164349bc55a977c7c7 Mon Sep 17 00:00:00 2001 From: "jui-wen.chen" Date: Tue, 23 Jan 2024 14:23:55 +0100 Subject: [PATCH 15/18] fix for resolve() --- libpdf/catalog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libpdf/catalog.py b/libpdf/catalog.py index e78de7f..fedc50b 100644 --- a/libpdf/catalog.py +++ b/libpdf/catalog.py @@ -3,7 +3,7 @@ import re from typing import Any, Dict, List, Union -from pdfminer.pdftypes import PDFObjRef +from pdfminer.pdftypes import PDFObjRef, resolve1 from pdfminer.psparser import PSLiteral from libpdf.log import logging_needed @@ -51,7 +51,7 @@ def get_named_destination(pdf): # pylint: disable=too-many-branches ): name_tree = pdf_catalog["Names"].resolve()["Dests"].resolve() elif isinstance(pdf_catalog["Names"], dict) and "Dests" in pdf_catalog["Names"]: - name_tree = pdf_catalog["Names"]["Dests"].resolve() + name_tree = resolve1(pdf_catalog["Names"]["Dests"]) # check if name tree not empty if name_tree: # map page id to page number From 76813902208d75ff65b093769b55e79321ff9b0d Mon Sep 17 00:00:00 2001 From: Marco Heinemann Date: Tue, 23 Jan 2024 14:42:50 +0100 Subject: [PATCH 16/18] Improve docs for non_stroking_color --- docs/contents/pdf_model.puml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/contents/pdf_model.puml b/docs/contents/pdf_model.puml index 219350a..67de5ae 100644 --- a/docs/contents/pdf_model.puml +++ b/docs/contents/pdf_model.puml @@ -180,7 +180,8 @@ + [tuple] non_stroking_color -- **id**: ""rect.<1,2,3,n>"" - **non_stroking_color**: RGB color space + **non_stroking_color**: RGB color space, + given as floats between 0 and 1 } package x <> { From 7335d53624f7704baf5d3538a0d871b624d0e301 Mon Sep 17 00:00:00 2001 From: Marco Heinemann Date: Tue, 23 Jan 2024 14:45:05 +0100 Subject: [PATCH 17/18] Revert version update --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bb8f351..899746d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "libpdf" -version = "0.1.1" +version = "0.0.1" description = "Extract structured data from PDFs." authors = [ "Marco Heinemann ", From 946c8200e3eb0d14c40c785bf45f7f4c01dcff89 Mon Sep 17 00:00:00 2001 From: Marco Heinemann Date: Tue, 23 Jan 2024 14:51:30 +0100 Subject: [PATCH 18/18] Updated changelog --- docs/contents/changelog.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/contents/changelog.rst b/docs/contents/changelog.rst index c341097..0d19565 100644 --- a/docs/contents/changelog.rst +++ b/docs/contents/changelog.rst @@ -20,11 +20,15 @@ __ https://github.com/useblocks/libpdf/compare/v0.0.1...HEAD Added ~~~~~ +- Introduced new element ``Rect`` (`PR #30 `_) +- Introduced Ruff as linter and formatter (PRs `#28 `_, + `#29 `_ and `#31 `_) +- Added support for Python 3.10, 3.11 and 3.12 (`PR #27 `_) + Changed ~~~~~~~ - Updated downstream library dependencies - - Added flag ``--no_annotations`` to exclude annotation extraction from the catalog to speed up extraction (`PR #15 `_) @@ -33,6 +37,8 @@ Removed - libpdf has dropped support for Python 3.6, which reached end-of-life on 2021-12-23; this also fixes the Pillow 8 security vulnerabilities +- libpdf has dropped support for Python 3.7, which reached end-of-life on 2023-06-27 + (`PR #27 `_) Fixed ~~~~~