From f74a71e0c5435fd604cfa6450c2fdf52744cf086 Mon Sep 17 00:00:00 2001 From: Louisa H Date: Wed, 5 Mar 2025 17:37:28 +0100 Subject: [PATCH 1/2] feat: create pdf parser with unstructured --- src/pdf_parse.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 src/pdf_parse.py diff --git a/src/pdf_parse.py b/src/pdf_parse.py new file mode 100644 index 0000000..3f23704 --- /dev/null +++ b/src/pdf_parse.py @@ -0,0 +1,45 @@ +import json +import os +from typing import List +from langchain_unstructured import UnstructuredLoader +from langchain_core.documents import Document +from dotenv import load_dotenv + +load_dotenv() +unstructured_api_key = os.getenv("UNSTRUCTURED_API_KEY") +unstructured_api_url=os.getenv("UNSTRUCTURED_API_URL") + +def create_documents(): + """Create documents from a PDF file and save them as JSON files. + + Args: + file_name (str): The name of the PDF file in the data/raw directory + """ + + file_path = r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\geo_form2.pdf" + + loader = UnstructuredLoader( + file_path=file_path, + strategy="hi_res", + unique_element_ids=True, + + partition_via_api=True, + coordinates=True, + api_key=unstructured_api_key, + url=unstructured_api_url, + ) + + docs: List[Document] = [] + for doc in loader.lazy_load(): + docs.append(doc) + + output_path = r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\geo_form2.md" + with open(output_path, "w", encoding="utf-8") as f: + docs_dict = [doc.model_dump() for doc in docs] + json.dump(docs_dict, f, ensure_ascii=False, indent=2) + + print("Document generated.") + + return docs + +create_documents() \ No newline at end of file From 552415004bb9f61040ff1bb1d11852dcf43ee13a Mon Sep 17 00:00:00 2001 From: Louisa H Date: Wed, 19 Mar 2025 19:17:18 +0100 Subject: [PATCH 2/2] Fixes issue #5: Test pymupdf4llm for parsing and langchain for chunking --- src/langchain_chunking.py | 72 +++++++++++++++++++ src/pymu_parsing.py | 65 +++++++++++++++++ src/{pdf_parse.py => unstructured_parsing.py} | 2 +- 3 files changed, 138 insertions(+), 1 deletion(-) create mode 100644 src/langchain_chunking.py create mode 100644 src/pymu_parsing.py rename src/{pdf_parse.py => unstructured_parsing.py} (97%) diff --git a/src/langchain_chunking.py b/src/langchain_chunking.py new file mode 100644 index 0000000..af4ab35 --- /dev/null +++ b/src/langchain_chunking.py @@ -0,0 +1,72 @@ +import json +from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter +import re +import pathlib + +pdf_path = r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\parsed\text.md" + +with open(pdf_path, "r", encoding="utf-8") as md_file: + md_doc = md_file.read() + +headers_to_split_on = [ + ("####", "Header 1"), # Chapter section + ("######", "Header 2"), # Acknowledgements, table of contents, etc. + ("#", "Header 3"), # Bold + ("#####", "Header 4"), # Bold + italic +] + +def preprocess_md(md_doc): + """Modify md document so that bold and bold + italic text is recognized as headers.""" + + md_doc = re.sub(r'\n\*\*(.*?)\*\*\n', r'\n# \1\n', md_doc) # Bold + md_doc = re.sub(r'\n\*\*_([^*]+)_\*\*\n', r'\n##### \1\n', md_doc) # Bold + italic + + return md_doc + +def md_split(md_doc): + """Split document based on md headers.""" + + markdown_splitter = MarkdownHeaderTextSplitter( + headers_to_split_on=headers_to_split_on, strip_headers=False + ) + md_header_splits = markdown_splitter.split_text(md_doc) + + return md_header_splits + +def recursive_split(md_header_splits): + """Split document recursively.""" + + chunk_size = 250 + chunk_overlap = 30 + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) + + splits = text_splitter.split_documents(md_header_splits) + + return splits + +# Change bold & bold + italic text to headers +md_doc = preprocess_md(md_doc) + +# Split data +md_header_splits = md_split(md_doc) +#character_splits = recursive_split(md_header_splits) + +# Append metadata +splits_data = [] +for split in md_header_splits: + splits_data.append({ + "content": split.page_content, + "metadata": split.metadata + }) + +# Save JSON output +output_dir = pathlib.Path(r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\parsed") +output_dir.mkdir(exist_ok=True) + +output_json_path = output_dir / "text.json" +with open(output_json_path, "w", encoding="utf-8") as json_file: + json.dump(splits_data, json_file, ensure_ascii=False, indent=4) + +print(f"Markdown splits saved to {output_json_path}") \ No newline at end of file diff --git a/src/pymu_parsing.py b/src/pymu_parsing.py new file mode 100644 index 0000000..f15c889 --- /dev/null +++ b/src/pymu_parsing.py @@ -0,0 +1,65 @@ +import fitz +import pathlib +import pymupdf4llm + +# If output directories do not exist, create them +output_dir = pathlib.Path(r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\parsed") +images_dir = output_dir / "images" +output_dir.mkdir(exist_ok=True) +images_dir.mkdir(exist_ok=True) + +def text_only_extraction(pdf_path, output_md): + """Only extract text from pdf, ignore images.""" + + md_text = pymupdf4llm.to_markdown(pdf_path) + pathlib.Path("pymu_output/output.md").write_bytes(md_text.encode()) + +def save_image(image_data, xref, ext): + """Save image to the images directory.""" + + image_file = images_dir / f"img_{xref}.{ext}" + with open(image_file, "wb") as img_file: + img_file.write(image_data) + return image_file + +def extract_images(doc, page_number): + """Extract images from a specific page.""" + + images_md = [] + for img in doc.get_page_images(page_number): + xref = img[0] + image = doc.extract_image(xref) + if not image: + continue + image_file = save_image(image['image'], xref, image['ext']) + images_md.append(f"![Image {xref}]({image_file})\n") + return "\n".join(images_md) + +def text_and_image_extraction(pdf_path, output_md): + """Extract text and image page by page.""" + + for page_number in range(len(doc)): + page_text = pymupdf4llm.to_markdown(pdf_path, pages=[page_number]) + page_images = extract_images(doc, page_number) + + markdown_content.append(f"## Page {page_number + 1}\n") + markdown_content.append(page_text.strip()) + if page_images: + markdown_content.append(page_images) + + # Save md output + with open(output_md, "w", encoding="utf-8") as md_file: + md_file.write("\n\n".join(markdown_content)) + + print(f"Markdown with text and images saved to: {output_md}") + +# Initialize md content +markdown_content = [] + +# Load pdf +pdf_path = r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\geo_form2.pdf" +doc = fitz.open(pdf_path) +output_md = output_dir / "text.md" + +# Extract text and images +text_and_image_extraction(pdf_path, output_md) \ No newline at end of file diff --git a/src/pdf_parse.py b/src/unstructured_parsing.py similarity index 97% rename from src/pdf_parse.py rename to src/unstructured_parsing.py index 3f23704..558fc50 100644 --- a/src/pdf_parse.py +++ b/src/unstructured_parsing.py @@ -33,7 +33,7 @@ def create_documents(): for doc in loader.lazy_load(): docs.append(doc) - output_path = r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\geo_form2.md" + output_path = r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\parsed\geo_form2.json" with open(output_path, "w", encoding="utf-8") as f: docs_dict = [doc.model_dump() for doc in docs] json.dump(docs_dict, f, ensure_ascii=False, indent=2)