Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: create pdf parser with unstructured #8

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions src/langchain_chunking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import json
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
import re
import pathlib

pdf_path = r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\parsed\text.md"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The path should be relative inside the project, so that every developer has its own absolute path


with open(pdf_path, "r", encoding="utf-8") as md_file:
md_doc = md_file.read()

headers_to_split_on = [
("####", "Header 1"), # Chapter section
("######", "Header 2"), # Acknowledgements, table of contents, etc.
("#", "Header 3"), # Bold
("#####", "Header 4"), # Bold + italic
]

def preprocess_md(md_doc):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you type the input and the output? Example:

def foo(im_a_number: int) -> int:
    return im_a_number + 1

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same for other functions

"""Modify md document so that bold and bold + italic text is recognized as headers."""

md_doc = re.sub(r'\n\*\*(.*?)\*\*\n', r'\n# \1\n', md_doc) # Bold
md_doc = re.sub(r'\n\*\*_([^*]+)_\*\*\n', r'\n##### \1\n', md_doc) # Bold + italic

return md_doc

def md_split(md_doc):
"""Split document based on md headers."""

markdown_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=headers_to_split_on, strip_headers=False
)
md_header_splits = markdown_splitter.split_text(md_doc)

return md_header_splits

def recursive_split(md_header_splits):
"""Split document recursively."""

chunk_size = 250
chunk_overlap = 30
Comment on lines +39 to +40
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it would be interesting to declare these two variables as input of the functions so that we could play with them if needed. Example:

def recursive_split(md_header_splits, chunk_size: int = 250, chunk_overlap: int = 30):
    """Split document recursively."""
    ...

text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

splits = text_splitter.split_documents(md_header_splits)

return splits

# Change bold & bold + italic text to headers
md_doc = preprocess_md(md_doc)

# Split data
md_header_splits = md_split(md_doc)
#character_splits = recursive_split(md_header_splits)

# Append metadata
splits_data = []
for split in md_header_splits:
splits_data.append({
"content": split.page_content,
"metadata": split.metadata
})

# Save JSON output
output_dir = pathlib.Path(r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\parsed")
output_dir.mkdir(exist_ok=True)

output_json_path = output_dir / "text.json"
with open(output_json_path, "w", encoding="utf-8") as json_file:
json.dump(splits_data, json_file, ensure_ascii=False, indent=4)

print(f"Markdown splits saved to {output_json_path}")
Comment on lines +50 to +72
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you move this inside a main function? This also implies creating the if __name__ == "__main__": famous line at the bottom of the file. Same for the other files

65 changes: 65 additions & 0 deletions src/pymu_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import fitz
import pathlib
import pymupdf4llm

# If output directories do not exist, create them
output_dir = pathlib.Path(r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\parsed")
images_dir = output_dir / "images"
output_dir.mkdir(exist_ok=True)
images_dir.mkdir(exist_ok=True)

def text_only_extraction(pdf_path, output_md):
"""Only extract text from pdf, ignore images."""

md_text = pymupdf4llm.to_markdown(pdf_path)
pathlib.Path("pymu_output/output.md").write_bytes(md_text.encode())

def save_image(image_data, xref, ext):
"""Save image to the images directory."""

image_file = images_dir / f"img_{xref}.{ext}"
with open(image_file, "wb") as img_file:
img_file.write(image_data)
return image_file

def extract_images(doc, page_number):
"""Extract images from a specific page."""

images_md = []
for img in doc.get_page_images(page_number):
xref = img[0]
image = doc.extract_image(xref)
if not image:
continue
image_file = save_image(image['image'], xref, image['ext'])
images_md.append(f"![Image {xref}]({image_file})\n")
return "\n".join(images_md)

def text_and_image_extraction(pdf_path, output_md):
"""Extract text and image page by page."""

for page_number in range(len(doc)):
page_text = pymupdf4llm.to_markdown(pdf_path, pages=[page_number])
page_images = extract_images(doc, page_number)

markdown_content.append(f"## Page {page_number + 1}\n")
markdown_content.append(page_text.strip())
if page_images:
markdown_content.append(page_images)

# Save md output
with open(output_md, "w", encoding="utf-8") as md_file:
md_file.write("\n\n".join(markdown_content))

print(f"Markdown with text and images saved to: {output_md}")

# Initialize md content
markdown_content = []

# Load pdf
pdf_path = r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\geo_form2.pdf"
doc = fitz.open(pdf_path)
output_md = output_dir / "text.md"

# Extract text and images
text_and_image_extraction(pdf_path, output_md)
45 changes: 45 additions & 0 deletions src/unstructured_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import json
import os
from typing import List
from langchain_unstructured import UnstructuredLoader
from langchain_core.documents import Document
from dotenv import load_dotenv

load_dotenv()
unstructured_api_key = os.getenv("UNSTRUCTURED_API_KEY")
unstructured_api_url=os.getenv("UNSTRUCTURED_API_URL")

def create_documents():
"""Create documents from a PDF file and save them as JSON files.

Args:
file_name (str): The name of the PDF file in the data/raw directory
"""

file_path = r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\geo_form2.pdf"

loader = UnstructuredLoader(
file_path=file_path,
strategy="hi_res",
unique_element_ids=True,

partition_via_api=True,
coordinates=True,
api_key=unstructured_api_key,
url=unstructured_api_url,
)

docs: List[Document] = []
for doc in loader.lazy_load():
docs.append(doc)

output_path = r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\parsed\geo_form2.json"
with open(output_path, "w", encoding="utf-8") as f:
docs_dict = [doc.model_dump() for doc in docs]
json.dump(docs_dict, f, ensure_ascii=False, indent=2)

print("Document generated.")

return docs

create_documents()