-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: create pdf parser with unstructured #8
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import json | ||
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter | ||
import re | ||
import pathlib | ||
|
||
pdf_path = r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\parsed\text.md" | ||
|
||
with open(pdf_path, "r", encoding="utf-8") as md_file: | ||
md_doc = md_file.read() | ||
|
||
headers_to_split_on = [ | ||
("####", "Header 1"), # Chapter section | ||
("######", "Header 2"), # Acknowledgements, table of contents, etc. | ||
("#", "Header 3"), # Bold | ||
("#####", "Header 4"), # Bold + italic | ||
] | ||
|
||
def preprocess_md(md_doc): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you type the input and the output? Example: def foo(im_a_number: int) -> int:
return im_a_number + 1 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same for other functions |
||
"""Modify md document so that bold and bold + italic text is recognized as headers.""" | ||
|
||
md_doc = re.sub(r'\n\*\*(.*?)\*\*\n', r'\n# \1\n', md_doc) # Bold | ||
md_doc = re.sub(r'\n\*\*_([^*]+)_\*\*\n', r'\n##### \1\n', md_doc) # Bold + italic | ||
|
||
return md_doc | ||
|
||
def md_split(md_doc): | ||
"""Split document based on md headers.""" | ||
|
||
markdown_splitter = MarkdownHeaderTextSplitter( | ||
headers_to_split_on=headers_to_split_on, strip_headers=False | ||
) | ||
md_header_splits = markdown_splitter.split_text(md_doc) | ||
|
||
return md_header_splits | ||
|
||
def recursive_split(md_header_splits): | ||
"""Split document recursively.""" | ||
|
||
chunk_size = 250 | ||
chunk_overlap = 30 | ||
Comment on lines
+39
to
+40
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe it would be interesting to declare these two variables as input of the functions so that we could play with them if needed. Example: def recursive_split(md_header_splits, chunk_size: int = 250, chunk_overlap: int = 30):
"""Split document recursively."""
... |
||
text_splitter = RecursiveCharacterTextSplitter( | ||
chunk_size=chunk_size, chunk_overlap=chunk_overlap | ||
) | ||
|
||
splits = text_splitter.split_documents(md_header_splits) | ||
|
||
return splits | ||
|
||
# Change bold & bold + italic text to headers | ||
md_doc = preprocess_md(md_doc) | ||
|
||
# Split data | ||
md_header_splits = md_split(md_doc) | ||
#character_splits = recursive_split(md_header_splits) | ||
|
||
# Append metadata | ||
splits_data = [] | ||
for split in md_header_splits: | ||
splits_data.append({ | ||
"content": split.page_content, | ||
"metadata": split.metadata | ||
}) | ||
|
||
# Save JSON output | ||
output_dir = pathlib.Path(r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\parsed") | ||
output_dir.mkdir(exist_ok=True) | ||
|
||
output_json_path = output_dir / "text.json" | ||
with open(output_json_path, "w", encoding="utf-8") as json_file: | ||
json.dump(splits_data, json_file, ensure_ascii=False, indent=4) | ||
|
||
print(f"Markdown splits saved to {output_json_path}") | ||
Comment on lines
+50
to
+72
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you move this inside a |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import fitz | ||
import pathlib | ||
import pymupdf4llm | ||
|
||
# If output directories do not exist, create them | ||
output_dir = pathlib.Path(r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\parsed") | ||
images_dir = output_dir / "images" | ||
output_dir.mkdir(exist_ok=True) | ||
images_dir.mkdir(exist_ok=True) | ||
|
||
def text_only_extraction(pdf_path, output_md): | ||
"""Only extract text from pdf, ignore images.""" | ||
|
||
md_text = pymupdf4llm.to_markdown(pdf_path) | ||
pathlib.Path("pymu_output/output.md").write_bytes(md_text.encode()) | ||
|
||
def save_image(image_data, xref, ext): | ||
"""Save image to the images directory.""" | ||
|
||
image_file = images_dir / f"img_{xref}.{ext}" | ||
with open(image_file, "wb") as img_file: | ||
img_file.write(image_data) | ||
return image_file | ||
|
||
def extract_images(doc, page_number): | ||
"""Extract images from a specific page.""" | ||
|
||
images_md = [] | ||
for img in doc.get_page_images(page_number): | ||
xref = img[0] | ||
image = doc.extract_image(xref) | ||
if not image: | ||
continue | ||
image_file = save_image(image['image'], xref, image['ext']) | ||
images_md.append(f"\n") | ||
return "\n".join(images_md) | ||
|
||
def text_and_image_extraction(pdf_path, output_md): | ||
"""Extract text and image page by page.""" | ||
|
||
for page_number in range(len(doc)): | ||
page_text = pymupdf4llm.to_markdown(pdf_path, pages=[page_number]) | ||
page_images = extract_images(doc, page_number) | ||
|
||
markdown_content.append(f"## Page {page_number + 1}\n") | ||
markdown_content.append(page_text.strip()) | ||
if page_images: | ||
markdown_content.append(page_images) | ||
|
||
# Save md output | ||
with open(output_md, "w", encoding="utf-8") as md_file: | ||
md_file.write("\n\n".join(markdown_content)) | ||
|
||
print(f"Markdown with text and images saved to: {output_md}") | ||
|
||
# Initialize md content | ||
markdown_content = [] | ||
|
||
# Load pdf | ||
pdf_path = r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\geo_form2.pdf" | ||
doc = fitz.open(pdf_path) | ||
output_md = output_dir / "text.md" | ||
|
||
# Extract text and images | ||
text_and_image_extraction(pdf_path, output_md) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import json | ||
import os | ||
from typing import List | ||
from langchain_unstructured import UnstructuredLoader | ||
from langchain_core.documents import Document | ||
from dotenv import load_dotenv | ||
|
||
load_dotenv() | ||
unstructured_api_key = os.getenv("UNSTRUCTURED_API_KEY") | ||
unstructured_api_url=os.getenv("UNSTRUCTURED_API_URL") | ||
|
||
def create_documents(): | ||
"""Create documents from a PDF file and save them as JSON files. | ||
|
||
Args: | ||
file_name (str): The name of the PDF file in the data/raw directory | ||
""" | ||
|
||
file_path = r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\geo_form2.pdf" | ||
|
||
loader = UnstructuredLoader( | ||
file_path=file_path, | ||
strategy="hi_res", | ||
unique_element_ids=True, | ||
|
||
partition_via_api=True, | ||
coordinates=True, | ||
api_key=unstructured_api_key, | ||
url=unstructured_api_url, | ||
) | ||
|
||
docs: List[Document] = [] | ||
for doc in loader.lazy_load(): | ||
docs.append(doc) | ||
|
||
output_path = r"C:\Users\ADMIN\Desktop\KTHAIS\twiga-warehouse\data\parsed\geo_form2.json" | ||
with open(output_path, "w", encoding="utf-8") as f: | ||
docs_dict = [doc.model_dump() for doc in docs] | ||
json.dump(docs_dict, f, ensure_ascii=False, indent=2) | ||
|
||
print("Document generated.") | ||
|
||
return docs | ||
|
||
create_documents() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The path should be relative inside the project, so that every developer has its own absolute path