Skip to content

Improve Text Chunker #168

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,6 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill:

def get_semantic_chunker_skill(
self,
num_surrounding_sentences: int = 2,
similarity_threshold: float = 0.8,
max_chunk_tokens: int = 500,
min_chunk_tokens: int = 150,
Expand All @@ -294,7 +293,6 @@ def get_semantic_chunker_skill(
-----
context (str): The context of the skill
source (str): The source of the skill
num_surrounding_sentences (int, optional): The number of surrounding sentences. Defaults to 1.
similarity_threshold (float, optional): The similarity threshold. Defaults to 0.8.
max_chunk_tokens (int, optional): The maximum number of tokens. Defaults to 200.

Expand All @@ -314,8 +312,8 @@ def get_semantic_chunker_skill(
name="content", source="/document/layout_merged_content"
),
InputFieldMappingEntry(
name="per_page_starting_sentences",
source="/document/per_page_starting_sentences",
name="page_number_tracking_holders",
source="/document/page_number_tracking_holders",
),
]

Expand All @@ -333,7 +331,6 @@ def get_semantic_chunker_skill(
degree_of_parallelism=degree_of_parallelism,
http_method="POST",
http_headers={
"num_surrounding_sentences": num_surrounding_sentences,
"similarity_threshold": similarity_threshold,
"max_chunk_tokens": max_chunk_tokens,
"min_chunk_tokens": min_chunk_tokens,
Expand Down Expand Up @@ -385,8 +382,8 @@ def get_layout_analysis_skill(
output = [
OutputFieldMappingEntry(name="layout", target_name="layout"),
OutputFieldMappingEntry(
name="per_page_starting_sentences",
target_name="per_page_starting_sentences",
name="page_number_tracking_holders",
target_name="page_number_tracking_holders",
),
]

Expand Down
2 changes: 1 addition & 1 deletion image_processing/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ This skill merges the layout output with the figure outputs to create a unified

### Semantic Chunker Custom Skill

You can then test the chunking by sending a AI Search JSON format to the `/semantic_text_chunker/ HTTP endpoint. The header controls the different chunking parameters *(num_surrounding_sentences, similarity_threshold, max_chunk_tokens, min_chunk_tokens)*.
You can then test the chunking by sending a AI Search JSON format to the `/semantic_text_chunker/ HTTP endpoint. The header controls the different chunking parameters *(similarity_threshold, max_chunk_tokens, min_chunk_tokens)*.

### MarkUp Cleaner Custom Skill

Expand Down
4 changes: 0 additions & 4 deletions image_processing/src/image_processing/function_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,6 @@ async def semantic_text_chunker(req: func.HttpRequest) -> func.HttpResponse:

semantic_text_chunker_config = req.headers

num_surrounding_sentences = int(
semantic_text_chunker_config.get("num_surrounding_sentences", 1)
)
similarity_threshold = float(
semantic_text_chunker_config.get("similarity_threshold", 0.8)
)
Expand All @@ -192,7 +189,6 @@ async def semantic_text_chunker(req: func.HttpRequest) -> func.HttpResponse:
record_tasks = []

semantic_text_chunker_processor = SemanticTextChunker(
num_surrounding_sentences=num_surrounding_sentences,
similarity_threshold=similarity_threshold,
max_chunk_tokens=max_chunk_tokens,
min_chunk_tokens=min_chunk_tokens,
Expand Down
47 changes: 32 additions & 15 deletions image_processing/src/image_processing/layout_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@
LayoutHolder,
PageWiseContentHolder,
NonPageWiseContentHolder,
PerPageStartingSentenceHolder,
PageNumberTrackingHolder,
)
import re


class StorageAccountHelper:
Expand Down Expand Up @@ -341,14 +342,14 @@ def create_page_wise_content(self) -> list[LayoutHolder]:

return page_wise_contents

def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolder]:
def create_page_number_tracking_holder(self) -> list[PageNumberTrackingHolder]:
"""Create a list of the starting sentence of each page so we can assign the starting sentence to the page number.

Returns:
--------
list: A list of the starting sentence of each page."""

per_page_starting_sentences = []
page_number_tracking_holders = []

for page in self.result.pages:
page_content = self.result.content[
Expand All @@ -358,22 +359,38 @@ def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolde

# Remove any leading whitespace/newlines.
cleaned_content = page_content.lstrip()
# If a newline appears before a period, split on newline; otherwise, on period.
if "\n" in cleaned_content:
first_line = cleaned_content.split("\n", 1)[0]
elif "." in cleaned_content:
first_line = cleaned_content.split(".", 1)[0]
# Strip the html comment but keep the content
html_comments_pattern = re.compile(r"<!--.*?-->", re.DOTALL)
cleaned_content = html_comments_pattern.sub("", cleaned_content)

# Remove anything inside a figure tag
cleaned_content = re.sub(
"<figure>(.*?)</figure>",
"",
cleaned_content,
flags=re.DOTALL | re.MULTILINE,
)
logging.info(f"Page Number: {page.page_number}")
logging.info(f"Content for Page Detection: {page_content}")
logging.info(f"Cleaned Content for Page Detection: {cleaned_content}")

if len(cleaned_content) == 0:
logging.error(
"No content found in the cleaned result for page %s.",
page.page_number,
)
cleaned_content = None
else:
first_line = cleaned_content
cleaned_content = cleaned_content.strip()

per_page_starting_sentences.append(
PerPageStartingSentenceHolder(
page_number_tracking_holders.append(
PageNumberTrackingHolder(
page_number=page.page_number,
starting_sentence=first_line.strip(),
page_content=cleaned_content,
)
)

return per_page_starting_sentences
return page_number_tracking_holders

async def get_document_intelligence_client(self) -> DocumentIntelligenceClient:
"""Get the Azure Document Intelligence client.
Expand Down Expand Up @@ -522,11 +539,11 @@ async def analyse(self):
if self.extract_figures:
await self.process_figures_from_extracted_content(text_content)

per_page_starting_sentences = self.create_per_page_starting_sentence()
page_number_tracking_holders = self.create_page_number_tracking_holder()

output_record = NonPageWiseContentHolder(
layout=text_content,
per_page_starting_sentences=per_page_starting_sentences,
page_number_tracking_holders=page_number_tracking_holders,
)

except Exception as e:
Expand Down
7 changes: 3 additions & 4 deletions image_processing/src/image_processing/layout_holders.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,18 @@ class PageWiseContentHolder(BaseModel):
page_wise_layout: list[LayoutHolder]


class PerPageStartingSentenceHolder(BaseModel):
class PageNumberTrackingHolder(BaseModel):
"""A class to hold the starting sentence of each page."""

page_number: int
starting_sentence: str
page_content: str | None


class NonPageWiseContentHolder(BaseModel):
"""A class to hold the non-page-wise content extracted from the document."""

layout: LayoutHolder
per_page_starting_sentences: list[PerPageStartingSentenceHolder] = Field(
page_number_tracking_holders: list[PageNumberTrackingHolder] = Field(
default_factory=list
)

Expand All @@ -69,6 +69,5 @@ class ChunkHolder(BaseModel):
mark_up: str
sections: Optional[list[str]] = Field(default_factory=list)
figures: Optional[list[FigureHolder]] = Field(default_factory=list)
starting_sentence: Optional[str] = None
cleaned_text: Optional[str] = None
page_number: Optional[int] = Field(default=None)
Loading
Loading