diff --git a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py
index 8a872f8..4e00d14 100644
--- a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py
+++ b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py
@@ -283,7 +283,6 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill:
def get_semantic_chunker_skill(
self,
- num_surrounding_sentences: int = 2,
similarity_threshold: float = 0.8,
max_chunk_tokens: int = 500,
min_chunk_tokens: int = 150,
@@ -294,7 +293,6 @@ def get_semantic_chunker_skill(
-----
context (str): The context of the skill
source (str): The source of the skill
- num_surrounding_sentences (int, optional): The number of surrounding sentences. Defaults to 1.
similarity_threshold (float, optional): The similarity threshold. Defaults to 0.8.
max_chunk_tokens (int, optional): The maximum number of tokens. Defaults to 200.
@@ -314,8 +312,8 @@ def get_semantic_chunker_skill(
name="content", source="/document/layout_merged_content"
),
InputFieldMappingEntry(
- name="per_page_starting_sentences",
- source="/document/per_page_starting_sentences",
+ name="page_number_tracking_holders",
+ source="/document/page_number_tracking_holders",
),
]
@@ -333,7 +331,6 @@ def get_semantic_chunker_skill(
degree_of_parallelism=degree_of_parallelism,
http_method="POST",
http_headers={
- "num_surrounding_sentences": num_surrounding_sentences,
"similarity_threshold": similarity_threshold,
"max_chunk_tokens": max_chunk_tokens,
"min_chunk_tokens": min_chunk_tokens,
@@ -385,8 +382,8 @@ def get_layout_analysis_skill(
output = [
OutputFieldMappingEntry(name="layout", target_name="layout"),
OutputFieldMappingEntry(
- name="per_page_starting_sentences",
- target_name="per_page_starting_sentences",
+ name="page_number_tracking_holders",
+ target_name="page_number_tracking_holders",
),
]
diff --git a/image_processing/README.md b/image_processing/README.md
index be740e4..10fdc74 100644
--- a/image_processing/README.md
+++ b/image_processing/README.md
@@ -98,7 +98,7 @@ This skill merges the layout output with the figure outputs to create a unified
### Semantic Chunker Custom Skill
-You can then test the chunking by sending a AI Search JSON format to the `/semantic_text_chunker/ HTTP endpoint. The header controls the different chunking parameters *(num_surrounding_sentences, similarity_threshold, max_chunk_tokens, min_chunk_tokens)*.
+You can then test the chunking by sending a AI Search JSON format to the `/semantic_text_chunker/ HTTP endpoint. The header controls the different chunking parameters *(similarity_threshold, max_chunk_tokens, min_chunk_tokens)*.
### MarkUp Cleaner Custom Skill
diff --git a/image_processing/src/image_processing/function_app.py b/image_processing/src/image_processing/function_app.py
index c918eff..cefcdc2 100644
--- a/image_processing/src/image_processing/function_app.py
+++ b/image_processing/src/image_processing/function_app.py
@@ -171,9 +171,6 @@ async def semantic_text_chunker(req: func.HttpRequest) -> func.HttpResponse:
semantic_text_chunker_config = req.headers
- num_surrounding_sentences = int(
- semantic_text_chunker_config.get("num_surrounding_sentences", 1)
- )
similarity_threshold = float(
semantic_text_chunker_config.get("similarity_threshold", 0.8)
)
@@ -192,7 +189,6 @@ async def semantic_text_chunker(req: func.HttpRequest) -> func.HttpResponse:
record_tasks = []
semantic_text_chunker_processor = SemanticTextChunker(
- num_surrounding_sentences=num_surrounding_sentences,
similarity_threshold=similarity_threshold,
max_chunk_tokens=max_chunk_tokens,
min_chunk_tokens=min_chunk_tokens,
diff --git a/image_processing/src/image_processing/layout_analysis.py b/image_processing/src/image_processing/layout_analysis.py
index 5a1ef4f..64fc8b0 100644
--- a/image_processing/src/image_processing/layout_analysis.py
+++ b/image_processing/src/image_processing/layout_analysis.py
@@ -22,8 +22,9 @@
LayoutHolder,
PageWiseContentHolder,
NonPageWiseContentHolder,
- PerPageStartingSentenceHolder,
+ PageNumberTrackingHolder,
)
+import re
class StorageAccountHelper:
@@ -341,14 +342,14 @@ def create_page_wise_content(self) -> list[LayoutHolder]:
return page_wise_contents
- def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolder]:
+ def create_page_number_tracking_holder(self) -> list[PageNumberTrackingHolder]:
"""Create a list of the starting sentence of each page so we can assign the starting sentence to the page number.
Returns:
--------
list: A list of the starting sentence of each page."""
- per_page_starting_sentences = []
+ page_number_tracking_holders = []
for page in self.result.pages:
page_content = self.result.content[
@@ -358,22 +359,38 @@ def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolde
# Remove any leading whitespace/newlines.
cleaned_content = page_content.lstrip()
- # If a newline appears before a period, split on newline; otherwise, on period.
- if "\n" in cleaned_content:
- first_line = cleaned_content.split("\n", 1)[0]
- elif "." in cleaned_content:
- first_line = cleaned_content.split(".", 1)[0]
+ # Strip the html comment but keep the content
+ html_comments_pattern = re.compile(r"", re.DOTALL)
+ cleaned_content = html_comments_pattern.sub("", cleaned_content)
+
+ # Remove anything inside a figure tag
+ cleaned_content = re.sub(
+ "(.*?)",
+ "",
+ cleaned_content,
+ flags=re.DOTALL | re.MULTILINE,
+ )
+ logging.info(f"Page Number: {page.page_number}")
+ logging.info(f"Content for Page Detection: {page_content}")
+ logging.info(f"Cleaned Content for Page Detection: {cleaned_content}")
+
+ if len(cleaned_content) == 0:
+ logging.error(
+ "No content found in the cleaned result for page %s.",
+ page.page_number,
+ )
+ cleaned_content = None
else:
- first_line = cleaned_content
+ cleaned_content = cleaned_content.strip()
- per_page_starting_sentences.append(
- PerPageStartingSentenceHolder(
+ page_number_tracking_holders.append(
+ PageNumberTrackingHolder(
page_number=page.page_number,
- starting_sentence=first_line.strip(),
+ page_content=cleaned_content,
)
)
- return per_page_starting_sentences
+ return page_number_tracking_holders
async def get_document_intelligence_client(self) -> DocumentIntelligenceClient:
"""Get the Azure Document Intelligence client.
@@ -522,11 +539,11 @@ async def analyse(self):
if self.extract_figures:
await self.process_figures_from_extracted_content(text_content)
- per_page_starting_sentences = self.create_per_page_starting_sentence()
+ page_number_tracking_holders = self.create_page_number_tracking_holder()
output_record = NonPageWiseContentHolder(
layout=text_content,
- per_page_starting_sentences=per_page_starting_sentences,
+ page_number_tracking_holders=page_number_tracking_holders,
)
except Exception as e:
diff --git a/image_processing/src/image_processing/layout_holders.py b/image_processing/src/image_processing/layout_holders.py
index 8d1535f..9e03cff 100644
--- a/image_processing/src/image_processing/layout_holders.py
+++ b/image_processing/src/image_processing/layout_holders.py
@@ -47,18 +47,18 @@ class PageWiseContentHolder(BaseModel):
page_wise_layout: list[LayoutHolder]
-class PerPageStartingSentenceHolder(BaseModel):
+class PageNumberTrackingHolder(BaseModel):
"""A class to hold the starting sentence of each page."""
page_number: int
- starting_sentence: str
+ page_content: str | None
class NonPageWiseContentHolder(BaseModel):
"""A class to hold the non-page-wise content extracted from the document."""
layout: LayoutHolder
- per_page_starting_sentences: list[PerPageStartingSentenceHolder] = Field(
+ page_number_tracking_holders: list[PageNumberTrackingHolder] = Field(
default_factory=list
)
@@ -69,6 +69,5 @@ class ChunkHolder(BaseModel):
mark_up: str
sections: Optional[list[str]] = Field(default_factory=list)
figures: Optional[list[FigureHolder]] = Field(default_factory=list)
- starting_sentence: Optional[str] = None
cleaned_text: Optional[str] = None
page_number: Optional[int] = Field(default=None)
diff --git a/image_processing/src/image_processing/semantic_text_chunker.py b/image_processing/src/image_processing/semantic_text_chunker.py
index b97c667..cb340c9 100644
--- a/image_processing/src/image_processing/semantic_text_chunker.py
+++ b/image_processing/src/image_processing/semantic_text_chunker.py
@@ -7,18 +7,16 @@
import spacy
import numpy as np
from model2vec import StaticModel
-from layout_holders import PerPageStartingSentenceHolder, ChunkHolder
+from layout_holders import PageNumberTrackingHolder, ChunkHolder
class SemanticTextChunker:
def __init__(
self,
- num_surrounding_sentences: int = 2,
similarity_threshold: float = 0.8,
max_chunk_tokens: int = 500,
- min_chunk_tokens: int = 150,
+ min_chunk_tokens: int = 200,
):
- self.num_surrounding_sentences = num_surrounding_sentences
self.similarity_threshold = similarity_threshold
self.max_chunk_tokens = max_chunk_tokens
self.min_chunk_tokens = min_chunk_tokens
@@ -88,7 +86,7 @@ async def chunk(self, text: str) -> list[ChunkHolder]:
Returns:
list(str): The list of chunks"""
- logging.info(f"Chunking text: {text}")
+ logging.debug(f"Chunking text: {text}")
sentences = self.split_into_sentences(text)
@@ -111,7 +109,7 @@ async def chunk(self, text: str) -> list[ChunkHolder]:
f"""Number of Forward pass chunks: {
len(forward_pass_chunks)}"""
)
- logging.info(f"Forward pass chunks: {forward_pass_chunks}")
+ logging.debug(f"Forward pass chunks: {forward_pass_chunks}")
backwards_pass_chunks, _ = self.merge_chunks(
forward_pass_chunks, new_is_table_or_figure_map, forwards_direction=False
@@ -123,7 +121,7 @@ async def chunk(self, text: str) -> list[ChunkHolder]:
f"""Number of Backaward pass chunks: {
len(reversed_backwards_pass_chunks)}"""
)
- logging.info(f"Backward pass chunks: {reversed_backwards_pass_chunks}")
+ logging.debug(f"Backward pass chunks: {reversed_backwards_pass_chunks}")
cleaned_final_chunks = []
for chunk in reversed_backwards_pass_chunks:
@@ -132,7 +130,7 @@ async def chunk(self, text: str) -> list[ChunkHolder]:
cleaned_final_chunks.append(ChunkHolder(mark_up=stripped_chunk))
logging.info(f"Number of final chunks: {len(cleaned_final_chunks)}")
- logging.info(f"Chunks: {cleaned_final_chunks}")
+ logging.debug(f"Chunks: {cleaned_final_chunks}")
if len(cleaned_final_chunks) == 0:
raise ValueError("No chunks were generated")
@@ -174,9 +172,12 @@ def split_into_sentences(self, text: str) -> list[str]:
# Filter out empty ... tags
cleaned_text = self.filter_empty_figures(cleaned_text)
- logging.info(f"Cleaned text: {cleaned_text}")
+ logging.debug(f"Cleaned text: {cleaned_text}")
- doc = self._nlp_model(cleaned_text)
+ self._nlp_model.max_length = len(cleaned_text) + 100
+ doc = self._nlp_model(
+ cleaned_text, disable=["ner", "tagger", "lemmatizer", "textcat"]
+ )
tag_split_sentences = []
# Pattern to match the closing and opening tag junctions with whitespace in between
@@ -258,103 +259,98 @@ def group_figures_and_tables_into_sentences(self, sentences: list[str]):
return grouped_sentences, is_table_or_figure_map
- def look_ahead_and_behind_sentences(
- self, total_sentences, is_table_or_figure_map, current_sentence_index
- ):
- is_table_or_figure_ahead = False
- is_table_or_figure_behind = False
-
- distance_to_next_figure = self.num_surrounding_sentences
-
- if current_sentence_index < self.num_surrounding_sentences:
- is_table_or_figure_behind = is_table_or_figure_map[0]
- else:
- is_table_or_figure_behind = is_table_or_figure_map[
- current_sentence_index - self.num_surrounding_sentences
- ]
-
- surround_sentences_gap_to_test = self.num_surrounding_sentences
- if current_sentence_index + self.num_surrounding_sentences >= total_sentences:
- is_table_or_figure_ahead = is_table_or_figure_map[-1]
- surround_sentences_gap_to_test = total_sentences - current_sentence_index
- else:
- is_table_or_figure_ahead = is_table_or_figure_map[
- current_sentence_index + self.num_surrounding_sentences
- ]
-
- for (
- next_sentence_is_table_or_figure_index,
- next_sentence_is_table_or_figure,
- ) in enumerate(
- is_table_or_figure_map[
- current_sentence_index : current_sentence_index
- + surround_sentences_gap_to_test
- ]
- ):
- if next_sentence_is_table_or_figure:
- distance_to_next_figure = next_sentence_is_table_or_figure_index
-
- return (
- is_table_or_figure_ahead,
- is_table_or_figure_behind,
- min(surround_sentences_gap_to_test, distance_to_next_figure),
+ def remove_figures(self, text):
+ figure_tag_pattern = (
+ r"(.*?)"
)
+ return re.sub(figure_tag_pattern, "", text).strip()
def merge_similar_chunks(self, current_sentence, current_chunk, forwards_direction):
new_chunk = None
- def retrieve_current_chunk_up_to_n(n):
+ def retrieve_current_chunk_up_to_minus_n(n):
if forwards_direction:
return " ".join(current_chunk[:-n])
else:
return " ".join(reversed(current_chunk[:-n]))
- def retrieve_current_chunks_from_n(n):
- if forwards_direction:
- return " ".join(current_chunk[n:])
- else:
- return " ".join(reversed(current_chunk[:-n]))
-
def retrive_current_chunk_at_n(n):
if forwards_direction:
return current_chunk[n]
else:
return current_chunk[n]
+ def retrieve_current_chunks_from_n(n):
+ if forwards_direction:
+ return " ".join(current_chunk[n:])
+ else:
+ return " ".join(reversed(current_chunk[n:]))
+
def get_current_chunk_tokens(chunk_segments):
+ if isinstance(chunk_segments, str):
+ return self.num_tokens_from_string(chunk_segments)
+
return self.num_tokens_from_string(" ".join(chunk_segments))
+ if len(current_chunk) == 1:
+ logging.debug("Chunk too small to compare")
+ return new_chunk, current_chunk
+
+ if len(current_chunk) > 2:
+ would_be_end_of_old_chunk = retrieve_current_chunk_up_to_minus_n(1)
+ would_be_start_of_new_chunk = [retrive_current_chunk_at_n(-1)]
+ else:
+ would_be_end_of_old_chunk = retrive_current_chunk_at_n(0)
+ would_be_start_of_new_chunk = [retrive_current_chunk_at_n(1)]
+
current_chunk_tokens = get_current_chunk_tokens(current_chunk)
+ logging.debug(f"Current chunk tokens: {current_chunk_tokens}")
+ would_be_end_of_old_chunk_tokens = get_current_chunk_tokens(
+ would_be_end_of_old_chunk
+ )
+ logging.debug(f"Would be new chunk tokens: {would_be_end_of_old_chunk_tokens}")
- if len(current_chunk) >= 2 and current_chunk_tokens >= self.min_chunk_tokens:
- # Calculate the tokens if we were to split
- if len(current_chunk) > 2:
- would_be_new_chunk = retrieve_current_chunk_up_to_n(1)
- would_be_current_chunk = [retrive_current_chunk_at_n(-1)]
- else:
- would_be_new_chunk = retrive_current_chunk_at_n(0)
- would_be_current_chunk = [retrive_current_chunk_at_n(1)]
+ would_be_end_of_old_chunk_without_figures = self.remove_figures(
+ would_be_end_of_old_chunk
+ )
- if (
- get_current_chunk_tokens(would_be_new_chunk) >= self.min_chunk_tokens
- and get_current_chunk_tokens(would_be_current_chunk)
- >= self.min_chunk_tokens
- ):
- logging.info("Comparing chunks")
- if (
- current_chunk_tokens >= self.max_chunk_tokens
- or self.sentence_similarity(
- retrieve_current_chunks_from_n(-2), current_sentence
- )
- < self.similarity_threshold
- ):
- return would_be_new_chunk, would_be_current_chunk
- else:
- logging.info("Chunk too small to compare")
- else:
- logging.info("Chunk too small to compare")
+ would_be_end_of_old_chunk_without_figures_tokens = self.num_tokens_from_string(
+ would_be_end_of_old_chunk_without_figures
+ )
+
+ would_be_start_of_new_chunk_without_figures = self.remove_figures(
+ " ".join(would_be_start_of_new_chunk)
+ )
+
+ if len(would_be_start_of_new_chunk_without_figures) == 0:
+ logging.debug("Chunk would only contain figures. Not comparing")
+ return new_chunk, current_chunk
+
+ if (
+ would_be_end_of_old_chunk_tokens < self.min_chunk_tokens
+ or would_be_end_of_old_chunk_without_figures_tokens
+ < (self.min_chunk_tokens / 2)
+ ):
+ logging.debug("Chunk too small. Not comparing")
+ return new_chunk, current_chunk
+
+ if would_be_end_of_old_chunk_without_figures_tokens > self.max_chunk_tokens:
+ logging.debug("Chunk too large. Not comparing")
+ return would_be_end_of_old_chunk, would_be_start_of_new_chunk
+
+ similarity_set = retrieve_current_chunks_from_n(-2)
- return new_chunk, current_chunk
+ # Calculate the tokens if we were to split
+ logging.debug("Comparing chunks")
+ if (
+ current_chunk_tokens > (self.max_chunk_tokens * 1.5)
+ or self.sentence_similarity(similarity_set, current_sentence)
+ < self.similarity_threshold
+ ):
+ return would_be_end_of_old_chunk, would_be_start_of_new_chunk
+ else:
+ logging.debug("Above similarity threshold")
+ return new_chunk, current_chunk
def is_markdown_heading(self, text):
return text.strip().startswith("#")
@@ -385,96 +381,73 @@ def retrieve_current_chunk():
index += 1
continue
- # Detect if table or figure
- if is_table_or_figure_map[current_sentence_index]:
- if forwards_direction:
+ if forwards_direction and self.is_markdown_heading(current_sentence):
+ heading_level = current_sentence.count("#")
+
+ if heading_level in [1, 2]:
+ # Start new chunk
if len(current_chunk) > 0:
- current_chunk.append(current_sentence)
- chunks.append(retrieve_current_chunk())
- new_is_table_or_figure_map.append(True)
- current_chunk = []
- else:
- current_chunk.append(current_sentence)
- else:
- # On the backwards pass we don't want to add to the table chunk
- chunks.append(retrieve_current_chunk())
- new_is_table_or_figure_map.append(True)
- current_chunk = [current_sentence]
+ current_chunk = retrieve_current_chunk()
+ chunks.append(current_chunk)
+ new_is_table_or_figure_map.append(
+ self.sentence_contains_figure_or_table(current_chunk)
+ )
+ current_chunk = [current_sentence]
- index += 1
- continue
- elif forwards_direction:
- # Look ahead to see if figure of table is coming up
- # We only do this on the forward pass
- (
- is_table_or_figure_ahead,
- is_table_or_figure_behind,
- min_of_distance_to_next_figure_or_num_surrounding_sentences,
- ) = self.look_ahead_and_behind_sentences(
- total_sentences, is_table_or_figure_map, current_sentence_index
- )
+ index += 1
+ continue
- if is_table_or_figure_behind:
- # Check if Makrdown heading
- if self.is_markdown_heading(current_sentence):
- # Start new chunk
- chunks.append(retrieve_current_chunk())
- new_is_table_or_figure_map.append(False)
- current_chunk = [current_sentence]
- else:
- # Finish off
- current_chunk.append(current_sentence)
- chunks.append(retrieve_current_chunk())
- new_is_table_or_figure_map.append(False)
- current_chunk = []
+ # Detect if table or figure
+ if forwards_direction and is_table_or_figure_map[current_sentence_index]:
+ if len(current_chunk) > 0:
+ current_chunk.append(current_sentence)
+ chunks.append(retrieve_current_chunk())
+ new_is_table_or_figure_map.append(True)
+ current_chunk = []
index += 1
continue
- elif is_table_or_figure_ahead:
- # Add to the ahead chunk
- chunks.append(retrieve_current_chunk())
- new_is_table_or_figure_map.append(False)
- if forwards_direction:
- current_chunk = sentences[
- current_sentence_index : current_sentence_index
- + min_of_distance_to_next_figure_or_num_surrounding_sentences
- ]
- else:
- current_chunk = sentences[
- current_sentence_index : current_sentence_index
- - min_of_distance_to_next_figure_or_num_surrounding_sentences : -1
- ]
- index += min_of_distance_to_next_figure_or_num_surrounding_sentences
- continue
# now group semanticly
- num_tokens = self.num_tokens_from_string(current_sentence)
+ current_chunk.append(current_sentence)
- if num_tokens >= self.max_chunk_tokens:
- chunks.append(current_sentence)
- new_is_table_or_figure_map.append(False)
- else:
- current_chunk.append(current_sentence)
+ new_chunk, current_chunk = self.merge_similar_chunks(
+ current_sentence,
+ current_chunk,
+ forwards_direction=forwards_direction,
+ )
- new_chunk, current_chunk = self.merge_similar_chunks(
- current_sentence,
- current_chunk,
- forwards_direction=forwards_direction,
+ if new_chunk is not None:
+ chunks.append(new_chunk)
+ new_is_table_or_figure_map.append(
+ self.sentence_contains_figure_or_table(new_chunk)
)
- if new_chunk is not None:
- chunks.append(new_chunk)
- new_is_table_or_figure_map.append(False)
-
index += 1
if len(current_chunk) > 0:
final_chunk = retrieve_current_chunk()
- chunks.append(final_chunk)
- new_is_table_or_figure_map.append(
- self.sentence_contains_figure_or_table(final_chunk)
- )
+ # Get tokens of this chunk
+ if (
+ self.num_tokens_from_string(final_chunk) < self.min_chunk_tokens
+ and len(chunks) > 0
+ ):
+ # Add the last chunk to the new chunks
+ if forwards_direction:
+ final_chunk = chunks[-1] + " " + final_chunk
+ else:
+ final_chunk = final_chunk + " " + chunks[-1]
+
+ chunks[-1] = final_chunk
+ new_is_table_or_figure_map[-1] = self.sentence_contains_figure_or_table(
+ final_chunk
+ )
+ else:
+ chunks.append(final_chunk)
+ new_is_table_or_figure_map.append(
+ self.sentence_contains_figure_or_table(final_chunk)
+ )
return chunks, new_is_table_or_figure_map
@@ -486,7 +459,7 @@ def sentence_similarity(self, text_1, text_2):
magnitude = np.linalg.norm(vec1) * np.linalg.norm(vec2)
similarity = dot_product / magnitude if magnitude != 0 else 0.0
- logging.info(
+ logging.debug(
f"""Similarity between '{text_1}' and '{
text_2}': {similarity}"""
)
@@ -495,28 +468,83 @@ def sentence_similarity(self, text_1, text_2):
def assign_page_number_to_chunks(
self,
chunks: list[ChunkHolder],
- per_page_starting_sentences: list[PerPageStartingSentenceHolder],
+ page_number_tracking_holders: list[PageNumberTrackingHolder],
) -> list[ChunkHolder]:
"""Assigns page numbers to the chunks based on the starting sentences of each page.
Args:
chunks (list[ChunkHolder]): The list of chunks.
- per_page_starting_sentences (list[PerPageStartingSentenceHolder]): The list of starting sentences of each page.
+ page_number_tracking_holders (list[PageNumberTrackingHolder]): The list of starting sentences of each page.
Returns:
list[ChunkHolder]: The list of chunks with page numbers assigned."""
page_number = 1
for chunk in chunks:
- for per_page_starting_sentence in per_page_starting_sentences[
- page_number - 1 :
- ]:
- if per_page_starting_sentence.starting_sentence in chunk:
- logging.info(
- "Assigning page number %i to chunk",
- per_page_starting_sentence.page_number,
+ # Remove any leading whitespace/newlines.
+ cleaned_content = chunk.mark_up.lstrip()
+ # Strip the html comment but keep the content
+ html_comments_pattern = re.compile(r"", re.DOTALL)
+ cleaned_content = html_comments_pattern.sub("", cleaned_content)
+
+ # Use the nlp model to get the first sentence
+ sentences = list(
+ self._nlp_model(
+ cleaned_content, disable=["ner", "tagger", "lemmatizer", "textcat"]
+ ).sents
+ )
+
+ if len(sentences) == 0:
+ first_line = None
+ else:
+ first_sentence = sentences[0].text.strip()
+
+ if "#" in first_sentence:
+ logging.info("Splitting on hash")
+ # Delibretely split on the next hash to get the first line of the markdown content
+ first_line = (
+ first_sentence.split(" #", 1)[0]
+ .strip()
+ .split("\n", 1)[0]
+ .strip()
)
- page_number = per_page_starting_sentence.page_number
- break
+ elif "
" in first_sentence:
+ logging.info("Joining onto second sentence to form first row")
+ if len(sentences) > 1:
+ first_line = (
+ first_sentence.lstrip() + "\n" + sentences[1].text.strip()
+ )
+ else:
+ first_line = first_sentence
+ elif "\n" in first_sentence:
+ logging.info("Splitting on newline")
+ first_line = first_sentence.split("\n", 1)[0].strip()
+ elif "." in first_sentence:
+ logging.info("Splitting on period")
+ first_line = first_sentence.split(".", 1)[0].strip()
+ else:
+ logging.info("No split found")
+ first_line = first_sentence.strip()
+
+ if first_line is not None:
+ logging.info(f"Looking for First line: {first_line}")
+ for page_number_tracking_holder in page_number_tracking_holders[
+ page_number - 1 :
+ ]:
+ if page_number_tracking_holder.page_content is not None:
+ if (
+ first_line == page_number_tracking_holder.page_content
+ or first_line in page_number_tracking_holder.page_content
+ or first_line
+ in page_number_tracking_holder.page_content.replace(
+ "\n", " "
+ )
+ ):
+ logging.info(
+ "Assigning page number %i to chunk",
+ page_number,
+ )
+ page_number = page_number_tracking_holder.page_number
+ break
chunk.page_number = page_number
return chunks
@@ -545,16 +573,16 @@ async def process_semantic_text_chunker(record: dict, text_chunker) -> dict:
# scenarios when page by chunking is enabled
chunks = await text_chunker.chunk(record["data"]["content"])
- if "per_page_starting_sentences" in record["data"]:
- per_page_starting_sentences = [
- PerPageStartingSentenceHolder(**sentence)
- for sentence in record["data"]["per_page_starting_sentences"]
+ if "page_number_tracking_holders" in record["data"]:
+ page_number_tracking_holders = [
+ PageNumberTrackingHolder(**sentence)
+ for sentence in record["data"]["page_number_tracking_holders"]
]
- logging.info(f"Per page starting sentences: {per_page_starting_sentences}")
+ logging.info(f"Per page holders: {page_number_tracking_holders}")
chunks = text_chunker.assign_page_number_to_chunks(
- chunks, per_page_starting_sentences
+ chunks, page_number_tracking_holders
)
cleaned_record["data"]["chunks"] = [
diff --git a/image_processing/tests/image_processing/test_figure_app.py b/image_processing/tests/image_processing/test_figure_app.py
index e86a1f4..dcdd1b9 100644
--- a/image_processing/tests/image_processing/test_figure_app.py
+++ b/image_processing/tests/image_processing/test_figure_app.py
@@ -188,7 +188,6 @@ async def dummy_process_semantic_text_chunker(value, processor):
)
headers = {
- "num_surrounding_sentences": "2",
"similarity_threshold": "0.9",
"max_chunk_tokens": "600",
"min_chunk_tokens": "60",
diff --git a/image_processing/tests/image_processing/test_layout_analysis.py b/image_processing/tests/image_processing/test_layout_analysis.py
index e9de95a..5c4d642 100644
--- a/image_processing/tests/image_processing/test_layout_analysis.py
+++ b/image_processing/tests/image_processing/test_layout_analysis.py
@@ -11,6 +11,8 @@
LayoutAnalysis,
)
+from layout_holders import LayoutHolder
+
# --- Dummy classes to simulate ADI results and figures ---
class DummySpan:
@@ -436,7 +438,7 @@ class DummyResultContent:
assert layout.page_offsets == 0
-def test_create_per_page_starting_sentence():
+def test_create_page_number_tracking_holder():
# Create a LayoutAnalysis instance.
la = LayoutAnalysis(record_id=200, source="dummy")
@@ -449,17 +451,17 @@ class DummyResultContent:
dummy_result = DummyResultContent()
dummy_result.content = "HelloWorld. This is a test sentence."
# DummyPage creates a page with spans as a list of dictionaries.
- dummy_result.pages = [DummyPage(0, 10, 1)]
+ dummy_result.pages = [DummyPage(0, 36, 1)]
la.result = dummy_result
- sentences = la.create_per_page_starting_sentence()
- assert len(sentences) == 1
- sentence = sentences[0]
- assert sentence.page_number == 1
- assert sentence.starting_sentence == "HelloWorld"
+ page_number_trackers = la.create_page_number_tracking_holder()
+ assert len(page_number_trackers) == 1
+ tracker = page_number_trackers[0]
+ assert tracker.page_number == 1
+ assert tracker.page_content == "HelloWorld. This is a test sentence."
-def test_create_per_page_starting_sentence_multiple_pages():
+def test_create_page_number_tracking_holder_multiple_pages():
# Create a LayoutAnalysis instance.
la = LayoutAnalysis(record_id=300, source="dummy")
@@ -479,15 +481,337 @@ class DummyResultContent:
]
la.result = dummy_result
- # Call create_per_page_starting_sentence and check results.
- sentences = la.create_per_page_starting_sentence()
- assert len(sentences) == 2
+ # Call create_page_number_tracking_holder and check results.
+ page_number_trackers = la.create_page_number_tracking_holder()
+ assert len(page_number_trackers) == 2
# For page 1, the substring is "Page one." -> split on "." gives "Page one"
- assert sentences[0].page_number == 1
- assert sentences[0].starting_sentence == "Page one"
+ assert page_number_trackers[0].page_number == 1
+ assert page_number_trackers[0].page_content == "Page one."
# For page 2, the substring is "Page two text and" -> split on "." gives the entire string
- assert sentences[1].page_number == 2
+ assert page_number_trackers[1].page_number == 2
# We strip potential leading/trailing spaces for validation.
- assert sentences[1].starting_sentence.strip() == "Page two text and more content"
+ assert (
+ page_number_trackers[1].page_content.strip()
+ == "Page two text and more content. This is more random content that is on page 2."
+ )
+
+
+# Test for download_figure_image with retry logic
+@pytest.mark.asyncio
+async def test_download_figure_image_with_retry(monkeypatch):
+ """Test the download_figure_image method with retry logic."""
+ la = LayoutAnalysis(record_id=101, source="dummy")
+ la.operation_id = "op101"
+ la.result = DummyResult("content", [], [], model_id="model101")
+
+ # Create a counter to track number of attempts
+ call_count = 0
+
+ # Mock document_intelligence_client.get_analyze_result_figure
+ class MockResponse:
+ def __init__(self):
+ self.chunks = [b"chunk1", b"chunk2"]
+
+ def __aiter__(self):
+ return self
+
+ async def __anext__(self):
+ if not self.chunks:
+ raise StopAsyncIteration
+ return self.chunks.pop(0)
+
+ class MockClient:
+ async def __aenter__(self):
+ return self
+
+ async def __aexit__(self, *args):
+ pass
+
+ async def get_analyze_result_figure(self, model_id, result_id, figure_id):
+ nonlocal call_count
+ call_count += 1
+ if call_count == 1:
+ # Fail on first attempt
+ raise Exception("Temporary failure")
+ # Succeed on subsequent attempts
+ return MockResponse()
+
+ # Patch get_document_intelligence_client to return our mock
+ async def mock_get_client():
+ return MockClient()
+
+ monkeypatch.setattr(la, "get_document_intelligence_client", mock_get_client)
+
+ # Call the method - should succeed after retry
+ result = await la.download_figure_image("fig1")
+
+ # Check that it was called more than once (at least one retry)
+ assert call_count > 1
+ # Check the result contains both chunks
+ assert result == b"chunk1chunk2"
+
+
+# Test for non-page-wise analysis with figures
+@pytest.mark.asyncio
+async def test_analyse_non_page_wise_with_figures(monkeypatch, dummy_storage_helper):
+ """Test non-page-wise analysis with figures."""
+ source = "https://dummyaccount.blob.core.windows.net/container/path/to/file.txt"
+ la = LayoutAnalysis(
+ page_wise=False, extract_figures=True, record_id=102, source=source
+ )
+ la.extract_file_info()
+
+ monkeypatch.setattr(
+ la, "get_storage_account_helper", AsyncMock(return_value=dummy_storage_helper)
+ )
+ monkeypatch.setattr(
+ dummy_storage_helper,
+ "download_blob_to_temp_dir",
+ AsyncMock(return_value=("/tmp/dummy.txt", {})),
+ )
+
+ # Create a dummy result with content and a figure
+ dummy_page = DummyPage(0, 20, 1)
+ dummy_figure = DummyFigure(
+ "fig102",
+ offset=10,
+ length=5,
+ page_number=1,
+ caption_content="Figure 102 caption",
+ )
+ dummy_result = DummyResult(
+ content="Full document content", pages=[dummy_page], figures=[dummy_figure]
+ )
+
+ async def dummy_analyse_document(file_path):
+ la.result = dummy_result
+ la.operation_id = "op102"
+
+ monkeypatch.setattr(la, "analyse_document", dummy_analyse_document)
+
+ # Mock figure download and upload
+ monkeypatch.setattr(
+ la, "download_figure_image", AsyncMock(return_value=b"figure102_image_data")
+ )
+ monkeypatch.setattr(
+ dummy_storage_helper,
+ "upload_blob",
+ AsyncMock(return_value="http://dummy.url/fig102.png"),
+ )
+
+ result = await la.analyse()
+
+ assert result["recordId"] == 102
+ assert result["data"] is not None
+ # In non-page-wise mode, we should have layout and page_number_tracking_holders
+ assert "layout" in result["data"]
+ assert "page_number_tracking_holders" in result["data"]
+
+ # Verify figure was processed
+ layout = result["data"]["layout"]
+ assert "figures" in layout
+ figures = layout["figures"]
+ assert len(figures) == 1
+ assert figures[0]["figure_id"] == "fig102"
+ assert figures[0]["caption"] == "Figure 102 caption"
+ expected_b64 = base64.b64encode(b"figure102_image_data").decode("utf-8")
+ assert figures[0]["data"] == expected_b64
+
+
+# Test for when extract_figures is False
+@pytest.mark.asyncio
+async def test_analyse_without_extracting_figures(monkeypatch, dummy_storage_helper):
+ """Test analysis when extract_figures is False."""
+ source = "https://dummyaccount.blob.core.windows.net/container/path/to/file.txt"
+ la = LayoutAnalysis(
+ page_wise=True, extract_figures=False, record_id=103, source=source
+ )
+ la.extract_file_info()
+
+ monkeypatch.setattr(
+ la, "get_storage_account_helper", AsyncMock(return_value=dummy_storage_helper)
+ )
+ monkeypatch.setattr(
+ dummy_storage_helper,
+ "download_blob_to_temp_dir",
+ AsyncMock(return_value=("/tmp/dummy.txt", {})),
+ )
+
+ # Create a dummy result with content and a figure
+ dummy_page = DummyPage(0, 10, 1)
+ dummy_figure = DummyFigure(
+ "fig103",
+ offset=5,
+ length=3,
+ page_number=1,
+ caption_content="Figure 103 caption",
+ )
+ dummy_result = DummyResult(
+ content="Page content", pages=[dummy_page], figures=[dummy_figure]
+ )
+
+ async def dummy_analyse_document(file_path):
+ la.result = dummy_result
+ la.operation_id = "op103"
+
+ monkeypatch.setattr(la, "analyse_document", dummy_analyse_document)
+
+ # Add spy on process_figures_from_extracted_content to ensure it's not called
+ process_figures_spy = AsyncMock()
+ monkeypatch.setattr(
+ la, "process_figures_from_extracted_content", process_figures_spy
+ )
+
+ result = await la.analyse()
+
+ # Verify the function was not called
+ process_figures_spy.assert_not_called()
+
+ assert result["recordId"] == 103
+ assert result["data"] is not None
+ # Verify we have page_wise_layout
+ assert "page_wise_layout" in result["data"]
+ layouts = result["data"]["page_wise_layout"]
+ assert len(layouts) == 1
+ # Each layout should have an empty figures list
+ assert layouts[0]["figures"] == []
+
+
+# Test for HTML comment handling in create_page_number_tracking_holder
+def test_create_page_number_tracking_holder_html_comments():
+ """Test HTML comment handling in page content extraction."""
+ la = LayoutAnalysis(record_id=104, source="dummy")
+
+ class DummyResultContent:
+ pass
+
+ dummy_result = DummyResultContent()
+ # Content with HTML comments
+ dummy_result.content = "Before After"
+ dummy_result.pages = [DummyPage(0, 29, 1)] # Full content
+ la.result = dummy_result
+
+ page_number_trackers = la.create_page_number_tracking_holder()
+ assert len(page_number_trackers) == 1
+ # HTML comments should be removed
+ assert page_number_trackers[0].page_content == "Before After"
+
+
+# Test for figure tag handling in create_page_number_tracking_holder
+def test_create_page_number_tracking_holder_figure_tags():
+ """Test figure tag handling in page content extraction."""
+ la = LayoutAnalysis(record_id=105, source="dummy")
+
+ class DummyResultContent:
+ pass
+
+ dummy_result = DummyResultContent()
+ # Content with figure tags
+ dummy_result.content = "Before Figure content After"
+ dummy_result.pages = [DummyPage(0, 44, 1)] # Full content
+ la.result = dummy_result
+
+ page_number_trackers = la.create_page_number_tracking_holder()
+ assert len(page_number_trackers) == 1
+ # Figure content should be removed
+ assert page_number_trackers[0].page_content == "Before After"
+
+
+# Test handling of empty content
+def test_create_page_number_tracking_holder_empty_content():
+ """Test handling of empty content in page tracking."""
+ la = LayoutAnalysis(record_id=106, source="dummy")
+
+ class DummyResultContent:
+ pass
+
+ dummy_result = DummyResultContent()
+ # Empty content
+ dummy_result.content = ""
+ dummy_result.pages = [DummyPage(0, 0, 1)] # Empty content
+ la.result = dummy_result
+
+ page_number_trackers = la.create_page_number_tracking_holder()
+ assert len(page_number_trackers) == 1
+ # Page content should be None for empty content
+ assert page_number_trackers[0].page_content is None
+
+
+# Test for process_layout_analysis with page_wise=True
+@pytest.mark.asyncio
+async def test_process_layout_analysis_page_wise(monkeypatch):
+ """Test process_layout_analysis with page_wise=True."""
+ record = {
+ "recordId": "107",
+ "data": {"source": "https://dummy.blob.core.windows.net/container/blob.pdf"},
+ }
+
+ # Create a mock LayoutAnalysis
+ mock_layout_analysis = AsyncMock()
+ mock_layout_analysis.analyse = AsyncMock(
+ return_value={"recordId": "107", "data": {"result": "success"}}
+ )
+
+ # Mock the LayoutAnalysis constructor
+ def mock_layout_analysis_constructor(*args, **kwargs):
+ # Verify page_wise=True was passed
+ assert kwargs["page_wise"] is True
+ return mock_layout_analysis
+
+ monkeypatch.setattr(
+ "layout_analysis.LayoutAnalysis", mock_layout_analysis_constructor
+ )
+
+ result = await process_layout_analysis(record, page_wise=True)
+
+ # Verify analyse was called
+ mock_layout_analysis.analyse.assert_called_once()
+ assert result["recordId"] == "107"
+ assert result["data"] == {"result": "success"}
+
+
+# Test handling figures without captions
+@pytest.mark.asyncio
+async def test_figure_without_caption(monkeypatch, dummy_storage_helper):
+ """Test handling figures without captions."""
+ source = "https://dummyaccount.blob.core.windows.net/container/path/to/file.txt"
+ la = LayoutAnalysis(
+ page_wise=False, extract_figures=True, record_id=108, source=source
+ )
+ la.extract_file_info()
+
+ monkeypatch.setattr(
+ la, "get_storage_account_helper", AsyncMock(return_value=dummy_storage_helper)
+ )
+ monkeypatch.setattr(
+ dummy_storage_helper,
+ "download_blob_to_temp_dir",
+ AsyncMock(return_value=("/tmp/dummy.txt", {})),
+ )
+
+ # Create a figure without a caption (caption=None)
+ dummy_figure = DummyFigure(
+ "fig108", offset=5, length=3, page_number=1, caption_content=None
+ )
+ dummy_result = DummyResult(
+ content="Content", pages=[DummyPage(0, 7, 1)], figures=[dummy_figure]
+ )
+
+ la.result = dummy_result
+ monkeypatch.setattr(
+ la, "download_figure_image", AsyncMock(return_value=b"figure108_image_data")
+ )
+
+ # Create a minimal layout holder for testing
+ layout_holder = LayoutHolder(content="Test", page_number=1, page_offsets=0)
+
+ # Process the figures
+ await la.process_figures_from_extracted_content(layout_holder)
+
+ # Check that the figure was processed despite having no caption
+ assert len(layout_holder.figures) == 1
+ figure = layout_holder.figures[0]
+ assert figure.figure_id == "fig108"
+ assert figure.caption is None # Caption should be None
diff --git a/image_processing/tests/image_processing/test_layout_holders.py b/image_processing/tests/image_processing/test_layout_holders.py
index 3d2d1c4..4e23893 100644
--- a/image_processing/tests/image_processing/test_layout_holders.py
+++ b/image_processing/tests/image_processing/test_layout_holders.py
@@ -8,7 +8,7 @@
PageWiseContentHolder,
NonPageWiseContentHolder,
ChunkHolder,
- PerPageStartingSentenceHolder,
+ PageNumberTrackingHolder,
)
@@ -74,34 +74,32 @@ def test_chunk_holder_creation():
mark_up="Sample markup",
sections=["Section1", "Section2"],
figures=[],
- starting_sentence="First sentence",
cleaned_text="Cleaned text content",
page_number=1,
)
assert chunk.mark_up == "Sample markup"
assert chunk.sections == ["Section1", "Section2"]
- assert chunk.starting_sentence == "First sentence"
assert chunk.cleaned_text == "Cleaned text content"
assert chunk.page_number == 1
-def test_per_page_starting_sentence_holder_creation():
- sentence = PerPageStartingSentenceHolder(
- page_number=1, starting_sentence="This is the starting sentence."
+def test_per_page_page_content_holder_creation():
+ sentence = PageNumberTrackingHolder(
+ page_number=1, page_content="This is the full content."
)
assert sentence.page_number == 1
- assert sentence.starting_sentence == "This is the starting sentence."
+ assert sentence.page_content == "This is the full content."
-def test_non_page_wise_content_holder_with_sentences():
+def test_non_page_wise_content_holder_with_page_number_trackers():
layout = LayoutHolder(content="Full document")
- sentences = [
- PerPageStartingSentenceHolder(page_number=1, starting_sentence="Start 1"),
- PerPageStartingSentenceHolder(page_number=2, starting_sentence="Start 2"),
+ page_number_trackers = [
+ PageNumberTrackingHolder(page_number=1, page_content="Start 1"),
+ PageNumberTrackingHolder(page_number=2, page_content="Start 2"),
]
non_page_holder = NonPageWiseContentHolder(
- layout=layout, per_page_starting_sentences=sentences
+ layout=layout, page_number_tracking_holders=page_number_trackers
)
assert non_page_holder.layout.content == "Full document"
- assert len(non_page_holder.per_page_starting_sentences) == 2
- assert non_page_holder.per_page_starting_sentences[0].starting_sentence == "Start 1"
+ assert len(non_page_holder.page_number_tracking_holders) == 2
+ assert non_page_holder.page_number_tracking_holders[0].page_content == "Start 1"
diff --git a/image_processing/tests/image_processing/test_semantic_text_chunker.py b/image_processing/tests/image_processing/test_semantic_text_chunker.py
index 59e8364..07277c4 100644
--- a/image_processing/tests/image_processing/test_semantic_text_chunker.py
+++ b/image_processing/tests/image_processing/test_semantic_text_chunker.py
@@ -1,3 +1,5 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
import pytest
from unittest.mock import AsyncMock, MagicMock
@@ -6,6 +8,8 @@
SemanticTextChunker,
)
+from layout_holders import ChunkHolder, PageNumberTrackingHolder
+
# --- Dummy Classes for Process-Level Tests ---
@@ -18,75 +22,80 @@ def model_dump(self, by_alias=False):
return {"mark_up": self.mark_up, "page_number": self.page_number}
-class DummyPerPageStartingSentenceHolder:
- def __init__(self, starting_sentence, page_number):
- self.starting_sentence = starting_sentence
+class DummyPageNumberTrackingHolder:
+ def __init__(self, page_content, page_number):
+ self.page_content = page_content
self.page_number = page_number
# --- Process-Level Tests (Using Dummy Chunker) ---
-@pytest.mark.asyncio
-async def test_process_semantic_text_chunker_success_without_page():
- """Test a successful chunking when no per-page starting sentences are provided."""
- record = {"recordId": "1", "data": {"content": "Some content to be chunked."}}
+@pytest.mark.parametrize(
+ "chunk_contents, page_content, expected_page",
+ [
+ # Test matching on markdown heading
+ (["# Title", "Content"], "# Title", 2),
+ # Test matching on newline content
+ (["First line", "Second line"], "First line", 3),
+ # Test matching on period
+ (["First sentence. Second sentence"], "First sentence. Second sentence", 4),
+ # Test matching on table
+ (["
Table content
"], "", 1),
+ # Test no match (should get default page 1)
+ (["Content not in any page_content"], "Different content", 1),
+ ],
+)
+def test_assign_page_number_to_chunks(chunk_contents, page_content, expected_page):
+ """Test the page assignment logic for different types of content."""
+ # Create a real SemanticTextChunker instance
+ chunker = SemanticTextChunker()
+
+ chunks = [ChunkHolder(mark_up=chunk_content) for chunk_content in chunk_contents]
+
+ # Create chunks with different content types
+
+ # Create page tracking holders
+ page_tracking_holders = [
+ PageNumberTrackingHolder(page_content="", page_number=1),
+ PageNumberTrackingHolder(page_content="# Title", page_number=2),
+ PageNumberTrackingHolder(page_content="First line", page_number=3),
+ PageNumberTrackingHolder(page_content="First sentence", page_number=4),
+ PageNumberTrackingHolder(page_content="Different content", page_number=5),
+ ]
- dummy_chunk = DummyChunkHolder("chunk1")
- dummy_text_chunker = MagicMock()
- dummy_text_chunker.chunk = AsyncMock(return_value=[dummy_chunk])
- dummy_text_chunker.assign_page_number_to_chunks = MagicMock()
+ # Call the method being tested
+ result_chunks = chunker.assign_page_number_to_chunks(chunks, page_tracking_holders)
- result = await process_semantic_text_chunker(record, dummy_text_chunker)
- assert result["recordId"] == "1"
- assert result["data"] is not None
- chunks = result["data"]["chunks"]
- assert isinstance(chunks, list)
- assert len(chunks) == 1
- assert chunks[0]["mark_up"] == "chunk1"
- # When no page info is provided, page_number remains unchanged (None in our dummy).
- assert chunks[0]["page_number"] is None
+ # Verify the page number was correctly assigned
+ assert result_chunks[0].page_number == expected_page
-@pytest.mark.asyncio
-async def test_process_semantic_text_chunker_success_with_page():
- """Test a successful chunking when per-page starting sentences are provided and match a chunk."""
- record = {
- "recordId": "2",
- "data": {
- "content": "Some content to be chunked.",
- "per_page_starting_sentences": [
- {"starting_sentence": "chunk", "page_number": 5}
- ],
- },
- }
+def test_assign_page_number_to_chunks_multiple_chunks():
+ """Test assigning page numbers to multiple chunks."""
+ chunker = SemanticTextChunker()
- dummy_chunk = DummyChunkHolder("This dummy chunk contains chunk in its text")
- dummy_text_chunker = MagicMock()
- dummy_text_chunker.chunk = AsyncMock(return_value=[dummy_chunk])
+ # Create multiple chunks
+ chunks = [
+ ChunkHolder(mark_up="# Introduction\nThis is the first section."),
+ ChunkHolder(mark_up="# Methods\nThis describes the methods used."),
+ ChunkHolder(mark_up="# Results\nThese are the results."),
+ ]
- def dummy_assign_page(chunks, per_page_starting_sentences):
- ps_objs = [
- DummyPerPageStartingSentenceHolder(**ps.__dict__)
- for ps in per_page_starting_sentences
- ]
- page_number = 1
- for chunk in chunks:
- for ps in ps_objs:
- if ps.starting_sentence in chunk.mark_up:
- page_number = ps.page_number
- break
- chunk.page_number = page_number
- return chunks
+ # Create page tracking holders for different sections
+ page_tracking_holders = [
+ PageNumberTrackingHolder(page_content="# Introduction", page_number=1),
+ PageNumberTrackingHolder(page_content="# Methods", page_number=3),
+ PageNumberTrackingHolder(page_content="# Results", page_number=5),
+ ]
- dummy_text_chunker.assign_page_number_to_chunks = dummy_assign_page
+ # Call the method being tested
+ result_chunks = chunker.assign_page_number_to_chunks(chunks, page_tracking_holders)
- result = await process_semantic_text_chunker(record, dummy_text_chunker)
- assert result["recordId"] == "2"
- chunks = result["data"]["chunks"]
- assert isinstance(chunks, list)
- assert len(chunks) == 1
- assert chunks[0]["page_number"] == 5
+ # Verify page numbers were correctly assigned
+ assert result_chunks[0].page_number == 1
+ assert result_chunks[1].page_number == 3
+ assert result_chunks[2].page_number == 5
@pytest.mark.asyncio
@@ -119,9 +128,9 @@ async def test_process_semantic_text_chunker_multiple_chunks():
"recordId": "4",
"data": {
"content": "Content that generates multiple chunks.",
- "per_page_starting_sentences": [
- {"starting_sentence": "first_page", "page_number": 3},
- {"starting_sentence": "second_page", "page_number": 4},
+ "page_number_tracking_holders": [
+ {"page_content": "first_page", "page_number": 3},
+ {"page_content": "second_page", "page_number": 4},
],
},
}
@@ -131,15 +140,15 @@ async def test_process_semantic_text_chunker_multiple_chunks():
dummy_text_chunker = MagicMock()
dummy_text_chunker.chunk = AsyncMock(return_value=[dummy_chunk1, dummy_chunk2])
- def dummy_assign_page(chunks, per_page_starting_sentences):
+ def dummy_assign_page(chunks, page_number_tracking_holders):
ps_objs = [
- DummyPerPageStartingSentenceHolder(**ps.__dict__)
- for ps in per_page_starting_sentences
+ DummyPageNumberTrackingHolder(**ps.__dict__)
+ for ps in page_number_tracking_holders
]
page_number = 1
for chunk in chunks:
for ps in ps_objs:
- if ps.starting_sentence in chunk.mark_up:
+ if ps.page_content in chunk.mark_up:
page_number = ps.page_number
break
chunk.page_number = page_number
@@ -156,55 +165,6 @@ def dummy_assign_page(chunks, per_page_starting_sentences):
assert chunks[1]["page_number"] == 4
-@pytest.mark.asyncio
-async def test_process_semantic_text_chunker_empty_page_sentences():
- """
- Test a record where 'per_page_starting_sentences' exists but is empty.
- In this case, the default page (1) is assigned.
- """
- record = {
- "recordId": "5",
- "data": {
- "content": "Some content to be chunked.",
- "per_page_starting_sentences": [],
- },
- }
-
- dummy_chunk = DummyChunkHolder("Chunk without any page indicator")
- dummy_text_chunker = MagicMock()
- dummy_text_chunker.chunk = AsyncMock(return_value=[dummy_chunk])
-
- def dummy_assign_page(chunks, per_page_starting_sentences):
- for chunk in chunks:
- chunk.page_number = 1
- return chunks
-
- dummy_text_chunker.assign_page_number_to_chunks = dummy_assign_page
-
- result = await process_semantic_text_chunker(record, dummy_text_chunker)
- assert result["recordId"] == "5"
- chunks = result["data"]["chunks"]
- assert isinstance(chunks, list)
- assert len(chunks) == 1
- assert chunks[0]["page_number"] == 1
-
-
-@pytest.mark.asyncio
-async def test_process_semantic_text_chunker_missing_data():
- """
- Test that if the record is missing the 'data' key, the function returns an error.
- """
- record = {"recordId": "6"}
- dummy_text_chunker = MagicMock()
- dummy_text_chunker.chunk = AsyncMock(return_value=[DummyChunkHolder("chunk")])
- dummy_text_chunker.assign_page_number_to_chunks = MagicMock()
-
- result = await process_semantic_text_chunker(record, dummy_text_chunker)
- assert result["recordId"] == "6"
- assert result["data"] is None
- assert "errors" in result
-
-
@pytest.mark.asyncio
async def test_process_semantic_text_chunker_empty_content():
"""
@@ -244,7 +204,7 @@ def __init__(self, text):
class DummyNLP:
- def __call__(self, text):
+ def __call__(self, text, disable):
return DummyDoc(text)
@@ -253,7 +213,6 @@ def __call__(self, text):
def chunker():
# Use relaxed thresholds so that even short sentences qualify.
stc = SemanticTextChunker(
- num_surrounding_sentences=1,
similarity_threshold=0.8,
max_chunk_tokens=1000,
min_chunk_tokens=1,
@@ -267,43 +226,6 @@ def chunker():
return stc
-# --- Chunk Splitting Tests Using Real (Patched) Chunker ---
-
-
-@pytest.mark.asyncio
-async def test_chunk_complete_figure(chunker):
- """
- Test a text containing a complete
element.
- Expect that the sentence with the complete figure is detected and grouped.
- """
- text = "Text before. Figure content. Text after."
- chunks = await chunker.chunk(text)
- # For our dummy segmentation, we expect two final chunks:
- # one that combines "Text before" and the figure, and one for "Text after".
- assert len(chunks) == 2
- # Check that the first chunk contains a complete figure.
- assert "
" in chunks[0].mark_up
-
-
-@pytest.mark.asyncio
-async def test_chunk_incomplete_figure(chunker):
- """
- Test a text with an incomplete figure element spanning multiple sentences.
- The start and end of the figure should be grouped together.
- """
- text = (
- "Text before. Start of figure. Figure continues . Text after."
- )
- chunks = await chunker.chunk(text)
- # Expected grouping: one chunk combining the normal text and the grouped figure,
- # and another chunk for text after.
- assert len(chunks) == 2
- # Check that the grouped chunk contains both the start and the end of the figure.
- assert "
" in chunks[0].mark_up
-
-
@pytest.mark.asyncio
async def test_chunk_markdown_heading(chunker):
"""
@@ -338,7 +260,6 @@ async def test_chunk_long_sentence():
"""
# Create a chunker that forces a long sentence to exceed the max token threshold.
stc = SemanticTextChunker(
- num_surrounding_sentences=1,
similarity_threshold=0.8,
max_chunk_tokens=5, # set low so even a few words exceed it
min_chunk_tokens=1,
@@ -353,3 +274,197 @@ async def test_chunk_long_sentence():
# And because 12 >= 5, that sentence is immediately appended as a chunk.
assert len(chunks) == 1
assert "exceed" in chunks[0].mark_up
+
+
+def test_assign_page_number_with_html_comments():
+ """Test that HTML comments are properly stripped when assigning page numbers."""
+ chunker = SemanticTextChunker()
+
+ # Create a chunk with HTML comments
+ chunk = ChunkHolder(mark_up=" First line\nSecond line")
+
+ # Create page tracking holders
+ page_tracking_holders = [
+ PageNumberTrackingHolder(page_content="First line\nSecond line", page_number=3),
+ ]
+
+ # Call the method being tested
+ result_chunks = chunker.assign_page_number_to_chunks([chunk], page_tracking_holders)
+
+ # Verify the page number was correctly assigned despite the HTML comment
+ assert result_chunks[0].page_number == 3
+
+
+@pytest.mark.asyncio
+async def test_clean_new_lines():
+ """Test the clean_new_lines method properly processes newlines."""
+ chunker = SemanticTextChunker()
+
+ # Test with various newline patterns
+ text = "
First line\nSecond line
\n\n
Next paragraph
"
+ result = chunker.clean_new_lines(text)
+
+ # Check that single newlines between tags are removed
+ assert "
First line Second line
" in result
+ # Check that multiple newlines are replaced with space + \n\n
+ assert " \n\n
" in result
+
+
+@pytest.mark.asyncio
+async def test_filter_empty_figures():
+ """Test the filter_empty_figures method removes empty figure tags."""
+ chunker = SemanticTextChunker()
+
+ # Test with empty and non-empty figures
+ text = "
Text
More text
Content"
+ result = chunker.filter_empty_figures(text)
+
+ # Check that empty figures are removed
+ assert "" not in result
+ # Check that non-empty figures remain
+ assert "Content" in result
+
+
+@pytest.mark.asyncio
+async def test_group_figures_and_tables():
+ """Test grouping of figures and tables into sentences."""
+ chunker = SemanticTextChunker()
+
+ sentences = ["Before table.", "
" in grouped
+ # Check table map is correct
+ assert is_table_map == [False, True, False]
+
+
+@pytest.mark.asyncio
+async def test_remove_figures():
+ """Test the remove_figures method."""
+ chunker = SemanticTextChunker()
+
+ text = 'Text before Figure content text after'
+ result = chunker.remove_figures(text)
+
+ assert "Text before text after" == result
+ assert "
+
Cell 1
Cell 2
+
+
+> Blockquote text"""
+
+ chunks = await chunker.chunk(text)
+
+ # Verify we have reasonable chunks
+ assert len(chunks) >= 1
+
+ # Check heading formatting
+ heading_chunks = [c for c in chunks if "# Heading 1" in c.mark_up]
+ assert len(heading_chunks) > 0
+ assert "# Heading" in heading_chunks[0].mark_up
+
+
+@pytest.mark.asyncio
+async def test_process_page_tracking_no_match():
+ """Test behavior when page_number_tracking_holders is provided but doesn't match any chunks."""
+ record = {
+ "recordId": "8",
+ "data": {
+ "content": "Unique content that won't match page tracking.",
+ "page_number_tracking_holders": [
+ {"page_content": "Something completely different", "page_number": 10}
+ ],
+ },
+ }
+
+ chunker = SemanticTextChunker()
+ result = await process_semantic_text_chunker(record, chunker)
+
+ # Should default to page 1 when no match is found
+ assert result["data"]["chunks"][0]["page_number"] == 1
+
+
+@pytest.mark.asyncio
+async def test_nested_html_structure():
+ """Test handling of nested HTML tags."""
+ chunker = SemanticTextChunker()
+
+ text = """
+
Paragraph with bold text and italic text
+
+
Header 1
Header 2
+
Value 1
Value 2
+
+
"""
+
+ chunks = await chunker.chunk(text)
+
+ # Verify we get at least one chunk
+ assert len(chunks) > 0
+ # Check that the table is kept intact in one chunk
+ table_chunks = [
+ c for c in chunks if "