diff --git a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py index 8a872f8..4e00d14 100644 --- a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py +++ b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py @@ -283,7 +283,6 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill: def get_semantic_chunker_skill( self, - num_surrounding_sentences: int = 2, similarity_threshold: float = 0.8, max_chunk_tokens: int = 500, min_chunk_tokens: int = 150, @@ -294,7 +293,6 @@ def get_semantic_chunker_skill( ----- context (str): The context of the skill source (str): The source of the skill - num_surrounding_sentences (int, optional): The number of surrounding sentences. Defaults to 1. similarity_threshold (float, optional): The similarity threshold. Defaults to 0.8. max_chunk_tokens (int, optional): The maximum number of tokens. Defaults to 200. @@ -314,8 +312,8 @@ def get_semantic_chunker_skill( name="content", source="/document/layout_merged_content" ), InputFieldMappingEntry( - name="per_page_starting_sentences", - source="/document/per_page_starting_sentences", + name="page_number_tracking_holders", + source="/document/page_number_tracking_holders", ), ] @@ -333,7 +331,6 @@ def get_semantic_chunker_skill( degree_of_parallelism=degree_of_parallelism, http_method="POST", http_headers={ - "num_surrounding_sentences": num_surrounding_sentences, "similarity_threshold": similarity_threshold, "max_chunk_tokens": max_chunk_tokens, "min_chunk_tokens": min_chunk_tokens, @@ -385,8 +382,8 @@ def get_layout_analysis_skill( output = [ OutputFieldMappingEntry(name="layout", target_name="layout"), OutputFieldMappingEntry( - name="per_page_starting_sentences", - target_name="per_page_starting_sentences", + name="page_number_tracking_holders", + target_name="page_number_tracking_holders", ), ] diff --git a/image_processing/README.md b/image_processing/README.md index be740e4..10fdc74 100644 --- a/image_processing/README.md +++ b/image_processing/README.md @@ -98,7 +98,7 @@ This skill merges the layout output with the figure outputs to create a unified ### Semantic Chunker Custom Skill -You can then test the chunking by sending a AI Search JSON format to the `/semantic_text_chunker/ HTTP endpoint. The header controls the different chunking parameters *(num_surrounding_sentences, similarity_threshold, max_chunk_tokens, min_chunk_tokens)*. +You can then test the chunking by sending a AI Search JSON format to the `/semantic_text_chunker/ HTTP endpoint. The header controls the different chunking parameters *(similarity_threshold, max_chunk_tokens, min_chunk_tokens)*. ### MarkUp Cleaner Custom Skill diff --git a/image_processing/src/image_processing/function_app.py b/image_processing/src/image_processing/function_app.py index c918eff..cefcdc2 100644 --- a/image_processing/src/image_processing/function_app.py +++ b/image_processing/src/image_processing/function_app.py @@ -171,9 +171,6 @@ async def semantic_text_chunker(req: func.HttpRequest) -> func.HttpResponse: semantic_text_chunker_config = req.headers - num_surrounding_sentences = int( - semantic_text_chunker_config.get("num_surrounding_sentences", 1) - ) similarity_threshold = float( semantic_text_chunker_config.get("similarity_threshold", 0.8) ) @@ -192,7 +189,6 @@ async def semantic_text_chunker(req: func.HttpRequest) -> func.HttpResponse: record_tasks = [] semantic_text_chunker_processor = SemanticTextChunker( - num_surrounding_sentences=num_surrounding_sentences, similarity_threshold=similarity_threshold, max_chunk_tokens=max_chunk_tokens, min_chunk_tokens=min_chunk_tokens, diff --git a/image_processing/src/image_processing/layout_analysis.py b/image_processing/src/image_processing/layout_analysis.py index 5a1ef4f..64fc8b0 100644 --- a/image_processing/src/image_processing/layout_analysis.py +++ b/image_processing/src/image_processing/layout_analysis.py @@ -22,8 +22,9 @@ LayoutHolder, PageWiseContentHolder, NonPageWiseContentHolder, - PerPageStartingSentenceHolder, + PageNumberTrackingHolder, ) +import re class StorageAccountHelper: @@ -341,14 +342,14 @@ def create_page_wise_content(self) -> list[LayoutHolder]: return page_wise_contents - def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolder]: + def create_page_number_tracking_holder(self) -> list[PageNumberTrackingHolder]: """Create a list of the starting sentence of each page so we can assign the starting sentence to the page number. Returns: -------- list: A list of the starting sentence of each page.""" - per_page_starting_sentences = [] + page_number_tracking_holders = [] for page in self.result.pages: page_content = self.result.content[ @@ -358,22 +359,38 @@ def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolde # Remove any leading whitespace/newlines. cleaned_content = page_content.lstrip() - # If a newline appears before a period, split on newline; otherwise, on period. - if "\n" in cleaned_content: - first_line = cleaned_content.split("\n", 1)[0] - elif "." in cleaned_content: - first_line = cleaned_content.split(".", 1)[0] + # Strip the html comment but keep the content + html_comments_pattern = re.compile(r"", re.DOTALL) + cleaned_content = html_comments_pattern.sub("", cleaned_content) + + # Remove anything inside a figure tag + cleaned_content = re.sub( + "
(.*?)
", + "", + cleaned_content, + flags=re.DOTALL | re.MULTILINE, + ) + logging.info(f"Page Number: {page.page_number}") + logging.info(f"Content for Page Detection: {page_content}") + logging.info(f"Cleaned Content for Page Detection: {cleaned_content}") + + if len(cleaned_content) == 0: + logging.error( + "No content found in the cleaned result for page %s.", + page.page_number, + ) + cleaned_content = None else: - first_line = cleaned_content + cleaned_content = cleaned_content.strip() - per_page_starting_sentences.append( - PerPageStartingSentenceHolder( + page_number_tracking_holders.append( + PageNumberTrackingHolder( page_number=page.page_number, - starting_sentence=first_line.strip(), + page_content=cleaned_content, ) ) - return per_page_starting_sentences + return page_number_tracking_holders async def get_document_intelligence_client(self) -> DocumentIntelligenceClient: """Get the Azure Document Intelligence client. @@ -522,11 +539,11 @@ async def analyse(self): if self.extract_figures: await self.process_figures_from_extracted_content(text_content) - per_page_starting_sentences = self.create_per_page_starting_sentence() + page_number_tracking_holders = self.create_page_number_tracking_holder() output_record = NonPageWiseContentHolder( layout=text_content, - per_page_starting_sentences=per_page_starting_sentences, + page_number_tracking_holders=page_number_tracking_holders, ) except Exception as e: diff --git a/image_processing/src/image_processing/layout_holders.py b/image_processing/src/image_processing/layout_holders.py index 8d1535f..9e03cff 100644 --- a/image_processing/src/image_processing/layout_holders.py +++ b/image_processing/src/image_processing/layout_holders.py @@ -47,18 +47,18 @@ class PageWiseContentHolder(BaseModel): page_wise_layout: list[LayoutHolder] -class PerPageStartingSentenceHolder(BaseModel): +class PageNumberTrackingHolder(BaseModel): """A class to hold the starting sentence of each page.""" page_number: int - starting_sentence: str + page_content: str | None class NonPageWiseContentHolder(BaseModel): """A class to hold the non-page-wise content extracted from the document.""" layout: LayoutHolder - per_page_starting_sentences: list[PerPageStartingSentenceHolder] = Field( + page_number_tracking_holders: list[PageNumberTrackingHolder] = Field( default_factory=list ) @@ -69,6 +69,5 @@ class ChunkHolder(BaseModel): mark_up: str sections: Optional[list[str]] = Field(default_factory=list) figures: Optional[list[FigureHolder]] = Field(default_factory=list) - starting_sentence: Optional[str] = None cleaned_text: Optional[str] = None page_number: Optional[int] = Field(default=None) diff --git a/image_processing/src/image_processing/semantic_text_chunker.py b/image_processing/src/image_processing/semantic_text_chunker.py index b97c667..cb340c9 100644 --- a/image_processing/src/image_processing/semantic_text_chunker.py +++ b/image_processing/src/image_processing/semantic_text_chunker.py @@ -7,18 +7,16 @@ import spacy import numpy as np from model2vec import StaticModel -from layout_holders import PerPageStartingSentenceHolder, ChunkHolder +from layout_holders import PageNumberTrackingHolder, ChunkHolder class SemanticTextChunker: def __init__( self, - num_surrounding_sentences: int = 2, similarity_threshold: float = 0.8, max_chunk_tokens: int = 500, - min_chunk_tokens: int = 150, + min_chunk_tokens: int = 200, ): - self.num_surrounding_sentences = num_surrounding_sentences self.similarity_threshold = similarity_threshold self.max_chunk_tokens = max_chunk_tokens self.min_chunk_tokens = min_chunk_tokens @@ -88,7 +86,7 @@ async def chunk(self, text: str) -> list[ChunkHolder]: Returns: list(str): The list of chunks""" - logging.info(f"Chunking text: {text}") + logging.debug(f"Chunking text: {text}") sentences = self.split_into_sentences(text) @@ -111,7 +109,7 @@ async def chunk(self, text: str) -> list[ChunkHolder]: f"""Number of Forward pass chunks: { len(forward_pass_chunks)}""" ) - logging.info(f"Forward pass chunks: {forward_pass_chunks}") + logging.debug(f"Forward pass chunks: {forward_pass_chunks}") backwards_pass_chunks, _ = self.merge_chunks( forward_pass_chunks, new_is_table_or_figure_map, forwards_direction=False @@ -123,7 +121,7 @@ async def chunk(self, text: str) -> list[ChunkHolder]: f"""Number of Backaward pass chunks: { len(reversed_backwards_pass_chunks)}""" ) - logging.info(f"Backward pass chunks: {reversed_backwards_pass_chunks}") + logging.debug(f"Backward pass chunks: {reversed_backwards_pass_chunks}") cleaned_final_chunks = [] for chunk in reversed_backwards_pass_chunks: @@ -132,7 +130,7 @@ async def chunk(self, text: str) -> list[ChunkHolder]: cleaned_final_chunks.append(ChunkHolder(mark_up=stripped_chunk)) logging.info(f"Number of final chunks: {len(cleaned_final_chunks)}") - logging.info(f"Chunks: {cleaned_final_chunks}") + logging.debug(f"Chunks: {cleaned_final_chunks}") if len(cleaned_final_chunks) == 0: raise ValueError("No chunks were generated") @@ -174,9 +172,12 @@ def split_into_sentences(self, text: str) -> list[str]: # Filter out empty
...
tags cleaned_text = self.filter_empty_figures(cleaned_text) - logging.info(f"Cleaned text: {cleaned_text}") + logging.debug(f"Cleaned text: {cleaned_text}") - doc = self._nlp_model(cleaned_text) + self._nlp_model.max_length = len(cleaned_text) + 100 + doc = self._nlp_model( + cleaned_text, disable=["ner", "tagger", "lemmatizer", "textcat"] + ) tag_split_sentences = [] # Pattern to match the closing and opening tag junctions with whitespace in between @@ -258,103 +259,98 @@ def group_figures_and_tables_into_sentences(self, sentences: list[str]): return grouped_sentences, is_table_or_figure_map - def look_ahead_and_behind_sentences( - self, total_sentences, is_table_or_figure_map, current_sentence_index - ): - is_table_or_figure_ahead = False - is_table_or_figure_behind = False - - distance_to_next_figure = self.num_surrounding_sentences - - if current_sentence_index < self.num_surrounding_sentences: - is_table_or_figure_behind = is_table_or_figure_map[0] - else: - is_table_or_figure_behind = is_table_or_figure_map[ - current_sentence_index - self.num_surrounding_sentences - ] - - surround_sentences_gap_to_test = self.num_surrounding_sentences - if current_sentence_index + self.num_surrounding_sentences >= total_sentences: - is_table_or_figure_ahead = is_table_or_figure_map[-1] - surround_sentences_gap_to_test = total_sentences - current_sentence_index - else: - is_table_or_figure_ahead = is_table_or_figure_map[ - current_sentence_index + self.num_surrounding_sentences - ] - - for ( - next_sentence_is_table_or_figure_index, - next_sentence_is_table_or_figure, - ) in enumerate( - is_table_or_figure_map[ - current_sentence_index : current_sentence_index - + surround_sentences_gap_to_test - ] - ): - if next_sentence_is_table_or_figure: - distance_to_next_figure = next_sentence_is_table_or_figure_index - - return ( - is_table_or_figure_ahead, - is_table_or_figure_behind, - min(surround_sentences_gap_to_test, distance_to_next_figure), + def remove_figures(self, text): + figure_tag_pattern = ( + r"(.*?)" ) + return re.sub(figure_tag_pattern, "", text).strip() def merge_similar_chunks(self, current_sentence, current_chunk, forwards_direction): new_chunk = None - def retrieve_current_chunk_up_to_n(n): + def retrieve_current_chunk_up_to_minus_n(n): if forwards_direction: return " ".join(current_chunk[:-n]) else: return " ".join(reversed(current_chunk[:-n])) - def retrieve_current_chunks_from_n(n): - if forwards_direction: - return " ".join(current_chunk[n:]) - else: - return " ".join(reversed(current_chunk[:-n])) - def retrive_current_chunk_at_n(n): if forwards_direction: return current_chunk[n] else: return current_chunk[n] + def retrieve_current_chunks_from_n(n): + if forwards_direction: + return " ".join(current_chunk[n:]) + else: + return " ".join(reversed(current_chunk[n:])) + def get_current_chunk_tokens(chunk_segments): + if isinstance(chunk_segments, str): + return self.num_tokens_from_string(chunk_segments) + return self.num_tokens_from_string(" ".join(chunk_segments)) + if len(current_chunk) == 1: + logging.debug("Chunk too small to compare") + return new_chunk, current_chunk + + if len(current_chunk) > 2: + would_be_end_of_old_chunk = retrieve_current_chunk_up_to_minus_n(1) + would_be_start_of_new_chunk = [retrive_current_chunk_at_n(-1)] + else: + would_be_end_of_old_chunk = retrive_current_chunk_at_n(0) + would_be_start_of_new_chunk = [retrive_current_chunk_at_n(1)] + current_chunk_tokens = get_current_chunk_tokens(current_chunk) + logging.debug(f"Current chunk tokens: {current_chunk_tokens}") + would_be_end_of_old_chunk_tokens = get_current_chunk_tokens( + would_be_end_of_old_chunk + ) + logging.debug(f"Would be new chunk tokens: {would_be_end_of_old_chunk_tokens}") - if len(current_chunk) >= 2 and current_chunk_tokens >= self.min_chunk_tokens: - # Calculate the tokens if we were to split - if len(current_chunk) > 2: - would_be_new_chunk = retrieve_current_chunk_up_to_n(1) - would_be_current_chunk = [retrive_current_chunk_at_n(-1)] - else: - would_be_new_chunk = retrive_current_chunk_at_n(0) - would_be_current_chunk = [retrive_current_chunk_at_n(1)] + would_be_end_of_old_chunk_without_figures = self.remove_figures( + would_be_end_of_old_chunk + ) - if ( - get_current_chunk_tokens(would_be_new_chunk) >= self.min_chunk_tokens - and get_current_chunk_tokens(would_be_current_chunk) - >= self.min_chunk_tokens - ): - logging.info("Comparing chunks") - if ( - current_chunk_tokens >= self.max_chunk_tokens - or self.sentence_similarity( - retrieve_current_chunks_from_n(-2), current_sentence - ) - < self.similarity_threshold - ): - return would_be_new_chunk, would_be_current_chunk - else: - logging.info("Chunk too small to compare") - else: - logging.info("Chunk too small to compare") + would_be_end_of_old_chunk_without_figures_tokens = self.num_tokens_from_string( + would_be_end_of_old_chunk_without_figures + ) + + would_be_start_of_new_chunk_without_figures = self.remove_figures( + " ".join(would_be_start_of_new_chunk) + ) + + if len(would_be_start_of_new_chunk_without_figures) == 0: + logging.debug("Chunk would only contain figures. Not comparing") + return new_chunk, current_chunk + + if ( + would_be_end_of_old_chunk_tokens < self.min_chunk_tokens + or would_be_end_of_old_chunk_without_figures_tokens + < (self.min_chunk_tokens / 2) + ): + logging.debug("Chunk too small. Not comparing") + return new_chunk, current_chunk + + if would_be_end_of_old_chunk_without_figures_tokens > self.max_chunk_tokens: + logging.debug("Chunk too large. Not comparing") + return would_be_end_of_old_chunk, would_be_start_of_new_chunk + + similarity_set = retrieve_current_chunks_from_n(-2) - return new_chunk, current_chunk + # Calculate the tokens if we were to split + logging.debug("Comparing chunks") + if ( + current_chunk_tokens > (self.max_chunk_tokens * 1.5) + or self.sentence_similarity(similarity_set, current_sentence) + < self.similarity_threshold + ): + return would_be_end_of_old_chunk, would_be_start_of_new_chunk + else: + logging.debug("Above similarity threshold") + return new_chunk, current_chunk def is_markdown_heading(self, text): return text.strip().startswith("#") @@ -385,96 +381,73 @@ def retrieve_current_chunk(): index += 1 continue - # Detect if table or figure - if is_table_or_figure_map[current_sentence_index]: - if forwards_direction: + if forwards_direction and self.is_markdown_heading(current_sentence): + heading_level = current_sentence.count("#") + + if heading_level in [1, 2]: + # Start new chunk if len(current_chunk) > 0: - current_chunk.append(current_sentence) - chunks.append(retrieve_current_chunk()) - new_is_table_or_figure_map.append(True) - current_chunk = [] - else: - current_chunk.append(current_sentence) - else: - # On the backwards pass we don't want to add to the table chunk - chunks.append(retrieve_current_chunk()) - new_is_table_or_figure_map.append(True) - current_chunk = [current_sentence] + current_chunk = retrieve_current_chunk() + chunks.append(current_chunk) + new_is_table_or_figure_map.append( + self.sentence_contains_figure_or_table(current_chunk) + ) + current_chunk = [current_sentence] - index += 1 - continue - elif forwards_direction: - # Look ahead to see if figure of table is coming up - # We only do this on the forward pass - ( - is_table_or_figure_ahead, - is_table_or_figure_behind, - min_of_distance_to_next_figure_or_num_surrounding_sentences, - ) = self.look_ahead_and_behind_sentences( - total_sentences, is_table_or_figure_map, current_sentence_index - ) + index += 1 + continue - if is_table_or_figure_behind: - # Check if Makrdown heading - if self.is_markdown_heading(current_sentence): - # Start new chunk - chunks.append(retrieve_current_chunk()) - new_is_table_or_figure_map.append(False) - current_chunk = [current_sentence] - else: - # Finish off - current_chunk.append(current_sentence) - chunks.append(retrieve_current_chunk()) - new_is_table_or_figure_map.append(False) - current_chunk = [] + # Detect if table or figure + if forwards_direction and is_table_or_figure_map[current_sentence_index]: + if len(current_chunk) > 0: + current_chunk.append(current_sentence) + chunks.append(retrieve_current_chunk()) + new_is_table_or_figure_map.append(True) + current_chunk = [] index += 1 continue - elif is_table_or_figure_ahead: - # Add to the ahead chunk - chunks.append(retrieve_current_chunk()) - new_is_table_or_figure_map.append(False) - if forwards_direction: - current_chunk = sentences[ - current_sentence_index : current_sentence_index - + min_of_distance_to_next_figure_or_num_surrounding_sentences - ] - else: - current_chunk = sentences[ - current_sentence_index : current_sentence_index - - min_of_distance_to_next_figure_or_num_surrounding_sentences : -1 - ] - index += min_of_distance_to_next_figure_or_num_surrounding_sentences - continue # now group semanticly - num_tokens = self.num_tokens_from_string(current_sentence) + current_chunk.append(current_sentence) - if num_tokens >= self.max_chunk_tokens: - chunks.append(current_sentence) - new_is_table_or_figure_map.append(False) - else: - current_chunk.append(current_sentence) + new_chunk, current_chunk = self.merge_similar_chunks( + current_sentence, + current_chunk, + forwards_direction=forwards_direction, + ) - new_chunk, current_chunk = self.merge_similar_chunks( - current_sentence, - current_chunk, - forwards_direction=forwards_direction, + if new_chunk is not None: + chunks.append(new_chunk) + new_is_table_or_figure_map.append( + self.sentence_contains_figure_or_table(new_chunk) ) - if new_chunk is not None: - chunks.append(new_chunk) - new_is_table_or_figure_map.append(False) - index += 1 if len(current_chunk) > 0: final_chunk = retrieve_current_chunk() - chunks.append(final_chunk) - new_is_table_or_figure_map.append( - self.sentence_contains_figure_or_table(final_chunk) - ) + # Get tokens of this chunk + if ( + self.num_tokens_from_string(final_chunk) < self.min_chunk_tokens + and len(chunks) > 0 + ): + # Add the last chunk to the new chunks + if forwards_direction: + final_chunk = chunks[-1] + " " + final_chunk + else: + final_chunk = final_chunk + " " + chunks[-1] + + chunks[-1] = final_chunk + new_is_table_or_figure_map[-1] = self.sentence_contains_figure_or_table( + final_chunk + ) + else: + chunks.append(final_chunk) + new_is_table_or_figure_map.append( + self.sentence_contains_figure_or_table(final_chunk) + ) return chunks, new_is_table_or_figure_map @@ -486,7 +459,7 @@ def sentence_similarity(self, text_1, text_2): magnitude = np.linalg.norm(vec1) * np.linalg.norm(vec2) similarity = dot_product / magnitude if magnitude != 0 else 0.0 - logging.info( + logging.debug( f"""Similarity between '{text_1}' and '{ text_2}': {similarity}""" ) @@ -495,28 +468,83 @@ def sentence_similarity(self, text_1, text_2): def assign_page_number_to_chunks( self, chunks: list[ChunkHolder], - per_page_starting_sentences: list[PerPageStartingSentenceHolder], + page_number_tracking_holders: list[PageNumberTrackingHolder], ) -> list[ChunkHolder]: """Assigns page numbers to the chunks based on the starting sentences of each page. Args: chunks (list[ChunkHolder]): The list of chunks. - per_page_starting_sentences (list[PerPageStartingSentenceHolder]): The list of starting sentences of each page. + page_number_tracking_holders (list[PageNumberTrackingHolder]): The list of starting sentences of each page. Returns: list[ChunkHolder]: The list of chunks with page numbers assigned.""" page_number = 1 for chunk in chunks: - for per_page_starting_sentence in per_page_starting_sentences[ - page_number - 1 : - ]: - if per_page_starting_sentence.starting_sentence in chunk: - logging.info( - "Assigning page number %i to chunk", - per_page_starting_sentence.page_number, + # Remove any leading whitespace/newlines. + cleaned_content = chunk.mark_up.lstrip() + # Strip the html comment but keep the content + html_comments_pattern = re.compile(r"", re.DOTALL) + cleaned_content = html_comments_pattern.sub("", cleaned_content) + + # Use the nlp model to get the first sentence + sentences = list( + self._nlp_model( + cleaned_content, disable=["ner", "tagger", "lemmatizer", "textcat"] + ).sents + ) + + if len(sentences) == 0: + first_line = None + else: + first_sentence = sentences[0].text.strip() + + if "#" in first_sentence: + logging.info("Splitting on hash") + # Delibretely split on the next hash to get the first line of the markdown content + first_line = ( + first_sentence.split(" #", 1)[0] + .strip() + .split("\n", 1)[0] + .strip() ) - page_number = per_page_starting_sentence.page_number - break + elif "" in first_sentence: + logging.info("Joining onto second sentence to form first row") + if len(sentences) > 1: + first_line = ( + first_sentence.lstrip() + "\n" + sentences[1].text.strip() + ) + else: + first_line = first_sentence + elif "\n" in first_sentence: + logging.info("Splitting on newline") + first_line = first_sentence.split("\n", 1)[0].strip() + elif "." in first_sentence: + logging.info("Splitting on period") + first_line = first_sentence.split(".", 1)[0].strip() + else: + logging.info("No split found") + first_line = first_sentence.strip() + + if first_line is not None: + logging.info(f"Looking for First line: {first_line}") + for page_number_tracking_holder in page_number_tracking_holders[ + page_number - 1 : + ]: + if page_number_tracking_holder.page_content is not None: + if ( + first_line == page_number_tracking_holder.page_content + or first_line in page_number_tracking_holder.page_content + or first_line + in page_number_tracking_holder.page_content.replace( + "\n", " " + ) + ): + logging.info( + "Assigning page number %i to chunk", + page_number, + ) + page_number = page_number_tracking_holder.page_number + break chunk.page_number = page_number return chunks @@ -545,16 +573,16 @@ async def process_semantic_text_chunker(record: dict, text_chunker) -> dict: # scenarios when page by chunking is enabled chunks = await text_chunker.chunk(record["data"]["content"]) - if "per_page_starting_sentences" in record["data"]: - per_page_starting_sentences = [ - PerPageStartingSentenceHolder(**sentence) - for sentence in record["data"]["per_page_starting_sentences"] + if "page_number_tracking_holders" in record["data"]: + page_number_tracking_holders = [ + PageNumberTrackingHolder(**sentence) + for sentence in record["data"]["page_number_tracking_holders"] ] - logging.info(f"Per page starting sentences: {per_page_starting_sentences}") + logging.info(f"Per page holders: {page_number_tracking_holders}") chunks = text_chunker.assign_page_number_to_chunks( - chunks, per_page_starting_sentences + chunks, page_number_tracking_holders ) cleaned_record["data"]["chunks"] = [ diff --git a/image_processing/tests/image_processing/test_figure_app.py b/image_processing/tests/image_processing/test_figure_app.py index e86a1f4..dcdd1b9 100644 --- a/image_processing/tests/image_processing/test_figure_app.py +++ b/image_processing/tests/image_processing/test_figure_app.py @@ -188,7 +188,6 @@ async def dummy_process_semantic_text_chunker(value, processor): ) headers = { - "num_surrounding_sentences": "2", "similarity_threshold": "0.9", "max_chunk_tokens": "600", "min_chunk_tokens": "60", diff --git a/image_processing/tests/image_processing/test_layout_analysis.py b/image_processing/tests/image_processing/test_layout_analysis.py index e9de95a..5c4d642 100644 --- a/image_processing/tests/image_processing/test_layout_analysis.py +++ b/image_processing/tests/image_processing/test_layout_analysis.py @@ -11,6 +11,8 @@ LayoutAnalysis, ) +from layout_holders import LayoutHolder + # --- Dummy classes to simulate ADI results and figures --- class DummySpan: @@ -436,7 +438,7 @@ class DummyResultContent: assert layout.page_offsets == 0 -def test_create_per_page_starting_sentence(): +def test_create_page_number_tracking_holder(): # Create a LayoutAnalysis instance. la = LayoutAnalysis(record_id=200, source="dummy") @@ -449,17 +451,17 @@ class DummyResultContent: dummy_result = DummyResultContent() dummy_result.content = "HelloWorld. This is a test sentence." # DummyPage creates a page with spans as a list of dictionaries. - dummy_result.pages = [DummyPage(0, 10, 1)] + dummy_result.pages = [DummyPage(0, 36, 1)] la.result = dummy_result - sentences = la.create_per_page_starting_sentence() - assert len(sentences) == 1 - sentence = sentences[0] - assert sentence.page_number == 1 - assert sentence.starting_sentence == "HelloWorld" + page_number_trackers = la.create_page_number_tracking_holder() + assert len(page_number_trackers) == 1 + tracker = page_number_trackers[0] + assert tracker.page_number == 1 + assert tracker.page_content == "HelloWorld. This is a test sentence." -def test_create_per_page_starting_sentence_multiple_pages(): +def test_create_page_number_tracking_holder_multiple_pages(): # Create a LayoutAnalysis instance. la = LayoutAnalysis(record_id=300, source="dummy") @@ -479,15 +481,337 @@ class DummyResultContent: ] la.result = dummy_result - # Call create_per_page_starting_sentence and check results. - sentences = la.create_per_page_starting_sentence() - assert len(sentences) == 2 + # Call create_page_number_tracking_holder and check results. + page_number_trackers = la.create_page_number_tracking_holder() + assert len(page_number_trackers) == 2 # For page 1, the substring is "Page one." -> split on "." gives "Page one" - assert sentences[0].page_number == 1 - assert sentences[0].starting_sentence == "Page one" + assert page_number_trackers[0].page_number == 1 + assert page_number_trackers[0].page_content == "Page one." # For page 2, the substring is "Page two text and" -> split on "." gives the entire string - assert sentences[1].page_number == 2 + assert page_number_trackers[1].page_number == 2 # We strip potential leading/trailing spaces for validation. - assert sentences[1].starting_sentence.strip() == "Page two text and more content" + assert ( + page_number_trackers[1].page_content.strip() + == "Page two text and more content. This is more random content that is on page 2." + ) + + +# Test for download_figure_image with retry logic +@pytest.mark.asyncio +async def test_download_figure_image_with_retry(monkeypatch): + """Test the download_figure_image method with retry logic.""" + la = LayoutAnalysis(record_id=101, source="dummy") + la.operation_id = "op101" + la.result = DummyResult("content", [], [], model_id="model101") + + # Create a counter to track number of attempts + call_count = 0 + + # Mock document_intelligence_client.get_analyze_result_figure + class MockResponse: + def __init__(self): + self.chunks = [b"chunk1", b"chunk2"] + + def __aiter__(self): + return self + + async def __anext__(self): + if not self.chunks: + raise StopAsyncIteration + return self.chunks.pop(0) + + class MockClient: + async def __aenter__(self): + return self + + async def __aexit__(self, *args): + pass + + async def get_analyze_result_figure(self, model_id, result_id, figure_id): + nonlocal call_count + call_count += 1 + if call_count == 1: + # Fail on first attempt + raise Exception("Temporary failure") + # Succeed on subsequent attempts + return MockResponse() + + # Patch get_document_intelligence_client to return our mock + async def mock_get_client(): + return MockClient() + + monkeypatch.setattr(la, "get_document_intelligence_client", mock_get_client) + + # Call the method - should succeed after retry + result = await la.download_figure_image("fig1") + + # Check that it was called more than once (at least one retry) + assert call_count > 1 + # Check the result contains both chunks + assert result == b"chunk1chunk2" + + +# Test for non-page-wise analysis with figures +@pytest.mark.asyncio +async def test_analyse_non_page_wise_with_figures(monkeypatch, dummy_storage_helper): + """Test non-page-wise analysis with figures.""" + source = "https://dummyaccount.blob.core.windows.net/container/path/to/file.txt" + la = LayoutAnalysis( + page_wise=False, extract_figures=True, record_id=102, source=source + ) + la.extract_file_info() + + monkeypatch.setattr( + la, "get_storage_account_helper", AsyncMock(return_value=dummy_storage_helper) + ) + monkeypatch.setattr( + dummy_storage_helper, + "download_blob_to_temp_dir", + AsyncMock(return_value=("/tmp/dummy.txt", {})), + ) + + # Create a dummy result with content and a figure + dummy_page = DummyPage(0, 20, 1) + dummy_figure = DummyFigure( + "fig102", + offset=10, + length=5, + page_number=1, + caption_content="Figure 102 caption", + ) + dummy_result = DummyResult( + content="Full document content", pages=[dummy_page], figures=[dummy_figure] + ) + + async def dummy_analyse_document(file_path): + la.result = dummy_result + la.operation_id = "op102" + + monkeypatch.setattr(la, "analyse_document", dummy_analyse_document) + + # Mock figure download and upload + monkeypatch.setattr( + la, "download_figure_image", AsyncMock(return_value=b"figure102_image_data") + ) + monkeypatch.setattr( + dummy_storage_helper, + "upload_blob", + AsyncMock(return_value="http://dummy.url/fig102.png"), + ) + + result = await la.analyse() + + assert result["recordId"] == 102 + assert result["data"] is not None + # In non-page-wise mode, we should have layout and page_number_tracking_holders + assert "layout" in result["data"] + assert "page_number_tracking_holders" in result["data"] + + # Verify figure was processed + layout = result["data"]["layout"] + assert "figures" in layout + figures = layout["figures"] + assert len(figures) == 1 + assert figures[0]["figure_id"] == "fig102" + assert figures[0]["caption"] == "Figure 102 caption" + expected_b64 = base64.b64encode(b"figure102_image_data").decode("utf-8") + assert figures[0]["data"] == expected_b64 + + +# Test for when extract_figures is False +@pytest.mark.asyncio +async def test_analyse_without_extracting_figures(monkeypatch, dummy_storage_helper): + """Test analysis when extract_figures is False.""" + source = "https://dummyaccount.blob.core.windows.net/container/path/to/file.txt" + la = LayoutAnalysis( + page_wise=True, extract_figures=False, record_id=103, source=source + ) + la.extract_file_info() + + monkeypatch.setattr( + la, "get_storage_account_helper", AsyncMock(return_value=dummy_storage_helper) + ) + monkeypatch.setattr( + dummy_storage_helper, + "download_blob_to_temp_dir", + AsyncMock(return_value=("/tmp/dummy.txt", {})), + ) + + # Create a dummy result with content and a figure + dummy_page = DummyPage(0, 10, 1) + dummy_figure = DummyFigure( + "fig103", + offset=5, + length=3, + page_number=1, + caption_content="Figure 103 caption", + ) + dummy_result = DummyResult( + content="Page content", pages=[dummy_page], figures=[dummy_figure] + ) + + async def dummy_analyse_document(file_path): + la.result = dummy_result + la.operation_id = "op103" + + monkeypatch.setattr(la, "analyse_document", dummy_analyse_document) + + # Add spy on process_figures_from_extracted_content to ensure it's not called + process_figures_spy = AsyncMock() + monkeypatch.setattr( + la, "process_figures_from_extracted_content", process_figures_spy + ) + + result = await la.analyse() + + # Verify the function was not called + process_figures_spy.assert_not_called() + + assert result["recordId"] == 103 + assert result["data"] is not None + # Verify we have page_wise_layout + assert "page_wise_layout" in result["data"] + layouts = result["data"]["page_wise_layout"] + assert len(layouts) == 1 + # Each layout should have an empty figures list + assert layouts[0]["figures"] == [] + + +# Test for HTML comment handling in create_page_number_tracking_holder +def test_create_page_number_tracking_holder_html_comments(): + """Test HTML comment handling in page content extraction.""" + la = LayoutAnalysis(record_id=104, source="dummy") + + class DummyResultContent: + pass + + dummy_result = DummyResultContent() + # Content with HTML comments + dummy_result.content = "Before After" + dummy_result.pages = [DummyPage(0, 29, 1)] # Full content + la.result = dummy_result + + page_number_trackers = la.create_page_number_tracking_holder() + assert len(page_number_trackers) == 1 + # HTML comments should be removed + assert page_number_trackers[0].page_content == "Before After" + + +# Test for figure tag handling in create_page_number_tracking_holder +def test_create_page_number_tracking_holder_figure_tags(): + """Test figure tag handling in page content extraction.""" + la = LayoutAnalysis(record_id=105, source="dummy") + + class DummyResultContent: + pass + + dummy_result = DummyResultContent() + # Content with figure tags + dummy_result.content = "Before
Figure content
After" + dummy_result.pages = [DummyPage(0, 44, 1)] # Full content + la.result = dummy_result + + page_number_trackers = la.create_page_number_tracking_holder() + assert len(page_number_trackers) == 1 + # Figure content should be removed + assert page_number_trackers[0].page_content == "Before After" + + +# Test handling of empty content +def test_create_page_number_tracking_holder_empty_content(): + """Test handling of empty content in page tracking.""" + la = LayoutAnalysis(record_id=106, source="dummy") + + class DummyResultContent: + pass + + dummy_result = DummyResultContent() + # Empty content + dummy_result.content = "" + dummy_result.pages = [DummyPage(0, 0, 1)] # Empty content + la.result = dummy_result + + page_number_trackers = la.create_page_number_tracking_holder() + assert len(page_number_trackers) == 1 + # Page content should be None for empty content + assert page_number_trackers[0].page_content is None + + +# Test for process_layout_analysis with page_wise=True +@pytest.mark.asyncio +async def test_process_layout_analysis_page_wise(monkeypatch): + """Test process_layout_analysis with page_wise=True.""" + record = { + "recordId": "107", + "data": {"source": "https://dummy.blob.core.windows.net/container/blob.pdf"}, + } + + # Create a mock LayoutAnalysis + mock_layout_analysis = AsyncMock() + mock_layout_analysis.analyse = AsyncMock( + return_value={"recordId": "107", "data": {"result": "success"}} + ) + + # Mock the LayoutAnalysis constructor + def mock_layout_analysis_constructor(*args, **kwargs): + # Verify page_wise=True was passed + assert kwargs["page_wise"] is True + return mock_layout_analysis + + monkeypatch.setattr( + "layout_analysis.LayoutAnalysis", mock_layout_analysis_constructor + ) + + result = await process_layout_analysis(record, page_wise=True) + + # Verify analyse was called + mock_layout_analysis.analyse.assert_called_once() + assert result["recordId"] == "107" + assert result["data"] == {"result": "success"} + + +# Test handling figures without captions +@pytest.mark.asyncio +async def test_figure_without_caption(monkeypatch, dummy_storage_helper): + """Test handling figures without captions.""" + source = "https://dummyaccount.blob.core.windows.net/container/path/to/file.txt" + la = LayoutAnalysis( + page_wise=False, extract_figures=True, record_id=108, source=source + ) + la.extract_file_info() + + monkeypatch.setattr( + la, "get_storage_account_helper", AsyncMock(return_value=dummy_storage_helper) + ) + monkeypatch.setattr( + dummy_storage_helper, + "download_blob_to_temp_dir", + AsyncMock(return_value=("/tmp/dummy.txt", {})), + ) + + # Create a figure without a caption (caption=None) + dummy_figure = DummyFigure( + "fig108", offset=5, length=3, page_number=1, caption_content=None + ) + dummy_result = DummyResult( + content="Content", pages=[DummyPage(0, 7, 1)], figures=[dummy_figure] + ) + + la.result = dummy_result + monkeypatch.setattr( + la, "download_figure_image", AsyncMock(return_value=b"figure108_image_data") + ) + + # Create a minimal layout holder for testing + layout_holder = LayoutHolder(content="Test", page_number=1, page_offsets=0) + + # Process the figures + await la.process_figures_from_extracted_content(layout_holder) + + # Check that the figure was processed despite having no caption + assert len(layout_holder.figures) == 1 + figure = layout_holder.figures[0] + assert figure.figure_id == "fig108" + assert figure.caption is None # Caption should be None diff --git a/image_processing/tests/image_processing/test_layout_holders.py b/image_processing/tests/image_processing/test_layout_holders.py index 3d2d1c4..4e23893 100644 --- a/image_processing/tests/image_processing/test_layout_holders.py +++ b/image_processing/tests/image_processing/test_layout_holders.py @@ -8,7 +8,7 @@ PageWiseContentHolder, NonPageWiseContentHolder, ChunkHolder, - PerPageStartingSentenceHolder, + PageNumberTrackingHolder, ) @@ -74,34 +74,32 @@ def test_chunk_holder_creation(): mark_up="Sample markup", sections=["Section1", "Section2"], figures=[], - starting_sentence="First sentence", cleaned_text="Cleaned text content", page_number=1, ) assert chunk.mark_up == "Sample markup" assert chunk.sections == ["Section1", "Section2"] - assert chunk.starting_sentence == "First sentence" assert chunk.cleaned_text == "Cleaned text content" assert chunk.page_number == 1 -def test_per_page_starting_sentence_holder_creation(): - sentence = PerPageStartingSentenceHolder( - page_number=1, starting_sentence="This is the starting sentence." +def test_per_page_page_content_holder_creation(): + sentence = PageNumberTrackingHolder( + page_number=1, page_content="This is the full content." ) assert sentence.page_number == 1 - assert sentence.starting_sentence == "This is the starting sentence." + assert sentence.page_content == "This is the full content." -def test_non_page_wise_content_holder_with_sentences(): +def test_non_page_wise_content_holder_with_page_number_trackers(): layout = LayoutHolder(content="Full document") - sentences = [ - PerPageStartingSentenceHolder(page_number=1, starting_sentence="Start 1"), - PerPageStartingSentenceHolder(page_number=2, starting_sentence="Start 2"), + page_number_trackers = [ + PageNumberTrackingHolder(page_number=1, page_content="Start 1"), + PageNumberTrackingHolder(page_number=2, page_content="Start 2"), ] non_page_holder = NonPageWiseContentHolder( - layout=layout, per_page_starting_sentences=sentences + layout=layout, page_number_tracking_holders=page_number_trackers ) assert non_page_holder.layout.content == "Full document" - assert len(non_page_holder.per_page_starting_sentences) == 2 - assert non_page_holder.per_page_starting_sentences[0].starting_sentence == "Start 1" + assert len(non_page_holder.page_number_tracking_holders) == 2 + assert non_page_holder.page_number_tracking_holders[0].page_content == "Start 1" diff --git a/image_processing/tests/image_processing/test_semantic_text_chunker.py b/image_processing/tests/image_processing/test_semantic_text_chunker.py index 59e8364..07277c4 100644 --- a/image_processing/tests/image_processing/test_semantic_text_chunker.py +++ b/image_processing/tests/image_processing/test_semantic_text_chunker.py @@ -1,3 +1,5 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. import pytest from unittest.mock import AsyncMock, MagicMock @@ -6,6 +8,8 @@ SemanticTextChunker, ) +from layout_holders import ChunkHolder, PageNumberTrackingHolder + # --- Dummy Classes for Process-Level Tests --- @@ -18,75 +22,80 @@ def model_dump(self, by_alias=False): return {"mark_up": self.mark_up, "page_number": self.page_number} -class DummyPerPageStartingSentenceHolder: - def __init__(self, starting_sentence, page_number): - self.starting_sentence = starting_sentence +class DummyPageNumberTrackingHolder: + def __init__(self, page_content, page_number): + self.page_content = page_content self.page_number = page_number # --- Process-Level Tests (Using Dummy Chunker) --- -@pytest.mark.asyncio -async def test_process_semantic_text_chunker_success_without_page(): - """Test a successful chunking when no per-page starting sentences are provided.""" - record = {"recordId": "1", "data": {"content": "Some content to be chunked."}} +@pytest.mark.parametrize( + "chunk_contents, page_content, expected_page", + [ + # Test matching on markdown heading + (["# Title", "Content"], "# Title", 2), + # Test matching on newline content + (["First line", "Second line"], "First line", 3), + # Test matching on period + (["First sentence. Second sentence"], "First sentence. Second sentence", 4), + # Test matching on table + (["
Table content
"], "", 1), + # Test no match (should get default page 1) + (["Content not in any page_content"], "Different content", 1), + ], +) +def test_assign_page_number_to_chunks(chunk_contents, page_content, expected_page): + """Test the page assignment logic for different types of content.""" + # Create a real SemanticTextChunker instance + chunker = SemanticTextChunker() + + chunks = [ChunkHolder(mark_up=chunk_content) for chunk_content in chunk_contents] + + # Create chunks with different content types + + # Create page tracking holders + page_tracking_holders = [ + PageNumberTrackingHolder(page_content="", page_number=1), + PageNumberTrackingHolder(page_content="# Title", page_number=2), + PageNumberTrackingHolder(page_content="First line", page_number=3), + PageNumberTrackingHolder(page_content="First sentence", page_number=4), + PageNumberTrackingHolder(page_content="Different content", page_number=5), + ] - dummy_chunk = DummyChunkHolder("chunk1") - dummy_text_chunker = MagicMock() - dummy_text_chunker.chunk = AsyncMock(return_value=[dummy_chunk]) - dummy_text_chunker.assign_page_number_to_chunks = MagicMock() + # Call the method being tested + result_chunks = chunker.assign_page_number_to_chunks(chunks, page_tracking_holders) - result = await process_semantic_text_chunker(record, dummy_text_chunker) - assert result["recordId"] == "1" - assert result["data"] is not None - chunks = result["data"]["chunks"] - assert isinstance(chunks, list) - assert len(chunks) == 1 - assert chunks[0]["mark_up"] == "chunk1" - # When no page info is provided, page_number remains unchanged (None in our dummy). - assert chunks[0]["page_number"] is None + # Verify the page number was correctly assigned + assert result_chunks[0].page_number == expected_page -@pytest.mark.asyncio -async def test_process_semantic_text_chunker_success_with_page(): - """Test a successful chunking when per-page starting sentences are provided and match a chunk.""" - record = { - "recordId": "2", - "data": { - "content": "Some content to be chunked.", - "per_page_starting_sentences": [ - {"starting_sentence": "chunk", "page_number": 5} - ], - }, - } +def test_assign_page_number_to_chunks_multiple_chunks(): + """Test assigning page numbers to multiple chunks.""" + chunker = SemanticTextChunker() - dummy_chunk = DummyChunkHolder("This dummy chunk contains chunk in its text") - dummy_text_chunker = MagicMock() - dummy_text_chunker.chunk = AsyncMock(return_value=[dummy_chunk]) + # Create multiple chunks + chunks = [ + ChunkHolder(mark_up="# Introduction\nThis is the first section."), + ChunkHolder(mark_up="# Methods\nThis describes the methods used."), + ChunkHolder(mark_up="# Results\nThese are the results."), + ] - def dummy_assign_page(chunks, per_page_starting_sentences): - ps_objs = [ - DummyPerPageStartingSentenceHolder(**ps.__dict__) - for ps in per_page_starting_sentences - ] - page_number = 1 - for chunk in chunks: - for ps in ps_objs: - if ps.starting_sentence in chunk.mark_up: - page_number = ps.page_number - break - chunk.page_number = page_number - return chunks + # Create page tracking holders for different sections + page_tracking_holders = [ + PageNumberTrackingHolder(page_content="# Introduction", page_number=1), + PageNumberTrackingHolder(page_content="# Methods", page_number=3), + PageNumberTrackingHolder(page_content="# Results", page_number=5), + ] - dummy_text_chunker.assign_page_number_to_chunks = dummy_assign_page + # Call the method being tested + result_chunks = chunker.assign_page_number_to_chunks(chunks, page_tracking_holders) - result = await process_semantic_text_chunker(record, dummy_text_chunker) - assert result["recordId"] == "2" - chunks = result["data"]["chunks"] - assert isinstance(chunks, list) - assert len(chunks) == 1 - assert chunks[0]["page_number"] == 5 + # Verify page numbers were correctly assigned + assert result_chunks[0].page_number == 1 + assert result_chunks[1].page_number == 3 + assert result_chunks[2].page_number == 5 @pytest.mark.asyncio @@ -119,9 +128,9 @@ async def test_process_semantic_text_chunker_multiple_chunks(): "recordId": "4", "data": { "content": "Content that generates multiple chunks.", - "per_page_starting_sentences": [ - {"starting_sentence": "first_page", "page_number": 3}, - {"starting_sentence": "second_page", "page_number": 4}, + "page_number_tracking_holders": [ + {"page_content": "first_page", "page_number": 3}, + {"page_content": "second_page", "page_number": 4}, ], }, } @@ -131,15 +140,15 @@ async def test_process_semantic_text_chunker_multiple_chunks(): dummy_text_chunker = MagicMock() dummy_text_chunker.chunk = AsyncMock(return_value=[dummy_chunk1, dummy_chunk2]) - def dummy_assign_page(chunks, per_page_starting_sentences): + def dummy_assign_page(chunks, page_number_tracking_holders): ps_objs = [ - DummyPerPageStartingSentenceHolder(**ps.__dict__) - for ps in per_page_starting_sentences + DummyPageNumberTrackingHolder(**ps.__dict__) + for ps in page_number_tracking_holders ] page_number = 1 for chunk in chunks: for ps in ps_objs: - if ps.starting_sentence in chunk.mark_up: + if ps.page_content in chunk.mark_up: page_number = ps.page_number break chunk.page_number = page_number @@ -156,55 +165,6 @@ def dummy_assign_page(chunks, per_page_starting_sentences): assert chunks[1]["page_number"] == 4 -@pytest.mark.asyncio -async def test_process_semantic_text_chunker_empty_page_sentences(): - """ - Test a record where 'per_page_starting_sentences' exists but is empty. - In this case, the default page (1) is assigned. - """ - record = { - "recordId": "5", - "data": { - "content": "Some content to be chunked.", - "per_page_starting_sentences": [], - }, - } - - dummy_chunk = DummyChunkHolder("Chunk without any page indicator") - dummy_text_chunker = MagicMock() - dummy_text_chunker.chunk = AsyncMock(return_value=[dummy_chunk]) - - def dummy_assign_page(chunks, per_page_starting_sentences): - for chunk in chunks: - chunk.page_number = 1 - return chunks - - dummy_text_chunker.assign_page_number_to_chunks = dummy_assign_page - - result = await process_semantic_text_chunker(record, dummy_text_chunker) - assert result["recordId"] == "5" - chunks = result["data"]["chunks"] - assert isinstance(chunks, list) - assert len(chunks) == 1 - assert chunks[0]["page_number"] == 1 - - -@pytest.mark.asyncio -async def test_process_semantic_text_chunker_missing_data(): - """ - Test that if the record is missing the 'data' key, the function returns an error. - """ - record = {"recordId": "6"} - dummy_text_chunker = MagicMock() - dummy_text_chunker.chunk = AsyncMock(return_value=[DummyChunkHolder("chunk")]) - dummy_text_chunker.assign_page_number_to_chunks = MagicMock() - - result = await process_semantic_text_chunker(record, dummy_text_chunker) - assert result["recordId"] == "6" - assert result["data"] is None - assert "errors" in result - - @pytest.mark.asyncio async def test_process_semantic_text_chunker_empty_content(): """ @@ -244,7 +204,7 @@ def __init__(self, text): class DummyNLP: - def __call__(self, text): + def __call__(self, text, disable): return DummyDoc(text) @@ -253,7 +213,6 @@ def __call__(self, text): def chunker(): # Use relaxed thresholds so that even short sentences qualify. stc = SemanticTextChunker( - num_surrounding_sentences=1, similarity_threshold=0.8, max_chunk_tokens=1000, min_chunk_tokens=1, @@ -267,43 +226,6 @@ def chunker(): return stc -# --- Chunk Splitting Tests Using Real (Patched) Chunker --- - - -@pytest.mark.asyncio -async def test_chunk_complete_figure(chunker): - """ - Test a text containing a complete
element. - Expect that the sentence with the complete figure is detected and grouped. - """ - text = "Text before.
Figure content
. Text after." - chunks = await chunker.chunk(text) - # For our dummy segmentation, we expect two final chunks: - # one that combines "Text before" and the figure, and one for "Text after". - assert len(chunks) == 2 - # Check that the first chunk contains a complete figure. - assert "" in chunks[0].mark_up - - -@pytest.mark.asyncio -async def test_chunk_incomplete_figure(chunker): - """ - Test a text with an incomplete figure element spanning multiple sentences. - The start and end of the figure should be grouped together. - """ - text = ( - "Text before.
Start of figure. Figure continues
. Text after." - ) - chunks = await chunker.chunk(text) - # Expected grouping: one chunk combining the normal text and the grouped figure, - # and another chunk for text after. - assert len(chunks) == 2 - # Check that the grouped chunk contains both the start and the end of the figure. - assert "" in chunks[0].mark_up - - @pytest.mark.asyncio async def test_chunk_markdown_heading(chunker): """ @@ -338,7 +260,6 @@ async def test_chunk_long_sentence(): """ # Create a chunker that forces a long sentence to exceed the max token threshold. stc = SemanticTextChunker( - num_surrounding_sentences=1, similarity_threshold=0.8, max_chunk_tokens=5, # set low so even a few words exceed it min_chunk_tokens=1, @@ -353,3 +274,197 @@ async def test_chunk_long_sentence(): # And because 12 >= 5, that sentence is immediately appended as a chunk. assert len(chunks) == 1 assert "exceed" in chunks[0].mark_up + + +def test_assign_page_number_with_html_comments(): + """Test that HTML comments are properly stripped when assigning page numbers.""" + chunker = SemanticTextChunker() + + # Create a chunk with HTML comments + chunk = ChunkHolder(mark_up=" First line\nSecond line") + + # Create page tracking holders + page_tracking_holders = [ + PageNumberTrackingHolder(page_content="First line\nSecond line", page_number=3), + ] + + # Call the method being tested + result_chunks = chunker.assign_page_number_to_chunks([chunk], page_tracking_holders) + + # Verify the page number was correctly assigned despite the HTML comment + assert result_chunks[0].page_number == 3 + + +@pytest.mark.asyncio +async def test_clean_new_lines(): + """Test the clean_new_lines method properly processes newlines.""" + chunker = SemanticTextChunker() + + # Test with various newline patterns + text = "

First line\nSecond line

\n\n

Next paragraph

" + result = chunker.clean_new_lines(text) + + # Check that single newlines between tags are removed + assert "

First line Second line

" in result + # Check that multiple newlines are replaced with space + \n\n + assert "

\n\n

" in result + + +@pytest.mark.asyncio +async def test_filter_empty_figures(): + """Test the filter_empty_figures method removes empty figure tags.""" + chunker = SemanticTextChunker() + + # Test with empty and non-empty figures + text = "

Text

More text

Content
" + result = chunker.filter_empty_figures(text) + + # Check that empty figures are removed + assert "
" not in result + # Check that non-empty figures remain + assert "
Content
" in result + + +@pytest.mark.asyncio +async def test_group_figures_and_tables(): + """Test grouping of figures and tables into sentences.""" + chunker = SemanticTextChunker() + + sentences = ["Before table.", "Row 1", "Row 2
", "After table."] + + grouped, is_table_map = chunker.group_figures_and_tables_into_sentences(sentences) + + # Check that table contents are grouped + assert len(grouped) == 3 + assert "Row 1 Row 2
" in grouped + # Check table map is correct + assert is_table_map == [False, True, False] + + +@pytest.mark.asyncio +async def test_remove_figures(): + """Test the remove_figures method.""" + chunker = SemanticTextChunker() + + text = 'Text before
Figure content
text after' + result = chunker.remove_figures(text) + + assert "Text before text after" == result + assert " +Cell 1Cell 2 + + +> Blockquote text""" + + chunks = await chunker.chunk(text) + + # Verify we have reasonable chunks + assert len(chunks) >= 1 + + # Check heading formatting + heading_chunks = [c for c in chunks if "# Heading 1" in c.mark_up] + assert len(heading_chunks) > 0 + assert "# Heading" in heading_chunks[0].mark_up + + +@pytest.mark.asyncio +async def test_process_page_tracking_no_match(): + """Test behavior when page_number_tracking_holders is provided but doesn't match any chunks.""" + record = { + "recordId": "8", + "data": { + "content": "Unique content that won't match page tracking.", + "page_number_tracking_holders": [ + {"page_content": "Something completely different", "page_number": 10} + ], + }, + } + + chunker = SemanticTextChunker() + result = await process_semantic_text_chunker(record, chunker) + + # Should default to page 1 when no match is found + assert result["data"]["chunks"][0]["page_number"] == 1 + + +@pytest.mark.asyncio +async def test_nested_html_structure(): + """Test handling of nested HTML tags.""" + chunker = SemanticTextChunker() + + text = """
+

Paragraph with bold text and italic text

+ + + +
Header 1Header 2
Value 1Value 2
+
""" + + chunks = await chunker.chunk(text) + + # Verify we get at least one chunk + assert len(chunks) > 0 + # Check that the table is kept intact in one chunk + table_chunks = [ + c for c in chunks if "" in c.mark_up and "
" in c.mark_up + ] + assert len(table_chunks) > 0 + + +def test_sentence_similarity(): + """Test the sentence_similarity method.""" + chunker = SemanticTextChunker() + + # Should be highly similar + text1 = "Machine learning is a field of artificial intelligence." + text2 = "Artificial intelligence includes the domain of machine learning." + similarity = chunker.sentence_similarity(text1, text2) + + # The exact value will depend on the model, but should be relatively high + assert similarity > 0.5 + + # Should be less similar + text3 = "Python is a programming language." + similarity2 = chunker.sentence_similarity(text1, text3) + + # Should be lower than the first comparison + assert similarity2 < similarity + + +@pytest.mark.asyncio +async def test_special_characters_handling(): + """Test chunking text with special characters and non-English content.""" + chunker = SemanticTextChunker() + + text = """# Résumé + +Special characters: ©®™℠ + +Non-English: こんにちは 你好 안녕하세요 + +Math symbols: ∑ ∫ ∏ √ ∂ Δ π μ σ""" + + chunks = await chunker.chunk(text) + + # Verify chunks were created + assert len(chunks) > 0 + # Check content is preserved + combined_content = " ".join(c.mark_up for c in chunks) + assert "©®™℠" in combined_content + assert "こんにちは" in combined_content