microsoft · BenConstable9 · Mar 21, 2025 · Mar 21, 2025 · Mar 21, 2025 · Mar 21, 2025
@@ -283,7 +283,6 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill:
 
     def get_semantic_chunker_skill(
         self,
-        num_surrounding_sentences: int = 2,
         similarity_threshold: float = 0.8,
         max_chunk_tokens: int = 500,
         min_chunk_tokens: int = 150,
@@ -294,7 +293,6 @@ def get_semantic_chunker_skill(
         -----
             context (str): The context of the skill
             source (str): The source of the skill
-            num_surrounding_sentences (int, optional): The number of surrounding sentences. Defaults to 1.
             similarity_threshold (float, optional): The similarity threshold. Defaults to 0.8.
             max_chunk_tokens (int, optional): The maximum number of tokens. Defaults to 200.
 
@@ -314,8 +312,8 @@ def get_semantic_chunker_skill(
                 name="content", source="/document/layout_merged_content"
             ),
             InputFieldMappingEntry(
-                name="per_page_starting_sentences",
-                source="/document/per_page_starting_sentences",
+                name="page_number_tracking_holders",
+                source="/document/page_number_tracking_holders",
             ),
         ]
 
@@ -333,7 +331,6 @@ def get_semantic_chunker_skill(
             degree_of_parallelism=degree_of_parallelism,
             http_method="POST",
             http_headers={
-                "num_surrounding_sentences": num_surrounding_sentences,
                 "similarity_threshold": similarity_threshold,
                 "max_chunk_tokens": max_chunk_tokens,
                 "min_chunk_tokens": min_chunk_tokens,
@@ -385,8 +382,8 @@ def get_layout_analysis_skill(
             output = [
                 OutputFieldMappingEntry(name="layout", target_name="layout"),
                 OutputFieldMappingEntry(
-                    name="per_page_starting_sentences",
-                    target_name="per_page_starting_sentences",
+                    name="page_number_tracking_holders",
+                    target_name="page_number_tracking_holders",
                 ),
             ]
 

@@ -98,7 +98,7 @@ This skill merges the layout output with the figure outputs to create a unified
 
 ### Semantic Chunker Custom Skill
 
-You can then test the chunking by sending a AI Search JSON format to the `/semantic_text_chunker/ HTTP endpoint. The header controls the different chunking parameters *(num_surrounding_sentences, similarity_threshold, max_chunk_tokens, min_chunk_tokens)*.
+You can then test the chunking by sending a AI Search JSON format to the `/semantic_text_chunker/ HTTP endpoint. The header controls the different chunking parameters *(similarity_threshold, max_chunk_tokens, min_chunk_tokens)*.
 
 ### MarkUp Cleaner Custom Skill
 

@@ -171,9 +171,6 @@ async def semantic_text_chunker(req: func.HttpRequest) -> func.HttpResponse:
 
         semantic_text_chunker_config = req.headers
 
-        num_surrounding_sentences = int(
-            semantic_text_chunker_config.get("num_surrounding_sentences", 1)
-        )
         similarity_threshold = float(
             semantic_text_chunker_config.get("similarity_threshold", 0.8)
         )
@@ -192,7 +189,6 @@ async def semantic_text_chunker(req: func.HttpRequest) -> func.HttpResponse:
         record_tasks = []
 
         semantic_text_chunker_processor = SemanticTextChunker(
-            num_surrounding_sentences=num_surrounding_sentences,
             similarity_threshold=similarity_threshold,
             max_chunk_tokens=max_chunk_tokens,
             min_chunk_tokens=min_chunk_tokens,

@@ -22,8 +22,9 @@
     LayoutHolder,
     PageWiseContentHolder,
     NonPageWiseContentHolder,
-    PerPageStartingSentenceHolder,
+    PageNumberTrackingHolder,
 )
+import re
 
 
 class StorageAccountHelper:
@@ -341,14 +342,14 @@ def create_page_wise_content(self) -> list[LayoutHolder]:
 
         return page_wise_contents
 
-    def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolder]:
+    def create_page_number_tracking_holder(self) -> list[PageNumberTrackingHolder]:
         """Create a list of the starting sentence of each page so we can assign the starting sentence to the page number.
 
         Returns:
         --------
             list: A list of the starting sentence of each page."""
 
-        per_page_starting_sentences = []
+        page_number_tracking_holders = []
 
         for page in self.result.pages:
             page_content = self.result.content[
@@ -358,22 +359,38 @@ def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolde
 
             # Remove any leading whitespace/newlines.
             cleaned_content = page_content.lstrip()
-            # If a newline appears before a period, split on newline; otherwise, on period.
-            if "\n" in cleaned_content:
-                first_line = cleaned_content.split("\n", 1)[0]
-            elif "." in cleaned_content:
-                first_line = cleaned_content.split(".", 1)[0]
+            # Strip the html comment but keep the content
+            html_comments_pattern = re.compile(r"<!--.*?-->", re.DOTALL)
+            cleaned_content = html_comments_pattern.sub("", cleaned_content)
+
+            # Remove anything inside a figure tag
+            cleaned_content = re.sub(
+                "<figure>(.*?)</figure>",
+                "",
+                cleaned_content,
+                flags=re.DOTALL | re.MULTILINE,
+            )
+            logging.info(f"Page Number: {page.page_number}")
+            logging.info(f"Content for Page Detection: {page_content}")
+            logging.info(f"Cleaned Content for Page Detection: {cleaned_content}")
+
+            if len(cleaned_content) == 0:
+                logging.error(
+                    "No content found in the cleaned result for page %s.",
+                    page.page_number,
+                )
+                cleaned_content = None
             else:
-                first_line = cleaned_content
+                cleaned_content = cleaned_content.strip()
 
-            per_page_starting_sentences.append(
-                PerPageStartingSentenceHolder(
+            page_number_tracking_holders.append(
+                PageNumberTrackingHolder(
                     page_number=page.page_number,
-                    starting_sentence=first_line.strip(),
+                    page_content=cleaned_content,
                 )
             )
 
-        return per_page_starting_sentences
+        return page_number_tracking_holders
 
     async def get_document_intelligence_client(self) -> DocumentIntelligenceClient:
         """Get the Azure Document Intelligence client.
@@ -522,11 +539,11 @@ async def analyse(self):
                 if self.extract_figures:
                     await self.process_figures_from_extracted_content(text_content)
 
-                per_page_starting_sentences = self.create_per_page_starting_sentence()
+                page_number_tracking_holders = self.create_page_number_tracking_holder()
 
                 output_record = NonPageWiseContentHolder(
                     layout=text_content,
-                    per_page_starting_sentences=per_page_starting_sentences,
+                    page_number_tracking_holders=page_number_tracking_holders,
                 )
 
         except Exception as e:

@@ -47,18 +47,18 @@ class PageWiseContentHolder(BaseModel):
     page_wise_layout: list[LayoutHolder]
 
 
-class PerPageStartingSentenceHolder(BaseModel):
+class PageNumberTrackingHolder(BaseModel):
     """A class to hold the starting sentence of each page."""
 
     page_number: int
-    starting_sentence: str
+    page_content: str | None
 
 
 class NonPageWiseContentHolder(BaseModel):
     """A class to hold the non-page-wise content extracted from the document."""
 
     layout: LayoutHolder
-    per_page_starting_sentences: list[PerPageStartingSentenceHolder] = Field(
+    page_number_tracking_holders: list[PageNumberTrackingHolder] = Field(
         default_factory=list
     )
 
@@ -69,6 +69,5 @@ class ChunkHolder(BaseModel):
     mark_up: str
     sections: Optional[list[str]] = Field(default_factory=list)
     figures: Optional[list[FigureHolder]] = Field(default_factory=list)
-    starting_sentence: Optional[str] = None
     cleaned_text: Optional[str] = None
     page_number: Optional[int] = Field(default=None)