diff --git a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py
index 8a872f8..4e00d14 100644
--- a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py
+++ b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py
@@ -283,7 +283,6 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill:
 
     def get_semantic_chunker_skill(
         self,
-        num_surrounding_sentences: int = 2,
         similarity_threshold: float = 0.8,
         max_chunk_tokens: int = 500,
         min_chunk_tokens: int = 150,
@@ -294,7 +293,6 @@ def get_semantic_chunker_skill(
         -----
             context (str): The context of the skill
             source (str): The source of the skill
-            num_surrounding_sentences (int, optional): The number of surrounding sentences. Defaults to 1.
             similarity_threshold (float, optional): The similarity threshold. Defaults to 0.8.
             max_chunk_tokens (int, optional): The maximum number of tokens. Defaults to 200.
 
@@ -314,8 +312,8 @@ def get_semantic_chunker_skill(
                 name="content", source="/document/layout_merged_content"
             ),
             InputFieldMappingEntry(
-                name="per_page_starting_sentences",
-                source="/document/per_page_starting_sentences",
+                name="page_number_tracking_holders",
+                source="/document/page_number_tracking_holders",
             ),
         ]
 
@@ -333,7 +331,6 @@ def get_semantic_chunker_skill(
             degree_of_parallelism=degree_of_parallelism,
             http_method="POST",
             http_headers={
-                "num_surrounding_sentences": num_surrounding_sentences,
                 "similarity_threshold": similarity_threshold,
                 "max_chunk_tokens": max_chunk_tokens,
                 "min_chunk_tokens": min_chunk_tokens,
@@ -385,8 +382,8 @@ def get_layout_analysis_skill(
             output = [
                 OutputFieldMappingEntry(name="layout", target_name="layout"),
                 OutputFieldMappingEntry(
-                    name="per_page_starting_sentences",
-                    target_name="per_page_starting_sentences",
+                    name="page_number_tracking_holders",
+                    target_name="page_number_tracking_holders",
                 ),
             ]
 
diff --git a/image_processing/README.md b/image_processing/README.md
index be740e4..10fdc74 100644
--- a/image_processing/README.md
+++ b/image_processing/README.md
@@ -98,7 +98,7 @@ This skill merges the layout output with the figure outputs to create a unified
 
 ### Semantic Chunker Custom Skill
 
-You can then test the chunking by sending a AI Search JSON format to the `/semantic_text_chunker/ HTTP endpoint. The header controls the different chunking parameters *(num_surrounding_sentences, similarity_threshold, max_chunk_tokens, min_chunk_tokens)*.
+You can then test the chunking by sending a AI Search JSON format to the `/semantic_text_chunker/ HTTP endpoint. The header controls the different chunking parameters *(similarity_threshold, max_chunk_tokens, min_chunk_tokens)*.
 
 ### MarkUp Cleaner Custom Skill
 
diff --git a/image_processing/src/image_processing/function_app.py b/image_processing/src/image_processing/function_app.py
index c918eff..cefcdc2 100644
--- a/image_processing/src/image_processing/function_app.py
+++ b/image_processing/src/image_processing/function_app.py
@@ -171,9 +171,6 @@ async def semantic_text_chunker(req: func.HttpRequest) -> func.HttpResponse:
 
         semantic_text_chunker_config = req.headers
 
-        num_surrounding_sentences = int(
-            semantic_text_chunker_config.get("num_surrounding_sentences", 1)
-        )
         similarity_threshold = float(
             semantic_text_chunker_config.get("similarity_threshold", 0.8)
         )
@@ -192,7 +189,6 @@ async def semantic_text_chunker(req: func.HttpRequest) -> func.HttpResponse:
         record_tasks = []
 
         semantic_text_chunker_processor = SemanticTextChunker(
-            num_surrounding_sentences=num_surrounding_sentences,
             similarity_threshold=similarity_threshold,
             max_chunk_tokens=max_chunk_tokens,
             min_chunk_tokens=min_chunk_tokens,
diff --git a/image_processing/src/image_processing/layout_analysis.py b/image_processing/src/image_processing/layout_analysis.py
index 5a1ef4f..64fc8b0 100644
--- a/image_processing/src/image_processing/layout_analysis.py
+++ b/image_processing/src/image_processing/layout_analysis.py
@@ -22,8 +22,9 @@
     LayoutHolder,
     PageWiseContentHolder,
     NonPageWiseContentHolder,
-    PerPageStartingSentenceHolder,
+    PageNumberTrackingHolder,
 )
+import re
 
 
 class StorageAccountHelper:
@@ -341,14 +342,14 @@ def create_page_wise_content(self) -> list[LayoutHolder]:
 
         return page_wise_contents
 
-    def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolder]:
+    def create_page_number_tracking_holder(self) -> list[PageNumberTrackingHolder]:
         """Create a list of the starting sentence of each page so we can assign the starting sentence to the page number.
 
         Returns:
         --------
             list: A list of the starting sentence of each page."""
 
-        per_page_starting_sentences = []
+        page_number_tracking_holders = []
 
         for page in self.result.pages:
             page_content = self.result.content[
@@ -358,22 +359,38 @@ def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolde
 
             # Remove any leading whitespace/newlines.
             cleaned_content = page_content.lstrip()
-            # If a newline appears before a period, split on newline; otherwise, on period.
-            if "\n" in cleaned_content:
-                first_line = cleaned_content.split("\n", 1)[0]
-            elif "." in cleaned_content:
-                first_line = cleaned_content.split(".", 1)[0]
+            # Strip the html comment but keep the content
+            html_comments_pattern = re.compile(r"<!--.*?-->", re.DOTALL)
+            cleaned_content = html_comments_pattern.sub("", cleaned_content)
+
+            # Remove anything inside a figure tag
+            cleaned_content = re.sub(
+                "<figure>(.*?)</figure>",
+                "",
+                cleaned_content,
+                flags=re.DOTALL | re.MULTILINE,
+            )
+            logging.info(f"Page Number: {page.page_number}")
+            logging.info(f"Content for Page Detection: {page_content}")
+            logging.info(f"Cleaned Content for Page Detection: {cleaned_content}")
+
+            if len(cleaned_content) == 0:
+                logging.error(
+                    "No content found in the cleaned result for page %s.",
+                    page.page_number,
+                )
+                cleaned_content = None
             else:
-                first_line = cleaned_content
+                cleaned_content = cleaned_content.strip()
 
-            per_page_starting_sentences.append(
-                PerPageStartingSentenceHolder(
+            page_number_tracking_holders.append(
+                PageNumberTrackingHolder(
                     page_number=page.page_number,
-                    starting_sentence=first_line.strip(),
+                    page_content=cleaned_content,
                 )
             )
 
-        return per_page_starting_sentences
+        return page_number_tracking_holders
 
     async def get_document_intelligence_client(self) -> DocumentIntelligenceClient:
         """Get the Azure Document Intelligence client.
@@ -522,11 +539,11 @@ async def analyse(self):
                 if self.extract_figures:
                     await self.process_figures_from_extracted_content(text_content)
 
-                per_page_starting_sentences = self.create_per_page_starting_sentence()
+                page_number_tracking_holders = self.create_page_number_tracking_holder()
 
                 output_record = NonPageWiseContentHolder(
                     layout=text_content,
-                    per_page_starting_sentences=per_page_starting_sentences,
+                    page_number_tracking_holders=page_number_tracking_holders,
                 )
 
         except Exception as e:
diff --git a/image_processing/src/image_processing/layout_holders.py b/image_processing/src/image_processing/layout_holders.py
index 8d1535f..9e03cff 100644
--- a/image_processing/src/image_processing/layout_holders.py
+++ b/image_processing/src/image_processing/layout_holders.py
@@ -47,18 +47,18 @@ class PageWiseContentHolder(BaseModel):
     page_wise_layout: list[LayoutHolder]
 
 
-class PerPageStartingSentenceHolder(BaseModel):
+class PageNumberTrackingHolder(BaseModel):
     """A class to hold the starting sentence of each page."""
 
     page_number: int
-    starting_sentence: str
+    page_content: str | None
 
 
 class NonPageWiseContentHolder(BaseModel):
     """A class to hold the non-page-wise content extracted from the document."""
 
     layout: LayoutHolder
-    per_page_starting_sentences: list[PerPageStartingSentenceHolder] = Field(
+    page_number_tracking_holders: list[PageNumberTrackingHolder] = Field(
         default_factory=list
     )
 
@@ -69,6 +69,5 @@ class ChunkHolder(BaseModel):
     mark_up: str
     sections: Optional[list[str]] = Field(default_factory=list)
     figures: Optional[list[FigureHolder]] = Field(default_factory=list)
-    starting_sentence: Optional[str] = None
     cleaned_text: Optional[str] = None
     page_number: Optional[int] = Field(default=None)
diff --git a/image_processing/src/image_processing/semantic_text_chunker.py b/image_processing/src/image_processing/semantic_text_chunker.py
index b97c667..cb340c9 100644
--- a/image_processing/src/image_processing/semantic_text_chunker.py
+++ b/image_processing/src/image_processing/semantic_text_chunker.py
@@ -7,18 +7,16 @@
 import spacy
 import numpy as np
 from model2vec import StaticModel
-from layout_holders import PerPageStartingSentenceHolder, ChunkHolder
+from layout_holders import PageNumberTrackingHolder, ChunkHolder
 
 
 class SemanticTextChunker:
     def __init__(
         self,
-        num_surrounding_sentences: int = 2,
         similarity_threshold: float = 0.8,
         max_chunk_tokens: int = 500,
-        min_chunk_tokens: int = 150,
+        min_chunk_tokens: int = 200,
     ):
-        self.num_surrounding_sentences = num_surrounding_sentences
         self.similarity_threshold = similarity_threshold
         self.max_chunk_tokens = max_chunk_tokens
         self.min_chunk_tokens = min_chunk_tokens
@@ -88,7 +86,7 @@ async def chunk(self, text: str) -> list[ChunkHolder]:
         Returns:
             list(str): The list of chunks"""
 
-        logging.info(f"Chunking text: {text}")
+        logging.debug(f"Chunking text: {text}")
 
         sentences = self.split_into_sentences(text)
 
@@ -111,7 +109,7 @@ async def chunk(self, text: str) -> list[ChunkHolder]:
             f"""Number of Forward pass chunks: {
                 len(forward_pass_chunks)}"""
         )
-        logging.info(f"Forward pass chunks: {forward_pass_chunks}")
+        logging.debug(f"Forward pass chunks: {forward_pass_chunks}")
 
         backwards_pass_chunks, _ = self.merge_chunks(
             forward_pass_chunks, new_is_table_or_figure_map, forwards_direction=False
@@ -123,7 +121,7 @@ async def chunk(self, text: str) -> list[ChunkHolder]:
             f"""Number of Backaward pass chunks: {
                 len(reversed_backwards_pass_chunks)}"""
         )
-        logging.info(f"Backward pass chunks: {reversed_backwards_pass_chunks}")
+        logging.debug(f"Backward pass chunks: {reversed_backwards_pass_chunks}")
 
         cleaned_final_chunks = []
         for chunk in reversed_backwards_pass_chunks:
@@ -132,7 +130,7 @@ async def chunk(self, text: str) -> list[ChunkHolder]:
                 cleaned_final_chunks.append(ChunkHolder(mark_up=stripped_chunk))
 
         logging.info(f"Number of final chunks: {len(cleaned_final_chunks)}")
-        logging.info(f"Chunks: {cleaned_final_chunks}")
+        logging.debug(f"Chunks: {cleaned_final_chunks}")
 
         if len(cleaned_final_chunks) == 0:
             raise ValueError("No chunks were generated")
@@ -174,9 +172,12 @@ def split_into_sentences(self, text: str) -> list[str]:
         # Filter out empty <figure>...</figure> tags
         cleaned_text = self.filter_empty_figures(cleaned_text)
 
-        logging.info(f"Cleaned text: {cleaned_text}")
+        logging.debug(f"Cleaned text: {cleaned_text}")
 
-        doc = self._nlp_model(cleaned_text)
+        self._nlp_model.max_length = len(cleaned_text) + 100
+        doc = self._nlp_model(
+            cleaned_text, disable=["ner", "tagger", "lemmatizer", "textcat"]
+        )
 
         tag_split_sentences = []
         # Pattern to match the closing and opening tag junctions with whitespace in between
@@ -258,103 +259,98 @@ def group_figures_and_tables_into_sentences(self, sentences: list[str]):
 
         return grouped_sentences, is_table_or_figure_map
 
-    def look_ahead_and_behind_sentences(
-        self, total_sentences, is_table_or_figure_map, current_sentence_index
-    ):
-        is_table_or_figure_ahead = False
-        is_table_or_figure_behind = False
-
-        distance_to_next_figure = self.num_surrounding_sentences
-
-        if current_sentence_index < self.num_surrounding_sentences:
-            is_table_or_figure_behind = is_table_or_figure_map[0]
-        else:
-            is_table_or_figure_behind = is_table_or_figure_map[
-                current_sentence_index - self.num_surrounding_sentences
-            ]
-
-        surround_sentences_gap_to_test = self.num_surrounding_sentences
-        if current_sentence_index + self.num_surrounding_sentences >= total_sentences:
-            is_table_or_figure_ahead = is_table_or_figure_map[-1]
-            surround_sentences_gap_to_test = total_sentences - current_sentence_index
-        else:
-            is_table_or_figure_ahead = is_table_or_figure_map[
-                current_sentence_index + self.num_surrounding_sentences
-            ]
-
-        for (
-            next_sentence_is_table_or_figure_index,
-            next_sentence_is_table_or_figure,
-        ) in enumerate(
-            is_table_or_figure_map[
-                current_sentence_index : current_sentence_index
-                + surround_sentences_gap_to_test
-            ]
-        ):
-            if next_sentence_is_table_or_figure:
-                distance_to_next_figure = next_sentence_is_table_or_figure_index
-
-        return (
-            is_table_or_figure_ahead,
-            is_table_or_figure_behind,
-            min(surround_sentences_gap_to_test, distance_to_next_figure),
+    def remove_figures(self, text):
+        figure_tag_pattern = (
+            r"<figure(?:\s+FigureId=(\"[^\"]*\"|'[^']*'))?>(.*?)</figure>"
         )
+        return re.sub(figure_tag_pattern, "", text).strip()
 
     def merge_similar_chunks(self, current_sentence, current_chunk, forwards_direction):
         new_chunk = None
 
-        def retrieve_current_chunk_up_to_n(n):
+        def retrieve_current_chunk_up_to_minus_n(n):
             if forwards_direction:
                 return " ".join(current_chunk[:-n])
             else:
                 return " ".join(reversed(current_chunk[:-n]))
 
-        def retrieve_current_chunks_from_n(n):
-            if forwards_direction:
-                return " ".join(current_chunk[n:])
-            else:
-                return " ".join(reversed(current_chunk[:-n]))
-
         def retrive_current_chunk_at_n(n):
             if forwards_direction:
                 return current_chunk[n]
             else:
                 return current_chunk[n]
 
+        def retrieve_current_chunks_from_n(n):
+            if forwards_direction:
+                return " ".join(current_chunk[n:])
+            else:
+                return " ".join(reversed(current_chunk[n:]))
+
         def get_current_chunk_tokens(chunk_segments):
+            if isinstance(chunk_segments, str):
+                return self.num_tokens_from_string(chunk_segments)
+
             return self.num_tokens_from_string(" ".join(chunk_segments))
 
+        if len(current_chunk) == 1:
+            logging.debug("Chunk too small to compare")
+            return new_chunk, current_chunk
+
+        if len(current_chunk) > 2:
+            would_be_end_of_old_chunk = retrieve_current_chunk_up_to_minus_n(1)
+            would_be_start_of_new_chunk = [retrive_current_chunk_at_n(-1)]
+        else:
+            would_be_end_of_old_chunk = retrive_current_chunk_at_n(0)
+            would_be_start_of_new_chunk = [retrive_current_chunk_at_n(1)]
+
         current_chunk_tokens = get_current_chunk_tokens(current_chunk)
+        logging.debug(f"Current chunk tokens: {current_chunk_tokens}")
+        would_be_end_of_old_chunk_tokens = get_current_chunk_tokens(
+            would_be_end_of_old_chunk
+        )
+        logging.debug(f"Would be new chunk tokens: {would_be_end_of_old_chunk_tokens}")
 
-        if len(current_chunk) >= 2 and current_chunk_tokens >= self.min_chunk_tokens:
-            # Calculate the tokens if we were to split
-            if len(current_chunk) > 2:
-                would_be_new_chunk = retrieve_current_chunk_up_to_n(1)
-                would_be_current_chunk = [retrive_current_chunk_at_n(-1)]
-            else:
-                would_be_new_chunk = retrive_current_chunk_at_n(0)
-                would_be_current_chunk = [retrive_current_chunk_at_n(1)]
+        would_be_end_of_old_chunk_without_figures = self.remove_figures(
+            would_be_end_of_old_chunk
+        )
 
-            if (
-                get_current_chunk_tokens(would_be_new_chunk) >= self.min_chunk_tokens
-                and get_current_chunk_tokens(would_be_current_chunk)
-                >= self.min_chunk_tokens
-            ):
-                logging.info("Comparing chunks")
-                if (
-                    current_chunk_tokens >= self.max_chunk_tokens
-                    or self.sentence_similarity(
-                        retrieve_current_chunks_from_n(-2), current_sentence
-                    )
-                    < self.similarity_threshold
-                ):
-                    return would_be_new_chunk, would_be_current_chunk
-            else:
-                logging.info("Chunk too small to compare")
-        else:
-            logging.info("Chunk too small to compare")
+        would_be_end_of_old_chunk_without_figures_tokens = self.num_tokens_from_string(
+            would_be_end_of_old_chunk_without_figures
+        )
+
+        would_be_start_of_new_chunk_without_figures = self.remove_figures(
+            " ".join(would_be_start_of_new_chunk)
+        )
+
+        if len(would_be_start_of_new_chunk_without_figures) == 0:
+            logging.debug("Chunk would only contain figures. Not comparing")
+            return new_chunk, current_chunk
+
+        if (
+            would_be_end_of_old_chunk_tokens < self.min_chunk_tokens
+            or would_be_end_of_old_chunk_without_figures_tokens
+            < (self.min_chunk_tokens / 2)
+        ):
+            logging.debug("Chunk too small. Not comparing")
+            return new_chunk, current_chunk
+
+        if would_be_end_of_old_chunk_without_figures_tokens > self.max_chunk_tokens:
+            logging.debug("Chunk too large. Not comparing")
+            return would_be_end_of_old_chunk, would_be_start_of_new_chunk
+
+        similarity_set = retrieve_current_chunks_from_n(-2)
 
-        return new_chunk, current_chunk
+        # Calculate the tokens if we were to split
+        logging.debug("Comparing chunks")
+        if (
+            current_chunk_tokens > (self.max_chunk_tokens * 1.5)
+            or self.sentence_similarity(similarity_set, current_sentence)
+            < self.similarity_threshold
+        ):
+            return would_be_end_of_old_chunk, would_be_start_of_new_chunk
+        else:
+            logging.debug("Above similarity threshold")
+            return new_chunk, current_chunk
 
     def is_markdown_heading(self, text):
         return text.strip().startswith("#")
@@ -385,96 +381,73 @@ def retrieve_current_chunk():
                 index += 1
                 continue
 
-            # Detect if table or figure
-            if is_table_or_figure_map[current_sentence_index]:
-                if forwards_direction:
+            if forwards_direction and self.is_markdown_heading(current_sentence):
+                heading_level = current_sentence.count("#")
+
+                if heading_level in [1, 2]:
+                    # Start new chunk
                     if len(current_chunk) > 0:
-                        current_chunk.append(current_sentence)
-                        chunks.append(retrieve_current_chunk())
-                        new_is_table_or_figure_map.append(True)
-                        current_chunk = []
-                    else:
-                        current_chunk.append(current_sentence)
-                else:
-                    # On the backwards pass we don't want to add to the table chunk
-                    chunks.append(retrieve_current_chunk())
-                    new_is_table_or_figure_map.append(True)
-                    current_chunk = [current_sentence]
+                        current_chunk = retrieve_current_chunk()
+                        chunks.append(current_chunk)
+                        new_is_table_or_figure_map.append(
+                            self.sentence_contains_figure_or_table(current_chunk)
+                        )
+                        current_chunk = [current_sentence]
 
-                index += 1
-                continue
-            elif forwards_direction:
-                # Look ahead to see if figure of table is coming up
-                # We only do this on the forward pass
-                (
-                    is_table_or_figure_ahead,
-                    is_table_or_figure_behind,
-                    min_of_distance_to_next_figure_or_num_surrounding_sentences,
-                ) = self.look_ahead_and_behind_sentences(
-                    total_sentences, is_table_or_figure_map, current_sentence_index
-                )
+                        index += 1
+                        continue
 
-                if is_table_or_figure_behind:
-                    # Check if Makrdown heading
-                    if self.is_markdown_heading(current_sentence):
-                        # Start new chunk
-                        chunks.append(retrieve_current_chunk())
-                        new_is_table_or_figure_map.append(False)
-                        current_chunk = [current_sentence]
-                    else:
-                        # Finish off
-                        current_chunk.append(current_sentence)
-                        chunks.append(retrieve_current_chunk())
-                        new_is_table_or_figure_map.append(False)
-                        current_chunk = []
+            # Detect if table or figure
+            if forwards_direction and is_table_or_figure_map[current_sentence_index]:
+                if len(current_chunk) > 0:
+                    current_chunk.append(current_sentence)
+                    chunks.append(retrieve_current_chunk())
+                    new_is_table_or_figure_map.append(True)
+                    current_chunk = []
 
                     index += 1
                     continue
-                elif is_table_or_figure_ahead:
-                    # Add to the ahead chunk
-                    chunks.append(retrieve_current_chunk())
-                    new_is_table_or_figure_map.append(False)
-                    if forwards_direction:
-                        current_chunk = sentences[
-                            current_sentence_index : current_sentence_index
-                            + min_of_distance_to_next_figure_or_num_surrounding_sentences
-                        ]
-                    else:
-                        current_chunk = sentences[
-                            current_sentence_index : current_sentence_index
-                            - min_of_distance_to_next_figure_or_num_surrounding_sentences : -1
-                        ]
-                    index += min_of_distance_to_next_figure_or_num_surrounding_sentences
-                    continue
 
             # now group semanticly
-            num_tokens = self.num_tokens_from_string(current_sentence)
+            current_chunk.append(current_sentence)
 
-            if num_tokens >= self.max_chunk_tokens:
-                chunks.append(current_sentence)
-                new_is_table_or_figure_map.append(False)
-            else:
-                current_chunk.append(current_sentence)
+            new_chunk, current_chunk = self.merge_similar_chunks(
+                current_sentence,
+                current_chunk,
+                forwards_direction=forwards_direction,
+            )
 
-                new_chunk, current_chunk = self.merge_similar_chunks(
-                    current_sentence,
-                    current_chunk,
-                    forwards_direction=forwards_direction,
+            if new_chunk is not None:
+                chunks.append(new_chunk)
+                new_is_table_or_figure_map.append(
+                    self.sentence_contains_figure_or_table(new_chunk)
                 )
 
-                if new_chunk is not None:
-                    chunks.append(new_chunk)
-                    new_is_table_or_figure_map.append(False)
-
             index += 1
 
         if len(current_chunk) > 0:
             final_chunk = retrieve_current_chunk()
-            chunks.append(final_chunk)
 
-            new_is_table_or_figure_map.append(
-                self.sentence_contains_figure_or_table(final_chunk)
-            )
+            # Get tokens of this chunk
+            if (
+                self.num_tokens_from_string(final_chunk) < self.min_chunk_tokens
+                and len(chunks) > 0
+            ):
+                # Add the last chunk to the new chunks
+                if forwards_direction:
+                    final_chunk = chunks[-1] + " " + final_chunk
+                else:
+                    final_chunk = final_chunk + " " + chunks[-1]
+
+                chunks[-1] = final_chunk
+                new_is_table_or_figure_map[-1] = self.sentence_contains_figure_or_table(
+                    final_chunk
+                )
+            else:
+                chunks.append(final_chunk)
+                new_is_table_or_figure_map.append(
+                    self.sentence_contains_figure_or_table(final_chunk)
+                )
 
         return chunks, new_is_table_or_figure_map
 
@@ -486,7 +459,7 @@ def sentence_similarity(self, text_1, text_2):
         magnitude = np.linalg.norm(vec1) * np.linalg.norm(vec2)
         similarity = dot_product / magnitude if magnitude != 0 else 0.0
 
-        logging.info(
+        logging.debug(
             f"""Similarity between '{text_1}' and '{
                 text_2}': {similarity}"""
         )
@@ -495,28 +468,83 @@ def sentence_similarity(self, text_1, text_2):
     def assign_page_number_to_chunks(
         self,
         chunks: list[ChunkHolder],
-        per_page_starting_sentences: list[PerPageStartingSentenceHolder],
+        page_number_tracking_holders: list[PageNumberTrackingHolder],
     ) -> list[ChunkHolder]:
         """Assigns page numbers to the chunks based on the starting sentences of each page.
 
         Args:
             chunks (list[ChunkHolder]): The list of chunks.
-            per_page_starting_sentences (list[PerPageStartingSentenceHolder]): The list of starting sentences of each page.
+            page_number_tracking_holders (list[PageNumberTrackingHolder]): The list of starting sentences of each page.
 
         Returns:
             list[ChunkHolder]: The list of chunks with page numbers assigned."""
         page_number = 1
         for chunk in chunks:
-            for per_page_starting_sentence in per_page_starting_sentences[
-                page_number - 1 :
-            ]:
-                if per_page_starting_sentence.starting_sentence in chunk:
-                    logging.info(
-                        "Assigning page number %i to chunk",
-                        per_page_starting_sentence.page_number,
+            # Remove any leading whitespace/newlines.
+            cleaned_content = chunk.mark_up.lstrip()
+            # Strip the html comment but keep the content
+            html_comments_pattern = re.compile(r"<!--.*?-->", re.DOTALL)
+            cleaned_content = html_comments_pattern.sub("", cleaned_content)
+
+            # Use the nlp model to get the first sentence
+            sentences = list(
+                self._nlp_model(
+                    cleaned_content, disable=["ner", "tagger", "lemmatizer", "textcat"]
+                ).sents
+            )
+
+            if len(sentences) == 0:
+                first_line = None
+            else:
+                first_sentence = sentences[0].text.strip()
+
+                if "#" in first_sentence:
+                    logging.info("Splitting on hash")
+                    # Delibretely split on the next hash to get the first line of the markdown content
+                    first_line = (
+                        first_sentence.split(" #", 1)[0]
+                        .strip()
+                        .split("\n", 1)[0]
+                        .strip()
                     )
-                    page_number = per_page_starting_sentence.page_number
-                    break
+                elif "<table>" in first_sentence:
+                    logging.info("Joining onto second sentence to form first row")
+                    if len(sentences) > 1:
+                        first_line = (
+                            first_sentence.lstrip() + "\n" + sentences[1].text.strip()
+                        )
+                    else:
+                        first_line = first_sentence
+                elif "\n" in first_sentence:
+                    logging.info("Splitting on newline")
+                    first_line = first_sentence.split("\n", 1)[0].strip()
+                elif "." in first_sentence:
+                    logging.info("Splitting on period")
+                    first_line = first_sentence.split(".", 1)[0].strip()
+                else:
+                    logging.info("No split found")
+                    first_line = first_sentence.strip()
+
+            if first_line is not None:
+                logging.info(f"Looking for First line: {first_line}")
+                for page_number_tracking_holder in page_number_tracking_holders[
+                    page_number - 1 :
+                ]:
+                    if page_number_tracking_holder.page_content is not None:
+                        if (
+                            first_line == page_number_tracking_holder.page_content
+                            or first_line in page_number_tracking_holder.page_content
+                            or first_line
+                            in page_number_tracking_holder.page_content.replace(
+                                "\n", " "
+                            )
+                        ):
+                            logging.info(
+                                "Assigning page number %i to chunk",
+                                page_number,
+                            )
+                            page_number = page_number_tracking_holder.page_number
+                            break
             chunk.page_number = page_number
         return chunks
 
@@ -545,16 +573,16 @@ async def process_semantic_text_chunker(record: dict, text_chunker) -> dict:
         # scenarios when page by chunking is enabled
         chunks = await text_chunker.chunk(record["data"]["content"])
 
-        if "per_page_starting_sentences" in record["data"]:
-            per_page_starting_sentences = [
-                PerPageStartingSentenceHolder(**sentence)
-                for sentence in record["data"]["per_page_starting_sentences"]
+        if "page_number_tracking_holders" in record["data"]:
+            page_number_tracking_holders = [
+                PageNumberTrackingHolder(**sentence)
+                for sentence in record["data"]["page_number_tracking_holders"]
             ]
 
-            logging.info(f"Per page starting sentences: {per_page_starting_sentences}")
+            logging.info(f"Per page holders: {page_number_tracking_holders}")
 
             chunks = text_chunker.assign_page_number_to_chunks(
-                chunks, per_page_starting_sentences
+                chunks, page_number_tracking_holders
             )
 
         cleaned_record["data"]["chunks"] = [
diff --git a/image_processing/tests/image_processing/test_figure_app.py b/image_processing/tests/image_processing/test_figure_app.py
index e86a1f4..dcdd1b9 100644
--- a/image_processing/tests/image_processing/test_figure_app.py
+++ b/image_processing/tests/image_processing/test_figure_app.py
@@ -188,7 +188,6 @@ async def dummy_process_semantic_text_chunker(value, processor):
     )
 
     headers = {
-        "num_surrounding_sentences": "2",
         "similarity_threshold": "0.9",
         "max_chunk_tokens": "600",
         "min_chunk_tokens": "60",
diff --git a/image_processing/tests/image_processing/test_layout_analysis.py b/image_processing/tests/image_processing/test_layout_analysis.py
index e9de95a..5c4d642 100644
--- a/image_processing/tests/image_processing/test_layout_analysis.py
+++ b/image_processing/tests/image_processing/test_layout_analysis.py
@@ -11,6 +11,8 @@
     LayoutAnalysis,
 )
 
+from layout_holders import LayoutHolder
+
 
 # --- Dummy classes to simulate ADI results and figures ---
 class DummySpan:
@@ -436,7 +438,7 @@ class DummyResultContent:
     assert layout.page_offsets == 0
 
 
-def test_create_per_page_starting_sentence():
+def test_create_page_number_tracking_holder():
     # Create a LayoutAnalysis instance.
     la = LayoutAnalysis(record_id=200, source="dummy")
 
@@ -449,17 +451,17 @@ class DummyResultContent:
     dummy_result = DummyResultContent()
     dummy_result.content = "HelloWorld. This is a test sentence."
     # DummyPage creates a page with spans as a list of dictionaries.
-    dummy_result.pages = [DummyPage(0, 10, 1)]
+    dummy_result.pages = [DummyPage(0, 36, 1)]
     la.result = dummy_result
 
-    sentences = la.create_per_page_starting_sentence()
-    assert len(sentences) == 1
-    sentence = sentences[0]
-    assert sentence.page_number == 1
-    assert sentence.starting_sentence == "HelloWorld"
+    page_number_trackers = la.create_page_number_tracking_holder()
+    assert len(page_number_trackers) == 1
+    tracker = page_number_trackers[0]
+    assert tracker.page_number == 1
+    assert tracker.page_content == "HelloWorld. This is a test sentence."
 
 
-def test_create_per_page_starting_sentence_multiple_pages():
+def test_create_page_number_tracking_holder_multiple_pages():
     # Create a LayoutAnalysis instance.
     la = LayoutAnalysis(record_id=300, source="dummy")
 
@@ -479,15 +481,337 @@ class DummyResultContent:
     ]
     la.result = dummy_result
 
-    # Call create_per_page_starting_sentence and check results.
-    sentences = la.create_per_page_starting_sentence()
-    assert len(sentences) == 2
+    # Call create_page_number_tracking_holder and check results.
+    page_number_trackers = la.create_page_number_tracking_holder()
+    assert len(page_number_trackers) == 2
 
     # For page 1, the substring is "Page one." -> split on "." gives "Page one"
-    assert sentences[0].page_number == 1
-    assert sentences[0].starting_sentence == "Page one"
+    assert page_number_trackers[0].page_number == 1
+    assert page_number_trackers[0].page_content == "Page one."
 
     # For page 2, the substring is "Page two text and" -> split on "." gives the entire string
-    assert sentences[1].page_number == 2
+    assert page_number_trackers[1].page_number == 2
     # We strip potential leading/trailing spaces for validation.
-    assert sentences[1].starting_sentence.strip() == "Page two text and more content"
+    assert (
+        page_number_trackers[1].page_content.strip()
+        == "Page two text and more content. This is more random content that is on page 2."
+    )
+
+
+# Test for download_figure_image with retry logic
+@pytest.mark.asyncio
+async def test_download_figure_image_with_retry(monkeypatch):
+    """Test the download_figure_image method with retry logic."""
+    la = LayoutAnalysis(record_id=101, source="dummy")
+    la.operation_id = "op101"
+    la.result = DummyResult("content", [], [], model_id="model101")
+
+    # Create a counter to track number of attempts
+    call_count = 0
+
+    # Mock document_intelligence_client.get_analyze_result_figure
+    class MockResponse:
+        def __init__(self):
+            self.chunks = [b"chunk1", b"chunk2"]
+
+        def __aiter__(self):
+            return self
+
+        async def __anext__(self):
+            if not self.chunks:
+                raise StopAsyncIteration
+            return self.chunks.pop(0)
+
+    class MockClient:
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, *args):
+            pass
+
+        async def get_analyze_result_figure(self, model_id, result_id, figure_id):
+            nonlocal call_count
+            call_count += 1
+            if call_count == 1:
+                # Fail on first attempt
+                raise Exception("Temporary failure")
+            # Succeed on subsequent attempts
+            return MockResponse()
+
+    # Patch get_document_intelligence_client to return our mock
+    async def mock_get_client():
+        return MockClient()
+
+    monkeypatch.setattr(la, "get_document_intelligence_client", mock_get_client)
+
+    # Call the method - should succeed after retry
+    result = await la.download_figure_image("fig1")
+
+    # Check that it was called more than once (at least one retry)
+    assert call_count > 1
+    # Check the result contains both chunks
+    assert result == b"chunk1chunk2"
+
+
+# Test for non-page-wise analysis with figures
+@pytest.mark.asyncio
+async def test_analyse_non_page_wise_with_figures(monkeypatch, dummy_storage_helper):
+    """Test non-page-wise analysis with figures."""
+    source = "https://dummyaccount.blob.core.windows.net/container/path/to/file.txt"
+    la = LayoutAnalysis(
+        page_wise=False, extract_figures=True, record_id=102, source=source
+    )
+    la.extract_file_info()
+
+    monkeypatch.setattr(
+        la, "get_storage_account_helper", AsyncMock(return_value=dummy_storage_helper)
+    )
+    monkeypatch.setattr(
+        dummy_storage_helper,
+        "download_blob_to_temp_dir",
+        AsyncMock(return_value=("/tmp/dummy.txt", {})),
+    )
+
+    # Create a dummy result with content and a figure
+    dummy_page = DummyPage(0, 20, 1)
+    dummy_figure = DummyFigure(
+        "fig102",
+        offset=10,
+        length=5,
+        page_number=1,
+        caption_content="Figure 102 caption",
+    )
+    dummy_result = DummyResult(
+        content="Full document content", pages=[dummy_page], figures=[dummy_figure]
+    )
+
+    async def dummy_analyse_document(file_path):
+        la.result = dummy_result
+        la.operation_id = "op102"
+
+    monkeypatch.setattr(la, "analyse_document", dummy_analyse_document)
+
+    # Mock figure download and upload
+    monkeypatch.setattr(
+        la, "download_figure_image", AsyncMock(return_value=b"figure102_image_data")
+    )
+    monkeypatch.setattr(
+        dummy_storage_helper,
+        "upload_blob",
+        AsyncMock(return_value="http://dummy.url/fig102.png"),
+    )
+
+    result = await la.analyse()
+
+    assert result["recordId"] == 102
+    assert result["data"] is not None
+    # In non-page-wise mode, we should have layout and page_number_tracking_holders
+    assert "layout" in result["data"]
+    assert "page_number_tracking_holders" in result["data"]
+
+    # Verify figure was processed
+    layout = result["data"]["layout"]
+    assert "figures" in layout
+    figures = layout["figures"]
+    assert len(figures) == 1
+    assert figures[0]["figure_id"] == "fig102"
+    assert figures[0]["caption"] == "Figure 102 caption"
+    expected_b64 = base64.b64encode(b"figure102_image_data").decode("utf-8")
+    assert figures[0]["data"] == expected_b64
+
+
+# Test for when extract_figures is False
+@pytest.mark.asyncio
+async def test_analyse_without_extracting_figures(monkeypatch, dummy_storage_helper):
+    """Test analysis when extract_figures is False."""
+    source = "https://dummyaccount.blob.core.windows.net/container/path/to/file.txt"
+    la = LayoutAnalysis(
+        page_wise=True, extract_figures=False, record_id=103, source=source
+    )
+    la.extract_file_info()
+
+    monkeypatch.setattr(
+        la, "get_storage_account_helper", AsyncMock(return_value=dummy_storage_helper)
+    )
+    monkeypatch.setattr(
+        dummy_storage_helper,
+        "download_blob_to_temp_dir",
+        AsyncMock(return_value=("/tmp/dummy.txt", {})),
+    )
+
+    # Create a dummy result with content and a figure
+    dummy_page = DummyPage(0, 10, 1)
+    dummy_figure = DummyFigure(
+        "fig103",
+        offset=5,
+        length=3,
+        page_number=1,
+        caption_content="Figure 103 caption",
+    )
+    dummy_result = DummyResult(
+        content="Page content", pages=[dummy_page], figures=[dummy_figure]
+    )
+
+    async def dummy_analyse_document(file_path):
+        la.result = dummy_result
+        la.operation_id = "op103"
+
+    monkeypatch.setattr(la, "analyse_document", dummy_analyse_document)
+
+    # Add spy on process_figures_from_extracted_content to ensure it's not called
+    process_figures_spy = AsyncMock()
+    monkeypatch.setattr(
+        la, "process_figures_from_extracted_content", process_figures_spy
+    )
+
+    result = await la.analyse()
+
+    # Verify the function was not called
+    process_figures_spy.assert_not_called()
+
+    assert result["recordId"] == 103
+    assert result["data"] is not None
+    # Verify we have page_wise_layout
+    assert "page_wise_layout" in result["data"]
+    layouts = result["data"]["page_wise_layout"]
+    assert len(layouts) == 1
+    # Each layout should have an empty figures list
+    assert layouts[0]["figures"] == []
+
+
+# Test for HTML comment handling in create_page_number_tracking_holder
+def test_create_page_number_tracking_holder_html_comments():
+    """Test HTML comment handling in page content extraction."""
+    la = LayoutAnalysis(record_id=104, source="dummy")
+
+    class DummyResultContent:
+        pass
+
+    dummy_result = DummyResultContent()
+    # Content with HTML comments
+    dummy_result.content = "Before <!-- comment --> After"
+    dummy_result.pages = [DummyPage(0, 29, 1)]  # Full content
+    la.result = dummy_result
+
+    page_number_trackers = la.create_page_number_tracking_holder()
+    assert len(page_number_trackers) == 1
+    # HTML comments should be removed
+    assert page_number_trackers[0].page_content == "Before  After"
+
+
+# Test for figure tag handling in create_page_number_tracking_holder
+def test_create_page_number_tracking_holder_figure_tags():
+    """Test figure tag handling in page content extraction."""
+    la = LayoutAnalysis(record_id=105, source="dummy")
+
+    class DummyResultContent:
+        pass
+
+    dummy_result = DummyResultContent()
+    # Content with figure tags
+    dummy_result.content = "Before <figure>Figure content</figure> After"
+    dummy_result.pages = [DummyPage(0, 44, 1)]  # Full content
+    la.result = dummy_result
+
+    page_number_trackers = la.create_page_number_tracking_holder()
+    assert len(page_number_trackers) == 1
+    # Figure content should be removed
+    assert page_number_trackers[0].page_content == "Before  After"
+
+
+# Test handling of empty content
+def test_create_page_number_tracking_holder_empty_content():
+    """Test handling of empty content in page tracking."""
+    la = LayoutAnalysis(record_id=106, source="dummy")
+
+    class DummyResultContent:
+        pass
+
+    dummy_result = DummyResultContent()
+    # Empty content
+    dummy_result.content = ""
+    dummy_result.pages = [DummyPage(0, 0, 1)]  # Empty content
+    la.result = dummy_result
+
+    page_number_trackers = la.create_page_number_tracking_holder()
+    assert len(page_number_trackers) == 1
+    # Page content should be None for empty content
+    assert page_number_trackers[0].page_content is None
+
+
+# Test for process_layout_analysis with page_wise=True
+@pytest.mark.asyncio
+async def test_process_layout_analysis_page_wise(monkeypatch):
+    """Test process_layout_analysis with page_wise=True."""
+    record = {
+        "recordId": "107",
+        "data": {"source": "https://dummy.blob.core.windows.net/container/blob.pdf"},
+    }
+
+    # Create a mock LayoutAnalysis
+    mock_layout_analysis = AsyncMock()
+    mock_layout_analysis.analyse = AsyncMock(
+        return_value={"recordId": "107", "data": {"result": "success"}}
+    )
+
+    # Mock the LayoutAnalysis constructor
+    def mock_layout_analysis_constructor(*args, **kwargs):
+        # Verify page_wise=True was passed
+        assert kwargs["page_wise"] is True
+        return mock_layout_analysis
+
+    monkeypatch.setattr(
+        "layout_analysis.LayoutAnalysis", mock_layout_analysis_constructor
+    )
+
+    result = await process_layout_analysis(record, page_wise=True)
+
+    # Verify analyse was called
+    mock_layout_analysis.analyse.assert_called_once()
+    assert result["recordId"] == "107"
+    assert result["data"] == {"result": "success"}
+
+
+# Test handling figures without captions
+@pytest.mark.asyncio
+async def test_figure_without_caption(monkeypatch, dummy_storage_helper):
+    """Test handling figures without captions."""
+    source = "https://dummyaccount.blob.core.windows.net/container/path/to/file.txt"
+    la = LayoutAnalysis(
+        page_wise=False, extract_figures=True, record_id=108, source=source
+    )
+    la.extract_file_info()
+
+    monkeypatch.setattr(
+        la, "get_storage_account_helper", AsyncMock(return_value=dummy_storage_helper)
+    )
+    monkeypatch.setattr(
+        dummy_storage_helper,
+        "download_blob_to_temp_dir",
+        AsyncMock(return_value=("/tmp/dummy.txt", {})),
+    )
+
+    # Create a figure without a caption (caption=None)
+    dummy_figure = DummyFigure(
+        "fig108", offset=5, length=3, page_number=1, caption_content=None
+    )
+    dummy_result = DummyResult(
+        content="Content", pages=[DummyPage(0, 7, 1)], figures=[dummy_figure]
+    )
+
+    la.result = dummy_result
+    monkeypatch.setattr(
+        la, "download_figure_image", AsyncMock(return_value=b"figure108_image_data")
+    )
+
+    # Create a minimal layout holder for testing
+    layout_holder = LayoutHolder(content="Test", page_number=1, page_offsets=0)
+
+    # Process the figures
+    await la.process_figures_from_extracted_content(layout_holder)
+
+    # Check that the figure was processed despite having no caption
+    assert len(layout_holder.figures) == 1
+    figure = layout_holder.figures[0]
+    assert figure.figure_id == "fig108"
+    assert figure.caption is None  # Caption should be None
diff --git a/image_processing/tests/image_processing/test_layout_holders.py b/image_processing/tests/image_processing/test_layout_holders.py
index 3d2d1c4..4e23893 100644
--- a/image_processing/tests/image_processing/test_layout_holders.py
+++ b/image_processing/tests/image_processing/test_layout_holders.py
@@ -8,7 +8,7 @@
     PageWiseContentHolder,
     NonPageWiseContentHolder,
     ChunkHolder,
-    PerPageStartingSentenceHolder,
+    PageNumberTrackingHolder,
 )
 
 
@@ -74,34 +74,32 @@ def test_chunk_holder_creation():
         mark_up="Sample markup",
         sections=["Section1", "Section2"],
         figures=[],
-        starting_sentence="First sentence",
         cleaned_text="Cleaned text content",
         page_number=1,
     )
     assert chunk.mark_up == "Sample markup"
     assert chunk.sections == ["Section1", "Section2"]
-    assert chunk.starting_sentence == "First sentence"
     assert chunk.cleaned_text == "Cleaned text content"
     assert chunk.page_number == 1
 
 
-def test_per_page_starting_sentence_holder_creation():
-    sentence = PerPageStartingSentenceHolder(
-        page_number=1, starting_sentence="This is the starting sentence."
+def test_per_page_page_content_holder_creation():
+    sentence = PageNumberTrackingHolder(
+        page_number=1, page_content="This is the full content."
     )
     assert sentence.page_number == 1
-    assert sentence.starting_sentence == "This is the starting sentence."
+    assert sentence.page_content == "This is the full content."
 
 
-def test_non_page_wise_content_holder_with_sentences():
+def test_non_page_wise_content_holder_with_page_number_trackers():
     layout = LayoutHolder(content="Full document")
-    sentences = [
-        PerPageStartingSentenceHolder(page_number=1, starting_sentence="Start 1"),
-        PerPageStartingSentenceHolder(page_number=2, starting_sentence="Start 2"),
+    page_number_trackers = [
+        PageNumberTrackingHolder(page_number=1, page_content="Start 1"),
+        PageNumberTrackingHolder(page_number=2, page_content="Start 2"),
     ]
     non_page_holder = NonPageWiseContentHolder(
-        layout=layout, per_page_starting_sentences=sentences
+        layout=layout, page_number_tracking_holders=page_number_trackers
     )
     assert non_page_holder.layout.content == "Full document"
-    assert len(non_page_holder.per_page_starting_sentences) == 2
-    assert non_page_holder.per_page_starting_sentences[0].starting_sentence == "Start 1"
+    assert len(non_page_holder.page_number_tracking_holders) == 2
+    assert non_page_holder.page_number_tracking_holders[0].page_content == "Start 1"
diff --git a/image_processing/tests/image_processing/test_semantic_text_chunker.py b/image_processing/tests/image_processing/test_semantic_text_chunker.py
index 59e8364..07277c4 100644
--- a/image_processing/tests/image_processing/test_semantic_text_chunker.py
+++ b/image_processing/tests/image_processing/test_semantic_text_chunker.py
@@ -1,3 +1,5 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
 import pytest
 from unittest.mock import AsyncMock, MagicMock
 
@@ -6,6 +8,8 @@
     SemanticTextChunker,
 )
 
+from layout_holders import ChunkHolder, PageNumberTrackingHolder
+
 # --- Dummy Classes for Process-Level Tests ---
 
 
@@ -18,75 +22,80 @@ def model_dump(self, by_alias=False):
         return {"mark_up": self.mark_up, "page_number": self.page_number}
 
 
-class DummyPerPageStartingSentenceHolder:
-    def __init__(self, starting_sentence, page_number):
-        self.starting_sentence = starting_sentence
+class DummyPageNumberTrackingHolder:
+    def __init__(self, page_content, page_number):
+        self.page_content = page_content
         self.page_number = page_number
 
 
 # --- Process-Level Tests (Using Dummy Chunker) ---
 
 
-@pytest.mark.asyncio
-async def test_process_semantic_text_chunker_success_without_page():
-    """Test a successful chunking when no per-page starting sentences are provided."""
-    record = {"recordId": "1", "data": {"content": "Some content to be chunked."}}
+@pytest.mark.parametrize(
+    "chunk_contents, page_content, expected_page",
+    [
+        # Test matching on markdown heading
+        (["# Title", "Content"], "# Title", 2),
+        # Test matching on newline content
+        (["First line", "Second line"], "First line", 3),
+        # Test matching on period
+        (["First sentence. Second sentence"], "First sentence. Second sentence", 4),
+        # Test matching on table
+        (["<table>Table content</table>"], "", 1),
+        # Test no match (should get default page 1)
+        (["Content not in any page_content"], "Different content", 1),
+    ],
+)
+def test_assign_page_number_to_chunks(chunk_contents, page_content, expected_page):
+    """Test the page assignment logic for different types of content."""
+    # Create a real SemanticTextChunker instance
+    chunker = SemanticTextChunker()
+
+    chunks = [ChunkHolder(mark_up=chunk_content) for chunk_content in chunk_contents]
+
+    # Create chunks with different content types
+
+    # Create page tracking holders
+    page_tracking_holders = [
+        PageNumberTrackingHolder(page_content="", page_number=1),
+        PageNumberTrackingHolder(page_content="# Title", page_number=2),
+        PageNumberTrackingHolder(page_content="First line", page_number=3),
+        PageNumberTrackingHolder(page_content="First sentence", page_number=4),
+        PageNumberTrackingHolder(page_content="Different content", page_number=5),
+    ]
 
-    dummy_chunk = DummyChunkHolder("chunk1")
-    dummy_text_chunker = MagicMock()
-    dummy_text_chunker.chunk = AsyncMock(return_value=[dummy_chunk])
-    dummy_text_chunker.assign_page_number_to_chunks = MagicMock()
+    # Call the method being tested
+    result_chunks = chunker.assign_page_number_to_chunks(chunks, page_tracking_holders)
 
-    result = await process_semantic_text_chunker(record, dummy_text_chunker)
-    assert result["recordId"] == "1"
-    assert result["data"] is not None
-    chunks = result["data"]["chunks"]
-    assert isinstance(chunks, list)
-    assert len(chunks) == 1
-    assert chunks[0]["mark_up"] == "chunk1"
-    # When no page info is provided, page_number remains unchanged (None in our dummy).
-    assert chunks[0]["page_number"] is None
+    # Verify the page number was correctly assigned
+    assert result_chunks[0].page_number == expected_page
 
 
-@pytest.mark.asyncio
-async def test_process_semantic_text_chunker_success_with_page():
-    """Test a successful chunking when per-page starting sentences are provided and match a chunk."""
-    record = {
-        "recordId": "2",
-        "data": {
-            "content": "Some content to be chunked.",
-            "per_page_starting_sentences": [
-                {"starting_sentence": "chunk", "page_number": 5}
-            ],
-        },
-    }
+def test_assign_page_number_to_chunks_multiple_chunks():
+    """Test assigning page numbers to multiple chunks."""
+    chunker = SemanticTextChunker()
 
-    dummy_chunk = DummyChunkHolder("This dummy chunk contains chunk in its text")
-    dummy_text_chunker = MagicMock()
-    dummy_text_chunker.chunk = AsyncMock(return_value=[dummy_chunk])
+    # Create multiple chunks
+    chunks = [
+        ChunkHolder(mark_up="# Introduction\nThis is the first section."),
+        ChunkHolder(mark_up="# Methods\nThis describes the methods used."),
+        ChunkHolder(mark_up="# Results\nThese are the results."),
+    ]
 
-    def dummy_assign_page(chunks, per_page_starting_sentences):
-        ps_objs = [
-            DummyPerPageStartingSentenceHolder(**ps.__dict__)
-            for ps in per_page_starting_sentences
-        ]
-        page_number = 1
-        for chunk in chunks:
-            for ps in ps_objs:
-                if ps.starting_sentence in chunk.mark_up:
-                    page_number = ps.page_number
-                    break
-            chunk.page_number = page_number
-        return chunks
+    # Create page tracking holders for different sections
+    page_tracking_holders = [
+        PageNumberTrackingHolder(page_content="# Introduction", page_number=1),
+        PageNumberTrackingHolder(page_content="# Methods", page_number=3),
+        PageNumberTrackingHolder(page_content="# Results", page_number=5),
+    ]
 
-    dummy_text_chunker.assign_page_number_to_chunks = dummy_assign_page
+    # Call the method being tested
+    result_chunks = chunker.assign_page_number_to_chunks(chunks, page_tracking_holders)
 
-    result = await process_semantic_text_chunker(record, dummy_text_chunker)
-    assert result["recordId"] == "2"
-    chunks = result["data"]["chunks"]
-    assert isinstance(chunks, list)
-    assert len(chunks) == 1
-    assert chunks[0]["page_number"] == 5
+    # Verify page numbers were correctly assigned
+    assert result_chunks[0].page_number == 1
+    assert result_chunks[1].page_number == 3
+    assert result_chunks[2].page_number == 5
 
 
 @pytest.mark.asyncio
@@ -119,9 +128,9 @@ async def test_process_semantic_text_chunker_multiple_chunks():
         "recordId": "4",
         "data": {
             "content": "Content that generates multiple chunks.",
-            "per_page_starting_sentences": [
-                {"starting_sentence": "first_page", "page_number": 3},
-                {"starting_sentence": "second_page", "page_number": 4},
+            "page_number_tracking_holders": [
+                {"page_content": "first_page", "page_number": 3},
+                {"page_content": "second_page", "page_number": 4},
             ],
         },
     }
@@ -131,15 +140,15 @@ async def test_process_semantic_text_chunker_multiple_chunks():
     dummy_text_chunker = MagicMock()
     dummy_text_chunker.chunk = AsyncMock(return_value=[dummy_chunk1, dummy_chunk2])
 
-    def dummy_assign_page(chunks, per_page_starting_sentences):
+    def dummy_assign_page(chunks, page_number_tracking_holders):
         ps_objs = [
-            DummyPerPageStartingSentenceHolder(**ps.__dict__)
-            for ps in per_page_starting_sentences
+            DummyPageNumberTrackingHolder(**ps.__dict__)
+            for ps in page_number_tracking_holders
         ]
         page_number = 1
         for chunk in chunks:
             for ps in ps_objs:
-                if ps.starting_sentence in chunk.mark_up:
+                if ps.page_content in chunk.mark_up:
                     page_number = ps.page_number
                     break
             chunk.page_number = page_number
@@ -156,55 +165,6 @@ def dummy_assign_page(chunks, per_page_starting_sentences):
     assert chunks[1]["page_number"] == 4
 
 
-@pytest.mark.asyncio
-async def test_process_semantic_text_chunker_empty_page_sentences():
-    """
-    Test a record where 'per_page_starting_sentences' exists but is empty.
-    In this case, the default page (1) is assigned.
-    """
-    record = {
-        "recordId": "5",
-        "data": {
-            "content": "Some content to be chunked.",
-            "per_page_starting_sentences": [],
-        },
-    }
-
-    dummy_chunk = DummyChunkHolder("Chunk without any page indicator")
-    dummy_text_chunker = MagicMock()
-    dummy_text_chunker.chunk = AsyncMock(return_value=[dummy_chunk])
-
-    def dummy_assign_page(chunks, per_page_starting_sentences):
-        for chunk in chunks:
-            chunk.page_number = 1
-        return chunks
-
-    dummy_text_chunker.assign_page_number_to_chunks = dummy_assign_page
-
-    result = await process_semantic_text_chunker(record, dummy_text_chunker)
-    assert result["recordId"] == "5"
-    chunks = result["data"]["chunks"]
-    assert isinstance(chunks, list)
-    assert len(chunks) == 1
-    assert chunks[0]["page_number"] == 1
-
-
-@pytest.mark.asyncio
-async def test_process_semantic_text_chunker_missing_data():
-    """
-    Test that if the record is missing the 'data' key, the function returns an error.
-    """
-    record = {"recordId": "6"}
-    dummy_text_chunker = MagicMock()
-    dummy_text_chunker.chunk = AsyncMock(return_value=[DummyChunkHolder("chunk")])
-    dummy_text_chunker.assign_page_number_to_chunks = MagicMock()
-
-    result = await process_semantic_text_chunker(record, dummy_text_chunker)
-    assert result["recordId"] == "6"
-    assert result["data"] is None
-    assert "errors" in result
-
-
 @pytest.mark.asyncio
 async def test_process_semantic_text_chunker_empty_content():
     """
@@ -244,7 +204,7 @@ def __init__(self, text):
 
 
 class DummyNLP:
-    def __call__(self, text):
+    def __call__(self, text, disable):
         return DummyDoc(text)
 
 
@@ -253,7 +213,6 @@ def __call__(self, text):
 def chunker():
     # Use relaxed thresholds so that even short sentences qualify.
     stc = SemanticTextChunker(
-        num_surrounding_sentences=1,
         similarity_threshold=0.8,
         max_chunk_tokens=1000,
         min_chunk_tokens=1,
@@ -267,43 +226,6 @@ def chunker():
     return stc
 
 
-# --- Chunk Splitting Tests Using Real (Patched) Chunker ---
-
-
-@pytest.mark.asyncio
-async def test_chunk_complete_figure(chunker):
-    """
-    Test a text containing a complete <figure> element.
-    Expect that the sentence with the complete figure is detected and grouped.
-    """
-    text = "Text before. <figure>Figure content</figure>. Text after."
-    chunks = await chunker.chunk(text)
-    # For our dummy segmentation, we expect two final chunks:
-    # one that combines "Text before" and the figure, and one for "Text after".
-    assert len(chunks) == 2
-    # Check that the first chunk contains a complete figure.
-    assert "<figure" in chunks[0].mark_up
-    assert "</figure>" in chunks[0].mark_up
-
-
-@pytest.mark.asyncio
-async def test_chunk_incomplete_figure(chunker):
-    """
-    Test a text with an incomplete figure element spanning multiple sentences.
-    The start and end of the figure should be grouped together.
-    """
-    text = (
-        "Text before. <figure>Start of figure. Figure continues </figure>. Text after."
-    )
-    chunks = await chunker.chunk(text)
-    # Expected grouping: one chunk combining the normal text and the grouped figure,
-    # and another chunk for text after.
-    assert len(chunks) == 2
-    # Check that the grouped chunk contains both the start and the end of the figure.
-    assert "<figure" in chunks[0].mark_up
-    assert "</figure>" in chunks[0].mark_up
-
-
 @pytest.mark.asyncio
 async def test_chunk_markdown_heading(chunker):
     """
@@ -338,7 +260,6 @@ async def test_chunk_long_sentence():
     """
     # Create a chunker that forces a long sentence to exceed the max token threshold.
     stc = SemanticTextChunker(
-        num_surrounding_sentences=1,
         similarity_threshold=0.8,
         max_chunk_tokens=5,  # set low so even a few words exceed it
         min_chunk_tokens=1,
@@ -353,3 +274,197 @@ async def test_chunk_long_sentence():
     # And because 12 >= 5, that sentence is immediately appended as a chunk.
     assert len(chunks) == 1
     assert "exceed" in chunks[0].mark_up
+
+
+def test_assign_page_number_with_html_comments():
+    """Test that HTML comments are properly stripped when assigning page numbers."""
+    chunker = SemanticTextChunker()
+
+    # Create a chunk with HTML comments
+    chunk = ChunkHolder(mark_up="<!-- comment --> First line\nSecond line")
+
+    # Create page tracking holders
+    page_tracking_holders = [
+        PageNumberTrackingHolder(page_content="First line\nSecond line", page_number=3),
+    ]
+
+    # Call the method being tested
+    result_chunks = chunker.assign_page_number_to_chunks([chunk], page_tracking_holders)
+
+    # Verify the page number was correctly assigned despite the HTML comment
+    assert result_chunks[0].page_number == 3
+
+
+@pytest.mark.asyncio
+async def test_clean_new_lines():
+    """Test the clean_new_lines method properly processes newlines."""
+    chunker = SemanticTextChunker()
+
+    # Test with various newline patterns
+    text = "<p>First line\nSecond line</p>\n\n<p>Next paragraph</p>"
+    result = chunker.clean_new_lines(text)
+
+    # Check that single newlines between tags are removed
+    assert "<p>First line Second line</p>" in result
+    # Check that multiple newlines are replaced with space + \n\n
+    assert "</p> \n\n<p>" in result
+
+
+@pytest.mark.asyncio
+async def test_filter_empty_figures():
+    """Test the filter_empty_figures method removes empty figure tags."""
+    chunker = SemanticTextChunker()
+
+    # Test with empty and non-empty figures
+    text = "<p>Text</p><figure></figure><p>More text</p><figure>Content</figure>"
+    result = chunker.filter_empty_figures(text)
+
+    # Check that empty figures are removed
+    assert "<figure></figure>" not in result
+    # Check that non-empty figures remain
+    assert "<figure>Content</figure>" in result
+
+
+@pytest.mark.asyncio
+async def test_group_figures_and_tables():
+    """Test grouping of figures and tables into sentences."""
+    chunker = SemanticTextChunker()
+
+    sentences = ["Before table.", "<table>Row 1", "Row 2</table>", "After table."]
+
+    grouped, is_table_map = chunker.group_figures_and_tables_into_sentences(sentences)
+
+    # Check that table contents are grouped
+    assert len(grouped) == 3
+    assert "<table>Row 1 Row 2</table>" in grouped
+    # Check table map is correct
+    assert is_table_map == [False, True, False]
+
+
+@pytest.mark.asyncio
+async def test_remove_figures():
+    """Test the remove_figures method."""
+    chunker = SemanticTextChunker()
+
+    text = 'Text before <figure FigureId="fig1">Figure content</figure> text after'
+    result = chunker.remove_figures(text)
+
+    assert "Text before  text after" == result
+    assert "<figure" not in result
+
+
+@pytest.mark.asyncio
+async def test_complex_document_structure():
+    """Test chunking with mixed content types (headings, lists, tables)."""
+    chunker = SemanticTextChunker()
+
+    text = """# Heading 1
+
+Some paragraph text.
+
+## Heading 2
+- List item 1
+- List item 2
+
+<table>
+<tr><td>Cell 1</td><td>Cell 2</td></tr>
+</table>
+
+> Blockquote text"""
+
+    chunks = await chunker.chunk(text)
+
+    # Verify we have reasonable chunks
+    assert len(chunks) >= 1
+
+    # Check heading formatting
+    heading_chunks = [c for c in chunks if "# Heading 1" in c.mark_up]
+    assert len(heading_chunks) > 0
+    assert "# Heading" in heading_chunks[0].mark_up
+
+
+@pytest.mark.asyncio
+async def test_process_page_tracking_no_match():
+    """Test behavior when page_number_tracking_holders is provided but doesn't match any chunks."""
+    record = {
+        "recordId": "8",
+        "data": {
+            "content": "Unique content that won't match page tracking.",
+            "page_number_tracking_holders": [
+                {"page_content": "Something completely different", "page_number": 10}
+            ],
+        },
+    }
+
+    chunker = SemanticTextChunker()
+    result = await process_semantic_text_chunker(record, chunker)
+
+    # Should default to page 1 when no match is found
+    assert result["data"]["chunks"][0]["page_number"] == 1
+
+
+@pytest.mark.asyncio
+async def test_nested_html_structure():
+    """Test handling of nested HTML tags."""
+    chunker = SemanticTextChunker()
+
+    text = """<div>
+        <p>Paragraph with <strong>bold text</strong> and <em>italic text</em></p>
+        <table>
+            <tr><th>Header 1</th><th>Header 2</th></tr>
+            <tr><td>Value 1</td><td>Value 2</td></tr>
+        </table>
+    </div>"""
+
+    chunks = await chunker.chunk(text)
+
+    # Verify we get at least one chunk
+    assert len(chunks) > 0
+    # Check that the table is kept intact in one chunk
+    table_chunks = [
+        c for c in chunks if "<table>" in c.mark_up and "</table>" in c.mark_up
+    ]
+    assert len(table_chunks) > 0
+
+
+def test_sentence_similarity():
+    """Test the sentence_similarity method."""
+    chunker = SemanticTextChunker()
+
+    # Should be highly similar
+    text1 = "Machine learning is a field of artificial intelligence."
+    text2 = "Artificial intelligence includes the domain of machine learning."
+    similarity = chunker.sentence_similarity(text1, text2)
+
+    # The exact value will depend on the model, but should be relatively high
+    assert similarity > 0.5
+
+    # Should be less similar
+    text3 = "Python is a programming language."
+    similarity2 = chunker.sentence_similarity(text1, text3)
+
+    # Should be lower than the first comparison
+    assert similarity2 < similarity
+
+
+@pytest.mark.asyncio
+async def test_special_characters_handling():
+    """Test chunking text with special characters and non-English content."""
+    chunker = SemanticTextChunker()
+
+    text = """# Résumé
+
+Special characters: ©®™℠
+
+Non-English: こんにちは 你好 안녕하세요
+
+Math symbols: ∑ ∫ ∏ √ ∂ Δ π μ σ"""
+
+    chunks = await chunker.chunk(text)
+
+    # Verify chunks were created
+    assert len(chunks) > 0
+    # Check content is preserved
+    combined_content = " ".join(c.mark_up for c in chunks)
+    assert "©®™℠" in combined_content
+    assert "こんにちは" in combined_content