Skip to content

Commit c17067a

Browse files
Add Unit Tests for Image Processing + Page Number Tracking
1 parent 064d406 commit c17067a

22 files changed

+2160
-174
lines changed

.github/workflows/ci-checks.yaml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,29 @@ jobs:
3636

3737
- name: Run pre-commit
3838
run: uv run pre-commit run --all-files
39+
40+
job-image-processing-unit-tests:
41+
name: Image Processing Unit Tests
42+
runs-on: ubuntu-latest
43+
44+
steps:
45+
- name: Checkout code
46+
uses: actions/checkout@v3
47+
48+
- name: Set up Python
49+
uses: actions/setup-python@v3
50+
with:
51+
python-version: ${{ env.MIN_PYTHON_VERSION }}
52+
53+
- name: Install uv
54+
uses: astral-sh/setup-uv@v4
55+
with:
56+
enable-cache: true
57+
58+
- name: Install the project
59+
run: uv sync
60+
working-directory: image_processing
61+
62+
- name: Run PyTest
63+
run: uv run pytest --cov=. --cov-config=.coveragerc
64+
working-directory: image_processing

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ repos:
1818

1919
# Python checks
2020
- id: name-tests-test
21+
args: [--pytest-test-first]
2122

2223
# JSON files
2324
- id: pretty-format-json

deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,11 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill:
219219
mark_up_cleaner_context = "/document/page_wise_layout/*"
220220
inputs = [
221221
InputFieldMappingEntry(
222-
name="chunk", source="/document/page_wise_layout/*/merged_content"
222+
name="mark_up", source="/document/page_wise_layout/*/merged_content"
223+
),
224+
InputFieldMappingEntry(
225+
name="page_number",
226+
source="/document/page_wise_layout/*/page_number",
223227
),
224228
InputFieldMappingEntry(
225229
name="figures",
@@ -230,20 +234,26 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill:
230234
mark_up_cleaner_context = "/document/chunk_mark_ups/*"
231235
inputs = [
232236
InputFieldMappingEntry(
233-
name="chunk", source="/document/chunk_mark_ups/*"
237+
name="mark_up", source="/document/chunk_mark_ups/*/mark_up"
238+
),
239+
InputFieldMappingEntry(
240+
name="page_number", source="/document/chunk_mark_ups/*/page_number"
234241
),
235242
InputFieldMappingEntry(
236243
name="figures", source="/document/layout/figures/*/updated_figure"
237244
),
238245
]
239246

240247
mark_up_cleaner_skill_outputs = [
241-
OutputFieldMappingEntry(name="chunk_cleaned", target_name="chunk_cleaned"),
242248
OutputFieldMappingEntry(
243-
name="chunk_sections", target_name="chunk_sections"
249+
name="cleaned_text", target_name="final_cleaned_text"
250+
),
251+
OutputFieldMappingEntry(name="sections", target_name="final_sections"),
252+
OutputFieldMappingEntry(name="mark_up", target_name="final_mark_up"),
253+
OutputFieldMappingEntry(name="figures", target_name="final_chunk_figures"),
254+
OutputFieldMappingEntry(
255+
name="page_number", target_name="final_page_number"
244256
),
245-
OutputFieldMappingEntry(name="chunk_mark_up", target_name="chunk_mark_up"),
246-
OutputFieldMappingEntry(name="chunk_figures", target_name="chunk_figures"),
247257
]
248258

249259
mark_up_cleaner_skill = WebApiSkill(
@@ -302,7 +312,11 @@ def get_semantic_chunker_skill(
302312
semantic_text_chunker_skill_inputs = [
303313
InputFieldMappingEntry(
304314
name="content", source="/document/layout_merged_content"
305-
)
315+
),
316+
InputFieldMappingEntry(
317+
name="per_page_starting_sentences",
318+
source="/document/per_page_starting_sentences",
319+
),
306320
]
307321

308322
semantic_text_chunker_skill_outputs = [
@@ -368,7 +382,13 @@ def get_layout_analysis_skill(
368382
)
369383
]
370384
else:
371-
output = [OutputFieldMappingEntry(name="layout", target_name="layout")]
385+
output = [
386+
OutputFieldMappingEntry(name="layout", target_name="layout"),
387+
OutputFieldMappingEntry(
388+
name="per_page_starting_sentences",
389+
target_name="per_page_starting_sentences",
390+
),
391+
]
372392

373393
layout_analysis_skill = WebApiSkill(
374394
name="Layout Analysis Skill",

deploy_ai_search_indexes/src/deploy_ai_search_indexes/image_processing.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,13 @@ def get_index_fields(self) -> list[SearchableField]:
8181
type=SearchFieldDataType.String,
8282
collection=True,
8383
),
84+
SimpleField(
85+
name="PageNumber",
86+
type=SearchFieldDataType.Int64,
87+
sortable=True,
88+
filterable=True,
89+
facetable=True,
90+
),
8491
SearchField(
8592
name="ChunkEmbedding",
8693
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
@@ -137,19 +144,6 @@ def get_index_fields(self) -> list[SearchableField]:
137144
),
138145
]
139146

140-
if self.enable_page_by_chunking:
141-
fields.extend(
142-
[
143-
SimpleField(
144-
name="PageNumber",
145-
type=SearchFieldDataType.Int64,
146-
sortable=True,
147-
filterable=True,
148-
facetable=True,
149-
)
150-
]
151-
)
152-
153147
return fields
154148

155149
def get_semantic_search(self) -> SemanticSearch:
@@ -194,11 +188,12 @@ def get_skills(self) -> list:
194188
if self.enable_page_by_chunking:
195189
embedding_skill = self.get_vector_skill(
196190
"/document/page_wise_layout/*",
197-
"/document/page_wise_layout/*/chunk_cleaned",
191+
"/document/page_wise_layout/*/final_cleaned_text",
198192
)
199193
else:
200194
embedding_skill = self.get_vector_skill(
201-
"/document/chunk_mark_ups/*", "/document/chunk_mark_ups/*/chunk_cleaned"
195+
"/document/chunk_mark_ups/*",
196+
"/document/chunk_mark_ups/*/final_cleaned_text",
202197
)
203198

204199
if self.enable_page_by_chunking:
@@ -229,7 +224,7 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
229224
source_context = "/document/page_wise_layout/*"
230225
mappings = [
231226
InputFieldMappingEntry(
232-
name="Chunk", source="/document/page_wise_layout/*/chunk_mark_up"
227+
name="Chunk", source="/document/page_wise_layout/*/final_mark_up"
233228
),
234229
InputFieldMappingEntry(
235230
name="ChunkEmbedding",
@@ -239,24 +234,25 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
239234
InputFieldMappingEntry(name="SourceUri", source="/document/SourceUri"),
240235
InputFieldMappingEntry(
241236
name="Sections",
242-
source="/document/page_wise_layout/*/chunk_sections",
237+
source="/document/page_wise_layout/*/final_sections",
243238
),
244239
InputFieldMappingEntry(
245240
name="ChunkFigures",
246-
source="/document/page_wise_layout/*/chunk_figures/*",
241+
source="/document/page_wise_layout/*/final_chunk_figures/*",
247242
),
248243
InputFieldMappingEntry(
249244
name="DateLastModified", source="/document/DateLastModified"
250245
),
251246
InputFieldMappingEntry(
252-
name="PageNumber", source="/document/page_wise_layout/*/page_number"
247+
name="PageNumber",
248+
source="/document/page_wise_layout/*/final_page_number",
253249
),
254250
]
255251
else:
256252
source_context = "/document/chunk_mark_ups/*"
257253
mappings = [
258254
InputFieldMappingEntry(
259-
name="Chunk", source="/document/chunk_mark_ups/*/chunk_mark_up"
255+
name="Chunk", source="/document/chunk_mark_ups/*/final_mark_up"
260256
),
261257
InputFieldMappingEntry(
262258
name="ChunkEmbedding",
@@ -265,15 +261,19 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
265261
InputFieldMappingEntry(name="Title", source="/document/Title"),
266262
InputFieldMappingEntry(name="SourceUri", source="/document/SourceUri"),
267263
InputFieldMappingEntry(
268-
name="Sections", source="/document/chunk_mark_ups/*/chunk_sections"
264+
name="Sections", source="/document/chunk_mark_ups/*/final_sections"
269265
),
270266
InputFieldMappingEntry(
271267
name="ChunkFigures",
272-
source="/document/chunk_mark_ups/*/chunk_figures/*",
268+
source="/document/chunk_mark_ups/*/final_chunk_figures/*",
273269
),
274270
InputFieldMappingEntry(
275271
name="DateLastModified", source="/document/DateLastModified"
276272
),
273+
InputFieldMappingEntry(
274+
name="PageNumber",
275+
source="/document/chunk_mark_ups/*/final_page_number",
276+
),
277277
]
278278

279279
index_projections = SearchIndexerIndexProjection(

image_processing/.coveragerc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[run]
2+
omit =
3+
tests/*
4+
*/__init__.py
5+
6+
[report]
7+
omit =
8+
tests/*
9+
*/__init__.py
10+
exclude_lines =
11+
if __name__ == "__main__":

image_processing/pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,4 +43,9 @@ dev = [
4343
"pygments>=2.18.0",
4444
"ruff>=0.8.1",
4545
"python-dotenv>=1.0.1",
46+
"coverage>=7.6.12",
47+
"pytest>=8.3.4",
48+
"pytest-asyncio>=0.25.3",
49+
"pytest-cov>=6.0.0",
50+
"pytest-mock>=3.14.0",
4651
]

image_processing/pytest.ini

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[pytest]
2+
pythonpath = src/image_processing

image_processing/src/image_processing/__init__.py

Whitespace-only changes.

image_processing/src/image_processing/layout_analysis.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
LayoutHolder,
2323
PageWiseContentHolder,
2424
NonPageWiseContentHolder,
25+
PerPageStartingSentenceHolder,
2526
)
2627

2728

@@ -340,6 +341,40 @@ def create_page_wise_content(self) -> list[LayoutHolder]:
340341

341342
return page_wise_contents
342343

344+
def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolder]:
345+
"""Create a list of the starting sentence of each page so we can assign the starting sentence to the page number.
346+
347+
Returns:
348+
--------
349+
list: A list of the starting sentence of each page."""
350+
351+
per_page_starting_sentences = []
352+
353+
for page in self.result.pages:
354+
page_content = self.result.content[
355+
page.spans[0]["offset"] : page.spans[0]["offset"]
356+
+ page.spans[0]["length"]
357+
]
358+
359+
# Remove any leading whitespace/newlines.
360+
cleaned_content = page_content.lstrip()
361+
# If a newline appears before a period, split on newline; otherwise, on period.
362+
if "\n" in cleaned_content:
363+
first_line = cleaned_content.split("\n", 1)[0]
364+
elif "." in cleaned_content:
365+
first_line = cleaned_content.split(".", 1)[0]
366+
else:
367+
first_line = cleaned_content
368+
369+
per_page_starting_sentences.append(
370+
PerPageStartingSentenceHolder(
371+
page_number=page.page_number,
372+
starting_sentence=first_line.strip(),
373+
)
374+
)
375+
376+
return per_page_starting_sentences
377+
343378
async def get_document_intelligence_client(self) -> DocumentIntelligenceClient:
344379
"""Get the Azure Document Intelligence client.
345380
@@ -487,7 +522,12 @@ async def analyse(self):
487522
if self.extract_figures:
488523
await self.process_figures_from_extracted_content(text_content)
489524

490-
output_record = NonPageWiseContentHolder(layout=text_content)
525+
per_page_starting_sentences = self.create_per_page_starting_sentence()
526+
527+
output_record = NonPageWiseContentHolder(
528+
layout=text_content,
529+
per_page_starting_sentences=per_page_starting_sentences,
530+
)
491531

492532
except Exception as e:
493533
logging.error(e)

image_processing/src/image_processing/layout_holders.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77

88
class FigureHolder(BaseModel):
9-
109
"""A class to hold the figure extracted from the document."""
1110

1211
figure_id: str = Field(..., alias="FigureId")
@@ -48,7 +47,28 @@ class PageWiseContentHolder(BaseModel):
4847
page_wise_layout: list[LayoutHolder]
4948

5049

50+
class PerPageStartingSentenceHolder(BaseModel):
51+
"""A class to hold the starting sentence of each page."""
52+
53+
page_number: int
54+
starting_sentence: str
55+
56+
5157
class NonPageWiseContentHolder(BaseModel):
5258
"""A class to hold the non-page-wise content extracted from the document."""
5359

5460
layout: LayoutHolder
61+
per_page_starting_sentences: list[PerPageStartingSentenceHolder] = Field(
62+
default_factory=list
63+
)
64+
65+
66+
class ChunkHolder(BaseModel):
67+
"""A class to hold the text extracted from the document after it has been chunked."""
68+
69+
mark_up: str
70+
sections: Optional[list[str]] = Field(default_factory=list)
71+
figures: Optional[list[FigureHolder]] = Field(default_factory=list)
72+
starting_sentence: Optional[str] = None
73+
cleaned_text: Optional[str] = None
74+
page_number: Optional[int] = Field(default=None)

0 commit comments

Comments
 (0)