8
8
from typing import Literal
9
9
from unittest .mock import patch
10
10
11
- import pandas as pd
12
11
import pytest
13
12
from azure .ai .formrecognizer import AnalyzeResult
14
13
@@ -148,11 +147,15 @@ def result(self) -> AnalyzeResult:
148
147
docs = out ["documents" ]
149
148
assert len (docs ) == 2
150
149
# Checking the table doc extracted
151
- assert docs [0 ].content_type == "table"
152
- assert docs [0 ].dataframe .shape [0 ] == 4 # number of rows
153
- assert docs [0 ].dataframe .shape [1 ] == 4 # number of columns
154
- assert list (docs [0 ].dataframe .columns ) == ["" , "Column 1" , "Column 2" , "Column 3" ]
155
- assert list (docs [0 ].dataframe .iloc [3 ]) == ["D" , "$54.35" , "$6345." , "" ]
150
+ assert (
151
+ docs [0 ].content
152
+ == """,Column 1,Column 2,Column 3
153
+ A,324,55 million units,2022
154
+ B,"234,523.00",The quick brown fox jumped over the lazy dog.,54x growth
155
+ C,23.53%,A short string.,
156
+ D,$54.35,$6345.,
157
+ """
158
+ )
156
159
assert (
157
160
docs [0 ].meta ["preceding_context" ] == "specification. These proprietary technologies are not "
158
161
"standardized and their\n specification is published only on "
@@ -191,13 +194,21 @@ def result(self) -> AnalyzeResult:
191
194
docs = out ["documents" ]
192
195
assert len (docs ) == 2
193
196
# Checking the table doc extracted that is missing bounding info
194
- assert docs [0 ].content_type == "table"
195
- assert docs [0 ].dataframe .shape [0 ] == 4 # number of rows
196
- assert docs [0 ].dataframe .shape [1 ] == 4 # number of columns
197
- assert list (docs [0 ].dataframe .columns ) == ["" , "Column 1" , "Column 2" , "Column 3" ]
198
- assert list (docs [0 ].dataframe .iloc [3 ]) == ["D" , "$54.35" , "$6345." , "" ]
199
- # TODO below assert fails
200
- # assert docs[0].meta["preceding_context"] == ""
197
+ assert (
198
+ docs [0 ].content
199
+ == """,Column 1,Column 2,Column 3
200
+ A,324,55 million units,2022
201
+ B,"234,523.00",The quick brown fox jumped over the lazy dog.,54x growth
202
+ C,23.53%,A short string.,
203
+ D,$54.35,$6345.,
204
+ """
205
+ )
206
+ assert docs [0 ].meta ["preceding_context" ] == (
207
+ "specification. These proprietary technologies are not standardized and their\n specification is published "
208
+ "only on Adobe's website. Many of them are also not\n supported by popular third-party implementations of "
209
+ "PDF."
210
+ )
211
+ assert docs [0 ].meta ["following_context" ] == ""
201
212
202
213
@patch ("haystack.utils.auth.EnvVarSecret.resolve_value" )
203
214
def test_azure_converter_with_multicolumn_header_table (self , mock_resolve_value , test_files_path ) -> None :
@@ -213,20 +224,17 @@ def result(self) -> AnalyzeResult:
213
224
azure_mock .return_value = MockPoller ()
214
225
ocr_node = AzureOCRDocumentConverter (endpoint = "" )
215
226
216
- # TODO: fails because of non-unique column names, azure_sample_pdf_3.json has duplicate column names
217
227
out = ocr_node .run (sources = [test_files_path / "pdf" / "sample_pdf_3.pdf" ])
218
228
219
229
docs = out ["documents" ]
220
230
assert len (docs ) == 2
221
- assert docs [0 ].content_type == "table"
222
- assert docs [0 ].dataframe .shape [0 ] == 1 # number of rows
223
- assert docs [0 ].dataframe .shape [1 ] == 3 # number of columns
224
- assert list (docs [0 ].dataframe .columns ) == ["This is a subheader" , "This is a subheader" , "This is a subheader" ]
225
- assert list (docs [0 ].dataframe .iloc [0 ]) == ["Value 1" , "Value 2" , "Val 3" ]
231
+ assert docs [0 ].content == "This is a subheader,This is a subheader,This is a subheader\n Value 1,Value 2,Val 3\n "
226
232
assert (
227
233
docs [0 ].meta ["preceding_context" ]
228
234
== "Table 1. This is an example table with two multicolumn headers\n Header 1"
229
235
)
236
+ assert docs [0 ].meta ["following_context" ] == ""
237
+ assert docs [0 ].meta ["page" ] == 1
230
238
231
239
@patch ("haystack.utils.auth.EnvVarSecret.resolve_value" )
232
240
def test_table_pdf_with_non_empty_meta (self , mock_resolve_value , test_files_path ) -> None :
@@ -244,7 +252,6 @@ def result(self) -> AnalyzeResult:
244
252
out = ocr_node .run (sources = [test_files_path / "pdf" / "sample_pdf_1.pdf" ], meta = [{"test" : "value_1" }])
245
253
246
254
docs = out ["documents" ]
247
- # TODO assert below changed from the original test
248
255
assert docs [1 ].meta ["test" ] == "value_1"
249
256
250
257
@pytest .mark .integration
@@ -307,27 +314,6 @@ def test_run_with_store_full_path_false(self, test_files_path):
307
314
assert "Sample Docx File" in documents [0 ].content
308
315
assert documents [0 ].meta ["file_path" ] == "sample_docx.docx"
309
316
310
- @patch ("haystack.utils.auth.EnvVarSecret.resolve_value" )
311
- def test_hashing_dataframe (self , mock_resolve_value ):
312
- mock_resolve_value .return_value = "test_api_key"
313
- component = AzureOCRDocumentConverter (endpoint = "" )
314
- hash_length = 32
315
-
316
- df = pd .DataFrame ({"A" : [1 , 2 , 3 ]})
317
- hash_string_1 = component ._hash_dataframe (df )
318
- assert len (hash_string_1 ) == hash_length
319
-
320
- df = pd .DataFrame ({"A" : [1 , 2 , 3 ], "B" : [4 , 5 , 6 ]})
321
- hash_string_2 = component ._hash_dataframe (df )
322
- assert len (hash_string_2 ) == hash_length
323
-
324
- df = pd .DataFrame ({"B" : [4 , 5 , 6 ], "A" : [1 , 2 , 3 ], "D" : [7 , 8 , 9 ]})
325
- hash_string_3 = component ._hash_dataframe (df )
326
- assert len (hash_string_3 ) == hash_length
327
-
328
- # doesn't mean much, more for sanity check
329
- assert hash_string_1 != hash_string_2 != hash_string_3
330
-
331
317
@patch ("haystack.utils.auth.EnvVarSecret.resolve_value" )
332
318
def test_meta_from_byte_stream (self , mock_resolve_value , test_files_path ) -> None :
333
319
mock_resolve_value .return_value = "test_api_key"
@@ -341,8 +327,8 @@ def result(self) -> AnalyzeResult:
341
327
with patch ("azure.ai.formrecognizer.DocumentAnalysisClient.begin_analyze_document" ) as azure_mock :
342
328
azure_mock .return_value = MockPoller ()
343
329
ocr_node = AzureOCRDocumentConverter (endpoint = "" )
344
- bytes = (test_files_path / "pdf" / "sample_pdf_1.pdf" ).read_bytes ()
345
- byte_stream = ByteStream (data = bytes , meta = {"test_from" : "byte_stream" })
330
+ bytes_ = (test_files_path / "pdf" / "sample_pdf_1.pdf" ).read_bytes ()
331
+ byte_stream = ByteStream (data = bytes_ , meta = {"test_from" : "byte_stream" })
346
332
out = ocr_node .run (sources = [byte_stream ], meta = [{"test" : "value_1" }])
347
333
348
334
docs = out ["documents" ]
0 commit comments