Skip to content

Commit 52a0282

Browse files
authored
refactor!: update AzureOCRDocumentConverter to not use the dataframe field for tabular Documents (#8885)
* Save document as a csv table now * Fix tests * Fix tests * Add reno
1 parent 209e6d5 commit 52a0282

File tree

3 files changed

+49
-78
lines changed

3 files changed

+49
-78
lines changed

haystack/components/converters/azure.py

Lines changed: 8 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,12 @@
33
# SPDX-License-Identifier: Apache-2.0
44

55
import copy
6-
import hashlib
76
import os
87
from collections import defaultdict
98
from pathlib import Path
109
from typing import Any, Dict, List, Literal, Optional, Union
1110

1211
import networkx as nx
13-
import pandas as pd
1412

1513
from haystack import Document, component, default_from_dict, default_to_dict, logging
1614
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
@@ -24,6 +22,9 @@
2422
from azure.ai.formrecognizer import AnalyzeResult, DocumentAnalysisClient, DocumentLine, DocumentParagraph
2523
from azure.core.credentials import AzureKeyCredential
2624

25+
with LazyImport(message="Run 'pip install pandas'") as pandas_import:
26+
import pandas as pd
27+
2728

2829
@component
2930
class AzureOCRDocumentConverter:
@@ -90,6 +91,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
9091
If False, only the file name is stored.
9192
"""
9293
azure_import.check()
94+
pandas_import.check()
9395

9496
self.document_analysis_client = DocumentAnalysisClient(
9597
endpoint=endpoint, credential=AzureKeyCredential(api_key.resolve_value() or "")
@@ -303,13 +305,10 @@ def _convert_tables(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]
303305
if table.bounding_regions:
304306
table_meta["page"] = table.bounding_regions[0].page_number
305307

306-
table_df = pd.DataFrame(columns=table_list[0], data=table_list[1:])
307-
308-
# Use custom ID for tables, as columns might not be unique and thus failing in the default ID generation
309-
pd_hashes = self._hash_dataframe(table_df)
310-
data = f"{pd_hashes}{table_meta}"
311-
doc_id = hashlib.sha256(data.encode()).hexdigest()
312-
converted_tables.append(Document(id=doc_id, dataframe=table_df, meta=table_meta))
308+
# Convert table to CSV
309+
table_df = pd.DataFrame(data=table_list)
310+
table_content = table_df.to_csv(header=False, index=False, lineterminator="\n")
311+
converted_tables.append(Document(content=table_content, meta=table_meta))
313312

314313
return converted_tables
315314

@@ -479,29 +478,3 @@ def _check_if_in_table(
479478
in_table = True
480479
break
481480
return in_table
482-
483-
def _hash_dataframe(self, df: pd.DataFrame, desired_samples=5, hash_length=4) -> str:
484-
"""
485-
Returns a hash of the DataFrame content.
486-
487-
The hash is based on the content of the DataFrame.
488-
:param df: The DataFrame to hash.
489-
:param desired_samples: The desired number of samples to hash.
490-
:param hash_length: The length of the hash for each sample.
491-
492-
:returns: A hash of the DataFrame content.
493-
"""
494-
# take adaptive sample of rows to hash because we can have very large dataframes
495-
hasher = hashlib.md5()
496-
total_rows = len(df)
497-
# sample rate based on DataFrame size and desired number of samples
498-
sample_rate = max(1, total_rows // desired_samples)
499-
500-
hashes = pd.util.hash_pandas_object(df, index=True)
501-
sampled_hashes = hashes[::sample_rate]
502-
503-
for hash_value in sampled_hashes:
504-
partial_hash = str(hash_value)[:hash_length].encode("utf-8")
505-
hasher.update(partial_hash)
506-
507-
return hasher.hexdigest()
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
---
2+
upgrade:
3+
- |
4+
The `AzureOCRDocumentConverter` no longer produces `Document` objects with the deprecated `dataframe` field.
5+
6+
**Am I affected?**
7+
- If your workflow relies on the `dataframe` field in `Document` objects generated by `AzureOCRDocumentConverter`, you are affected.
8+
- If you saw a `DeprecationWarning` in Haystack 2.10 when initializing a `Document` with a `dataframe`, this change will now remove that field entirely.
9+
10+
**How to handle the change:**
11+
- Instead of storing detected tables as a `dataframe`, `AzureOCRDocumentConverter` now represents tables as CSV-formatted text in the `content` field of the `Document`.
12+
- Update your processing logic to handle CSV-formatted tables instead of a `dataframe`. If needed, you can convert the CSV text back into a `dataframe` using `pandas.read_csv()`.

test/components/converters/test_azure_ocr_doc_converter.py

Lines changed: 29 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from typing import Literal
99
from unittest.mock import patch
1010

11-
import pandas as pd
1211
import pytest
1312
from azure.ai.formrecognizer import AnalyzeResult
1413

@@ -148,11 +147,15 @@ def result(self) -> AnalyzeResult:
148147
docs = out["documents"]
149148
assert len(docs) == 2
150149
# Checking the table doc extracted
151-
assert docs[0].content_type == "table"
152-
assert docs[0].dataframe.shape[0] == 4 # number of rows
153-
assert docs[0].dataframe.shape[1] == 4 # number of columns
154-
assert list(docs[0].dataframe.columns) == ["", "Column 1", "Column 2", "Column 3"]
155-
assert list(docs[0].dataframe.iloc[3]) == ["D", "$54.35", "$6345.", ""]
150+
assert (
151+
docs[0].content
152+
== """,Column 1,Column 2,Column 3
153+
A,324,55 million units,2022
154+
B,"234,523.00",The quick brown fox jumped over the lazy dog.,54x growth
155+
C,23.53%,A short string.,
156+
D,$54.35,$6345.,
157+
"""
158+
)
156159
assert (
157160
docs[0].meta["preceding_context"] == "specification. These proprietary technologies are not "
158161
"standardized and their\nspecification is published only on "
@@ -191,13 +194,21 @@ def result(self) -> AnalyzeResult:
191194
docs = out["documents"]
192195
assert len(docs) == 2
193196
# Checking the table doc extracted that is missing bounding info
194-
assert docs[0].content_type == "table"
195-
assert docs[0].dataframe.shape[0] == 4 # number of rows
196-
assert docs[0].dataframe.shape[1] == 4 # number of columns
197-
assert list(docs[0].dataframe.columns) == ["", "Column 1", "Column 2", "Column 3"]
198-
assert list(docs[0].dataframe.iloc[3]) == ["D", "$54.35", "$6345.", ""]
199-
# TODO below assert fails
200-
# assert docs[0].meta["preceding_context"] == ""
197+
assert (
198+
docs[0].content
199+
== """,Column 1,Column 2,Column 3
200+
A,324,55 million units,2022
201+
B,"234,523.00",The quick brown fox jumped over the lazy dog.,54x growth
202+
C,23.53%,A short string.,
203+
D,$54.35,$6345.,
204+
"""
205+
)
206+
assert docs[0].meta["preceding_context"] == (
207+
"specification. These proprietary technologies are not standardized and their\nspecification is published "
208+
"only on Adobe's website. Many of them are also not\nsupported by popular third-party implementations of "
209+
"PDF."
210+
)
211+
assert docs[0].meta["following_context"] == ""
201212

202213
@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
203214
def test_azure_converter_with_multicolumn_header_table(self, mock_resolve_value, test_files_path) -> None:
@@ -213,20 +224,17 @@ def result(self) -> AnalyzeResult:
213224
azure_mock.return_value = MockPoller()
214225
ocr_node = AzureOCRDocumentConverter(endpoint="")
215226

216-
# TODO: fails because of non-unique column names, azure_sample_pdf_3.json has duplicate column names
217227
out = ocr_node.run(sources=[test_files_path / "pdf" / "sample_pdf_3.pdf"])
218228

219229
docs = out["documents"]
220230
assert len(docs) == 2
221-
assert docs[0].content_type == "table"
222-
assert docs[0].dataframe.shape[0] == 1 # number of rows
223-
assert docs[0].dataframe.shape[1] == 3 # number of columns
224-
assert list(docs[0].dataframe.columns) == ["This is a subheader", "This is a subheader", "This is a subheader"]
225-
assert list(docs[0].dataframe.iloc[0]) == ["Value 1", "Value 2", "Val 3"]
231+
assert docs[0].content == "This is a subheader,This is a subheader,This is a subheader\nValue 1,Value 2,Val 3\n"
226232
assert (
227233
docs[0].meta["preceding_context"]
228234
== "Table 1. This is an example table with two multicolumn headers\nHeader 1"
229235
)
236+
assert docs[0].meta["following_context"] == ""
237+
assert docs[0].meta["page"] == 1
230238

231239
@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
232240
def test_table_pdf_with_non_empty_meta(self, mock_resolve_value, test_files_path) -> None:
@@ -244,7 +252,6 @@ def result(self) -> AnalyzeResult:
244252
out = ocr_node.run(sources=[test_files_path / "pdf" / "sample_pdf_1.pdf"], meta=[{"test": "value_1"}])
245253

246254
docs = out["documents"]
247-
# TODO assert below changed from the original test
248255
assert docs[1].meta["test"] == "value_1"
249256

250257
@pytest.mark.integration
@@ -307,27 +314,6 @@ def test_run_with_store_full_path_false(self, test_files_path):
307314
assert "Sample Docx File" in documents[0].content
308315
assert documents[0].meta["file_path"] == "sample_docx.docx"
309316

310-
@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
311-
def test_hashing_dataframe(self, mock_resolve_value):
312-
mock_resolve_value.return_value = "test_api_key"
313-
component = AzureOCRDocumentConverter(endpoint="")
314-
hash_length = 32
315-
316-
df = pd.DataFrame({"A": [1, 2, 3]})
317-
hash_string_1 = component._hash_dataframe(df)
318-
assert len(hash_string_1) == hash_length
319-
320-
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
321-
hash_string_2 = component._hash_dataframe(df)
322-
assert len(hash_string_2) == hash_length
323-
324-
df = pd.DataFrame({"B": [4, 5, 6], "A": [1, 2, 3], "D": [7, 8, 9]})
325-
hash_string_3 = component._hash_dataframe(df)
326-
assert len(hash_string_3) == hash_length
327-
328-
# doesn't mean much, more for sanity check
329-
assert hash_string_1 != hash_string_2 != hash_string_3
330-
331317
@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
332318
def test_meta_from_byte_stream(self, mock_resolve_value, test_files_path) -> None:
333319
mock_resolve_value.return_value = "test_api_key"
@@ -341,8 +327,8 @@ def result(self) -> AnalyzeResult:
341327
with patch("azure.ai.formrecognizer.DocumentAnalysisClient.begin_analyze_document") as azure_mock:
342328
azure_mock.return_value = MockPoller()
343329
ocr_node = AzureOCRDocumentConverter(endpoint="")
344-
bytes = (test_files_path / "pdf" / "sample_pdf_1.pdf").read_bytes()
345-
byte_stream = ByteStream(data=bytes, meta={"test_from": "byte_stream"})
330+
bytes_ = (test_files_path / "pdf" / "sample_pdf_1.pdf").read_bytes()
331+
byte_stream = ByteStream(data=bytes_, meta={"test_from": "byte_stream"})
346332
out = ocr_node.run(sources=[byte_stream], meta=[{"test": "value_1"}])
347333

348334
docs = out["documents"]

0 commit comments

Comments
 (0)