refactor!: update AzureOCRDocumentConverter to not use the dataframe field for tabular Documents (#8885)

sjrl · web-flow · commit 52a028251c5a · 2025-03-03T12:45:02.000Z
* Save document as a csv table now

* Fix tests

* Fix tests

* Add reno
diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py
@@ -3,14 +3,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import copy
-import hashlib
 import os
 from collections import defaultdict
 from pathlib import Path
 from typing import Any, Dict, List, Literal, Optional, Union
 
 import networkx as nx
-import pandas as pd
 
 from haystack import Document, component, default_from_dict, default_to_dict, logging
 from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
@@ -24,6 +22,9 @@
     from azure.ai.formrecognizer import AnalyzeResult, DocumentAnalysisClient, DocumentLine, DocumentParagraph
     from azure.core.credentials import AzureKeyCredential
 
+with LazyImport(message="Run 'pip install pandas'") as pandas_import:
+    import pandas as pd
+
 
 @component
 class AzureOCRDocumentConverter:
@@ -90,6 +91,7 @@ def __init__(  # pylint: disable=too-many-positional-arguments
             If False, only the file name is stored.
         """
         azure_import.check()
+        pandas_import.check()
 
         self.document_analysis_client = DocumentAnalysisClient(
             endpoint=endpoint, credential=AzureKeyCredential(api_key.resolve_value() or "")
@@ -303,13 +305,10 @@ def _convert_tables(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]
             if table.bounding_regions:
                 table_meta["page"] = table.bounding_regions[0].page_number
 
-            table_df = pd.DataFrame(columns=table_list[0], data=table_list[1:])
-
-            # Use custom ID for tables, as columns might not be unique and thus failing in the default ID generation
-            pd_hashes = self._hash_dataframe(table_df)
-            data = f"{pd_hashes}{table_meta}"
-            doc_id = hashlib.sha256(data.encode()).hexdigest()
-            converted_tables.append(Document(id=doc_id, dataframe=table_df, meta=table_meta))
+            # Convert table to CSV
+            table_df = pd.DataFrame(data=table_list)
+            table_content = table_df.to_csv(header=False, index=False, lineterminator="\n")
+            converted_tables.append(Document(content=table_content, meta=table_meta))
 
         return converted_tables
 
@@ -479,29 +478,3 @@ def _check_if_in_table(
                 in_table = True
                 break
         return in_table
-
-    def _hash_dataframe(self, df: pd.DataFrame, desired_samples=5, hash_length=4) -> str:
-        """
-        Returns a hash of the DataFrame content.
-
-        The hash is based on the content of the DataFrame.
-        :param df: The DataFrame to hash.
-        :param desired_samples: The desired number of samples to hash.
-        :param hash_length: The length of the hash for each sample.
-
-        :returns: A hash of the DataFrame content.
-        """
-        # take adaptive sample of rows to hash because we can have very large dataframes
-        hasher = hashlib.md5()
-        total_rows = len(df)
-        # sample rate based on DataFrame size and desired number of samples
-        sample_rate = max(1, total_rows // desired_samples)
-
-        hashes = pd.util.hash_pandas_object(df, index=True)
-        sampled_hashes = hashes[::sample_rate]
-
-        for hash_value in sampled_hashes:
-            partial_hash = str(hash_value)[:hash_length].encode("utf-8")
-            hasher.update(partial_hash)
-
-        return hasher.hexdigest()
diff --git a/releasenotes/notes/remove-df-doc-from-azure-ocr-4d65509235a5fd9d.yaml b/releasenotes/notes/remove-df-doc-from-azure-ocr-4d65509235a5fd9d.yaml
@@ -0,0 +1,12 @@
+---
+upgrade:
+  - |
+    The `AzureOCRDocumentConverter` no longer produces `Document` objects with the deprecated `dataframe` field.
+
+    **Am I affected?**
+    - If your workflow relies on the `dataframe` field in `Document` objects generated by `AzureOCRDocumentConverter`, you are affected.
+    - If you saw a `DeprecationWarning` in Haystack 2.10 when initializing a `Document` with a `dataframe`, this change will now remove that field entirely.
+
+    **How to handle the change:**
+    - Instead of storing detected tables as a `dataframe`, `AzureOCRDocumentConverter` now represents tables as CSV-formatted text in the `content` field of the `Document`.
+    - Update your processing logic to handle CSV-formatted tables instead of a `dataframe`. If needed, you can convert the CSV text back into a `dataframe` using `pandas.read_csv()`.
diff --git a/test/components/converters/test_azure_ocr_doc_converter.py b/test/components/converters/test_azure_ocr_doc_converter.py
@@ -8,7 +8,6 @@
 from typing import Literal
 from unittest.mock import patch
 
-import pandas as pd
 import pytest
 from azure.ai.formrecognizer import AnalyzeResult
 
@@ -148,11 +147,15 @@ def result(self) -> AnalyzeResult:
         docs = out["documents"]
         assert len(docs) == 2
         # Checking the table doc extracted
-        assert docs[0].content_type == "table"
-        assert docs[0].dataframe.shape[0] == 4  # number of rows
-        assert docs[0].dataframe.shape[1] == 4  # number of columns
-        assert list(docs[0].dataframe.columns) == ["", "Column 1", "Column 2", "Column 3"]
-        assert list(docs[0].dataframe.iloc[3]) == ["D", "$54.35", "$6345.", ""]
+        assert (
+            docs[0].content
+            == """,Column 1,Column 2,Column 3
+A,324,55 million units,2022
+B,"234,523.00",The quick brown fox jumped over the lazy dog.,54x growth
+C,23.53%,A short string.,
+D,$54.35,$6345.,
+"""
+        )
         assert (
             docs[0].meta["preceding_context"] == "specification. These proprietary technologies are not "
             "standardized and their\nspecification is published only on "
@@ -191,13 +194,21 @@ def result(self) -> AnalyzeResult:
         docs = out["documents"]
         assert len(docs) == 2
         # Checking the table doc extracted that is missing bounding info
-        assert docs[0].content_type == "table"
-        assert docs[0].dataframe.shape[0] == 4  # number of rows
-        assert docs[0].dataframe.shape[1] == 4  # number of columns
-        assert list(docs[0].dataframe.columns) == ["", "Column 1", "Column 2", "Column 3"]
-        assert list(docs[0].dataframe.iloc[3]) == ["D", "$54.35", "$6345.", ""]
-        # TODO below assert fails
-        # assert docs[0].meta["preceding_context"] == ""
+        assert (
+            docs[0].content
+            == """,Column 1,Column 2,Column 3
+A,324,55 million units,2022
+B,"234,523.00",The quick brown fox jumped over the lazy dog.,54x growth
+C,23.53%,A short string.,
+D,$54.35,$6345.,
+"""
+        )
+        assert docs[0].meta["preceding_context"] == (
+            "specification. These proprietary technologies are not standardized and their\nspecification is published "
+            "only on Adobe's website. Many of them are also not\nsupported by popular third-party implementations of "
+            "PDF."
+        )
+        assert docs[0].meta["following_context"] == ""
 
     @patch("haystack.utils.auth.EnvVarSecret.resolve_value")
     def test_azure_converter_with_multicolumn_header_table(self, mock_resolve_value, test_files_path) -> None:
@@ -213,20 +224,17 @@ def result(self) -> AnalyzeResult:
             azure_mock.return_value = MockPoller()
             ocr_node = AzureOCRDocumentConverter(endpoint="")
 
-            # TODO: fails because of non-unique column names, azure_sample_pdf_3.json has duplicate column names
             out = ocr_node.run(sources=[test_files_path / "pdf" / "sample_pdf_3.pdf"])
 
         docs = out["documents"]
         assert len(docs) == 2
-        assert docs[0].content_type == "table"
-        assert docs[0].dataframe.shape[0] == 1  # number of rows
-        assert docs[0].dataframe.shape[1] == 3  # number of columns
-        assert list(docs[0].dataframe.columns) == ["This is a subheader", "This is a subheader", "This is a subheader"]
-        assert list(docs[0].dataframe.iloc[0]) == ["Value 1", "Value 2", "Val 3"]
+        assert docs[0].content == "This is a subheader,This is a subheader,This is a subheader\nValue 1,Value 2,Val 3\n"
         assert (
             docs[0].meta["preceding_context"]
             == "Table 1. This is an example table with two multicolumn headers\nHeader 1"
         )
+        assert docs[0].meta["following_context"] == ""
+        assert docs[0].meta["page"] == 1
 
     @patch("haystack.utils.auth.EnvVarSecret.resolve_value")
     def test_table_pdf_with_non_empty_meta(self, mock_resolve_value, test_files_path) -> None:
@@ -244,7 +252,6 @@ def result(self) -> AnalyzeResult:
             out = ocr_node.run(sources=[test_files_path / "pdf" / "sample_pdf_1.pdf"], meta=[{"test": "value_1"}])
 
         docs = out["documents"]
-        # TODO assert below changed from the original test
         assert docs[1].meta["test"] == "value_1"
 
     @pytest.mark.integration
@@ -307,27 +314,6 @@ def test_run_with_store_full_path_false(self, test_files_path):
         assert "Sample Docx File" in documents[0].content
         assert documents[0].meta["file_path"] == "sample_docx.docx"
 
-    @patch("haystack.utils.auth.EnvVarSecret.resolve_value")
-    def test_hashing_dataframe(self, mock_resolve_value):
-        mock_resolve_value.return_value = "test_api_key"
-        component = AzureOCRDocumentConverter(endpoint="")
-        hash_length = 32
-
-        df = pd.DataFrame({"A": [1, 2, 3]})
-        hash_string_1 = component._hash_dataframe(df)
-        assert len(hash_string_1) == hash_length
-
-        df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
-        hash_string_2 = component._hash_dataframe(df)
-        assert len(hash_string_2) == hash_length
-
-        df = pd.DataFrame({"B": [4, 5, 6], "A": [1, 2, 3], "D": [7, 8, 9]})
-        hash_string_3 = component._hash_dataframe(df)
-        assert len(hash_string_3) == hash_length
-
-        # doesn't mean much, more for sanity check
-        assert hash_string_1 != hash_string_2 != hash_string_3
-
     @patch("haystack.utils.auth.EnvVarSecret.resolve_value")
     def test_meta_from_byte_stream(self, mock_resolve_value, test_files_path) -> None:
         mock_resolve_value.return_value = "test_api_key"
@@ -341,8 +327,8 @@ def result(self) -> AnalyzeResult:
         with patch("azure.ai.formrecognizer.DocumentAnalysisClient.begin_analyze_document") as azure_mock:
             azure_mock.return_value = MockPoller()
             ocr_node = AzureOCRDocumentConverter(endpoint="")
-            bytes = (test_files_path / "pdf" / "sample_pdf_1.pdf").read_bytes()
-            byte_stream = ByteStream(data=bytes, meta={"test_from": "byte_stream"})
+            bytes_ = (test_files_path / "pdf" / "sample_pdf_1.pdf").read_bytes()
+            byte_stream = ByteStream(data=bytes_, meta={"test_from": "byte_stream"})
             out = ocr_node.run(sources=[byte_stream], meta=[{"test": "value_1"}])
 
         docs = out["documents"]