feat: table filtering based on multiple keywords UDF (#907)

affan00733 · jarulraj · web-flow · commit 0da039c9a302 · 2023-06-27T15:34:47.000-04:00
👋 Thanks for submitting a Pull Request to EvaDB! 🙌 We want to make contributing to EvaDB as easy and transparent as possible. Here are a few tips to get you started: - 🔍 Search existing EvaDB [PRs](https://github.com/georgia-tech-db/eva/pulls) to see if a similar PR already exists. - 🔗 Link this PR to a EvaDB [issue](https://github.com/georgia-tech-db/eva/issues) to help us understand what bug fix or feature is being implemented. - 📈 Provide before and after profiling results to help us quantify the improvement your PR provides (if applicable). 👉 Please see our ✅ [Contributing Guide](https://evadb.readthedocs.io/en/stable/source/contribute/index.html) for more details. --------- Co-authored-by: jarulraj <arulraj@gatech.edu>
diff --git a/evadb/udfs/text_filter_keyword.py b/evadb/udfs/text_filter_keyword.py
@@ -0,0 +1,67 @@
+# coding=utf-8
+# Copyright 2018-2023 EvaDB
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import pandas as pd
+
+from evadb.catalog.catalog_type import NdArrayType
+from evadb.udfs.abstract.abstract_udf import AbstractUDF
+from evadb.udfs.decorators.decorators import forward, setup
+from evadb.udfs.decorators.io_descriptors.data_types import PandasDataframe
+
+
+class TextFilterKeyword(AbstractUDF):
+    @setup(cacheable=False, udf_type="TextProcessing", batchable=False)
+    def setup(self):
+        pass
+
+    @property
+    def name(self) -> str:
+        return "TextFilterKeyword"
+
+    @forward(
+        input_signatures=[
+            PandasDataframe(
+                columns=["data", "keyword"],
+                column_types=[NdArrayType.STR, NdArrayType.STR],
+                column_shapes=[(1), (1)],
+            )
+        ],
+        output_signatures=[
+            PandasDataframe(
+                columns=["filtered"],
+                column_types=[NdArrayType.STR],
+                column_shapes=[(1)],
+            )
+        ],
+    )
+    def forward(self, df: pd.DataFrame) -> pd.DataFrame:
+        def _forward(row: pd.Series) -> np.ndarray:
+            import re
+
+            data = row.iloc[0]
+            keywords = row.iloc[1]
+            flag = False
+            for i in keywords:
+                pattern = rf"^(.*?({i})[^$]*)$"
+                match_check = re.search(pattern, data, re.IGNORECASE)
+                if match_check:
+                    flag = True
+            if flag is False:
+                return data
+            flag = False
+
+        ret = pd.DataFrame()
+        ret["filtered"] = df.apply(_forward, axis=1)
+        return ret
diff --git a/test/integration_tests/test_load_executor.py b/test/integration_tests/test_load_executor.py
@@ -43,7 +43,7 @@
 
 
 @pytest.mark.notparallel
-class LoadExecutorTest(unittest.TestCase):
+class LoadExecutorTests(unittest.TestCase):
     def setUp(self):
         self.evadb = get_evadb_for_testing()
         # reset the catalog manager before running each test
diff --git a/test/integration_tests/test_load_pdf_executor.py b/test/integration_tests/test_load_pdf_executor.py
@@ -22,7 +22,7 @@
 
 
 @pytest.mark.notparallel
-class LoadExecutorTest(unittest.TestCase):
+class LoadPDFExecutorTests(unittest.TestCase):
     def setUp(self):
         self.evadb = get_evadb_for_testing()
         # reset the catalog manager before running each test
diff --git a/test/integration_tests/test_text_filtering.py b/test/integration_tests/test_text_filtering.py
@@ -0,0 +1,56 @@
+# coding=utf-8
+# Copyright 2018-2023 EvaDB
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from test.util import suffix_pytest_xdist_worker_id_to_dir
+
+import pytest
+
+from evadb.configuration.constants import EvaDB_DATABASE_DIR, EvaDB_ROOT_DIR
+from evadb.interfaces.relational.db import connect
+from evadb.server.command_handler import execute_query_fetch_all
+
+
+@pytest.mark.notparallel
+class TextFilteringTests(unittest.TestCase):
+    def setUp(self):
+        self.db_dir = suffix_pytest_xdist_worker_id_to_dir(EvaDB_DATABASE_DIR)
+        self.conn = connect(self.db_dir)
+        self.evadb = self.conn._evadb
+        self.evadb.catalog().reset()
+
+    def tearDown(self):
+        execute_query_fetch_all(self.evadb, "DROP TABLE IF EXISTS MyPDFs;")
+
+    def test_text_filter(self):
+        pdf_path = f"{EvaDB_ROOT_DIR}/data/documents/layout-parser-paper.pdf"
+        cursor = self.conn.cursor()
+        cursor.load(pdf_path, "MyPDFs", "pdf").df()
+        load_pdf_data = cursor.table("MyPDFs").df()
+        cursor.create_udf(
+            "TextFilterKeyword",
+            True,
+            f"{EvaDB_ROOT_DIR}/evadb/udfs/text_filter_keyword.py",
+        ).df()
+        filtered_data = (
+            cursor.table("MyPDFs")
+            .cross_apply("TextFilterKeyword(data, ['References'])", "objs(filtered)")
+            .df()
+        )
+        filtered_data.dropna(inplace=True)
+        import pandas as pd
+
+        pd.set_option("display.max_colwidth", None)
+        print(filtered_data)
+        self.assertNotEqual(len(filtered_data), len(load_pdf_data))