Skip to content

Commit 0da039c

Browse files
affan00733jarulraj
andauthored
feat: table filtering based on multiple keywords UDF (#907)
πŸ‘‹ Thanks for submitting a Pull Request to EvaDB! πŸ™Œ We want to make contributing to EvaDB as easy and transparent as possible. Here are a few tips to get you started: - πŸ” Search existing EvaDB [PRs](https://github.com/georgia-tech-db/eva/pulls) to see if a similar PR already exists. - πŸ”— Link this PR to a EvaDB [issue](https://github.com/georgia-tech-db/eva/issues) to help us understand what bug fix or feature is being implemented. - πŸ“ˆ Provide before and after profiling results to help us quantify the improvement your PR provides (if applicable). πŸ‘‰ Please see our βœ… [Contributing Guide](https://evadb.readthedocs.io/en/stable/source/contribute/index.html) for more details. --------- Co-authored-by: jarulraj <arulraj@gatech.edu>
1 parent 84aef2d commit 0da039c

File tree

4 files changed

+125
-2
lines changed

4 files changed

+125
-2
lines changed

β€Ževadb/udfs/text_filter_keyword.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# coding=utf-8
2+
# Copyright 2018-2023 EvaDB
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
import numpy as np
16+
import pandas as pd
17+
18+
from evadb.catalog.catalog_type import NdArrayType
19+
from evadb.udfs.abstract.abstract_udf import AbstractUDF
20+
from evadb.udfs.decorators.decorators import forward, setup
21+
from evadb.udfs.decorators.io_descriptors.data_types import PandasDataframe
22+
23+
24+
class TextFilterKeyword(AbstractUDF):
25+
@setup(cacheable=False, udf_type="TextProcessing", batchable=False)
26+
def setup(self):
27+
pass
28+
29+
@property
30+
def name(self) -> str:
31+
return "TextFilterKeyword"
32+
33+
@forward(
34+
input_signatures=[
35+
PandasDataframe(
36+
columns=["data", "keyword"],
37+
column_types=[NdArrayType.STR, NdArrayType.STR],
38+
column_shapes=[(1), (1)],
39+
)
40+
],
41+
output_signatures=[
42+
PandasDataframe(
43+
columns=["filtered"],
44+
column_types=[NdArrayType.STR],
45+
column_shapes=[(1)],
46+
)
47+
],
48+
)
49+
def forward(self, df: pd.DataFrame) -> pd.DataFrame:
50+
def _forward(row: pd.Series) -> np.ndarray:
51+
import re
52+
53+
data = row.iloc[0]
54+
keywords = row.iloc[1]
55+
flag = False
56+
for i in keywords:
57+
pattern = rf"^(.*?({i})[^$]*)$"
58+
match_check = re.search(pattern, data, re.IGNORECASE)
59+
if match_check:
60+
flag = True
61+
if flag is False:
62+
return data
63+
flag = False
64+
65+
ret = pd.DataFrame()
66+
ret["filtered"] = df.apply(_forward, axis=1)
67+
return ret

β€Žtest/integration_tests/test_load_executor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343

4444

4545
@pytest.mark.notparallel
46-
class LoadExecutorTest(unittest.TestCase):
46+
class LoadExecutorTests(unittest.TestCase):
4747
def setUp(self):
4848
self.evadb = get_evadb_for_testing()
4949
# reset the catalog manager before running each test

β€Žtest/integration_tests/test_load_pdf_executor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323

2424
@pytest.mark.notparallel
25-
class LoadExecutorTest(unittest.TestCase):
25+
class LoadPDFExecutorTests(unittest.TestCase):
2626
def setUp(self):
2727
self.evadb = get_evadb_for_testing()
2828
# reset the catalog manager before running each test
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# coding=utf-8
2+
# Copyright 2018-2023 EvaDB
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
import unittest
16+
from test.util import suffix_pytest_xdist_worker_id_to_dir
17+
18+
import pytest
19+
20+
from evadb.configuration.constants import EvaDB_DATABASE_DIR, EvaDB_ROOT_DIR
21+
from evadb.interfaces.relational.db import connect
22+
from evadb.server.command_handler import execute_query_fetch_all
23+
24+
25+
@pytest.mark.notparallel
26+
class TextFilteringTests(unittest.TestCase):
27+
def setUp(self):
28+
self.db_dir = suffix_pytest_xdist_worker_id_to_dir(EvaDB_DATABASE_DIR)
29+
self.conn = connect(self.db_dir)
30+
self.evadb = self.conn._evadb
31+
self.evadb.catalog().reset()
32+
33+
def tearDown(self):
34+
execute_query_fetch_all(self.evadb, "DROP TABLE IF EXISTS MyPDFs;")
35+
36+
def test_text_filter(self):
37+
pdf_path = f"{EvaDB_ROOT_DIR}/data/documents/layout-parser-paper.pdf"
38+
cursor = self.conn.cursor()
39+
cursor.load(pdf_path, "MyPDFs", "pdf").df()
40+
load_pdf_data = cursor.table("MyPDFs").df()
41+
cursor.create_udf(
42+
"TextFilterKeyword",
43+
True,
44+
f"{EvaDB_ROOT_DIR}/evadb/udfs/text_filter_keyword.py",
45+
).df()
46+
filtered_data = (
47+
cursor.table("MyPDFs")
48+
.cross_apply("TextFilterKeyword(data, ['References'])", "objs(filtered)")
49+
.df()
50+
)
51+
filtered_data.dropna(inplace=True)
52+
import pandas as pd
53+
54+
pd.set_option("display.max_colwidth", None)
55+
print(filtered_data)
56+
self.assertNotEqual(len(filtered_data), len(load_pdf_data))

0 commit comments

Comments
Β (0)