Skip to content

Commit ea8aef6

Browse files
committed
test_serialize_deserialize_avro
1 parent d910f92 commit ea8aef6

File tree

1 file changed

+32
-20
lines changed

1 file changed

+32
-20
lines changed

python/tests/engine/test_spark.py

+32-20
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#
1616
from __future__ import annotations
1717

18+
import datetime
1819
import hopsworks_common
1920
import numpy
2021
import pandas as pd
@@ -1707,21 +1708,24 @@ def test_save_online_dataframe(self, mocker, backend_fixtures):
17071708
== 1
17081709
)
17091710

1710-
def test_serialize_to_avro(self, mocker):
1711+
def test_serialize_deserialize_avro(self, mocker):
17111712
# Arrange
1712-
mocker.patch("hopsworks_common.client.get_instance")
1713-
mocker.patch(
1714-
"hsfs.feature_group.FeatureGroup.get_complex_features",
1715-
return_value=["col_1"],
1716-
)
1717-
mocker.patch("hsfs.feature_group.FeatureGroup._get_feature_avro_schema")
1718-
17191713
spark_engine = spark.Engine()
17201714

1721-
d = {"col_0": ["test_1", "test_2"], "col_1": ["test_1", "test_2"]}
1722-
df = pd.DataFrame(data=d)
1715+
now = datetime.datetime.now()
17231716

1724-
spark_df = spark_engine._spark_session.createDataFrame(df)
1717+
fg_data = []
1718+
fg_data.append(("ekarson", ["GRAVITY RUSH 2", "KING'S QUEST"], pd.Timestamp(now.timestamp())))
1719+
fg_data.append(("ratmilkdrinker", ["NBA 2K", "CALL OF DUTY"], pd.Timestamp(now.timestamp())))
1720+
pandas_df = pd.DataFrame(fg_data, columns =["account_id", "last_played_games", "event_time"])
1721+
1722+
df = spark_engine._spark_session.createDataFrame(pandas_df)
1723+
1724+
features = [
1725+
feature.Feature(name="account_id", type="str"),
1726+
feature.Feature(name="last_played_games", type="xx"),
1727+
feature.Feature(name="event_time", type="timestamp"),
1728+
]
17251729

17261730
fg = feature_group.FeatureGroup(
17271731
name="test",
@@ -1730,22 +1734,30 @@ def test_serialize_to_avro(self, mocker):
17301734
primary_key=[],
17311735
partition_key=[],
17321736
id=10,
1737+
features=features,
17331738
)
1734-
fg._subject = {"schema": '{"fields": [{"name": "col_0"}]}'}
1735-
1736-
expected = pd.DataFrame(data={"col_0": ["test_1", "test_2"]})
1739+
fg._subject = {
1740+
'id': 1025,
1741+
'subject': 'fg_1',
1742+
'version': 1,
1743+
'schema': '{"type":"record","name":"fg_1","namespace":"test_featurestore.db","fields":[{"name":"account_id","type":["null","string"]},{"name":"last_played_games","type":["null",{"type":"array","items":["null","string"]}]},{"name":"event_time","type":["null",{"type":"long","logicalType":"timestamp-micros"}]}]}'
1744+
}
17371745

17381746
# Act
1739-
result = spark_engine._serialize_to_avro(
1747+
serialized_df = spark_engine._serialize_to_avro(
17401748
feature_group=fg,
1741-
dataframe=spark_df,
1749+
dataframe=df,
1750+
)
1751+
1752+
deserialized_df = spark_engine._deserialize_from_avro(
1753+
feature_group=fg,
1754+
dataframe=serialized_df,
17421755
)
17431756

17441757
# Assert
1745-
result_df = result.toPandas()
1746-
assert list(result_df) == list(expected)
1747-
for column in list(result_df):
1748-
assert result_df[column].equals(expected[column])
1758+
assert serialized_df.schema.json() == '{"fields":[{"metadata":{},"name":"key","nullable":false,"type":"binary"},{"metadata":{},"name":"value","nullable":false,"type":"binary"}],"type":"struct"}'
1759+
assert df.schema == deserialized_df.schema
1760+
assert df.collect() == deserialized_df.collect()
17491761

17501762
def test_get_training_data(self, mocker):
17511763
# Arrange

0 commit comments

Comments
 (0)