add filter to kafka

bubriks · bubriks · commit cf1c325021a8 · 2024-10-14T17:25:39.000+03:00
diff --git a/utils/python/hsfs_utils.py b/utils/python/hsfs_utils.py
@@ -13,7 +13,7 @@
 hopsfs = pfs.HadoopFileSystem("default", user=os.environ["HADOOP_USER_NAME"])
 from pyspark.sql import SparkSession
 from pyspark.sql.types import StructField, StructType, _parse_datatype_string
-from pyspark.sql.functions import max
+from pyspark.sql.functions import max, expr
 
 import hopsworks
 
@@ -300,6 +300,10 @@ def offline_fg_materialization(spark: SparkSession, job_conf: Dict[Any, Any], in
         .load()
     )
 
+    # filter only the necassary entries
+    df = df.filter(expr("CAST(filter(headers, header -> header.key = 'featureGroupId')[0].value AS STRING)") == str(entity._id))
+    df = df.filter(expr("CAST(filter(headers, header -> header.key = 'subjectId')[0].value AS STRING)") == str(entity.subject["id"]))
+
     # deserialize dataframe so that it can be properly saved
     deserialized_df = engine.get_instance()._deserialize_from_avro(entity, df)