Skip to content

Commit ad441ee

Browse files
bubriksSirOibaf
andauthored
[FSTORE-1632] Change limit location for delta ingestion (#422)
Co-authored-by: Fabio Buso <dev.siroibaf@gmail.com>
1 parent ccabb28 commit ad441ee

File tree

1 file changed

+8
-5
lines changed

1 file changed

+8
-5
lines changed

utils/python/hsfs_utils.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -301,22 +301,25 @@ def offline_fg_materialization(spark: SparkSession, job_conf: Dict[Any, Any], in
301301
.option("includeHeaders", "true")
302302
.option("failOnDataLoss", "false")
303303
.load()
304-
.limit(5000000)
305304
)
306305

307306
# filter only the necassary entries
308-
df = df.filter(expr("CAST(filter(headers, header -> header.key = 'featureGroupId')[0].value AS STRING)") == str(entity._id))
309-
df = df.filter(expr("CAST(filter(headers, header -> header.key = 'subjectId')[0].value AS STRING)") == str(entity.subject["id"]))
307+
filtered_df = df.filter(expr("CAST(filter(headers, header -> header.key = 'featureGroupId')[0].value AS STRING)") == str(entity._id))
308+
filtered_df = filtered_df.filter(expr("CAST(filter(headers, header -> header.key = 'subjectId')[0].value AS STRING)") == str(entity.subject["id"]))
309+
310+
# limit the number of records ingested
311+
limit = job_conf.get("write_options", {}).get("job_limit", 5000000)
312+
filtered_df = filtered_df.limit(limit)
310313

311314
# deserialize dataframe so that it can be properly saved
312-
deserialized_df = engine.get_instance()._deserialize_from_avro(entity, df)
315+
deserialized_df = engine.get_instance()._deserialize_from_avro(entity, filtered_df)
313316

314317
# insert data
315318
entity.stream = False # to make sure we dont write to kafka
316319
entity.insert(deserialized_df, storage="offline")
317320

318321
# update offsets
319-
df_offsets = df.groupBy('partition').agg(max('offset').alias('offset')).collect()
322+
df_offsets = (df if limit > filtered_df.count() else filtered_df).groupBy('partition').agg(max('offset').alias('offset')).collect()
320323
offset_dict = json.loads(offset_string)
321324
for offset_row in df_offsets:
322325
offset_dict[f"{entity._online_topic_name}"][f"{offset_row.partition}"] = offset_row.offset + 1

0 commit comments

Comments
 (0)