@@ -259,7 +259,7 @@ def delta_vacuum_fg(spark: SparkSession, job_conf: Dict[Any, Any]) -> None:
259
259
260
260
entity .delta_vacuum ()
261
261
262
- def offline_fg_materialization (spark : SparkSession , job_conf : Dict [Any , Any ]) -> None :
262
+ def offline_fg_materialization (spark : SparkSession , job_conf : Dict [Any , Any ], initial_check_point_string : str ) -> None :
263
263
"""
264
264
Run materialization job on a feature group.
265
265
"""
@@ -276,6 +276,7 @@ def offline_fg_materialization(spark: SparkSession, job_conf: Dict[Any, Any]) ->
276
276
spark .read .format ("kafka" )
277
277
.options (** read_options )
278
278
.option ("subscribe" , entity ._online_topic_name )
279
+ .option ("startingOffsets" , _build_starting_offsets (initial_check_point_string )) \
279
280
.load ()
280
281
)
281
282
@@ -285,6 +286,20 @@ def offline_fg_materialization(spark: SparkSession, job_conf: Dict[Any, Any]) ->
285
286
entity .stream = False # to make sure we dont write to kafka
286
287
entity .insert (deserialized_df )
287
288
289
+ def _build_starting_offsets (initial_check_point_string : str ):
290
+ if not initial_check_point_string :
291
+ return ""
292
+
293
+ # Split the input string into the topic and partition-offset pairs
294
+ topic , offsets = initial_check_point_string .split (',' , 1 )
295
+
296
+ # Split the offsets and build a dictionary from them
297
+ offsets_dict = {partition : int (offset ) for partition , offset in (pair .split (':' ) for pair in offsets .split (',' ))}
298
+
299
+ # Create the final dictionary structure
300
+ result = {topic : offsets_dict }
301
+
302
+ return json .dumps (result )
288
303
289
304
if __name__ == "__main__" :
290
305
# Setup spark first so it fails faster in case of args errors
@@ -325,6 +340,12 @@ def parse_isoformat_date(da: str) -> datetime:
325
340
help = "Job start time" ,
326
341
)
327
342
343
+ parser .add_argument (
344
+ "-initialCheckPointString" ,
345
+ type = str ,
346
+ help = "Kafka offset to start consuming from" ,
347
+ )
348
+
328
349
args = parser .parse_args ()
329
350
job_conf = read_job_conf (args .path )
330
351
@@ -347,7 +368,7 @@ def parse_isoformat_date(da: str) -> datetime:
347
368
elif args .op == "delta_vacuum_fg" :
348
369
delta_vacuum_fg (spark , job_conf )
349
370
elif args .op == "offline_fg_materialization" :
350
- offline_fg_materialization (spark , job_conf )
371
+ offline_fg_materialization (spark , job_conf , args . initialCheckPointString )
351
372
352
373
success = True
353
374
except Exception :
0 commit comments