13
13
hopsfs = pfs .HadoopFileSystem ("default" , user = os .environ ["HADOOP_USER_NAME" ])
14
14
from pyspark .sql import SparkSession
15
15
from pyspark .sql .types import StructField , StructType , _parse_datatype_string
16
+ from pyspark .sql .functions import max
16
17
17
18
import hopsworks
18
19
@@ -272,20 +273,50 @@ def offline_fg_materialization(spark: SparkSession, job_conf: Dict[Any, Any], in
272
273
entity .feature_store_id , {}, engine = "spark"
273
274
)
274
275
276
+ # get offsets
277
+ offset_location = entity .location + "_offsets"
278
+ try :
279
+ if initial_check_point_string :
280
+ offset_string = json .dumps (_build_starting_offsets (initial_check_point_string ))
281
+ else :
282
+ offset_string = spark .read .json (offset_location ).toJSON ().first ()
283
+ except Exception as e :
284
+ print (f"An unexpected error occurred: { e } " )
285
+ # if all else fails read from the beggining
286
+ initial_check_point_string = kafka_engine .kafka_get_offsets (
287
+ topic_name = entity ._online_topic_name ,
288
+ feature_store_id = entity .feature_store_id ,
289
+ high = False ,
290
+ )
291
+ offset_string = json .dumps (_build_starting_offsets (initial_check_point_string ))
292
+
293
+ # read kafka topic
275
294
df = (
276
295
spark .read .format ("kafka" )
277
296
.options (** read_options )
278
297
.option ("subscribe" , entity ._online_topic_name )
279
- .option ("startingOffsets" , _build_starting_offsets (initial_check_point_string )) \
298
+ .option ("startingOffsets" , offset_string )
299
+ .limit (5000000 )
280
300
.load ()
281
301
)
282
302
283
303
# deserialize dataframe so that it can be properly saved
284
304
deserialized_df = engine .get_instance ()._deserialize_from_avro (entity , df )
285
305
306
+ # insert data
286
307
entity .stream = False # to make sure we dont write to kafka
287
308
entity .insert (deserialized_df )
288
309
310
+ # update offsets
311
+ df_offsets = df .groupBy ('partition' ).agg (max ('offset' ).alias ('offset' )).collect ()
312
+ offset_dict = json .loads (offset_string )
313
+ for offset_row in df_offsets :
314
+ offset_dict [f"{ entity ._online_topic_name } " ][f"{ offset_row .partition } " ] = offset_row .offset
315
+
316
+ # save offsets
317
+ offset_df = spark .createDataFrame ([offset_dict ])
318
+ offset_df .write .mode ("overwrite" ).json (offset_location )
319
+
289
320
def _build_starting_offsets (initial_check_point_string : str ):
290
321
if not initial_check_point_string :
291
322
return ""
@@ -299,7 +330,7 @@ def _build_starting_offsets(initial_check_point_string: str):
299
330
# Create the final dictionary structure
300
331
result = {topic : offsets_dict }
301
332
302
- return json . dumps ( result )
333
+ return result
303
334
304
335
if __name__ == "__main__" :
305
336
# Setup spark first so it fails faster in case of args errors
0 commit comments