save offsets?

bubriks · bubriks · commit 0e923769ba47 · 2024-10-14T16:05:47.000+03:00
diff --git a/python/hsfs/core/kafka_engine.py b/python/hsfs/core/kafka_engine.py
@@ -141,7 +141,7 @@ def kafka_get_offsets(
             offsets += f",{partition_metadata.id}:{consumer.get_watermark_offsets(partition)[tuple_value]}"
         consumer.close()
 
-        return f" -initialCheckPointString {topic_name + offsets}"
+        return f"{topic_name + offsets}"
     return ""
 
 
diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py
@@ -1401,7 +1401,7 @@ def _write_dataframe_kafka(
             now = datetime.now(timezone.utc)
             feature_group.materialization_job.run(
                 args=feature_group.materialization_job.config.get("defaultArgs", "")
-                + initial_check_point,
+                + (f" -initialCheckPointString {initial_check_point}" if initial_check_point else ""),
                 await_termination=offline_write_options.get("wait_for_job", False),
             )
             offline_backfill_every_hr = offline_write_options.pop(
diff --git a/utils/python/hsfs_utils.py b/utils/python/hsfs_utils.py
@@ -13,6 +13,7 @@
 hopsfs = pfs.HadoopFileSystem("default", user=os.environ["HADOOP_USER_NAME"])
 from pyspark.sql import SparkSession
 from pyspark.sql.types import StructField, StructType, _parse_datatype_string
+from pyspark.sql.functions import max
 
 import hopsworks
 
@@ -272,20 +273,50 @@ def offline_fg_materialization(spark: SparkSession, job_conf: Dict[Any, Any], in
         entity.feature_store_id, {}, engine="spark"
     )
 
+    # get offsets
+    offset_location = entity.location + "_offsets"
+    try:
+        if initial_check_point_string:
+            offset_string = json.dumps(_build_starting_offsets(initial_check_point_string))
+        else:
+            offset_string = spark.read.json(offset_location).toJSON().first()
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        # if all else fails read from the beggining
+        initial_check_point_string = kafka_engine.kafka_get_offsets(
+            topic_name=entity._online_topic_name,
+            feature_store_id=entity.feature_store_id,
+            high=False,
+        )
+        offset_string = json.dumps(_build_starting_offsets(initial_check_point_string))
+
+    # read kafka topic
     df = (
         spark.read.format("kafka")
         .options(**read_options)
         .option("subscribe", entity._online_topic_name)
-        .option("startingOffsets", _build_starting_offsets(initial_check_point_string)) \
+        .option("startingOffsets", offset_string)
+        .limit(5000000)
         .load()
     )
 
     # deserialize dataframe so that it can be properly saved
     deserialized_df = engine.get_instance()._deserialize_from_avro(entity, df)
 
+    # insert data
     entity.stream = False # to make sure we dont write to kafka
     entity.insert(deserialized_df)
 
+    # update offsets
+    df_offsets = df.groupBy('partition').agg(max('offset').alias('offset')).collect()
+    offset_dict = json.loads(offset_string)
+    for offset_row in df_offsets:
+        offset_dict[f"{entity._online_topic_name}"][f"{offset_row.partition}"] = offset_row.offset
+
+    # save offsets
+    offset_df = spark.createDataFrame([offset_dict])
+    offset_df.write.mode("overwrite").json(offset_location)
+
 def _build_starting_offsets(initial_check_point_string: str):
     if not initial_check_point_string:
         return ""
@@ -299,7 +330,7 @@ def _build_starting_offsets(initial_check_point_string: str):
     # Create the final dictionary structure
     result = {topic: offsets_dict}
     
-    return json.dumps(result)
+    return result
 
 if __name__ == "__main__":
     # Setup spark first so it fails faster in case of args errors

Original file line number	Diff line number	Diff line change
`@@ -1401,7 +1401,7 @@ def _write_dataframe_kafka(`
`1401`	`1401`	`now = datetime.now(timezone.utc)`
`1402`	`1402`	`feature_group.materialization_job.run(`
`1403`	`1403`	`args=feature_group.materialization_job.config.get("defaultArgs", "")`
`1404`		`- + initial_check_point,`
	`1404`	`+ + (f" -initialCheckPointString {initial_check_point}" if initial_check_point else ""),`
`1405`	`1405`	`await_termination=offline_write_options.get("wait_for_job", False),`
`1406`	`1406`	`)`
`1407`	`1407`	`offline_backfill_every_hr = offline_write_options.pop(`