@@ -734,20 +734,28 @@ def parse_schema_feature_group(
734
734
self ,
735
735
dataframe : Union [pd .DataFrame , pl .DataFrame ],
736
736
time_travel_format : Optional [str ] = None ,
737
+ features : Optional [List [feature .Feature ]] = None ,
737
738
) -> List [feature .Feature ]:
739
+ feature_type_map = {}
740
+ if features :
741
+ for _feature in features :
742
+ feature_type_map [_feature .name ] = _feature .type
738
743
if isinstance (dataframe , pd .DataFrame ):
739
744
arrow_schema = pa .Schema .from_pandas (dataframe , preserve_index = False )
740
745
elif isinstance (dataframe , pl .DataFrame ) or isinstance (
741
746
dataframe , pl .dataframe .frame .DataFrame
742
747
):
743
748
arrow_schema = dataframe .to_arrow ().schema
744
749
features = []
745
- for feat_name in arrow_schema .names :
750
+ for i in range (len (arrow_schema .names )):
751
+ feat_name = arrow_schema .names [i ]
746
752
name = util .autofix_feature_name (feat_name )
747
753
try :
748
- converted_type = convert_pandas_dtype_to_offline_type (
749
- arrow_schema .field (feat_name ).type
750
- )
754
+ pd_type = arrow_schema .field (feat_name ).type
755
+ if pa .types .is_null (pd_type ) and feature_type_map .get (name ):
756
+ converted_type = feature_type_map .get (name )
757
+ else :
758
+ converted_type = convert_pandas_dtype_to_offline_type (pd_type )
751
759
except ValueError as e :
752
760
raise FeatureStoreException (f"Feature '{ name } ': { str (e )} " ) from e
753
761
features .append (feature .Feature (name , converted_type ))
@@ -1422,7 +1430,7 @@ def _start_offline_materialization(offline_write_options: Dict[str, Any]) -> boo
1422
1430
return True
1423
1431
1424
1432
@staticmethod
1425
- def _convert_feature_log_to_df (feature_log , cols ):
1433
+ def _convert_feature_log_to_df (feature_log , cols ) -> pd . DataFrame :
1426
1434
if feature_log is None and cols :
1427
1435
return pd .DataFrame (columns = cols )
1428
1436
if not (
@@ -1442,40 +1450,40 @@ def _convert_feature_log_to_df(feature_log, cols):
1442
1450
1443
1451
return pd .DataFrame (feature_log , columns = cols )
1444
1452
else :
1445
- return feature_log .copy (deep = False )
1453
+ if isinstance (feature_log , pl .DataFrame ):
1454
+ return feature_log .clone ().to_pandas ()
1455
+ elif isinstance (feature_log , pd .DataFrame ):
1456
+ return feature_log .copy (deep = False )
1446
1457
1447
1458
@staticmethod
1448
1459
def get_feature_logging_df (
1449
- fg ,
1450
- features ,
1451
- fg_features : List [TrainingDatasetFeature ] ,
1452
- td_predictions : List [TrainingDatasetFeature ],
1453
- td_col_name ,
1454
- time_col_name ,
1455
- model_col_name ,
1456
- prediction = None ,
1457
- training_dataset_version = None ,
1460
+ features : Union [ pd . DataFrame , list [ list ], np . ndarray ] ,
1461
+ fg : FeatureGroup = None ,
1462
+ td_features : List [str ] = None ,
1463
+ td_predictions : List [TrainingDatasetFeature ] = None ,
1464
+ td_col_name : Optional [ str ] = None ,
1465
+ time_col_name : Optional [ str ] = None ,
1466
+ model_col_name : Optional [ str ] = None ,
1467
+ predictions : Optional [ Union [ pd . DataFrame , list [ list ], np . ndarray ]] = None ,
1468
+ training_dataset_version : Optional [ int ] = None ,
1458
1469
hsml_model = None ,
1459
1470
) -> pd .DataFrame :
1460
- import uuid
1461
-
1462
1471
features = Engine ._convert_feature_log_to_df (
1463
- features , [ f . name for f in fg_features ]
1472
+ features , td_features
1464
1473
)
1465
1474
if td_predictions :
1466
- prediction = Engine ._convert_feature_log_to_df (
1467
- prediction , [f .name for f in td_predictions ]
1475
+ predictions = Engine ._convert_feature_log_to_df (
1476
+ predictions , [f .name for f in td_predictions ]
1468
1477
)
1469
1478
for f in td_predictions :
1470
- prediction [f .name ] = Engine . _cast_column_to_offline_type (
1471
- prediction [f .name ], f .type
1479
+ predictions [f .name ] = cast_column_to_offline_type (
1480
+ predictions [f .name ], f .type
1472
1481
)
1473
- if not set (prediction .columns ).intersection (set (features .columns )):
1474
- features = pd .concat ([features , prediction ], axis = 1 )
1475
- # need to case the column type as if it is None, type cannot be inferred.
1476
- features [td_col_name ] = Engine ._cast_column_to_offline_type (
1477
- pd .Series ([training_dataset_version for _ in range (len (features ))]),
1478
- fg .get_feature (td_col_name ).type ,
1482
+ if not set (predictions .columns ).intersection (set (features .columns )):
1483
+ features = pd .concat ([features , predictions ], axis = 1 )
1484
+
1485
+ features [td_col_name ] = pd .Series (
1486
+ [training_dataset_version for _ in range (len (features ))]
1479
1487
)
1480
1488
# _cast_column_to_offline_type cannot cast string type
1481
1489
features [model_col_name ] = pd .Series (
@@ -1488,9 +1496,12 @@ def get_feature_logging_df(
1488
1496
dtype = pd .StringDtype (),
1489
1497
)
1490
1498
now = datetime .now ()
1491
- features [time_col_name ] = Engine ._cast_column_to_offline_type (
1492
- pd .Series ([now for _ in range (len (features ))]),
1493
- fg .get_feature (time_col_name ).type ,
1494
- )
1499
+
1500
+ features [time_col_name ] = pd .Series ([now for _ in range (len (features ))])
1495
1501
features ["log_id" ] = [str (uuid .uuid4 ()) for _ in range (len (features ))]
1496
1502
return features [[feat .name for feat in fg .features ]]
1503
+
1504
+ @staticmethod
1505
+ def read_feature_log (query ):
1506
+ df = query .read ()
1507
+ return df .drop (["log_id" , FeatureViewEngine ._LOG_TIME ], axis = 1 )
0 commit comments