|
90 | 90 | from hsfs.core.type_systems import PYARROW_HOPSWORKS_DTYPE_MAPPING
|
91 | 91 | from hsfs.core.vector_db_client import VectorDbClient
|
92 | 92 | from hsfs.feature_group import ExternalFeatureGroup, FeatureGroup
|
| 93 | +from hsfs.hopsworks_udf import HopsworksUdf, UDFExecutionMode |
93 | 94 | from hsfs.training_dataset import TrainingDataset
|
94 | 95 | from hsfs.training_dataset_feature import TrainingDatasetFeature
|
95 | 96 | from hsfs.training_dataset_split import TrainingDatasetSplit
|
@@ -1262,6 +1263,7 @@ def _apply_transformation_function(
|
1262 | 1263 | self,
|
1263 | 1264 | transformation_functions: List[transformation_function.TransformationFunction],
|
1264 | 1265 | dataset: Union[pd.DataFrame, pl.DataFrame],
|
| 1266 | + online_inference: bool = False, |
1265 | 1267 | ) -> Union[pd.DataFrame, pl.DataFrame]:
|
1266 | 1268 | """
|
1267 | 1269 | Apply transformation function to the dataframe.
|
@@ -1299,22 +1301,129 @@ def _apply_transformation_function(
|
1299 | 1301 | )
|
1300 | 1302 | if tf.hopsworks_udf.dropped_features:
|
1301 | 1303 | dropped_features.update(tf.hopsworks_udf.dropped_features)
|
1302 |
| - dataset = pd.concat( |
| 1304 | + |
| 1305 | + if ( |
| 1306 | + hopsworks_udf.execution_mode.get_current_execution_mode( |
| 1307 | + online=online_inference |
| 1308 | + ) |
| 1309 | + == UDFExecutionMode.PANDAS |
| 1310 | + ): |
| 1311 | + dataset = self._apply_pandas_udf( |
| 1312 | + hopsworks_udf=hopsworks_udf, dataframe=dataset |
| 1313 | + ) |
| 1314 | + else: |
| 1315 | + dataset = self._apply_python_udf( |
| 1316 | + hopsworks_udf=hopsworks_udf, dataframe=dataset |
| 1317 | + ) |
| 1318 | + dataset = dataset.drop(dropped_features, axis=1) |
| 1319 | + return dataset |
| 1320 | + |
| 1321 | + def _apply_python_udf( |
| 1322 | + self, |
| 1323 | + hopsworks_udf: HopsworksUdf, |
| 1324 | + dataframe: Union[pd.DataFrame, pl.DataFrame], |
| 1325 | + ) -> Union[pd.DataFrame, pl.DataFrame]: |
| 1326 | + """ |
| 1327 | + Apply a python udf to a dataframe |
| 1328 | +
|
| 1329 | + # Arguments |
| 1330 | + transformation_functions `List[transformation_function.TransformationFunction]` : List of transformation functions. |
| 1331 | + dataset `Union[pd.DataFrame, pl.DataFrame]`: A pandas or polars dataframe. |
| 1332 | + # Returns |
| 1333 | + `DataFrame`: A pandas dataframe with the transformed data. |
| 1334 | + # Raises |
| 1335 | + `FeatureStoreException`: If any of the features mentioned in the transformation function is not present in the Feature View. |
| 1336 | + """ |
| 1337 | + udf = hopsworks_udf.get_udf(online=False) |
| 1338 | + if isinstance(dataframe, pd.DataFrame): |
| 1339 | + if len(hopsworks_udf.return_types) > 1: |
| 1340 | + dataframe[hopsworks_udf.output_column_names] = dataframe.apply( |
| 1341 | + lambda x: udf(*x[hopsworks_udf.transformation_features]), |
| 1342 | + axis=1, |
| 1343 | + result_type="expand", |
| 1344 | + ) |
| 1345 | + else: |
| 1346 | + dataframe[hopsworks_udf.output_column_names[0]] = dataframe.apply( |
| 1347 | + lambda x: udf(*x[hopsworks_udf.transformation_features]), |
| 1348 | + axis=1, |
| 1349 | + result_type="expand", |
| 1350 | + ) |
| 1351 | + if hopsworks_udf.output_column_names[0] in dataframe.columns: |
| 1352 | + # Overwriting features so reordering dataframe to move overwritten column to the end of the dataframe |
| 1353 | + cols = dataframe.columns.tolist() |
| 1354 | + cols.append( |
| 1355 | + cols.pop(cols.index(hopsworks_udf.output_column_names[0])) |
| 1356 | + ) |
| 1357 | + dataframe = dataframe[cols] |
| 1358 | + else: |
| 1359 | + # Dynamically creating lambda function so that we do not need to loop though to extract features required for the udf. |
| 1360 | + # This is done because polars 'map_rows' provides rows as tuples to the udf. |
| 1361 | + transformation_features = ", ".join( |
1303 | 1362 | [
|
1304 |
| - dataset.reset_index(drop=True), |
1305 |
| - tf.hopsworks_udf.get_udf()( |
1306 |
| - *( |
1307 |
| - [ |
1308 |
| - dataset[feature] |
1309 |
| - for feature in tf.hopsworks_udf.transformation_features |
1310 |
| - ] |
| 1363 | + f"x[{dataframe.columns.index(feature)}]" |
| 1364 | + for feature in hopsworks_udf.transformation_features |
| 1365 | + ] |
| 1366 | + ) |
| 1367 | + feature_mapping_wrapper = eval( |
| 1368 | + f"lambda x: udf({transformation_features})", locals() |
| 1369 | + ) |
| 1370 | + transformed_features = dataframe.map_rows(feature_mapping_wrapper) |
| 1371 | + dataframe = dataframe.with_columns( |
| 1372 | + transformed_features.rename( |
| 1373 | + dict( |
| 1374 | + zip( |
| 1375 | + transformed_features.columns, |
| 1376 | + hopsworks_udf.output_column_names, |
1311 | 1377 | )
|
1312 |
| - ).reset_index(drop=True), |
1313 |
| - ], |
1314 |
| - axis=1, |
| 1378 | + ) |
| 1379 | + ) |
1315 | 1380 | )
|
1316 |
| - dataset = dataset.drop(dropped_features, axis=1) |
1317 |
| - return dataset |
| 1381 | + return dataframe |
| 1382 | + |
| 1383 | + def _apply_pandas_udf( |
| 1384 | + self, |
| 1385 | + hopsworks_udf: HopsworksUdf, |
| 1386 | + dataframe: Union[pd.DataFrame, pl.DataFrame], |
| 1387 | + ) -> Union[pd.DataFrame, pl.DataFrame]: |
| 1388 | + """ |
| 1389 | + Apply a pandas udf to a dataframe |
| 1390 | +
|
| 1391 | + # Arguments |
| 1392 | + transformation_functions `List[transformation_function.TransformationFunction]` : List of transformation functions. |
| 1393 | + dataset `Union[pd.DataFrame, pl.DataFrame]`: A pandas or polars dataframe. |
| 1394 | + # Returns |
| 1395 | + `DataFrame`: A pandas dataframe with the transformed data. |
| 1396 | + # Raises |
| 1397 | + `FeatureStoreException`: If any of the features mentioned in the transformation function is not present in the Feature View. |
| 1398 | + """ |
| 1399 | + if len(hopsworks_udf.return_types) > 1: |
| 1400 | + dataframe[hopsworks_udf.output_column_names] = hopsworks_udf.get_udf( |
| 1401 | + online=False |
| 1402 | + )( |
| 1403 | + *( |
| 1404 | + [ |
| 1405 | + dataframe[feature] |
| 1406 | + for feature in hopsworks_udf.transformation_features |
| 1407 | + ] |
| 1408 | + ) |
| 1409 | + ) |
| 1410 | + else: |
| 1411 | + dataframe[hopsworks_udf.output_column_names[0]] = hopsworks_udf.get_udf( |
| 1412 | + online=False |
| 1413 | + )( |
| 1414 | + *( |
| 1415 | + [ |
| 1416 | + dataframe[feature] |
| 1417 | + for feature in hopsworks_udf.transformation_features |
| 1418 | + ] |
| 1419 | + ) |
| 1420 | + ) |
| 1421 | + if hopsworks_udf.output_column_names[0] in dataframe.columns: |
| 1422 | + # Overwriting features so reordering dataframe to move overwritten column to the end of the dataframe |
| 1423 | + cols = dataframe.columns.tolist() |
| 1424 | + cols.append(cols.pop(cols.index(hopsworks_udf.output_column_names[0]))) |
| 1425 | + dataframe = dataframe[cols] |
| 1426 | + return dataframe |
1318 | 1427 |
|
1319 | 1428 | @staticmethod
|
1320 | 1429 | def get_unique_values(
|
|
0 commit comments