1
+ import hopsworks
2
+ import xgboost as xgb
3
+ import pandas as pd
4
+ import os
5
+ from sklearn .metrics import confusion_matrix
6
+ from sklearn .metrics import f1_score
7
+ from matplotlib import pyplot
8
+ import seaborn as sns
9
+ import joblib
10
+ from hsml .schema import Schema
11
+ from hsml .model_schema import ModelSchema
12
+ from mage_ai .data_preparation .shared .secrets import get_secret_value
13
+ if 'data_exporter' not in globals ():
14
+ from mage_ai .data_preparation .decorators import data_exporter
15
+
16
+
17
+ def prepare_training_data (X_train , X_test , y_train , y_test ):
18
+ # Sort the training features DataFrame 'X_train' based on the 'datetime' column
19
+ X_train = X_train .sort_values ("datetime" )
20
+
21
+ # Reindex the target variable 'y_train' to match the sorted order of 'X_train' index
22
+ y_train = y_train .reindex (X_train .index )
23
+
24
+ # Sort the test features DataFrame 'X_test' based on the 'datetime' column
25
+ X_test = X_test .sort_values ("datetime" )
26
+
27
+ # Reindex the target variable 'y_test' to match the sorted order of 'X_test' index
28
+ y_test = y_test .reindex (X_test .index )
29
+
30
+ # Drop the 'datetime' column from the training features DataFrame 'X_train'
31
+ X_train .drop (["datetime" ], axis = 1 , inplace = True )
32
+
33
+ # Drop the 'datetime' column from the test features DataFrame 'X_test'
34
+ X_test .drop (["datetime" ], axis = 1 , inplace = True )
35
+
36
+ return X_train , X_test , y_train , y_test
37
+
38
+
39
+ @data_exporter
40
+ def train_model (data , * args , ** kwargs ):
41
+ """
42
+ Train an XGBoost classifier for fraud detection and save it in the Hopsworks Model Registry.
43
+
44
+ Args:
45
+ data: The output from the upstream parent block
46
+ args: The output from any additional upstream blocks (if applicable)
47
+ """
48
+ TEST_SIZE = 0.2
49
+
50
+ # Specify your data exporting logic here
51
+ project = hopsworks .login (
52
+ api_key_value = get_secret_value ('HOPSWORKS_API_KEY' ),
53
+ )
54
+
55
+ fs = project .get_feature_store ()
56
+
57
+ # Get the 'transactions_view' feature view
58
+ feature_view = fs .get_feature_view (
59
+ name = 'transactions_view' ,
60
+ version = 1 ,
61
+ )
62
+
63
+ X_train , X_test , y_train , y_test = feature_view .train_test_split (
64
+ description = 'transactions fraud training dataset' ,
65
+ test_size = TEST_SIZE ,
66
+ )
67
+
68
+ X_train , X_test , y_train , y_test = prepare_training_data (
69
+ X_train ,
70
+ X_test ,
71
+ y_train ,
72
+ y_test ,
73
+ )
74
+ X_train .to_csv (f'X_train.csv' )
75
+
76
+ # Create an XGBoost classifier
77
+ model = xgb .XGBClassifier ()
78
+
79
+ # Fit XGBoost classifier to the training data
80
+ model .fit (X_train , y_train )
81
+
82
+ # Predict the training data using the trained classifier
83
+ y_pred_train = model .predict (X_train )
84
+
85
+ # Predict the test data using the trained classifier
86
+ y_pred_test = model .predict (X_test )
87
+
88
+ # Compute f1 score
89
+ metrics = {
90
+ "f1_score" : f1_score (y_test , y_pred_test , average = 'macro' )
91
+ }
92
+
93
+ # Calculate and print the confusion matrix for the test predictions
94
+ results = confusion_matrix (y_test , y_pred_test )
95
+ print (results )
96
+
97
+ # Create a DataFrame for the confusion matrix results
98
+ df_cm = pd .DataFrame (
99
+ results ,
100
+ ['True Normal' , 'True Fraud' ],
101
+ ['Pred Normal' , 'Pred Fraud' ],
102
+ )
103
+
104
+ # Create a heatmap using seaborn with annotations
105
+ cm = sns .heatmap (df_cm , annot = True )
106
+
107
+ # Get the figure and display it
108
+ fig = cm .get_figure ()
109
+
110
+ # Create a Schema for the input features using the values of X_train
111
+ input_schema = Schema (X_train .values )
112
+
113
+ # Create a Schema for the output using y_train
114
+ output_schema = Schema (y_train )
115
+
116
+ # Create a ModelSchema using the defined input and output schemas
117
+ model_schema = ModelSchema (
118
+ input_schema = input_schema ,
119
+ output_schema = output_schema ,
120
+ )
121
+
122
+ # Convert the model schema to a dictionary for inspection
123
+ model_schema .to_dict ()
124
+
125
+ # Specify the directory name for saving the model and related artifacts
126
+ model_dir = "quickstart_fraud_model"
127
+
128
+ # Check if the directory already exists; if not, create it
129
+ if not os .path .isdir (model_dir ):
130
+ os .mkdir (model_dir )
131
+
132
+ # Save the trained XGBoost classifier to a joblib file in the specified directory
133
+ joblib .dump (model , model_dir + '/xgboost_model.pkl' )
134
+
135
+ # Save the confusion matrix heatmap figure to an image file in the specified directory
136
+ fig .savefig (model_dir + "/confusion_matrix.png" )
137
+
138
+ # Get the model registry
139
+ mr = project .get_model_registry ()
140
+
141
+ # Create a Python model named "fraud" in the model registry
142
+ fraud_model = mr .python .create_model (
143
+ name = "fraud" ,
144
+ metrics = metrics , # Specify the metrics used to evaluate the model
145
+ model_schema = model_schema , # Use the previously defined model schema
146
+ input_example = [4700702588013561 ], # Provide an input example for testing deployments
147
+ description = "Quickstart Fraud Predictor" , # Add a description for the model
148
+ )
149
+
150
+ # Save the model to the specified directory
151
+ fraud_model .save (model_dir )
0 commit comments