@@ -8,36 +8,49 @@ The joining functionality is heavily inspired by the APIs used by Pandas to merg
8
8
9
9
=== "Python"
10
10
```python
11
+ fs = ...
12
+ credit_card_transactions_fg = fs.get_feature_group(name="credit_card_transactions", version=1)
13
+ account_details_fg = fs.get_feature_group(name="account_details", version=1)
14
+ merchant_details_fg = fs.get_feature_group(name="merchant_details", version=1)
15
+
11
16
# create a query
12
- feature_join = rain_fg .select_all() \
13
- .join(temperature_fg .select_all(), on=[ "date", "location_id "] ) \
14
- .join(location_fg .select_all())
17
+ selected_features = credit_card_transactions_fg .select_all() \
18
+ .join(account_details_fg .select_all(), on=["cc_num "]) \
19
+ .join(merchant_details_fg .select_all())
15
20
16
21
# save the query to feature view
17
22
feature_view = fs.create_feature_view(
18
- name='rain_dataset',
19
- query=feature_join
23
+ version=1,
24
+ name='credit_card_fraud',
25
+ labels=["is_fraud"],
26
+ query=selected_features
20
27
)
21
28
22
29
# retrieve the query back from the feature view
23
- feature_view = fs.get_feature_view(“rain_dataset ”, version=1)
30
+ feature_view = fs.get_feature_view(“credit_card_fraud ”, version=1)
24
31
query = feature_view.query
25
32
```
26
33
27
34
=== "Scala"
28
35
```scala
36
+
37
+ val fs = ...
38
+ val creditCardTransactionsFg = fs.getFeatureGroup("credit_card_transactions", 1)
39
+ val accountDetailsFg = fs.getFeatureGroup(name="account_details", version=1)
40
+ val merchantDetailsFg = fs.getFeatureGroup("merchant_details", 1)
41
+
29
42
// create a query
30
- val featureJoin = (rainFg .selectAll()
31
- .join(temperatureFg .selectAll(), on=Seq("date", "location_id "))
32
- .join(locationFg .selectAll()))
43
+ val selectedFeatures = (creditCardTransactionsFg .selectAll()
44
+ .join(accountDetailsFg .selectAll(), on=Seq("cc_num "))
45
+ .join(merchantDetailsFg .selectAll()))
33
46
34
47
val featureView = featureStore.createFeatureView()
35
- .name("rain_dataset ")
36
- .query(featureJoin )
48
+ .name("credit_card_fraud ")
49
+ .query(selectedFeatures )
37
50
.build();
38
51
39
52
// retrieve the query back from the feature view
40
- val featureView = fs.getFeatureView(“rain_dataset ”, 1)
53
+ val featureView = fs.getFeatureView(“credit_card_fraud ”, 1)
41
54
val query = featureView.getQuery()
42
55
```
43
56
@@ -53,18 +66,18 @@ Selecting features from a feature group is a lazy operation, returning a query w
53
66
54
67
=== "Python"
55
68
```python
56
- rain_fg = fs.get_feature_group("rain_fg ")
69
+ credit_card_transactions_fg = fs.get_feature_group("credit_card_transactions ")
57
70
58
71
# Returns Query
59
- feature_join = rain_fg .select(["location_id ", "weekly_rainfall "])
72
+ selected_features = credit_card_transactions_fg .select(["amount ", "latitude", "longitude "])
60
73
```
61
74
62
75
=== "Scala"
63
76
```Scala
64
- val rainFg = fs.getFeatureGroup("rain_fg ")
77
+ val creditCardTransactionsFg = fs.getFeatureGroup("credit_card_transactions ")
65
78
66
79
# Returns Query
67
- val featureJoin = rainFg .select(Seq("location_id ", "weekly_rainfall "))
80
+ val selectedFeatures = creditCardTransactionsFg .select(Seq("amount ", "latitude", "longitude "))
68
81
```
69
82
70
83
#### Join
@@ -75,35 +88,103 @@ By default, Hopsworks will use the maximal matching subset of the primary keys o
75
88
=== "Python"
76
89
```python
77
90
# Returns Query
78
- feature_join = rain_fg .join(temperature_fg )
91
+ selected_features = credit_card_transactions_fg .join(account_details_fg )
79
92
```
80
93
81
94
=== "Scala"
82
95
```Scala
83
96
// Returns Query
84
- val featureJoin = rainFg .join(temperatureFg )
97
+ val selectedFeatures = creditCardTransactionsFg .join(accountDetailsFg )
85
98
```
86
99
More complex joins are possible by selecting subsets of features from the joined feature groups and by specifying a join key and type.
87
- Possible join types are "inner", "left" or "right". Furthermore, it is possible to specify different features for the join key of the left and right feature group.
88
- The join key lists should contain the names of the features to join on.
100
+ Possible join types are "inner", "left" or "right". By default ` join_type ` is `"left". Furthermore, it is possible to specify different
101
+ features for the join key of the left and right feature group. The join key lists should contain the names of the features to join on.
89
102
90
103
=== "Python"
91
104
```python
92
- feature_join = rain_fg .select_all() \
93
- .join(temperature_fg .select_all(), on=[ "date", "location_id "] ) \
94
- .join(location_fg .select_all(), left_on=[ "location_id "] , right_on=[ "id"] , join_type="left ")
105
+ selected_features = credit_card_transactions_fg .select_all() \
106
+ .join(account_details_fg .select_all(), on=[ "cc_num "] ) \
107
+ .join(merchant_details_fg .select_all(), left_on=[ "merchant_id "] , right_on=[ "id"] , join_type="inner ")
95
108
```
96
109
97
110
=== "Scala"
98
111
```scala
99
- val featureJoin = (rainFg.selectAll()
100
- .join(temperatureFg.selectAll(), Seq("date", "location_id"))
101
- .join(locationFg.selectAll(), Seq("location_id"), Seq("id"), "left"))
112
+ val selectedFeatures = (creditCardTransactionsFg.selectAll()
113
+ .join(accountDetailsFg.selectAll(), Seq("cc_num"))
114
+ .join(merchantDetailsFg.selectAll(), Seq("merchant_id"), Seq("id"), "inner"))
115
+ ```
116
+
117
+ ### Data modeling in Hopsworks
118
+
119
+ Since v4.0 Hopsworks Feature selection API supports both Star and Snowflake Schema data models.
120
+
121
+ #### Star schema data model
122
+
123
+ When choosing Star Schema data model all tables are children of the parent (the left most) feature group, which has all
124
+ foreign keys for its child feature groups.
125
+
126
+ <p align =" center " >
127
+ <figure >
128
+ <img src="../../../../assets/images/guides/fs/feature_view/star.png" alt="Star schema data model">
129
+ <figcaption>Star schema data model</figcaption>
130
+ </figure >
131
+ </p >
132
+
133
+ === "Python"
134
+ ```python
135
+ selected_features = credit_card_transactions.select_all()
136
+ .join(aggregated_cc_transactions.select_all())
137
+ .join(account_details.select_all())
138
+ .join(merchant_details.select_all())
139
+ .join(cc_issuer_details.select_all())
102
140
```
103
141
104
- !!! error "Nested Joins"
105
- The API currently does not support nested joins. That is joins of joins.
106
- You can fall back to Spark DataFrames to cover these cases. However, if you have to use joins of joins, most likely there is potential to optimise your feature group structure.
142
+ In online inference, when you want to retrieve features in your online model, you have to provide all foreign key values,
143
+ known as the serving_keys, from the parent feature group to retrieve your precomputed feature values using the feature view.
144
+
145
+ === "Python"
146
+ ```python
147
+ feature vector = feature_view.get_feature_vector({
148
+ ‘cc_num’: “1234 5555 3333 8888”,
149
+ ‘issuer_id’: 20440455,
150
+ ‘merchant_id’: 44208484,
151
+ ‘account_id’: 84403331
152
+ })
153
+ ```
154
+
155
+ #### Snowflake schema
156
+ Hopsworks also provides the possibility to define a feature view that consists of a nested tree of children (to up to a depth of 20)
157
+ from the root (left most) feature group. This is called Snowflake Schema data model where you need to build nested tables (subtrees) using joins, and then join the
158
+ subtrees to their parents iteratively until you reach the root node (the leftmost feature group in the feature selection):
159
+
160
+ <p align =" center " >
161
+ <figure >
162
+ <img src="../../../../assets/images/guides/fs/feature_view/snowflake.png" alt="Snowflake schema data model">
163
+ <figcaption>Snowflake schema data model</figcaption>
164
+ </figure >
165
+ </p >
166
+
167
+ === "Python"
168
+ ```python
169
+ nested_selection = aggregated_cc_transactions.select_all()
170
+ .join(account_details.select_all())
171
+ .join(cc_issuer_details.select_all())
172
+
173
+ selected_features = credit_card_transactions.select_all()
174
+ .join(nested_selection)
175
+ .join(merchant_details.select_all())
176
+ ```
177
+
178
+ Now, you have the benefit that in online inference you only need to pass two serving key values (the foreign keys of the leftmost feature group)
179
+ to retrieve the precomputed features:
180
+
181
+ === "Python"
182
+ ```python
183
+ feature vector = feature_view.get_feature_vector({
184
+ ‘cc_num’: “1234 5555 3333 8888”,
185
+ ‘merchant_id’: 44208484,
186
+ })
187
+ ```
107
188
108
189
#### Filter
109
190
@@ -114,48 +195,48 @@ For the Scala part of the API, equivalent methods are available in the `Feature`
114
195
115
196
=== "Python"
116
197
```python
117
- filtered_rain = rain_fg .filter(rain_fg.location_id == 10 )
198
+ filtered_credit_card_transactions = credit_card_transactions_fg .filter(credit_card_transactions_fg.category == "Grocery" )
118
199
```
119
200
120
201
=== "Scala"
121
202
```scala
122
- val filteredRain = rainFg .filter(rainFg .getFeature("location_id ").eq(10 ))
203
+ val filteredCreditCardTransactions = creditCardTransactionsFg .filter(creditCardTransactionsFg .getFeature("category ").eq("Grocery" ))
123
204
```
124
205
125
206
Filters are fully compatible with joins:
126
207
127
208
=== "Python"
128
209
```python
129
- feature_join = rain_fg .select_all() \
130
- .join(temperature_fg .select_all(), on=[ "date", "location_id "] ) \
131
- .join(location_fg .select_all(), left_on=[ "location_id "] , right_on=[ "id"] , join_type="left" ) \
132
- .filter((rain_fg.location_id == 10 ) | (rain_fg.location_id == 20 ))
210
+ selected_features = credit_card_transactions_fg .select_all() \
211
+ .join(account_details_fg .select_all(), on=[ "cc_num "] ) \
212
+ .join(merchant_details_fg .select_all(), left_on=[ "merchant_id "] , right_on=[ "id"] ) \
213
+ .filter((credit_card_transactions_fg.category == "Grocery" ) | (credit_card_transactions_fg.category == "Restaurant/Cafeteria" ))
133
214
```
134
215
135
216
=== "Scala"
136
217
```scala
137
- val featureJoin = (rainFg .selectAll()
138
- .join(temperatureFg .selectAll(), Seq("date", "location_id "))
139
- .join(locationFg .selectAll(), Seq("location_id "), Seq("id"), "left")
140
- .filter(rainFg .getFeature("location_id ").eq(10 ).or(rainFg .getFeature("location_id ").eq(20 ))))
218
+ val selectedFeatures = (creditCardTransactionsFg .selectAll()
219
+ .join(accountDetailsFg .selectAll(), Seq("cc_num "))
220
+ .join(merchantDetailsFg .selectAll(), Seq("merchant_id "), Seq("id"), "left")
221
+ .filter(creditCardTransactionsFg .getFeature("category ").eq("Grocery" ).or(creditCardTransactionsFg .getFeature("category ").eq("Restaurant/Cafeteria" ))))
141
222
```
142
223
143
224
The filters can be applied at any point of the query:
144
225
145
226
=== "Python"
146
227
```python
147
- feature_join = rain_fg .select_all() \
148
- .join(temperature_fg .select_all().filter(temperature_fg .avg_temp >= 22), on=[ "date", "location_id "] ) \
149
- .join(location_fg .select_all(), left_on=[ "location_id "] , right_on=[ "id"] , join_type="left" ) \
150
- .filter(rain_fg.location_id == 10 )
228
+ selected_features = credit_card_transactions_fg .select_all() \
229
+ .join(accountDetails_fg .select_all().filter(accountDetails_fg .avg_temp >= 22), on=[ "cc_num "] ) \
230
+ .join(merchant_details_fg .select_all(), left_on=[ "merchant_id "] , right_on=[ "id"] ) \
231
+ .filter(credit_card_transactions_fg.category == "Grocery" )
151
232
```
152
233
153
234
=== "Scala"
154
235
```scala
155
- val featureJoin = (rainFg .selectAll()
156
- .join(temperatureFg .selectAll().filter(temperatureFg .getFeature("avg_temp").ge(22)), Seq("date", "location_id "))
157
- .join(locationFg .selectAll(), Seq("location_id "), Seq("id"), "left")
158
- .filter(rainFg .getFeature("location_id ").eq(10 )))
236
+ val selectedFeatures = (creditCardTransactionsFg .selectAll()
237
+ .join(accountDetailsFg .selectAll().filter(accountDetailsFg .getFeature("avg_temp").ge(22)), Seq("cc_num "))
238
+ .join(merchantDetailsFg .selectAll(), Seq("merchant_id "), Seq("id"), "left")
239
+ .filter(creditCardTransactionsFg .getFeature("category ").eq("Grocery" )))
159
240
```
160
241
161
242
#### Joins and/or Filters on feature view query
@@ -166,23 +247,23 @@ However, this operation will not update the metadata and persist the updated que
166
247
=== "Python"
167
248
```python
168
249
fs = ...
169
- wind_speed_fg = fs.get_feature_group(name="wind_speed_fg ", version=1)
170
- rain_fg = fs.get_feature_group(name="rain_fg ", version=1)
171
- feature_view = fs.get_feature_view(“rain_dataset ”, version=1)
250
+ merchant_details_fg = fs.get_feature_group(name="merchant_details ", version=1)
251
+ credit_card_transactions_fg = fs.get_feature_group(name="credit_card_transactions ", version=1)
252
+ feature_view = fs.get_feature_view(“credit_card_fraud ”, version=1)
172
253
feature_view.query \
173
- .join(wind_speed_fg .select_all()) \
174
- .filter((rain_fg.location_id == 54 )
254
+ .join(merchant_details_fg .select_all()) \
255
+ .filter((credit_card_transactions_fg.category == "Cash Withdrawal" )
175
256
```
176
257
177
258
=== "Scala"
178
259
```scala
179
260
val fs = ...
180
- val windSpeedFg = fs.getFeatureGroup("wind_speed_fg ", 1)
181
- val rainFg = fs.getFeatureGroup("rain_fg ", 1)
182
- val featureView = fs.getFeatureView(“rain_dataset ”, 1)
261
+ val merchantDetailsFg = fs.getFeatureGroup("merchant_details ", 1)
262
+ val creditCardTransactionsFg = fs.getFeatureGroup("credit_card_transactions ", 1)
263
+ val featureView = fs.getFeatureView(“credit_card_fraud ”, 1)
183
264
featureView.getQuery()
184
- .join(windSpeedFg .selectAll())
185
- .filter(rainFg .getFeature("location_id ").eq(54 ))
265
+ .join(merchantDetailsFg .selectAll())
266
+ .filter(creditCardTransactionsFg .getFeature("category ").eq("Cash Withdrawal" ))
186
267
```
187
268
188
269
!!! warning
@@ -192,45 +273,46 @@ However, this operation will not update the metadata and persist the updated que
192
273
=== "Python"
193
274
```python
194
275
fs = ...
195
- wind_speed_fg = fs.get_feature_group(name="wind_speed_fg", version=1)
196
- solar_irradiance_fg = fs.get_feature_group(name="solar_irradiance_fg", version=1)
197
- rain_fg = fs.get_feature_group(name="rain_fg", version=1)
276
+
277
+ merchant_details_fg = fs.get_feature_group(name="merchant_details", version=1)
278
+ account_details_fg = fs.get_feature_group(name="account_details", version=1)
279
+ credit_card_transactions_fg = fs.get_feature_group(name="credit_card_transactions", version=1)
198
280
199
281
# fetch new feature view and its query instance
200
- feature_view = fs.get_feature_view(“rain_dataset ”, version=1)
282
+ feature_view = fs.get_feature_view(“credit_card_fraud ”, version=1)
201
283
202
- # apply join/filter logic based on location and wind speed
203
- feature_view.query.join(wind_speed_fg .select_all()) \
204
- .filter((rain_fg.location_id == 54 )
284
+ # apply join/filter logic based on purchase type
285
+ feature_view.query.join(merchant_details_fg .select_all()) \
286
+ .filter((credit_card_transactions_fg.category == "Cash Withdrawal" )
205
287
206
- # to apply new logic independent of location and wind speed from above
288
+ # to apply new logic independent of purchase type from above
207
289
# re-fetch new feature view and its query instance
208
- feature_view = fs.get_feature_view(“rain_dataset ”, version=1)
290
+ feature_view = fs.get_feature_view(“credit_card_fraud ”, version=1)
209
291
210
- # apply new join/filter logic based on solar irradiance
211
- feature_view.query.join(solar_irradiance_fg .select_all()) \
212
- .filter(solar_irradiance_fg.location_id == 28 )
292
+ # apply new join/filter logic based on account details
293
+ feature_view.query.join(merchant_details_fg .select_all()) \
294
+ .filter(account_details_fg.gender == "F" )
213
295
```
214
296
215
297
=== "Scala"
216
298
```scala
217
299
fs = ...
218
- windSpeedFg = fs.getFeatureGroup("wind_speed_fg ", 1)
219
- solarIrradianceFg = fs.getFeatureGroup("solar_irradiance_fg ", 1)
220
- rainFg = fs.getFeatureGroup("rain_fg ", 1)
300
+ merchantDetailsFg = fs.getFeatureGroup("merchant_details ", 1)
301
+ accountDetailsFg = fs.getFeatureGroup("account_details ", 1)
302
+ creditCardTransactionsFg = fs.getFeatureGroup("credit_card_transactions ", 1)
221
303
222
304
// fetch new feature view and its query instance
223
- val featureView = fs.getFeatureView(“rain_dataset ”, version=1)
305
+ val featureView = fs.getFeatureView(“credit_card_fraud ”, version=1)
224
306
225
- // apply join/filter logic based on location and wind speed
226
- featureView.getQuery.join(windSpeedFg .selectAll())
227
- .filter(rainFg .getFeature("location_id ").eq(54 ))
307
+ // apply join/filter logic based on purchase type
308
+ featureView.getQuery.join(merchantDetailsFg .selectAll())
309
+ .filter(creditCardTransactionsFg .getFeature("category ").eq("Cash Withdrawal" ))
228
310
229
- // to apply new logic independent of location and wind speed from above
311
+ // to apply new logic independent of purchase type from above
230
312
// re-fetch new feature view and its query instance
231
- val featureView = fs.getFeatureView(“rain_dataset ”, 1)
313
+ val featureView = fs.getFeatureView(“credit_card_fraud ”, 1)
232
314
233
- // apply new join/filter logic based on solar irradiance
234
- featureView.getQuery.join(solarIrradianceFg .selectAll())
235
- .filter(solarIrradianceFg .getFeature("location_id ").eq(28 ))
315
+ // apply new join/filter logic based on account details
316
+ featureView.getQuery.join(merchantDetailsFg .selectAll())
317
+ .filter(accountDetailsFg .getFeature("gender ").eq("F" ))
236
318
```
0 commit comments