Skip to content

Commit 86856bc

Browse files
authored
[FSTORE-1470] Documentation and Guides for Snowflake Schema (#398)
* snowflake schema documentation
1 parent e50ba52 commit 86856bc

File tree

3 files changed

+163
-81
lines changed

3 files changed

+163
-81
lines changed
Loading
Loading

docs/user_guides/fs/feature_view/query.md

+163-81
Original file line numberDiff line numberDiff line change
@@ -8,36 +8,49 @@ The joining functionality is heavily inspired by the APIs used by Pandas to merg
88

99
=== "Python"
1010
```python
11+
fs = ...
12+
credit_card_transactions_fg = fs.get_feature_group(name="credit_card_transactions", version=1)
13+
account_details_fg = fs.get_feature_group(name="account_details", version=1)
14+
merchant_details_fg = fs.get_feature_group(name="merchant_details", version=1)
15+
1116
# create a query
12-
feature_join = rain_fg.select_all() \
13-
.join(temperature_fg.select_all(), on=["date", "location_id"]) \
14-
.join(location_fg.select_all())
17+
selected_features = credit_card_transactions_fg.select_all() \
18+
.join(account_details_fg.select_all(), on=["cc_num"]) \
19+
.join(merchant_details_fg.select_all())
1520

1621
# save the query to feature view
1722
feature_view = fs.create_feature_view(
18-
name='rain_dataset',
19-
query=feature_join
23+
version=1,
24+
name='credit_card_fraud',
25+
labels=["is_fraud"],
26+
query=selected_features
2027
)
2128

2229
# retrieve the query back from the feature view
23-
feature_view = fs.get_feature_view(“rain_dataset”, version=1)
30+
feature_view = fs.get_feature_view(“credit_card_fraud”, version=1)
2431
query = feature_view.query
2532
```
2633

2734
=== "Scala"
2835
```scala
36+
37+
val fs = ...
38+
val creditCardTransactionsFg = fs.getFeatureGroup("credit_card_transactions", 1)
39+
val accountDetailsFg = fs.getFeatureGroup(name="account_details", version=1)
40+
val merchantDetailsFg = fs.getFeatureGroup("merchant_details", 1)
41+
2942
// create a query
30-
val featureJoin = (rainFg.selectAll()
31-
.join(temperatureFg.selectAll(), on=Seq("date", "location_id"))
32-
.join(locationFg.selectAll()))
43+
val selectedFeatures = (creditCardTransactionsFg.selectAll()
44+
.join(accountDetailsFg.selectAll(), on=Seq("cc_num"))
45+
.join(merchantDetailsFg.selectAll()))
3346

3447
val featureView = featureStore.createFeatureView()
35-
.name("rain_dataset")
36-
.query(featureJoin)
48+
.name("credit_card_fraud")
49+
.query(selectedFeatures)
3750
.build();
3851

3952
// retrieve the query back from the feature view
40-
val featureView = fs.getFeatureView(“rain_dataset”, 1)
53+
val featureView = fs.getFeatureView(“credit_card_fraud”, 1)
4154
val query = featureView.getQuery()
4255
```
4356

@@ -53,18 +66,18 @@ Selecting features from a feature group is a lazy operation, returning a query w
5366

5467
=== "Python"
5568
```python
56-
rain_fg = fs.get_feature_group("rain_fg")
69+
credit_card_transactions_fg = fs.get_feature_group("credit_card_transactions")
5770

5871
# Returns Query
59-
feature_join = rain_fg.select(["location_id", "weekly_rainfall"])
72+
selected_features = credit_card_transactions_fg.select(["amount", "latitude", "longitude"])
6073
```
6174

6275
=== "Scala"
6376
```Scala
64-
val rainFg = fs.getFeatureGroup("rain_fg")
77+
val creditCardTransactionsFg = fs.getFeatureGroup("credit_card_transactions")
6578

6679
# Returns Query
67-
val featureJoin = rainFg.select(Seq("location_id", "weekly_rainfall"))
80+
val selectedFeatures = creditCardTransactionsFg.select(Seq("amount", "latitude", "longitude"))
6881
```
6982

7083
#### Join
@@ -75,35 +88,103 @@ By default, Hopsworks will use the maximal matching subset of the primary keys o
7588
=== "Python"
7689
```python
7790
# Returns Query
78-
feature_join = rain_fg.join(temperature_fg)
91+
selected_features = credit_card_transactions_fg.join(account_details_fg)
7992
```
8093

8194
=== "Scala"
8295
```Scala
8396
// Returns Query
84-
val featureJoin = rainFg.join(temperatureFg)
97+
val selectedFeatures = creditCardTransactionsFg.join(accountDetailsFg)
8598
```
8699
More complex joins are possible by selecting subsets of features from the joined feature groups and by specifying a join key and type.
87-
Possible join types are "inner", "left" or "right". Furthermore, it is possible to specify different features for the join key of the left and right feature group.
88-
The join key lists should contain the names of the features to join on.
100+
Possible join types are "inner", "left" or "right". By default`join_type` is `"left". Furthermore, it is possible to specify different
101+
features for the join key of the left and right feature group. The join key lists should contain the names of the features to join on.
89102

90103
=== "Python"
91104
```python
92-
feature_join = rain_fg.select_all() \
93-
.join(temperature_fg.select_all(), on=["date", "location_id"]) \
94-
.join(location_fg.select_all(), left_on=["location_id"], right_on=["id"], join_type="left")
105+
selected_features = credit_card_transactions_fg.select_all() \
106+
.join(account_details_fg.select_all(), on=["cc_num"]) \
107+
.join(merchant_details_fg.select_all(), left_on=["merchant_id"], right_on=["id"], join_type="inner")
95108
```
96109

97110
=== "Scala"
98111
```scala
99-
val featureJoin = (rainFg.selectAll()
100-
.join(temperatureFg.selectAll(), Seq("date", "location_id"))
101-
.join(locationFg.selectAll(), Seq("location_id"), Seq("id"), "left"))
112+
val selectedFeatures = (creditCardTransactionsFg.selectAll()
113+
.join(accountDetailsFg.selectAll(), Seq("cc_num"))
114+
.join(merchantDetailsFg.selectAll(), Seq("merchant_id"), Seq("id"), "inner"))
115+
```
116+
117+
### Data modeling in Hopsworks
118+
119+
Since v4.0 Hopsworks Feature selection API supports both Star and Snowflake Schema data models.
120+
121+
#### Star schema data model
122+
123+
When choosing Star Schema data model all tables are children of the parent (the left most) feature group, which has all
124+
foreign keys for its child feature groups.
125+
126+
<p align="center">
127+
<figure>
128+
<img src="../../../../assets/images/guides/fs/feature_view/star.png" alt="Star schema data model">
129+
<figcaption>Star schema data model</figcaption>
130+
</figure>
131+
</p>
132+
133+
=== "Python"
134+
```python
135+
selected_features = credit_card_transactions.select_all()
136+
.join(aggregated_cc_transactions.select_all())
137+
.join(account_details.select_all())
138+
.join(merchant_details.select_all())
139+
.join(cc_issuer_details.select_all())
102140
```
103141

104-
!!! error "Nested Joins"
105-
The API currently does not support nested joins. That is joins of joins.
106-
You can fall back to Spark DataFrames to cover these cases. However, if you have to use joins of joins, most likely there is potential to optimise your feature group structure.
142+
In online inference, when you want to retrieve features in your online model, you have to provide all foreign key values,
143+
known as the serving_keys, from the parent feature group to retrieve your precomputed feature values using the feature view.
144+
145+
=== "Python"
146+
```python
147+
feature vector = feature_view.get_feature_vector({
148+
‘cc_num’: “1234 5555 3333 8888”,
149+
‘issuer_id’: 20440455,
150+
‘merchant_id’: 44208484,
151+
‘account_id’: 84403331
152+
})
153+
```
154+
155+
#### Snowflake schema
156+
Hopsworks also provides the possibility to define a feature view that consists of a nested tree of children (to up to a depth of 20)
157+
from the root (left most) feature group. This is called Snowflake Schema data model where you need to build nested tables (subtrees) using joins, and then join the
158+
subtrees to their parents iteratively until you reach the root node (the leftmost feature group in the feature selection):
159+
160+
<p align="center">
161+
<figure>
162+
<img src="../../../../assets/images/guides/fs/feature_view/snowflake.png" alt="Snowflake schema data model">
163+
<figcaption>Snowflake schema data model</figcaption>
164+
</figure>
165+
</p>
166+
167+
=== "Python"
168+
```python
169+
nested_selection = aggregated_cc_transactions.select_all()
170+
.join(account_details.select_all())
171+
.join(cc_issuer_details.select_all())
172+
173+
selected_features = credit_card_transactions.select_all()
174+
.join(nested_selection)
175+
.join(merchant_details.select_all())
176+
```
177+
178+
Now, you have the benefit that in online inference you only need to pass two serving key values (the foreign keys of the leftmost feature group)
179+
to retrieve the precomputed features:
180+
181+
=== "Python"
182+
```python
183+
feature vector = feature_view.get_feature_vector({
184+
‘cc_num’: “1234 5555 3333 8888”,
185+
‘merchant_id’: 44208484,
186+
})
187+
```
107188

108189
#### Filter
109190

@@ -114,48 +195,48 @@ For the Scala part of the API, equivalent methods are available in the `Feature`
114195

115196
=== "Python"
116197
```python
117-
filtered_rain = rain_fg.filter(rain_fg.location_id == 10)
198+
filtered_credit_card_transactions = credit_card_transactions_fg.filter(credit_card_transactions_fg.category == "Grocery")
118199
```
119200

120201
=== "Scala"
121202
```scala
122-
val filteredRain = rainFg.filter(rainFg.getFeature("location_id").eq(10))
203+
val filteredCreditCardTransactions = creditCardTransactionsFg.filter(creditCardTransactionsFg.getFeature("category").eq("Grocery"))
123204
```
124205

125206
Filters are fully compatible with joins:
126207

127208
=== "Python"
128209
```python
129-
feature_join = rain_fg.select_all() \
130-
.join(temperature_fg.select_all(), on=["date", "location_id"]) \
131-
.join(location_fg.select_all(), left_on=["location_id"], right_on=["id"], join_type="left") \
132-
.filter((rain_fg.location_id == 10) | (rain_fg.location_id == 20))
210+
selected_features = credit_card_transactions_fg.select_all() \
211+
.join(account_details_fg.select_all(), on=["cc_num"]) \
212+
.join(merchant_details_fg.select_all(), left_on=["merchant_id"], right_on=["id"]) \
213+
.filter((credit_card_transactions_fg.category == "Grocery") | (credit_card_transactions_fg.category == "Restaurant/Cafeteria"))
133214
```
134215

135216
=== "Scala"
136217
```scala
137-
val featureJoin = (rainFg.selectAll()
138-
.join(temperatureFg.selectAll(), Seq("date", "location_id"))
139-
.join(locationFg.selectAll(), Seq("location_id"), Seq("id"), "left")
140-
.filter(rainFg.getFeature("location_id").eq(10).or(rainFg.getFeature("location_id").eq(20))))
218+
val selectedFeatures = (creditCardTransactionsFg.selectAll()
219+
.join(accountDetailsFg.selectAll(), Seq("cc_num"))
220+
.join(merchantDetailsFg.selectAll(), Seq("merchant_id"), Seq("id"), "left")
221+
.filter(creditCardTransactionsFg.getFeature("category").eq("Grocery").or(creditCardTransactionsFg.getFeature("category").eq("Restaurant/Cafeteria"))))
141222
```
142223

143224
The filters can be applied at any point of the query:
144225

145226
=== "Python"
146227
```python
147-
feature_join = rain_fg.select_all() \
148-
.join(temperature_fg.select_all().filter(temperature_fg.avg_temp >= 22), on=["date", "location_id"]) \
149-
.join(location_fg.select_all(), left_on=["location_id"], right_on=["id"], join_type="left") \
150-
.filter(rain_fg.location_id == 10)
228+
selected_features = credit_card_transactions_fg.select_all() \
229+
.join(accountDetails_fg.select_all().filter(accountDetails_fg.avg_temp >= 22), on=["cc_num"]) \
230+
.join(merchant_details_fg.select_all(), left_on=["merchant_id"], right_on=["id"]) \
231+
.filter(credit_card_transactions_fg.category == "Grocery")
151232
```
152233

153234
=== "Scala"
154235
```scala
155-
val featureJoin = (rainFg.selectAll()
156-
.join(temperatureFg.selectAll().filter(temperatureFg.getFeature("avg_temp").ge(22)), Seq("date", "location_id"))
157-
.join(locationFg.selectAll(), Seq("location_id"), Seq("id"), "left")
158-
.filter(rainFg.getFeature("location_id").eq(10)))
236+
val selectedFeatures = (creditCardTransactionsFg.selectAll()
237+
.join(accountDetailsFg.selectAll().filter(accountDetailsFg.getFeature("avg_temp").ge(22)), Seq("cc_num"))
238+
.join(merchantDetailsFg.selectAll(), Seq("merchant_id"), Seq("id"), "left")
239+
.filter(creditCardTransactionsFg.getFeature("category").eq("Grocery")))
159240
```
160241

161242
#### Joins and/or Filters on feature view query
@@ -166,23 +247,23 @@ However, this operation will not update the metadata and persist the updated que
166247
=== "Python"
167248
```python
168249
fs = ...
169-
wind_speed_fg = fs.get_feature_group(name="wind_speed_fg", version=1)
170-
rain_fg = fs.get_feature_group(name="rain_fg", version=1)
171-
feature_view = fs.get_feature_view(“rain_dataset”, version=1)
250+
merchant_details_fg = fs.get_feature_group(name="merchant_details", version=1)
251+
credit_card_transactions_fg = fs.get_feature_group(name="credit_card_transactions", version=1)
252+
feature_view = fs.get_feature_view(“credit_card_fraud”, version=1)
172253
feature_view.query \
173-
.join(wind_speed_fg.select_all()) \
174-
.filter((rain_fg.location_id == 54)
254+
.join(merchant_details_fg.select_all()) \
255+
.filter((credit_card_transactions_fg.category == "Cash Withdrawal")
175256
```
176257

177258
=== "Scala"
178259
```scala
179260
val fs = ...
180-
val windSpeedFg = fs.getFeatureGroup("wind_speed_fg", 1)
181-
val rainFg = fs.getFeatureGroup("rain_fg", 1)
182-
val featureView = fs.getFeatureView(“rain_dataset”, 1)
261+
val merchantDetailsFg = fs.getFeatureGroup("merchant_details", 1)
262+
val creditCardTransactionsFg = fs.getFeatureGroup("credit_card_transactions", 1)
263+
val featureView = fs.getFeatureView(“credit_card_fraud”, 1)
183264
featureView.getQuery()
184-
.join(windSpeedFg.selectAll())
185-
.filter(rainFg.getFeature("location_id").eq(54))
265+
.join(merchantDetailsFg.selectAll())
266+
.filter(creditCardTransactionsFg.getFeature("category").eq("Cash Withdrawal"))
186267
```
187268

188269
!!! warning
@@ -192,45 +273,46 @@ However, this operation will not update the metadata and persist the updated que
192273
=== "Python"
193274
```python
194275
fs = ...
195-
wind_speed_fg = fs.get_feature_group(name="wind_speed_fg", version=1)
196-
solar_irradiance_fg = fs.get_feature_group(name="solar_irradiance_fg", version=1)
197-
rain_fg = fs.get_feature_group(name="rain_fg", version=1)
276+
277+
merchant_details_fg = fs.get_feature_group(name="merchant_details", version=1)
278+
account_details_fg = fs.get_feature_group(name="account_details", version=1)
279+
credit_card_transactions_fg = fs.get_feature_group(name="credit_card_transactions", version=1)
198280

199281
# fetch new feature view and its query instance
200-
feature_view = fs.get_feature_view(“rain_dataset”, version=1)
282+
feature_view = fs.get_feature_view(“credit_card_fraud”, version=1)
201283

202-
# apply join/filter logic based on location and wind speed
203-
feature_view.query.join(wind_speed_fg.select_all()) \
204-
.filter((rain_fg.location_id == 54)
284+
# apply join/filter logic based on purchase type
285+
feature_view.query.join(merchant_details_fg.select_all()) \
286+
.filter((credit_card_transactions_fg.category == "Cash Withdrawal")
205287

206-
# to apply new logic independent of location and wind speed from above
288+
# to apply new logic independent of purchase type from above
207289
# re-fetch new feature view and its query instance
208-
feature_view = fs.get_feature_view(“rain_dataset”, version=1)
290+
feature_view = fs.get_feature_view(“credit_card_fraud”, version=1)
209291

210-
# apply new join/filter logic based on solar irradiance
211-
feature_view.query.join(solar_irradiance_fg.select_all()) \
212-
.filter(solar_irradiance_fg.location_id == 28)
292+
# apply new join/filter logic based on account details
293+
feature_view.query.join(merchant_details_fg.select_all()) \
294+
.filter(account_details_fg.gender == "F")
213295
```
214296

215297
=== "Scala"
216298
```scala
217299
fs = ...
218-
windSpeedFg = fs.getFeatureGroup("wind_speed_fg", 1)
219-
solarIrradianceFg = fs.getFeatureGroup("solar_irradiance_fg", 1)
220-
rainFg = fs.getFeatureGroup("rain_fg", 1)
300+
merchantDetailsFg = fs.getFeatureGroup("merchant_details", 1)
301+
accountDetailsFg = fs.getFeatureGroup("account_details", 1)
302+
creditCardTransactionsFg = fs.getFeatureGroup("credit_card_transactions", 1)
221303

222304
// fetch new feature view and its query instance
223-
val featureView = fs.getFeatureView(“rain_dataset”, version=1)
305+
val featureView = fs.getFeatureView(“credit_card_fraud”, version=1)
224306

225-
// apply join/filter logic based on location and wind speed
226-
featureView.getQuery.join(windSpeedFg.selectAll())
227-
.filter(rainFg.getFeature("location_id").eq(54))
307+
// apply join/filter logic based on purchase type
308+
featureView.getQuery.join(merchantDetailsFg.selectAll())
309+
.filter(creditCardTransactionsFg.getFeature("category").eq("Cash Withdrawal"))
228310

229-
// to apply new logic independent of location and wind speed from above
311+
// to apply new logic independent of purchase type from above
230312
// re-fetch new feature view and its query instance
231-
val featureView = fs.getFeatureView(“rain_dataset”, 1)
313+
val featureView = fs.getFeatureView(“credit_card_fraud”, 1)
232314

233-
// apply new join/filter logic based on solar irradiance
234-
featureView.getQuery.join(solarIrradianceFg.selectAll())
235-
.filter(solarIrradianceFg.getFeature("location_id").eq(28))
315+
// apply new join/filter logic based on account details
316+
featureView.getQuery.join(merchantDetailsFg.selectAll())
317+
.filter(accountDetailsFg.getFeature("gender").eq("F"))
236318
```

0 commit comments

Comments
 (0)