Skip to content

Commit cddef7c

Browse files
authored
Merge branch 'ibis-project:main' into main
2 parents 1739bbd + 6dce35e commit cddef7c

File tree

6 files changed

+101
-280
lines changed

6 files changed

+101
-280
lines changed

docs/tutorial/pytorch.qmd

Lines changed: 12 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ flight_data = (
102102
"time_hour",
103103
)
104104
# Exclude missing data
105-
.dropna()
105+
.drop_null()
106106
)
107107
flight_data
108108
```
@@ -122,44 +122,24 @@ To get started, let's split this single dataset into two: a _training_ set and a
122122
Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. [It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.](https://www.euclaim.com/blog/flight-numbers-explained#:~:text=Can%20flight%20numbers%20be%20reused,of%20travel%20is%20always%20unique.)
123123

124124
```{python}
125-
flight_data_with_unique_key = flight_data.mutate(
126-
unique_key=ibis.literal(",").join(
127-
[flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]
128-
)
129-
)
130-
flight_data_with_unique_key
131-
```
132-
133-
```{python}
134-
# FIXME(deepyaman): Proposed key isn't unique for actual departure date.
135-
flight_data_with_unique_key.group_by("unique_key").mutate(
136-
cnt=flight_data_with_unique_key.count()
137-
)[ibis._.cnt > 1]
138-
```
139-
140-
```{python}
141-
import random
142-
143-
# Fix the random numbers by setting the seed
144-
# This enables the analysis to be reproducible when random numbers are used
145-
random.seed(222)
146-
147-
# Put 3/4 of the data into the training set
148-
random_key = str(random.getrandbits(256))
149-
data_split = flight_data_with_unique_key.mutate(
150-
train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3
151-
)
125+
import ibis_ml as ml
152126
153127
# Create data frames for the two sets:
154-
train_data = data_split[data_split.train].drop("unique_key", "train")
155-
test_data = data_split[~data_split.train].drop("unique_key", "train")
128+
train_data, test_data = ml.train_test_split(
129+
flight_data,
130+
unique_key=["carrier", "flight", "date"],
131+
# Put 3/4 of the data into the training set
132+
test_size=0.25,
133+
num_buckets=4,
134+
# Fix the random numbers by setting the seed
135+
# This enables the analysis to be reproducible when random numbers are used
136+
random_seed=222,
137+
)
156138
```
157139

158140
## Create features
159141

160142
```{python}
161-
import ibis_ml as ml
162-
163143
flights_rec = ml.Recipe(
164144
ml.ExpandDate("date", components=["dow", "month"]),
165145
ml.Drop("date"),

docs/tutorial/scikit-learn.qmd

Lines changed: 12 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ flight_data = (
101101
"time_hour",
102102
)
103103
# Exclude missing data
104-
.dropna()
104+
.drop_null()
105105
)
106106
flight_data
107107
```
@@ -121,44 +121,24 @@ To get started, let's split this single dataset into two: a _training_ set and a
121121
Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. [It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.](https://www.euclaim.com/blog/flight-numbers-explained#:~:text=Can%20flight%20numbers%20be%20reused,of%20travel%20is%20always%20unique.)
122122

123123
```{python}
124-
flight_data_with_unique_key = flight_data.mutate(
125-
unique_key=ibis.literal(",").join(
126-
[flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]
127-
)
128-
)
129-
flight_data_with_unique_key
130-
```
131-
132-
```{python}
133-
# FIXME(deepyaman): Proposed key isn't unique for actual departure date.
134-
flight_data_with_unique_key.group_by("unique_key").mutate(
135-
cnt=flight_data_with_unique_key.count()
136-
)[ibis._.cnt > 1]
137-
```
138-
139-
```{python}
140-
import random
141-
142-
# Fix the random numbers by setting the seed
143-
# This enables the analysis to be reproducible when random numbers are used
144-
random.seed(222)
145-
146-
# Put 3/4 of the data into the training set
147-
random_key = str(random.getrandbits(256))
148-
data_split = flight_data_with_unique_key.mutate(
149-
train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3
150-
)
124+
import ibis_ml as ml
151125
152126
# Create data frames for the two sets:
153-
train_data = data_split[data_split.train].drop("unique_key", "train")
154-
test_data = data_split[~data_split.train].drop("unique_key", "train")
127+
train_data, test_data = ml.train_test_split(
128+
flight_data,
129+
unique_key=["carrier", "flight", "date"],
130+
# Put 3/4 of the data into the training set
131+
test_size=0.25,
132+
num_buckets=4,
133+
# Fix the random numbers by setting the seed
134+
# This enables the analysis to be reproducible when random numbers are used
135+
random_seed=222,
136+
)
155137
```
156138

157139
## Create features
158140

159141
```{python}
160-
import ibis_ml as ml
161-
162142
flights_rec = ml.Recipe(
163143
ml.ExpandDate("date", components=["dow", "month"]),
164144
ml.Drop("date"),

docs/tutorial/xgboost.qmd

Lines changed: 12 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ flight_data = (
101101
"time_hour",
102102
)
103103
# Exclude missing data
104-
.dropna()
104+
.drop_null()
105105
)
106106
flight_data
107107
```
@@ -121,44 +121,24 @@ To get started, let's split this single dataset into two: a _training_ set and a
121121
Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. [It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.](https://www.euclaim.com/blog/flight-numbers-explained#:~:text=Can%20flight%20numbers%20be%20reused,of%20travel%20is%20always%20unique.)
122122

123123
```{python}
124-
flight_data_with_unique_key = flight_data.mutate(
125-
unique_key=ibis.literal(",").join(
126-
[flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]
127-
)
128-
)
129-
flight_data_with_unique_key
130-
```
131-
132-
```{python}
133-
# FIXME(deepyaman): Proposed key isn't unique for actual departure date.
134-
flight_data_with_unique_key.group_by("unique_key").mutate(
135-
cnt=flight_data_with_unique_key.count()
136-
)[ibis._.cnt > 1]
137-
```
138-
139-
```{python}
140-
import random
141-
142-
# Fix the random numbers by setting the seed
143-
# This enables the analysis to be reproducible when random numbers are used
144-
random.seed(222)
145-
146-
# Put 3/4 of the data into the training set
147-
random_key = str(random.getrandbits(256))
148-
data_split = flight_data_with_unique_key.mutate(
149-
train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3
150-
)
124+
import ibis_ml as ml
151125
152126
# Create data frames for the two sets:
153-
train_data = data_split[data_split.train].drop("unique_key", "train")
154-
test_data = data_split[~data_split.train].drop("unique_key", "train")
127+
train_data, test_data = ml.train_test_split(
128+
flight_data,
129+
unique_key=["carrier", "flight", "date"],
130+
# Put 3/4 of the data into the training set
131+
test_size=0.25,
132+
num_buckets=4,
133+
# Fix the random numbers by setting the seed
134+
# This enables the analysis to be reproducible when random numbers are used
135+
random_seed=222,
136+
)
155137
```
156138

157139
## Create features
158140

159141
```{python}
160-
import ibis_ml as ml
161-
162142
flights_rec = ml.Recipe(
163143
ml.ExpandDate("date", components=["dow", "month"]),
164144
ml.Drop("date"),

0 commit comments

Comments
 (0)