jitingxu1
diff --git a/‎docs/_quarto.yml
Lines changed: 1 addition & 1 deletion b/‎docs/_quarto.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/index.qmd
Lines changed: 3 additions & 2 deletions b/‎docs/index.qmd
Lines changed: 3 additions & 2 deletions
diff --git a/‎docs/reference/support-matrix/step_config.yml
Lines changed: 24 additions & 24 deletions b/‎docs/reference/support-matrix/step_config.yml
Lines changed: 24 additions & 24 deletions
diff --git a/‎docs/tutorial/pytorch.qmd
Lines changed: 12 additions & 32 deletions b/‎docs/tutorial/pytorch.qmd
Lines changed: 12 additions & 32 deletions
diff --git a/‎docs/tutorial/scikit-learn.qmd
Lines changed: 12 additions & 32 deletions b/‎docs/tutorial/scikit-learn.qmd
Lines changed: 12 additions & 32 deletions
diff --git a/‎docs/tutorial/xgboost.qmd
Lines changed: 12 additions & 32 deletions b/‎docs/tutorial/xgboost.qmd
Lines changed: 12 additions & 32 deletions
@@ -214,9 +214,9 @@ quartodoc:
             name: Temporal feature extraction
             desc: Feature extraction for temporal columns
           contents:
-            - ExpandDateTime
             - ExpandDate
             - ExpandTime
+            - ExpandTimestamp
 
         - kind: page
           path: steps-other
 
@@ -11,8 +11,9 @@ hide-description: true
 
 - Preprocess your data at scale on any [Ibis](https://ibis-project.org/)-supported
   backend.
-- Compose [`Recipe`](/reference/core.html#ibis_ml.Recipe)s with other scikit-learn
-  estimators using
+- Compose
+  [`Recipe`](https://ibis-project.github.io/ibis-ml/reference/core.html#ibis_ml.Recipe)s
+  with other scikit-learn estimators using
   [`Pipeline`](https://scikit-learn.org/stable/modules/compose.html#pipeline-chaining-estimators)s.
 - Seamlessly integrate with [scikit-learn](https://scikit-learn.org/stable/),
   [XGBoost](https://xgboost.readthedocs.io/en/stable/python/sklearn_estimator.html), and
 
@@ -90,7 +90,30 @@ ExpandDate:
         components:
           - doy
 
-ExpandDateTime:
+ExpandTime:
+  configurations:
+    - name: h
+      config:
+        inputs: time
+        components:
+          - hour
+    - name: m
+      config:
+        inputs: time
+        components:
+          - minute
+    - name: s
+      config:
+        inputs: time
+        components:
+          - second
+    - name: ms
+      config:
+        inputs: time
+        components:
+          - millisecond
+
+ExpandTimestamp:
   configurations:
     - name: ms
       config:
@@ -137,26 +160,3 @@ ExpandDateTime:
         inputs: timestamp
         components:
           - doy
-
-ExpandTime:
-  configurations:
-    - name: h
-      config:
-        inputs: time
-        components:
-          - hour
-    - name: m
-      config:
-        inputs: time
-        components:
-          - minute
-    - name: s
-      config:
-        inputs: time
-        components:
-          - second
-    - name: ms
-      config:
-        inputs: time
-        components:
-          - millisecond
@@ -102,7 +102,7 @@ flight_data = (
         "time_hour",
     )
     # Exclude missing data
-    .dropna()
+    .drop_null()
 )
 flight_data
 ```
@@ -122,44 +122,24 @@ To get started, let's split this single dataset into two: a _training_ set and a
 Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. [It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.](https://www.euclaim.com/blog/flight-numbers-explained#:~:text=Can%20flight%20numbers%20be%20reused,of%20travel%20is%20always%20unique.)
 
 ```{python}
-flight_data_with_unique_key = flight_data.mutate(
-    unique_key=ibis.literal(",").join(
-        [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]
-    )
-)
-flight_data_with_unique_key
-```
-
-```{python}
-# FIXME(deepyaman): Proposed key isn't unique for actual departure date.
-flight_data_with_unique_key.group_by("unique_key").mutate(
-    cnt=flight_data_with_unique_key.count()
-)[ibis._.cnt > 1]
-```
-
-```{python}
-import random
-
-# Fix the random numbers by setting the seed
-# This enables the analysis to be reproducible when random numbers are used
-random.seed(222)
-
-# Put 3/4 of the data into the training set
-random_key = str(random.getrandbits(256))
-data_split = flight_data_with_unique_key.mutate(
-    train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3
-)
+import ibis_ml as ml
 
 # Create data frames for the two sets:
-train_data = data_split[data_split.train].drop("unique_key", "train")
-test_data = data_split[~data_split.train].drop("unique_key", "train")
+train_data, test_data = ml.train_test_split(
+    flight_data,
+    unique_key=["carrier", "flight", "date"],
+    # Put 3/4 of the data into the training set
+    test_size=0.25,
+    num_buckets=4,
+    # Fix the random numbers by setting the seed
+    # This enables the analysis to be reproducible when random numbers are used
+    random_seed=222,
+)
 ```
 
 ## Create features
 
 ```{python}
-import ibis_ml as ml
-
 flights_rec = ml.Recipe(
     ml.ExpandDate("date", components=["dow", "month"]),
     ml.Drop("date"),
 
@@ -101,7 +101,7 @@ flight_data = (
         "time_hour",
     )
     # Exclude missing data
-    .dropna()
+    .drop_null()
 )
 flight_data
 ```
@@ -121,44 +121,24 @@ To get started, let's split this single dataset into two: a _training_ set and a
 Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. [It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.](https://www.euclaim.com/blog/flight-numbers-explained#:~:text=Can%20flight%20numbers%20be%20reused,of%20travel%20is%20always%20unique.)
 
 ```{python}
-flight_data_with_unique_key = flight_data.mutate(
-    unique_key=ibis.literal(",").join(
-        [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]
-    )
-)
-flight_data_with_unique_key
-```
-
-```{python}
-# FIXME(deepyaman): Proposed key isn't unique for actual departure date.
-flight_data_with_unique_key.group_by("unique_key").mutate(
-    cnt=flight_data_with_unique_key.count()
-)[ibis._.cnt > 1]
-```
-
-```{python}
-import random
-
-# Fix the random numbers by setting the seed
-# This enables the analysis to be reproducible when random numbers are used
-random.seed(222)
-
-# Put 3/4 of the data into the training set
-random_key = str(random.getrandbits(256))
-data_split = flight_data_with_unique_key.mutate(
-    train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3
-)
+import ibis_ml as ml
 
 # Create data frames for the two sets:
-train_data = data_split[data_split.train].drop("unique_key", "train")
-test_data = data_split[~data_split.train].drop("unique_key", "train")
+train_data, test_data = ml.train_test_split(
+    flight_data,
+    unique_key=["carrier", "flight", "date"],
+    # Put 3/4 of the data into the training set
+    test_size=0.25,
+    num_buckets=4,
+    # Fix the random numbers by setting the seed
+    # This enables the analysis to be reproducible when random numbers are used
+    random_seed=222,
+)
 ```
 
 ## Create features
 
 ```{python}
-import ibis_ml as ml
-
 flights_rec = ml.Recipe(
     ml.ExpandDate("date", components=["dow", "month"]),
     ml.Drop("date"),
 
@@ -101,7 +101,7 @@ flight_data = (
         "time_hour",
     )
     # Exclude missing data
-    .dropna()
+    .drop_null()
 )
 flight_data
 ```
@@ -121,44 +121,24 @@ To get started, let's split this single dataset into two: a _training_ set and a
 Because the order of rows in an Ibis table is undefined, we need a unique key to split the data reproducibly. [It is permissible for airlines to use the same flight number for different routes, as long as the flights do not operate on the same day. This means that the combination of the flight number and the date of travel is always unique.](https://www.euclaim.com/blog/flight-numbers-explained#:~:text=Can%20flight%20numbers%20be%20reused,of%20travel%20is%20always%20unique.)
 
 ```{python}
-flight_data_with_unique_key = flight_data.mutate(
-    unique_key=ibis.literal(",").join(
-        [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]
-    )
-)
-flight_data_with_unique_key
-```
-
-```{python}
-# FIXME(deepyaman): Proposed key isn't unique for actual departure date.
-flight_data_with_unique_key.group_by("unique_key").mutate(
-    cnt=flight_data_with_unique_key.count()
-)[ibis._.cnt > 1]
-```
-
-```{python}
-import random
-
-# Fix the random numbers by setting the seed
-# This enables the analysis to be reproducible when random numbers are used
-random.seed(222)
-
-# Put 3/4 of the data into the training set
-random_key = str(random.getrandbits(256))
-data_split = flight_data_with_unique_key.mutate(
-    train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3
-)
+import ibis_ml as ml
 
 # Create data frames for the two sets:
-train_data = data_split[data_split.train].drop("unique_key", "train")
-test_data = data_split[~data_split.train].drop("unique_key", "train")
+train_data, test_data = ml.train_test_split(
+    flight_data,
+    unique_key=["carrier", "flight", "date"],
+    # Put 3/4 of the data into the training set
+    test_size=0.25,
+    num_buckets=4,
+    # Fix the random numbers by setting the seed
+    # This enables the analysis to be reproducible when random numbers are used
+    random_seed=222,
+)
 ```
 
 ## Create features
 
 ```{python}
-import ibis_ml as ml
-
 flights_rec = ml.Recipe(
     ml.ExpandDate("date", components=["dow", "month"]),
     ml.Drop("date"),