From 31ee55816906d121ecc54cb82dd11b0d0db83523 Mon Sep 17 00:00:00 2001
From: advieser <advieser@gmail.com>
Date: Thu, 23 Jan 2025 22:48:59 +0100
Subject: [PATCH] NA handling for PipeOpEncodeQuantiles

---
 R/PipeOpEncodePL.R | 39 +++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/R/PipeOpEncodePL.R b/R/PipeOpEncodePL.R
index 5ea95f488..d7ae405ef 100644
--- a/R/PipeOpEncodePL.R
+++ b/R/PipeOpEncodePL.R
@@ -11,7 +11,7 @@
 #' in `private$.get_bins()`, and then creating new feature columns through a continuous alternative to one-hot encoding.
 #' Here, one new feature per bin is constructed, with values being either
 #' * `0`, if the original value was below the lower bin boundary,
-#' * `1`, if the original value was above the upper bin boundary, or
+#' * `1`, if the original value was above or equal to the upper bin boundary, or
 #' * a scaled value between `0` and `1`, if the original value was inside the bin boundaries. Scaling is done by
 #'   offsetting the original value by the lower bin boundary and dividing by the bin width.
 #'
@@ -46,7 +46,7 @@
 #' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreprocSimple`], as well as:
 #' * `bins` :: named `list`\cr
 #'   Named list of numeric vectors. Each element corresponds to one of the affected feature columns and contains the
-#'   bin boundaries derived through the private method `.get_bins()`. The element vectors are named by the respective
+#'   bin boundaries derived through  `private$.get_bins()`. The element vectors are named by the respective
 #'   feature column.
 #'
 #' @section Parameters:
@@ -104,6 +104,10 @@ PipeOpEncodePL = R6Class("PipeOpEncodePL",
         return(task)  # early exit
       }
 
+      # if (private$.can_handle_nas && any(task$missings())) {
+      #   stopf("%s does not support handling of tasks with missing data.", class(self))
+      # }
+
       dt = task$data(cols = cols)
       res = imap_dtc(dt, function(d, col) encode_piecewise_linear(d, bins[[col]]))
 
@@ -124,10 +128,13 @@ encode_piecewise_linear = function(column, bins) {
   for (t in seq_len(n_bins)) {
     lower = bins[[t]]
     upper = bins[[t + 1]]
+    colname = colnames(dt)[[t]]
 
-    dt[column >= upper, colnames(dt)[[t]] := 1]
-    indices = column < upper & column >= lower
-    dt[indices, colnames(dt)[[t]] := (column[indices] - lower) / (upper - lower)]
+    dt[column >= upper, (colname) := 1]
+    indices = !is.na(column) & column < upper & column >= lower
+    dt[indices, (colname) := (column[indices] - lower) / (upper - lower)]
+    # Filling NAs back in
+    dt[is.na(column), (colname) := NA]
   }
 
   dt
@@ -148,6 +155,8 @@ encode_piecewise_linear = function(column, bins) {
 #' AND HOW THE NUMBER IS TO BE INTERPRETEED
 #' (i.e. numsplits=2 creates 3 values, meaning five bins??!)
 #'
+#' document that nas are removed for calculating quantiles
+#'
 #' @section Construction:
 #' ```
 #' PipeOpEncodePL$new(id = "encodeplquantiles", param_vals = list())
@@ -168,11 +177,13 @@ encode_piecewise_linear = function(column, bins) {
 #'
 #' @section Parameters:
 #' The parameters are the parameters inherited from [`PipeOpEncodePL`]/[`PipeOpTaskPreprocSimple`], as well as:
-#' * `numsplits`  :: `numeric(1)` \cr
-#'   Default is ``.
+#' * `numsplits` :: `integer(1)` \cr
+#'   Number of bins to create. Default is `2`.
+#' * `type` :: `integer(1)`\cr
+#'   Method used to calculate sample quantiles. See help of [`stats::quantile`]. Default is `7`.
 #'
 #' @section Internals:
-#'
+#' Uses the [`stats::quantile`] function.
 #'
 #' @section Fields:
 #' Only fields inherited from [`PipeOpEncodePL`]/[`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
@@ -196,7 +207,8 @@ PipeOpEncodePLQuantiles = R6Class("PipeOpEncodePLQuantiles",
   public = list(
     initialize = function(id = "encodeplquantiles", param_vals = list()) {
       ps = ps(
-        numsplits = p_int(lower = 2, default = 2, tags = c("train", "predict"))
+        numsplits = p_int(lower = 2, default = 2, tags = c("train", "predict")),
+        type = p_int(lower = 1, upper = 9, default = 7, tags = c("train", "predict"))
       )
       super$initialize(id, param_set = ps, param_vals = param_vals, packages = "stats")
     }
@@ -205,8 +217,15 @@ PipeOpEncodePLQuantiles = R6Class("PipeOpEncodePLQuantiles",
 
     .get_bins = function(task, cols) {
       numsplits = self$param_set$values$numsplits %??% 2
+      # Defaulting to default value in stats::quantile, i.e. method 7
+      type = self$param_set$values$type %??% 7
+
       lapply(task$data(cols = cols), function(d) {
-        unique(c(min(d), stats::quantile(d, seq(1, numsplits - 1) / numsplits, na.rm = TRUE), max(d)))
+        unique(c(
+          min(d, na.rm = TRUE),
+          stats::quantile(d, seq(1, numsplits - 1) / numsplits, na.rm = TRUE, names = FALSE, type = type),
+          max(d, na.rm = TRUE)
+        ))
       })
     }
   )