diff --git a/NEWS.md b/NEWS.md index c612f2fcc..129d851ab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,10 @@ # mlr3pipelines 0.7.1-9000 -* New parameter `no_collapse_above_absolute` in `PipeOpCollapseFactors` / `po("collapse_factors")`. +* New parameter `no_collapse_above_absolute` for `PipeOpCollapseFactors` / `po("collapse_factors")`. * Fix: `PipeOpCollapseFactors` now correctly collapses levels of ordered factors. * Fix: `LearnerClassifAvg` and `LearnerRegrAvg` hyperparameters get the `"required"` tag. * New parameter `use_groups` (default `TRUE`) for `PipeOpSubsampling` to respect grouping (changed default behaviour for grouped data) +* New parameter `new_role_direct` for `PipeOpColRoles` / `po("colroles")` to change column roles by role instead of by column. * Dictionary sugar functions `po()` / `pos()` / `ppl()` / `ppls()` now make suggestions for entries in both `mlr_pipeops` as well as `mlr_graphs` when an object by the given name could not be found in the respective dictionary. * New PipeOp `PipeOpDecode` / `po("decode")` to reverse one-hot or treatment encoding. diff --git a/R/PipeOpColRoles.R b/R/PipeOpColRoles.R index e5e56cc16..f5dd9d006 100644 --- a/R/PipeOpColRoles.R +++ b/R/PipeOpColRoles.R @@ -5,7 +5,9 @@ #' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. #' #' @description -#' Changes the column roles of the input [`Task`][mlr3::Task] according to `new_role`. +#' Changes the column roles of the input [`Task`][mlr3::Task] according to `new_role` or its inverse `new_role_direct`. +#' +#' Setting a new target variable or changing the role of an existing target variable is not supported. #' #' @section Construction: #' ``` @@ -21,20 +23,25 @@ #' @section Input and Output Channels: #' Input and output channels are inherited from [`PipeOpTaskPreproc`]. #' -#' The output is the input [`Task`][mlr3::Task] with transformed column roles according to `new_role`. +#' The output is the input [`Task`][mlr3::Task] with transformed column roles according to `new_role` or its inverse `new_role_direct`. #' #' @section State: #' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`]. #' #' @section Parameters: #' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: -#' * `new_role` :: `list`\cr -#' Named list of new column roles. The names must match the column names of the input task that +#' * `new_role` :: named `list`\cr +#' Named list of new column roles by column. The names must match the column names of the input task that #' will later be trained/predicted on. Each entry of the list must contain a character vector with -#' possible values of [`mlr_reflections$task_col_roles`][mlr3::mlr_reflections]. If the value is -#' given as `character()`, the column will be dropped from the input task. Changing the role of a -#' column results in this column loosing its previous role(s). Setting a new target variable or -#' changing the role of an existing target variable is not supported. +#' possible values of [`mlr_reflections$task_col_roles`][mlr3::mlr_reflections]. +#' If the value is given as `character()` or `NULL`, the column will be dropped from the input task. Changing the role +#' of a column results in this column loosing its previous role(s). +#' * `new_role_direct` :: named `list`\cr# +#' Named list of new column roles by role. The names must match the possible column roles, i.e. values of +#' [`mlr_reflections$task_col_roles`][mlr3::mlr_reflections]. Each entry of the list must contain a character +#' vector with column names of the input task that will later be trained/predicted on. +#' If the value is given as `character()` or `NULL`, all columns will be dropped from the role given in the element +#' name. The value given for a role overwrites the previous entry in `task$col_roles` for that role, completely. #' #' @section Methods: #' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. @@ -42,12 +49,22 @@ #' @examples #' library("mlr3") #' -#' task = tsk("boston_housing") +#' task = tsk("penguins") #' pop = po("colroles", param_vals = list( -#' new_role = list(town = c("order", "feature")) +#' new_role = list(body_mass = c("order", "feature")) #' )) #' -#' pop$train(list(task)) +#' train_out1 = pop$train(list(task))[[1L]] +#' train_out1$col_roles +#' +#' pop$param_set$set_values( +#' new_role = NULL, +#' new_role_direct = list(order = character(), group = "island") +#' ) +#' +#' train_out2 = pop$train(list(train_out1)) +#' train_out2$col_roles +#' #' @family PipeOps #' @template seealso_pipeopslist #' @include PipeOpTaskPreproc.R @@ -57,20 +74,48 @@ PipeOpColRoles = R6Class("PipeOpColRoles", public = list( initialize = function(id = "colroles", param_vals = list()) { ps = ps( - # named list, each entry with a vector of roles + # named list, each entry with a vector of roles, names are columns new_role = p_uty( tags = c("train", "predict"), custom_check = crate(function(x) { - first_check = check_list(x, types = "character", any.missing = FALSE, min.len = 1L, names = "named") - # return the error directly if this failed - if (is.character(first_check)) { - return(first_check) + first_check = check_list(x, types = c("character", "null"), min.len = 1L, names = "unique") + # Return the error directly if this failed + if (!isTRUE(first_check)) return(first_check) + + # Only one column for roles "group", "weight", and "name" + counter = c(group = 0L, weight = 0L, name = 0L) + for (i in seq_along(x)) { + counter = counter + names(counter) %in% x[[i]] + } + if (any(counter > 1L)) { + return(sprintf("There may only be up to one column per role for role(s) %s.", str_collapse(names(which(counter > 1L)), quote = '"'))) } - # changing anything target related is not supported - # a value of "character()" will lead to the column being dropped + + # Changing anything target related is not supported. + # A value of character() or NULL is accepted. all_col_roles = unique(unlist(mlr3::mlr_reflections$task_col_roles)) check_subset(unlist(x), all_col_roles[all_col_roles != "target"]) }, .parent = topenv()) + ), + # named list, each with a vector of columns, names are column roles + new_role_direct = p_uty( + tags = c("train", "predict"), + custom_check = crate(function(x) { + first_check = check_list(x, types = c("character", "null"), min.len = 1L, names = "unique") + # Return the error directly if this failed + if (!isTRUE(first_check)) return(first_check) + + # Only one column for roles "group", "weight", and "name" + lens = lengths(x[c("group", "weight", "name")]) + if (any(lens > 1L)) { + return(sprintf("There may only be up to one column per role for role(s) %s.", str_collapse(names(which(lens > 1L)), quote = '"'))) + } + + # Changing anything target related is not supported. + # A value of character() or NULL is accepted. + all_col_roles = unique(unlist(mlr3::mlr_reflections$task_col_roles)) + check_subset(names(x), all_col_roles[all_col_roles != "target"]) + }, .parent = topenv()) ) ) super$initialize(id, param_set = ps, param_vals = param_vals, can_subset_cols = FALSE) @@ -78,33 +123,44 @@ PipeOpColRoles = R6Class("PipeOpColRoles", ), private = list( .transform = function(task) { - new_role = self$param_set$values$new_role + new_role = self$param_set$values[["new_role"]] + new_role_direct = self$param_set$values[["new_role_direct"]] - if (is.null(new_role)) { + if (is.null(new_role) && is.null(new_role_direct)) { return(task) # early exit } + if (!is.null(new_role) && !is.null(new_role_direct)) { + stop("Both parameters, 'new_role' and 'new_role_direct', are set. Provide only one parameter at a time.") + } - new_role_names = names(new_role) - ids = task$col_info$id - ids = ids[ids != "..row_ids"] - # names of "new_role" must be a subset of the column names of the task - assert_subset(new_role_names, choices = ids, empty.ok = FALSE) + # Create list new_role_direct with similar structure to col_roles (names are column roles, entries are column names) + if (!is.null(new_role)) { + # Set new_role_direct to task$col_roles with columns removed for which we change roles (as we want a column to only + # have the roles given for that column in new_role) + new_role_direct = map(task$col_roles, .f = function(x) x[x %nin% names(new_role)]) - # changing the role of a target is not supported - if (any(task$col_roles$target %in% new_role_names)) { - stopf("Cannot change the role of a target.") + # Add new role(s) for column(s) for which we change the role + possible_col_roles = mlr3::mlr_reflections$task_col_roles[[task$task_type]] + for (role in possible_col_roles) { + new_role_direct[[role]] = union( + new_role_direct[[role]], + names(which(unlist(map(new_role, .f = function(x) role %in% x)))) + ) + } } - # drop (all) old role(s) - task$col_roles = map(task$col_roles, .f = function(x) x[x %nin% new_role_names]) + # Replace NULLs with character(0) + new_role_direct = lapply(new_role_direct, as.character) - # add the new role(s) - all_col_roles = unique(unlist(mlr3::mlr_reflections$task_col_roles)) - for(role in all_col_roles) { - task$col_roles[[role]] = union(task$col_roles[[role]], - y = names(which(unlist(map(new_role, .f = function(x) role %in% x))))) + # Changing the role of a target is not supported + cols = unlist(new_role_direct[names(new_role_direct) != "target"]) + if (any(task$col_roles$target %in% cols)) { + stop("Cannot change the role of a target.") } + # Update column roles + task$col_roles[names(new_role_direct)] = new_role_direct + task } ) diff --git a/man/mlr_pipeops_colroles.Rd b/man/mlr_pipeops_colroles.Rd index dcd3ee912..798f629ea 100644 --- a/man/mlr_pipeops_colroles.Rd +++ b/man/mlr_pipeops_colroles.Rd @@ -8,7 +8,9 @@ \code{\link[R6:R6Class]{R6Class}} object inheriting from \code{\link{PipeOpTaskPreprocSimple}}/\code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}. } \description{ -Changes the column roles of the input \code{\link[mlr3:Task]{Task}} according to \code{new_role}. +Changes the column roles of the input \code{\link[mlr3:Task]{Task}} according to \code{new_role} or its inverse \code{new_role_direct}. + +Setting a new target variable or changing the role of an existing target variable is not supported. } \section{Construction}{ @@ -28,7 +30,7 @@ be set during construction. Default \code{list()}. Input and output channels are inherited from \code{\link{PipeOpTaskPreproc}}. -The output is the input \code{\link[mlr3:Task]{Task}} with transformed column roles according to \code{new_role}. +The output is the input \code{\link[mlr3:Task]{Task}} with transformed column roles according to \code{new_role} or its inverse \code{new_role_direct}. } \section{State}{ @@ -40,13 +42,18 @@ The \verb{$state} is a named \code{list} with the \verb{$state} elements inherit The parameters are the parameters inherited from \code{\link{PipeOpTaskPreproc}}, as well as: \itemize{ -\item \code{new_role} :: \code{list}\cr -Named list of new column roles. The names must match the column names of the input task that +\item \code{new_role} :: named \code{list}\cr +Named list of new column roles by column. The names must match the column names of the input task that will later be trained/predicted on. Each entry of the list must contain a character vector with -possible values of \code{\link[mlr3:mlr_reflections]{mlr_reflections$task_col_roles}}. If the value is -given as \code{character()}, the column will be dropped from the input task. Changing the role of a -column results in this column loosing its previous role(s). Setting a new target variable or -changing the role of an existing target variable is not supported. +possible values of \code{\link[mlr3:mlr_reflections]{mlr_reflections$task_col_roles}}. +If the value is given as \code{character()} or \code{NULL}, the column will be dropped from the input task. Changing the role +of a column results in this column loosing its previous role(s). +\item \code{new_role_direct} :: named \code{list}\cr# +Named list of new column roles by role. The names must match the possible column roles, i.e. values of +\code{\link[mlr3:mlr_reflections]{mlr_reflections$task_col_roles}}. Each entry of the list must contain a character +vector with column names of the input task that will later be trained/predicted on. +If the value is given as \code{character()} or \code{NULL}, all columns will be dropped from the role given in the element +name. The value given for a role overwrites the previous entry in \code{task$col_roles} for that role, completely. } } @@ -58,12 +65,22 @@ Only methods inherited from \code{\link{PipeOpTaskPreprocSimple}}/\code{\link{Pi \examples{ library("mlr3") -task = tsk("boston_housing") +task = tsk("penguins") pop = po("colroles", param_vals = list( - new_role = list(town = c("order", "feature")) + new_role = list(body_mass = c("order", "feature")) )) -pop$train(list(task)) +train_out1 = pop$train(list(task))[[1L]] +train_out1$col_roles + +pop$param_set$set_values( + new_role = NULL, + new_role_direct = list(order = character(), group = "island") +) + +train_out2 = pop$train(list(train_out1)) +train_out2$col_roles + } \seealso{ https://mlr-org.com/pipeops.html diff --git a/tests/testthat/test_pipeop_colroles.R b/tests/testthat/test_pipeop_colroles.R index 500f5f0b3..42089bc44 100644 --- a/tests/testthat/test_pipeop_colroles.R +++ b/tests/testthat/test_pipeop_colroles.R @@ -1,47 +1,124 @@ context("PipeOpColRoles") test_that("PipeOpColRoles - basic properties", { + op = PipeOpColRoles$new() task = mlr_tasks$get("iris") + expect_pipeop(op) expect_equal(task, train_pipeop(op, inputs = list(task))$output) expect_equal(task, predict_pipeop(op, inputs = list(task))$output) expect_datapreproc_pipeop_class(PipeOpColRoles, task = task) + }) -test_that("PipeOpColRoles - assertion works", { +test_that("PipeOpColRoles - assertions on params work", { + expect_error(PipeOpColRoles$new(param_vals = list(new_role = "wrong")), regexp = "list") - expect_error(PipeOpColRoles$new(param_vals = list(new_role = list(a = "wrong", b = NA))), regexp = "character") - expect_error(PipeOpColRoles$new(param_vals = list(new_role = list(a = "wrong", b = "target"))), regexp = "subset") + expect_error(PipeOpColRoles$new(param_vals = list(new_role = list(a = "wrong", b = NA))), regexp = "character,null") + expect_no_error(PipeOpColRoles$new(param_vals = list(new_role = list(a = NULL)))) + expect_error(PipeOpColRoles$new(param_vals = list(new_role = list(a = "wrong"))), regexp = "subset") + expect_error(PipeOpColRoles$new(param_vals = list(new_role = list(a = "target"))), regexp = "subset") + + expect_error(PipeOpColRoles$new(param_vals = list(new_role = list(a = "group", b = "group"))), regexp = "up to one column per role") + expect_error(PipeOpColRoles$new(param_vals = list(new_role = list(a = "weight", b = "weight"))), regexp = "up to one column per role") + expect_error(PipeOpColRoles$new(param_vals = list(new_role = list(a = "name", b = "name"))), regexp = "up to one column per role") + expect_error(PipeOpColRoles$new(param_vals = list(new_role = list(a = c("group", "name"), b = c("group", "name")))), regexp = "up to one column per role.*?\"group\", \"name\"") + + expect_error(PipeOpColRoles$new(param_vals = list(new_role_direct = "wrong")), regexp = "list") + expect_error(PipeOpColRoles$new(param_vals = list(new_role_direct = list(wrong = "x", feature = NA))), regexp = "character,null") + expect_no_error(PipeOpColRoles$new(param_vals = list(new_role = list(feature = NULL)))) + expect_error(PipeOpColRoles$new(param_vals = list(new_role_direct = list(wrong = "x"))), regexp = "subset") + expect_error(PipeOpColRoles$new(param_vals = list(new_role_direct = list(target = "y"))), regexp = "subset") + + expect_error(PipeOpColRoles$new(param_vals = list(new_role_direct = list(group = c("x", "y")))), regexp = "up to one column per role") + expect_error(PipeOpColRoles$new(param_vals = list(new_role_direct = list(weight = c("x", "y")))), regexp = "up to one column per role") + expect_error(PipeOpColRoles$new(param_vals = list(new_role_direct = list(name = c("x", "y")))), regexp = "up to one column per role") + expect_error(PipeOpColRoles$new(param_vals = list(new_role_direct = list(group = c("x", "y"), name = c("x", "y")))), regexp = "up to one column per role.*?\"group\", \"name\"") + }) -test_that("PipeOpColRoles - name checking works", { +test_that("PipeOpColRoles - only existing columns are accepted", { + task = mlr_tasks$get("iris") task$cbind(data.table(rn = sprintf("%03d", 1:150))) + op = PipeOpColRoles$new(param_vals = list(new_role = list(rn = "name", wrong = "feature"))) expect_error(train_pipeop(op, inputs = list(task)), regexp = "subset") + + op = PipeOpColRoles$new(param_vals = list(new_role_direct = list(name = "rn", feature = "wrong"))) + expect_error(train_pipeop(op, inputs = list(task)), regexp = "subset") + }) test_that("PipeOpColRoles - changing the role of a target fails", { + task = mlr_tasks$get("iris") + op = PipeOpColRoles$new(param_vals = list(new_role = list(Species = "feature"))) expect_error(train_pipeop(op, inputs = list(task)), regexp = "role of a target") + + op = PipeOpColRoles$new(param_vals = list(new_role_direct = list(feature = "Species"))) + expect_error(train_pipeop(op, inputs = list(task)), regexp = "role of a target") + }) -test_that("PipeOpColRoles - functionality works", { +test_that("PipeOpColRoles - new_role works", { + task = mlr_tasks$get("iris") task$cbind(data.table(rn = sprintf("%03d", 1:150))) - op = PipeOpColRoles$new(param_vals = list(new_role = list(rn = "name", Petal.Length = "order", Petal.Width = character(0)))) - train_out = train_pipeop(op, inputs = list(task))$output + + op = PipeOpColRoles$new(param_vals = list(new_role = list( + rn = "name", Petal.Length = c("feature", "order"), Petal.Width = character(0), Sepal.Width = NULL)) + ) + + train_out = train_pipeop(op, inputs = list(task))[[1L]] + col_roles_actual = train_out$col_roles col_roles_expected = list( - feature = c("Sepal.Length", "Sepal.Width"), target = "Species", name = "rn", - order = "Petal.Length", stratum = character(0), group = character(0), weight = character(0)) + feature = c("Sepal.Length", "Petal.Length"), target = "Species", name = "rn", order = "Petal.Length", + stratum = character(0), group = character(0), weight = character(0) + ) + + # Compatibility with upcoming new weights_learner role in mlr3 if ("weights_learner" %in% names(task)) names(col_roles_expected)[names(col_roles_expected) == "weight"] = "weights_learner" + expect_equal(train_out$col_roles[names(col_roles_expected)], col_roles_expected) expect_equal(train_out$row_names$row_name, task$data(cols = "rn")[[1L]]) expect_true("Petal.Width" %nin% colnames(train_out$data())) - predict_out = predict_pipeop(op, inputs = list(task))$output + expect_true("Sepal.Width" %nin% colnames(train_out$data())) + + predict_out = predict_pipeop(op, inputs = list(task))[[1L]] expect_equal(train_out, predict_out) }) + +test_that("PipeOpColRoles - new_role_direct works", { + + task = mlr_tasks$get("iris") + task$cbind(data.table(rn = sprintf("%03d", 1:150))) + task$col_roles$group = "Species" + + op = PipeOpColRoles$new(param_vals = list(new_role_direct = list( + name = "rn", order = c("Petal.Length", "Sepal.Length"), feature = character(0), group = NULL)) + ) + + train_out = train_pipeop(op, inputs = list(task))[[1L]] + + col_roles_actual = train_out$col_roles + col_roles_expected = list( + feature = character(0), target = "Species", name = "rn", order = c("Petal.Length", "Sepal.Length"), + stratum = character(0), group = character(0), weight = character(0) + ) + + # Compatibility with upcoming new weights_learner role in mlr3 + if ("weights_learner" %in% names(task)) names(col_roles_expected)[names(col_roles_expected) == "weight"] = "weights_learner" + + expect_equal(train_out$col_roles[names(col_roles_expected)], col_roles_expected) + expect_equal(train_out$row_names$row_name, task$data(cols = "rn")[[1L]]) + expect_equal(train_out$data(), task$data(cols = "Species")) + + predict_out = predict_pipeop(op, inputs = list(task))[[1L]] + expect_equal(train_out, predict_out) +}) +