From e61f6123da5a2bad8e266dcee0dc54961bb4b935 Mon Sep 17 00:00:00 2001
From: john <bblodfon@gmail.com>
Date: Wed, 5 Feb 2025 11:32:58 +0100
Subject: [PATCH] add 'man_nfeatures' argument

---
 NEWS.md                                |  2 ++
 R/EnsembleFSResult.R                   | 29 ++++++++++++++++++------
 man/AutoFSelector.Rd                   |  1 +
 man/ensemble_fs_result.Rd              | 31 +++++++++++++++++++++-----
 tests/testthat/test_ensemble_fselect.R |  9 ++++++++
 5 files changed, 59 insertions(+), 13 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 5eebdfd7..c2098124 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,7 @@
 # mlr3fselect (development version)
 
+* feat: Add `max_nfeatures` argument in the `pareto_front()` and `knee_points()` methods of an `EnsembleFSResult()`
+
 # mlr3fselect 1.3.0
 
 * refactor: Use [fastVoteR](https://github.com/bblodfon/fastVoteR) for feature ranking in `EnsembleFSResult()` objects
diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R
index 3ab903a9..bc63787e 100644
--- a/R/EnsembleFSResult.R
+++ b/R/EnsembleFSResult.R
@@ -379,17 +379,26 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
     #'
     #' @param type (`character(1)`)\cr
     #'  Specifies the type of Pareto front to return. See details.
+    #' @param max_nfeatures (`integer(1)`)\cr
+    #'  Specifies the maximum number of features for which the estimated Pareto
+    #'  front is computed. Applicable only when `type = "estimated"`.
+    #'  If `NULL` (default), the maximum number of features
+    #'  is determined by the ensemble feature selection process.
     #'
     #' @details
     #' Two options are available for the Pareto front:
     #' - `"empirical"` (default): returns the empirical Pareto front.
     #' - `"estimated"`: the Pareto front points are estimated by fitting a linear model with the inversed of the number of features (\eqn{1/x}) as input and the associated performance scores as output.
-    #'  This method is useful when the Pareto points are sparse and the front  assumes a convex shape if better performance corresponds to lower measure values (e.g. classification error), or a concave shape otherwise (e.g. classification accuracy).
-    #'  The `estimated` Pareto front will include points for a number of features ranging from 1 up to the maximum number found in the empirical Pareto front.
+    #'
+    #'  This method is useful when the Pareto points are sparse and the front assumes a convex shape if better performance corresponds to lower measure values (e.g. classification error), or a concave shape otherwise (e.g. classification accuracy).
+    #'
+    #'  When `type = "estimated"`, the estimated Pareto front includes points with the number of features ranging from 1 up to `max_nfeatures`.
+    #'  If `max_nfeatures` is not provided, it defaults to the maximum number of features available in the ensemble feature selection `result`, i.e. the maximum out of all learners and resamplings included.
     #'
     #' @return A [data.table::data.table] with columns the number of features and the performance that together form the Pareto front.
-    pareto_front = function(type = "empirical") {
+    pareto_front = function(type = "empirical", max_nfeatures = NULL) {
       assert_choice(type, choices =  c("empirical", "estimated"))
+      assert_numeric(max_nfeatures, lower = 1, null.ok = TRUE)
       result = private$.result
       measure = self$measure # get active measure
       measure_id = ifelse(private$.active_measure == "inner",
@@ -441,7 +450,8 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
         model = stats::lm(formula = form, data = pf)
 
         # Predict values using the model to create a smooth curve
-        pf_pred = data.table(n_features = seq(1, max(data$n_features)))
+        if (is.null(max_nfeatures)) max_nfeatures = max(data[["n_features"]])
+        pf_pred = data.table(n_features = seq(1, max_nfeatures))
         pf_pred[, n_features_inv := 1 / n_features]
         pf_pred[, (measure_id) := stats::predict(model, newdata = pf_pred)]
         pf_pred$n_features_inv = NULL
@@ -463,13 +473,18 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
     #' The knee point is determined as the Pareto point with the maximum distance from this line, see Das (1999).
     #'
     #' @param method (`character(1)`)\cr
-    #'  Type of method to use to identify the knee point. See details.
+    #'  Type of method to use to identify the knee point.
     #' @param type (`character(1)`)\cr
     #'  Specifies the type of Pareto front to use for the identification of the knee point.
+    #' @param max_nfeatures (`integer(1)`)\cr
+    #'  Specifies the maximum number of features for which the estimated Pareto
+    #'  front is computed. Applicable only when `type = "estimated"`.
+    #'  If `NULL` (default), the maximum number of features
+    #'  is determined by the ensemble feature selection process.
     #'  See `pareto_front()` method for more details.
     #'
     #' @return A [data.table::data.table] with the knee point(s) of the Pareto front.
-    knee_points = function(method = "NBI", type = "empirical") {
+    knee_points = function(method = "NBI", type = "empirical", max_nfeatures = NULL) {
       assert_choice(method, choices = c("NBI"))
       assert_choice(type, choices = c("empirical", "estimated"))
       measure = self$measure # get active measure
@@ -478,7 +493,7 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
                           measure$id)
       minimize = measure$minimize
 
-      pf = if (type == "empirical") self$pareto_front() else self$pareto_front(type = "estimated")
+      pf = if (type == "empirical") self$pareto_front() else self$pareto_front(type = "estimated", max_nfeatures = max_nfeatures)
 
       # Scale the Pareto front data to (0-1) range
       nfeats = perf = dist_to_line = NULL
diff --git a/man/AutoFSelector.Rd b/man/AutoFSelector.Rd
index 365d9385..9b1c5168 100644
--- a/man/AutoFSelector.Rd
+++ b/man/AutoFSelector.Rd
@@ -149,6 +149,7 @@ Hash (unique identifier) for this partial object, excluding some components whic
 \if{html}{\out{
 <details><summary>Inherited methods</summary>
 <ul>
+<li><span class="pkg-link" data-pkg="mlr3" data-topic="Learner" data-id="configure"><a href='../../mlr3/html/Learner.html#method-Learner-configure'><code>mlr3::Learner$configure()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="mlr3" data-topic="Learner" data-id="encapsulate"><a href='../../mlr3/html/Learner.html#method-Learner-encapsulate'><code>mlr3::Learner$encapsulate()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="mlr3" data-topic="Learner" data-id="format"><a href='../../mlr3/html/Learner.html#method-Learner-format'><code>mlr3::Learner$format()</code></a></span></li>
 <li><span class="pkg-link" data-pkg="mlr3" data-topic="Learner" data-id="help"><a href='../../mlr3/html/Learner.html#method-Learner-help'><code>mlr3::Learner$help()</code></a></span></li>
diff --git a/man/ensemble_fs_result.Rd b/man/ensemble_fs_result.Rd
index 77fa2314..e5ae6e2a 100644
--- a/man/ensemble_fs_result.Rd
+++ b/man/ensemble_fs_result.Rd
@@ -388,7 +388,7 @@ This function identifies the \strong{Pareto front} of the ensemble feature
 selection process, i.e., the set of points that represent the trade-off
 between the number of features and performance (e.g. classification error).
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{EnsembleFSResult$pareto_front(type = "empirical")}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{EnsembleFSResult$pareto_front(type = "empirical", max_nfeatures = NULL)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -396,6 +396,12 @@ between the number of features and performance (e.g. classification error).
 \describe{
 \item{\code{type}}{(\code{character(1)})\cr
 Specifies the type of Pareto front to return. See details.}
+
+\item{\code{max_nfeatures}}{(\code{integer(1)})\cr
+Specifies the maximum number of features for which the estimated Pareto
+front is computed. Applicable only when \code{type = "estimated"}.
+If \code{NULL} (default), the maximum number of features
+is determined by the ensemble feature selection process.}
 }
 \if{html}{\out{</div>}}
 }
@@ -404,9 +410,12 @@ Two options are available for the Pareto front:
 \itemize{
 \item \code{"empirical"} (default): returns the empirical Pareto front.
 \item \code{"estimated"}: the Pareto front points are estimated by fitting a linear model with the inversed of the number of features (\eqn{1/x}) as input and the associated performance scores as output.
-This method is useful when the Pareto points are sparse and the front  assumes a convex shape if better performance corresponds to lower measure values (e.g. classification error), or a concave shape otherwise (e.g. classification accuracy).
-The \code{estimated} Pareto front will include points for a number of features ranging from 1 up to the maximum number found in the empirical Pareto front.
 }
+
+This method is useful when the Pareto points are sparse and the front assumes a convex shape if better performance corresponds to lower measure values (e.g. classification error), or a concave shape otherwise (e.g. classification accuracy).
+
+When \code{type = "estimated"}, the estimated Pareto front includes points with the number of features ranging from 1 up to \code{max_nfeatures}.
+If \code{max_nfeatures} is not provided, it defaults to the maximum number of features available in the ensemble feature selection \code{result}, i.e. the maximum out of all learners and resamplings included.
 }
 
 \subsection{Returns}{
@@ -420,17 +429,27 @@ A \link[data.table:data.table]{data.table::data.table} with columns the number o
 This function implements various \emph{knee} point identification (KPI) methods, which select points in the Pareto front, such that an optimal trade-off between performance and number of features is achieved.
 In most cases, only one such point is returned.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{EnsembleFSResult$knee_points(method = "NBI", type = "empirical")}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{EnsembleFSResult$knee_points(
+  method = "NBI",
+  type = "empirical",
+  max_nfeatures = NULL
+)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{
 \item{\code{method}}{(\code{character(1)})\cr
-Type of method to use to identify the knee point. See details.}
+Type of method to use to identify the knee point.}
 
 \item{\code{type}}{(\code{character(1)})\cr
-Specifies the type of Pareto front to use for the identification of the knee point.
+Specifies the type of Pareto front to use for the identification of the knee point.}
+
+\item{\code{max_nfeatures}}{(\code{integer(1)})\cr
+Specifies the maximum number of features for which the estimated Pareto
+front is computed. Applicable only when \code{type = "estimated"}.
+If \code{NULL} (default), the maximum number of features
+is determined by the ensemble feature selection process.
 See \code{pareto_front()} method for more details.}
 }
 \if{html}{\out{</div>}}
diff --git a/tests/testthat/test_ensemble_fselect.R b/tests/testthat/test_ensemble_fselect.R
index d88cec0f..78ebb0b8 100644
--- a/tests/testthat/test_ensemble_fselect.R
+++ b/tests/testthat/test_ensemble_fselect.R
@@ -41,6 +41,9 @@ test_that("efs works", {
   pf_pred = efsr$pareto_front(type = "estimated")
   expect_data_table(pf_pred, nrows = max(efsr$result$n_features))
   expect_equal(names(pf_pred), c("n_features", "classif.ce"))
+  # restrict estimation of Pareto front up to a specific number of features
+  pf_pred2 = efsr$pareto_front(type = "estimated", max_nfeatures = 10)
+  expect_equal(pf_pred[1:10, classif.ce], pf_pred2[, classif.ce])
 
   # knee_points
   kps = efsr$knee_points()
@@ -49,6 +52,12 @@ test_that("efs works", {
   kpse = efsr$knee_points(type = "estimated")
   expect_data_table(kpse, nrows = 1)
   expect_true(kps$n_features != kpse$n_features)
+  # setting the default `max_nfeatures` doesn't change the Pareto front
+  kpse2 = efsr$knee_points(type = "estimated", max_nfeatures = max(efsr$result$n_features))
+  expect_equal(kpse, kpse2)
+  # less points in the estimated pareto front, the knee point changes
+  kpse3 = efsr$knee_points(type = "estimated", max_nfeatures = 15)
+  expect_true(kpse$n_features != kpse3$n_features)
 
   # data.table conversion
   tab = as.data.table(efsr)