From 062ba85b38d9a35228d5882af241461d3b56c4b6 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Wed, 5 Mar 2025 17:04:36 +0100 Subject: [PATCH] moved feature selection --- src/SoleFeatures.jl | 1 + src/dataset/prepare_dataset.jl | 2 +- src/selection/fselection.jl | 133 +++++++++++- src/selection/interface.jl | 1 + test/benchmarks/03_FS_newStruct.jl | 336 +++++++++++++++-------------- 5 files changed, 304 insertions(+), 169 deletions(-) diff --git a/src/SoleFeatures.jl b/src/SoleFeatures.jl index 631e39b..d99f1c6 100644 --- a/src/SoleFeatures.jl +++ b/src/SoleFeatures.jl @@ -49,6 +49,7 @@ include("filters/univariate/variancefilter.jl") export VarianceRanking, VarianceThreshold include("selection/fselection.jl") +export feature_selection # using SoleData # using Reexport diff --git a/src/dataset/prepare_dataset.jl b/src/dataset/prepare_dataset.jl index 1309145..97b0066 100644 --- a/src/dataset/prepare_dataset.jl +++ b/src/dataset/prepare_dataset.jl @@ -185,7 +185,7 @@ function _features_groupby( )::Vector{Vector{Int}} res = Dict{Any, Vector{Int}}() for (i, g) in enumerate(Xinfo) - key = Tuple(getproperty(g, field) for field in group) + key = Tuple(getproperty(g, field) for field in aggrby) push!(get!(res, key, Int[]), i) end return collect(values(res)) # Return the grouped indices diff --git a/src/selection/fselection.jl b/src/selection/fselection.jl index 6ef9405..0fc4e21 100644 --- a/src/selection/fselection.jl +++ b/src/selection/fselection.jl @@ -160,4 +160,135 @@ function _fsgroup( return _fsgroup(X, nothing, Xinfo, selector, limiter, aggrby; kwargs...) end -_fsgroup(Xdf::AbstractDataFrame, args...) = _fsgroup(Matrix(Xdf), args...) \ No newline at end of file +_fsgroup(Xdf::AbstractDataFrame, args...) = _fsgroup(Matrix(Xdf), args...) + +# ---------------------------------------------------------------------------- # +# main feature selection function # +# ---------------------------------------------------------------------------- # +""" +TODO: documentation + +# Feature Selection with Aggregation Control + +## Overview +The `feature_selection` function allows precise control over how feature aggregation +is applied during the multi-step feature selection process. + +## Aggregation Parameter (`aggrby`) +The `aggrby` parameter can be provided in two ways: + +1. **Single NamedTuple**: When provided as a single NamedTuple (not a vector), + aggregation is only applied during the final step of feature selection. + The function automatically creates a vector where: + - All positions except the last contain `nothing` + - The last position contains the provided aggregation parameters + +2. **Vector of NamedTuples**: When provided as a vector, each element specifies + the aggregation behavior for the corresponding step in `fs_methods`. +""" +function feature_selection( + X::AbstractMatrix{T}, + y::Union{AbstractVector{<:Class}, Nothing}, + Xinfo::Vector{<:InfoFeat}; + + aggrby::Union{ABT,AbstractVector{<:ABT}} = ( + aggrby = (:var,), + aggregatef = length, # NOTE: or mean, minimum, maximum to aggregate scores instead of just counting number of selected features for each group + group_before_score = true, + ), + + fs_methods::AbstractVector{<:NamedTuple{(:selector, :limiter)}} = [ + ( # STEP 1: unsupervised variance-based filter + selector = SoleFeatures.VarianceFilter(SoleFeatures.IdentityLimiter()), + limiter = PercentageLimiter(0.5), + ), + ( # STEP 2: supervised Mutual Information filter + selector = SoleFeatures.MutualInformationClassif(SoleFeatures.IdentityLimiter()), + limiter = PercentageLimiter(0.1), + ), + ( # STEP 3: group results by variable + selector = IdentityFilter(), + limiter = SoleFeatures.IdentityLimiter(), + ), + ], + + norm::Bool = false, + normalize_kwargs::NamedTuple = NamedTuple(), + + cache_extracted_dataset::Union{Nothing,AbstractString} = nothing, + return_mid_results::Union{Val{true},Val{false}} = Val(true), +# )::Union{DataFrame,Tuple{DataFrame,FSMidResults}} where {T<:Number} +) where {T<:Number} + # prepare aggregation parameters + if !(aggrby isa AbstractVector) + # when aggrby is not a Vector assume that the user want to perform aggregation + # only during the last step of feature selection TODO: document this properly!!! + aggrby = push!(Union{Nothing,NamedTuple}[fill(nothing, max(length(fs_methods)-1, 0))...], aggrby) + end + + # prepare labels + y_coded = @. CategoricalArrays.levelcode(y) + + # dataset normalization + norm && _normalize_dataset(X, Xinfo; normalize_kwargs...) + + # feature selection + fs_mid_results = NamedTuple{(:score, :indices,:group_aggr_func,:group_indices,:aggrby)}[] + + for (fsm, gfs_params) in zip(fs_methods, aggrby) + current_dataset_col_slice = 1:size(X, 2) + + # pick survived columns only + for i in 1:length(fs_mid_results) + current_dataset_col_slice = current_dataset_col_slice[fs_mid_results[i].indices] + end + + currX = X[:,current_dataset_col_slice] + currXinfo = Xinfo[current_dataset_col_slice] + + dataset_param = isnothing(y_coded) || SoleFeatures.is_unsupervised(fsm.selector) ? + (currX, currXinfo) : + (currX, y_coded, currXinfo) + + idxes, score, g_indices = + if isnothing(gfs_params) + # perform normal feature selection + SoleFeatures._fs(dataset_param..., fsm...)..., nothing + else + # perform aggregated feature selection + sel_g_indices, g_scores, g_indices, grouped_variable_scores = SoleFeatures._fsgroup( + dataset_param..., fsm..., gfs_params.aggrby; + aggregatef = gfs_params.aggregatef, + group_before_score = gfs_params.group_before_score + ) + + # find indices to re-sort the scores of all variables to their + # original position in dataset columns + old_sort = sortperm(vcat(g_indices...)) + + vcat(g_indices[sel_g_indices]...), vcat(vcat(grouped_variable_scores...)[old_sort]...), g_indices + end + + sort!(idxes) + + push!(fs_mid_results, ( + score = score, + indices = idxes, + group_aggr_func = isnothing(gfs_params) ? nothing : gfs_params.aggregatef, + group_indices = g_indices, + aggrby = isnothing(gfs_params) ? nothing : gfs_params.aggrby + )) + end + + dataset_col_slice = 1:size(X, 2) + for i in 1:length(fs_mid_results) + dataset_col_slice = dataset_col_slice[fs_mid_results[i].indices] + end + + if isa(return_mid_results, Val{true}) + return X, X[:,dataset_col_slice], (extraction_column_names = Xinfo[dataset_col_slice], fs_mid_results = fs_mid_results) + else + return X[:,dataset_col_slice] + end +end +feature_selection(Xdf::AbstractDataFrame, args...; kwargs...) = feature_selection(Matrix(Xdf), args...; kwargs...) diff --git a/src/selection/interface.jl b/src/selection/interface.jl index ad61075..62cfcb7 100644 --- a/src/selection/interface.jl +++ b/src/selection/interface.jl @@ -14,6 +14,7 @@ abstract type AbstractSelResult end # ---------------------------------------------------------------------------- # # types # # ---------------------------------------------------------------------------- # +const ABT = Union{NamedTuple{(:aggrby,:aggregatef,:group_before_score)}, Nothing} # ---------------------------------------------------------------------------- # # data structures # diff --git a/test/benchmarks/03_FS_newStruct.jl b/test/benchmarks/03_FS_newStruct.jl index fe01351..da7aa81 100644 --- a/test/benchmarks/03_FS_newStruct.jl +++ b/test/benchmarks/03_FS_newStruct.jl @@ -492,194 +492,194 @@ end }}} } -""" -TODO: documentation - -# Feature Selection with Aggregation Control - -## Overview -The `feature_selection` function allows precise control over how feature aggregation -is applied during the multi-step feature selection process. - -## Aggregation Parameter (`aggrby`) -The `aggrby` parameter can be provided in two ways: - -1. **Single NamedTuple**: When provided as a single NamedTuple (not a vector), - aggregation is only applied during the final step of feature selection. - The function automatically creates a vector where: - - All positions except the last contain `nothing` - - The last position contains the provided aggregation parameters - -2. **Vector of NamedTuples**: When provided as a vector, each element specifies - the aggregation behavior for the corresponding step in `fs_methods`. -""" -function feature_selection( - X::AbstractMatrix{T}, - y::Union{Nothing,AbstractVector}, - Xinfo::Vector{<:SoleFeatures.InfoFeat}; - - # groups_separator::AbstractString = _SEPARATOR, - - # ex_windows::AbstractVector = [ FixedNumMovingWindows(5, 0.05)... ], - # ex_measures::AbstractVector{Union{Function, SuperFeature}} = [minimum, maximum, mean], - - # cosa vuoi fare al dataset, crea la tripla var, win, feats - # extract_tuples::AbstractVector = vec(collect(Iterators.product(names(X), ex_windows, ex_measures))), - - # tipo di aggregazione che si vuole alla fine - aggrby::Union{ABT,AbstractVector{<:ABT}} = ( - aggrby = (:var,), - aggregatef = length, # NOTE: or mean, minimum, maximum to aggregate scores instead of just counting number of selected features for each group - group_before_score = true, - ), - - fs_methods::AbstractVector{<:NamedTuple{(:selector, :limiter)}} = [ - ( # STEP 1: unsupervised variance-based filter - selector = SoleFeatures.VarianceFilter(SoleFeatures.IdentityLimiter()), - limiter = PercentageLimiter(0.5), - ), - ( # STEP 2: supervised Mutual Information filter - selector = SoleFeatures.MutualInformationClassif(SoleFeatures.IdentityLimiter()), - limiter = PercentageLimiter(0.1), - ), - ( # STEP 3: group results by variable - selector = IdentityFilter(), - limiter = SoleFeatures.IdentityLimiter(), - ), - ], +# """ +# TODO: documentation - # fix_special_floats::Bool = false, - # fix_special_floats_kwargs::NamedTuple = NamedTuple(), - norm::Bool = false, - normalize_kwargs::NamedTuple = NamedTuple(), +# # Feature Selection with Aggregation Control - cache_extracted_dataset::Union{Nothing,AbstractString} = nothing, - return_mid_results::Union{Val{true},Val{false}} = Val(true), -# )::Union{DataFrame,Tuple{DataFrame,FSMidResults}} where {T<:Number} -) where {T<:Number} +# ## Overview +# The `feature_selection` function allows precise control over how feature aggregation +# is applied during the multi-step feature selection process. - # ==================== PREPARE INPUTS ==================== +# ## Aggregation Parameter (`aggrby`) +# The `aggrby` parameter can be provided in two ways: - if !(aggrby isa AbstractVector) - # when aggrby is not a Vector assume that the user want to perform aggregation - # only during the last step of feature selection TODO: document this properly!!! - aggrby = push!(Union{Nothing,NamedTuple}[fill(nothing, max(length(fs_methods)-1, 0))...], aggrby) - end +# 1. **Single NamedTuple**: When provided as a single NamedTuple (not a vector), +# aggregation is only applied during the final step of feature selection. +# The function automatically creates a vector where: +# - All positions except the last contain `nothing` +# - The last position contains the provided aggregation parameters - # ==================== PREPARE LABELS ==================== +# 2. **Vector of NamedTuples**: When provided as a vector, each element specifies +# the aggregation behavior for the corresponding step in `fs_methods`. +# """ +# function feature_selection( +# X::AbstractMatrix{T}, +# y::Union{Nothing,AbstractVector}, +# Xinfo::Vector{<:SoleFeatures.InfoFeat}; - y_coded = @. CategoricalArrays.levelcode(y) +# # groups_separator::AbstractString = _SEPARATOR, + +# # ex_windows::AbstractVector = [ FixedNumMovingWindows(5, 0.05)... ], +# # ex_measures::AbstractVector{Union{Function, SuperFeature}} = [minimum, maximum, mean], + +# # cosa vuoi fare al dataset, crea la tripla var, win, feats +# # extract_tuples::AbstractVector = vec(collect(Iterators.product(names(X), ex_windows, ex_measures))), + +# # tipo di aggregazione che si vuole alla fine +# aggrby::Union{ABT,AbstractVector{<:ABT}} = ( +# aggrby = (:var,), +# aggregatef = length, # NOTE: or mean, minimum, maximum to aggregate scores instead of just counting number of selected features for each group +# group_before_score = true, +# ), + +# fs_methods::AbstractVector{<:NamedTuple{(:selector, :limiter)}} = [ +# ( # STEP 1: unsupervised variance-based filter +# selector = SoleFeatures.VarianceFilter(SoleFeatures.IdentityLimiter()), +# limiter = PercentageLimiter(0.5), +# ), +# ( # STEP 2: supervised Mutual Information filter +# selector = SoleFeatures.MutualInformationClassif(SoleFeatures.IdentityLimiter()), +# limiter = PercentageLimiter(0.1), +# ), +# ( # STEP 3: group results by variable +# selector = IdentityFilter(), +# limiter = SoleFeatures.IdentityLimiter(), +# ), +# ], + +# # fix_special_floats::Bool = false, +# # fix_special_floats_kwargs::NamedTuple = NamedTuple(), +# norm::Bool = false, +# normalize_kwargs::NamedTuple = NamedTuple(), + +# cache_extracted_dataset::Union{Nothing,AbstractString} = nothing, +# return_mid_results::Union{Val{true},Val{false}} = Val(true), +# # )::Union{DataFrame,Tuple{DataFrame,FSMidResults}} where {T<:Number} +# ) where {T<:Number} - # ================== DATASET EXTRACTION ================== +# # ==================== PREPARE INPUTS ==================== - # QUI inizia feature selection - # extract new dataset - # newX = begin - # local ced - # local _extr - # ced = cache_extracted_dataset - # _extr = extract - # # TODO: this Float64 is a strong assumption! - # Float64.(@scache_if !isnothing(ced) "dse" ced _extr(X, extract_tuples)) - # end +# if !(aggrby isa AbstractVector) +# # when aggrby is not a Vector assume that the user want to perform aggregation +# # only during the last step of feature selection TODO: document this properly!!! +# aggrby = push!(Union{Nothing,NamedTuple}[fill(nothing, max(length(fs_methods)-1, 0))...], aggrby) +# end - # # groups_separator = "@@@" - # if groups_separator != _SEPARATOR - # rename!(x -> replace(x, _SEPARATOR => groups_separator), newX) - # end - # extraction_column_names = names(newX) +# # ==================== PREPARE LABELS ==================== +# y_coded = @. CategoricalArrays.levelcode(y) - # =================== SPECIAL FLOAT FIX =================== +# # ================== DATASET EXTRACTION ================== - # if fix_special_floats - # @warn "DANGER!!! It is really discouraged to call this function " * - # "`fix_special_floats` set to `true`" - # fix_special_floats_kwargs = merge(fix_special_floats_kwargs, (remove_too_nan_instance = false,)) - # _fix_nan_inf_dataset!(newX, y; fix_special_floats_kwargs...) - # # FIXME: this function could alter the length o `y` and create - # # heavy inconsistencies!!! (this is why I forced - # # `remove_too_nan_instance` to false) - # end +# # QUI inizia feature selection +# # extract new dataset +# # newX = begin +# # local ced +# # local _extr +# # ced = cache_extracted_dataset +# # _extr = extract +# # # TODO: this Float64 is a strong assumption! +# # Float64.(@scache_if !isnothing(ced) "dse" ced _extr(X, extract_tuples)) +# # end - # ================== DATASET NORMALIZATION ================== +# # # groups_separator = "@@@" +# # if groups_separator != _SEPARATOR +# # rename!(x -> replace(x, _SEPARATOR => groups_separator), newX) +# # end +# # extraction_column_names = names(newX) - norm && _normalize_dataset!(X, Xinfo; normalize_kwargs...) - # =================== NO FEATURE SELECTION ================== +# # =================== SPECIAL FLOAT FIX =================== - # # if no feature selector was passed we can assume the user just wanted to extract features from dataset - # if length(fs_methods) == 0 - # if isa(return_mid_results, Val{true}) - # return X, NamedTuple() - # else - # return X - # end - # end +# # if fix_special_floats +# # @warn "DANGER!!! It is really discouraged to call this function " * +# # "`fix_special_floats` set to `true`" +# # fix_special_floats_kwargs = merge(fix_special_floats_kwargs, (remove_too_nan_instance = false,)) +# # _fix_nan_inf_dataset!(newX, y; fix_special_floats_kwargs...) +# # # FIXME: this function could alter the length o `y` and create +# # # heavy inconsistencies!!! (this is why I forced +# # # `remove_too_nan_instance` to false) +# # end - # ===================== FEATURE SELECTION =================== +# # ================== DATASET NORMALIZATION ================== - # questo serve solo per generare grafici - # fs_mid_results = NamedTuple{(:score,:indices,:name2score,:group_aggr_func,:group_indices,:aggrby)}[] - fs_mid_results = NamedTuple{(:score, :indices,:group_aggr_func,:group_indices,:aggrby)}[] +# norm && _normalize_dataset!(X, Xinfo; normalize_kwargs...) - for (fsm, gfs_params) in zip(fs_methods, aggrby) - current_dataset_col_slice = 1:size(X, 2) +# # =================== NO FEATURE SELECTION ================== - # pick survived columns only - for i in 1:length(fs_mid_results) - current_dataset_col_slice = current_dataset_col_slice[fs_mid_results[i].indices] - end +# # # if no feature selector was passed we can assume the user just wanted to extract features from dataset +# # if length(fs_methods) == 0 +# # if isa(return_mid_results, Val{true}) +# # return X, NamedTuple() +# # else +# # return X +# # end +# # end - currX = X[:,current_dataset_col_slice] - currXinfo = Xinfo[current_dataset_col_slice] - - dataset_param = isnothing(y_coded) || SoleFeatures.is_unsupervised(fsm.selector) ? - (currX, currXinfo) : - (currX, y_coded, currXinfo) - - idxes, score, g_indices = - if isnothing(gfs_params) - # perform normal feature selection - SoleFeatures._fs(dataset_param..., fsm...)..., nothing - else - # perform aggregated feature selection - sel_g_indices, g_scores, g_indices, grouped_variable_scores = SoleFeatures._fsgroup( - dataset_param..., fsm..., gfs_params.aggrby; - aggregatef = gfs_params.aggregatef, - group_before_score = gfs_params.group_before_score - ) +# # ===================== FEATURE SELECTION =================== - # find indices to re-sort the scores of all variables to their - # original position in dataset columns - old_sort = sortperm(vcat(g_indices...)) +# # questo serve solo per generare grafici +# # fs_mid_results = NamedTuple{(:score,:indices,:name2score,:group_aggr_func,:group_indices,:aggrby)}[] +# fs_mid_results = NamedTuple{(:score, :indices,:group_aggr_func,:group_indices,:aggrby)}[] - vcat(g_indices[sel_g_indices]...), vcat(vcat(grouped_variable_scores...)[old_sort]...), g_indices - end +# for (fsm, gfs_params) in zip(fs_methods, aggrby) +# current_dataset_col_slice = 1:size(X, 2) - sort!(idxes) +# # pick survived columns only +# for i in 1:length(fs_mid_results) +# current_dataset_col_slice = current_dataset_col_slice[fs_mid_results[i].indices] +# end - push!(fs_mid_results, ( - score = score, - indices = idxes, - group_aggr_func = isnothing(gfs_params) ? nothing : gfs_params.aggregatef, - group_indices = g_indices, - aggrby = isnothing(gfs_params) ? nothing : gfs_params.aggrby - )) - end +# currX = X[:,current_dataset_col_slice] +# currXinfo = Xinfo[current_dataset_col_slice] + +# dataset_param = isnothing(y_coded) || SoleFeatures.is_unsupervised(fsm.selector) ? +# (currX, currXinfo) : +# (currX, y_coded, currXinfo) + +# idxes, score, g_indices = +# if isnothing(gfs_params) +# # perform normal feature selection +# SoleFeatures._fs(dataset_param..., fsm...)..., nothing +# else +# # perform aggregated feature selection +# sel_g_indices, g_scores, g_indices, grouped_variable_scores = SoleFeatures._fsgroup( +# dataset_param..., fsm..., gfs_params.aggrby; +# aggregatef = gfs_params.aggregatef, +# group_before_score = gfs_params.group_before_score +# ) + +# # find indices to re-sort the scores of all variables to their +# # original position in dataset columns +# old_sort = sortperm(vcat(g_indices...)) + +# vcat(g_indices[sel_g_indices]...), vcat(vcat(grouped_variable_scores...)[old_sort]...), g_indices +# end + +# sort!(idxes) + +# push!(fs_mid_results, ( +# score = score, +# indices = idxes, +# group_aggr_func = isnothing(gfs_params) ? nothing : gfs_params.aggregatef, +# group_indices = g_indices, +# aggrby = isnothing(gfs_params) ? nothing : gfs_params.aggrby +# )) +# end - dataset_col_slice = 1:size(X, 2) - for i in 1:length(fs_mid_results) - dataset_col_slice = dataset_col_slice[fs_mid_results[i].indices] - end +# dataset_col_slice = 1:size(X, 2) +# for i in 1:length(fs_mid_results) +# dataset_col_slice = dataset_col_slice[fs_mid_results[i].indices] +# end - if isa(return_mid_results, Val{true}) - return X, X[:,dataset_col_slice], (extraction_column_names = Xinfo[dataset_col_slice], fs_mid_results = fs_mid_results) - else - return X[:,dataset_col_slice] - end -end -feature_selection(X::AbstractDataFrame, args...; kwargs...) = feature_selection(Matrix(X), args...; kwargs...) +# if isa(return_mid_results, Val{true}) +# return X, X[:,dataset_col_slice], (extraction_column_names = Xinfo[dataset_col_slice], fs_mid_results = fs_mid_results) +# else +# return X[:,dataset_col_slice] +# end +# end +# feature_selection(X::AbstractDataFrame, args...; kwargs...) = feature_selection(Matrix(X), args...; kwargs...) """ TODO: docs @@ -919,6 +919,7 @@ end # ---------------------------------------------------------------------------- # # debug # # ---------------------------------------------------------------------------- # +using SoleData, SoleFeatures # load a time-series dataset df, y = SoleData.load_arff_dataset("NATOPS") @@ -938,14 +939,15 @@ fs_methods = [ ), ] -# prepare dataset for feature selection -Xdf, Xinfo = @test_nowarn SoleFeatures.feature_selection_preprocess(df; features=ms, type=adaptivewindow, nwindows=6, relative_overlap=0.05) -# Xdf, Xinfo = @test_nowarn SoleFeatures.feature_selection_preprocess(df; features=ms, type=wholewindow) +# # prepare dataset for feature selection +# Xdf, Xinfo = @test_nowarn SoleFeatures.feature_selection_preprocess(df; features=ms, type=adaptivewindow, nwindows=6, relative_overlap=0.05) +# # Xdf, Xinfo = @test_nowarn SoleFeatures.feature_selection_preprocess(df; features=ms, type=wholewindow) -@info "FEATURE SELECTION" +# @info "FEATURE SELECTION" -using BenchmarkTools +# using BenchmarkTools -a = feature_selection(Xdf, y, Xinfo, fs_methods = fs_methods, norm = false) +Xdf, Xinfo = SoleFeatures.feature_selection_preprocess(df; features=ms, type=adaptivewindow, nwindows=6, relative_overlap=0.05) +a = SoleFeatures.feature_selection(Xdf, y, Xinfo; fs_methods = fs_methods, norm = true) # 3.189 ms (52904 allocations: 5.54 MiB)