lorenzo

aclai-lab · Mar 4, 2025 · 256a803 · 256a803
1 parent f387acd
commit 256a803
Show file tree

Hide file tree

Showing 14 changed files with 1,207 additions and 77 deletions.
diff --git a/Project.toml b/Project.toml
@@ -11,9 +11,12 @@ HypothesisTests = "09f84164-cd44-5f33-b23f-e6b0d136a0d5"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
 MLBase = "f0e99cf1-93fa-52ec-9ecc-5026115318e0"
 MultiData = "8cc5100c-b3d1-4f82-90cb-0ea93d317aba"
+NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SoleBase = "4475fa32-7023-44a0-aa70-4813b230e492"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
@@ -23,9 +26,12 @@ DataFrames = "1"
 HypothesisTests = "0.11"
 IterTools = "1"
 MultiData = "0.1.4"
+NearestNeighbors = "0.4.21"
 OrderedCollections = "1"
 Random = "1"
 SoleBase = "0.13"
+SparseArrays = "1.11.0"
+SpecialFunctions = "2.5.0"
 StatsBase = "0.30 - 0.34"
 julia = "1"
 

diff --git a/src/SoleFeatures.jl b/src/SoleFeatures.jl
@@ -4,37 +4,43 @@ module SoleFeatures
 using SoleBase
 using MultiData
 using StatsBase, Catch22
-using CategoricalArrays, DataFrames
+
+using SpecialFunctions  # For digamma function
+using NearestNeighbors  # For KDTree and knn
+using SparseArrays, CategoricalArrays, DataFrames
 using Random
 
+using Base.Threads: @threads
+
+
 include("interface.jl")
 export AbstractFilterBased
 include("utils/utils.jl")
 
+include("utils/features_set.jl")
+export mode_5, mode_10, embedding_dist, acf_timescale, acf_first_min, ami2, trev, outlier_timing_pos
+export outlier_timing_neg, whiten_timescale, forecast_error, ami_timescale, high_fluctuation, stretch_decreasing
+export stretch_high, entropy_pairs, rs_range, dfa, low_freq_power, centroid_freq, transition_variance, periodicity
+export base_set, catch9, catch22_set, complete_set
+
+include("dataset/interface.jl")
+export Feature, Score, GroupScore
+include("dataset/prepare_dataset.jl")
+export feature_selection_preprocess
+
 # filters
 include("filters/limiter.jl")
 export AbstractLimiter
 export PercentageLimiter
 include("filters/interface.jl")
 
 include("filters/univariate/identityfilter.jl")
+include("filters/mutual_info.jl")
 include("filters/univariate/mutualinformationclassif.jl")
 export MutualInformationClassifRanking
 include("filters/univariate/variancefilter.jl")
 export VarianceRanking, VarianceThreshold
 
-include("utils/features_set.jl")
-export mode_5, mode_10, embedding_dist, acf_timescale, acf_first_min, ami2, trev, outlier_timing_pos
-export outlier_timing_neg, whiten_timescale, forecast_error, ami_timescale, high_fluctuation, stretch_decreasing
-export stretch_high, entropy_pairs, rs_range, dfa, low_freq_power, centroid_freq, transition_variance, periodicity
-export base_set, catch9, catch22_set, complete_set
-
-include("dataset/interface.jl")
-export Feature
-
-include("dataset/prepare_dataset.jl")
-export feature_selection_preprocess
-
 # using SoleData
 # using Reexport
 # using LinearAlgebra

diff --git a/src/dataset/interface.jl b/src/dataset/interface.jl
@@ -21,6 +21,11 @@ Abstract type for feature struct
 """
 abstract type AbstractFeature end
 
+"""
+Abstract type for score struct
+"""
+abstract type AbstractScore end
+
 # ---------------------------------------------------------------------------- #
 #                                    types                                     #
 # ---------------------------------------------------------------------------- #
@@ -256,44 +261,94 @@ const WIN_PARAMS = Dict(
 # end
 
 """
-    InfoFeat{V<:Number, T<:Union{Symbol, String}} <: AbstractFeature
+    InfoFeat{T<:VarName} <: AbstractFeature
 
-Holds info on colum dataset, used in feature selection.
+Holds information about dataset columns, used in feature selection.
 
 # Type Parameters
-- `T`: Type of the variable name (must be either `Symbol` or `String`)
+- `T`: VarName type (must be either `Symbol` or `String`)
 
 # Fields
-- `feats::Symbol`: The feature extraction function name
-- `var::T`: The variable name/identifier
-- `nwin::Int`: The window number (must be positive)
+- `id   :: Int`     : Unique identifier for the feature (Int or nothing)
+- `feat :: Symbol`  : The feature extraction function name
+- `var  :: T`       : The variable name/identifier
+- `nwin :: Int`     : The window number (must be positive)
 
 # Constructors
 ```julia
-InfoFeat(feats::Symbol, var::Union{Symbol,String}, nwin::Integer)
+InfoFeat(id::Id, feat::Symbol, var::Union{Symbol,String}, nwin::Integer)
 """
 struct InfoFeat{T<:VarName} <: AbstractFeature
-    feats :: Symbol
-    var   :: T
-    nwin  :: Int
+    id     :: Int
+    var    :: T
+    feat   :: Symbol
+    nwin   :: Int
 
-    function InfoFeat(feats::Symbol, var::VarName, nwin::Int)
+    function InfoFeat(id::Int, var::VarName, feat::Symbol, nwin::Int)
         nwin > 0 || throw(ArgumentError("Window number must be positive"))
-        new{typeof(var)}(feats, var, nwin)
+        new{typeof(var)}(id, var, feat, nwin)
     end
 end
 
 # Value access methods
 Base.getproperty(f::InfoFeat, s::Symbol) = getfield(f, s)
-Base.propertynames(::InfoFeat) = (:feats, :var, :nwin)
+Base.propertynames(::InfoFeat)           = (:id, :feat, :var, :nwin)
 
-# Get variable name
+feature_id(f::InfoFeat)    = f.id
 variable_name(f::InfoFeat) = f.var
-# Get feature type
-feature_type(f::InfoFeat) = f.feats
-# Get window number
+feature_type(f::InfoFeat)  = f.feat
 window_number(f::InfoFeat) = f.nwin
 
+"""
+    Score <: AbstractScore
+
+A struct representing the score of an individual feature in feature selection.
+
+# Fields
+- `id :: Int` : The unique identifier of the feature that this score belongs to
+- `score :: Float64` : The numerical score value indicating feature importance/relevance
+
+# Constructors
+```julia
+Score(id::Int, score::Float64)
+"""
+struct Score <: AbstractScore
+    id    :: Int
+    score :: Float64
+end
+
+# Value access methods
+Base.getproperty(sc::Score, s::Symbol) = getfield(sc, s)
+Base.propertynames(::Score)            = (:id, :score)
+
+score_id(s::Score)  = s.id
+score_val(s::Score) = s.score
+
+"""
+    GroupScore <: AbstractScore
+
+A struct representing the score of a group of features in grouped feature selection.
+
+# Fields
+- `grp :: Tuple{Vararg{Symbol}}` : Tuple of symbols identifying the feature group
+- `score :: Float64` : Numerical score value indicating the group's importance/relevance
+
+# Constructors
+```julia
+GroupScore(grp::Tuple{Vararg{Symbol}}, score::Float64)
+"""
+struct GroupScore <: AbstractScore
+    grp   :: Tuple{Vararg{Symbol}}
+    score :: Float64
+end
+
+# Value access methods
+Base.getproperty(sc::GroupScore, s::Symbol) = getfield(sc, s)
+Base.propertynames(::GroupScore)            = (:id, :score)
+
+score_id(s::GroupScore)  = s.id
+score_val(s::GroupScore) = s.score
+
 # ---------------------------------------------------------------------------- #
 #                            functions definitions                             #
 # ---------------------------------------------------------------------------- #
diff --git a/src/dataset/prepare_dataset.jl b/src/dataset/prepare_dataset.jl
@@ -388,29 +388,42 @@ function feature_selection_preprocess(
     X::DataFrame;
     vnames::VarNames=nothing,
     features::FeatNames=nothing,
-    nwindows::Union{Int, Nothing}=nothing
+    type::Union{Base.Callable, Nothing}=nothing,
+    nwindows::Union{Int, Nothing}=nothing,
+    relative_overlap::Union{AbstractFloat, Nothing}=nothing
 )
     # check parameters
     isnothing(vnames) && (vnames = names(X))
     isnothing(features) && (features = DEFAULT_FE.features)
     treatment = :aggregate
     _ = _check_dimensions(X)
+
+    if !isnothing(type)
+        type ∈ keys(WIN_PARAMS) || throw(ArgumentError("Invalid window type."))
+    end
     if !isnothing(nwindows)
         nwindows > 0 || throw(ArgumentError("Number of windows must be positive."))
     end
-    winparams = isnothing(nwindows) ? DEFAULT_FE_WINPARAMS : merge(DEFAULT_FE_WINPARAMS, (nwindows = nwindows,))
-
-    # Xinfo = [v => InfoFeat[f, Symbol(v), i]
-    #     for f in features for v in vnames
-    #     for i in 1:length(nwindows)]
-
-    #     # Replace the Xinfo creation with:
-    Xinfo = [
-        (f, v, i) => InfoFeat(Symbol(f), v, i)
-        for f in features 
-        for v in vnames 
-        for i in 1:nwindows
-    ]
+    if !isnothing(relative_overlap)
+        relative_overlap ≥ 0 || throw(ArgumentError("Overlap must non negative."))
+    end
+
+    winparams = begin
+        base_params = isnothing(type) ? DEFAULT_FE_WINPARAMS : merge(DEFAULT_FE_WINPARAMS, (type = type,))
+        base_params = isnothing(nwindows) ? DEFAULT_FE_WINPARAMS : merge(DEFAULT_FE_WINPARAMS, (nwindows = nwindows,))
+        isnothing(relative_overlap) ? base_params : merge(base_params, (relative_overlap = relative_overlap,))
+    end
+
+    @show winparams
+
+    total_features = length(features) * length(vnames) * nwindows
+    Xinfo = Vector{InfoFeat}(undef, total_features)
+    idx = 1
+
+    for f in features, v in vnames, n in 1:nwindows
+        Xinfo[idx] = InfoFeat(idx, v, Symbol(f), n)
+        idx += 1
+    end
 
     _treatment(X, vnames, treatment, features, winparams), Xinfo
 end
diff --git a/src/filters/limiter.jl b/src/filters/limiter.jl
@@ -50,6 +50,10 @@ function limit(scores::AbstractVector{<:Real}, tl::ThresholdLimiter)
     return findall(ordf(tl)(threshold(tl)), scores)
 end
 
+function limit(scores::AbstractVector{GroupScore}, tl::ThresholdLimiter)
+    return limit([s.score for s in scores], tl)
+end
+
 # ---------------------------------------------------------------------------- #
 #                              ranking limiter                                 #
 # ---------------------------------------------------------------------------- #
@@ -74,6 +78,10 @@ function limit(scores::AbstractVector{<:Real}, rl::RankingLimiter)
     return sortperm(scores; rev=rev(rl))[1:nbest(rl)]
 end
 
+function limit(scores::AbstractVector{GroupScore}, rl::RankingLimiter)
+    return limit([s.score for s in scores], rl)
+end
+
 # ---------------------------------------------------------------------------- #
 #                              majority limiter                                #
 # ---------------------------------------------------------------------------- #
@@ -109,6 +117,10 @@ function limit(scores::AbstractVector, ml::MajorityLimiter)
     return findall(accepted .≥ bounds)
 end
 
+function limit(scores::AbstractVector{GroupScore}, ml::MajorityLimiter)
+    return limit([s.score for s in scores], ml)
+end
+
 # ---------------------------------------------------------------------------- #
 #                               atleast limiter                                #
 # ---------------------------------------------------------------------------- #
@@ -142,6 +154,10 @@ function limit(scores::AbstractVector, al::AtLeastLimiter)
     return findall(accepted .≥ al.atleast)
 end
 
+function limit(scores::AbstractVector{GroupScore}, al::AtLeastLimiter)
+    return limit([s.score for s in scores], al)
+end
+
 # ---------------------------------------------------------------------------- #
 #                             percentange limiter                              #
 # ---------------------------------------------------------------------------- #
@@ -180,3 +196,7 @@ function limit(scores::AbstractVector{<:Real}, l::PercentageLimiter)
     len = Int(ceil(length(scores) * perc(l)))
     return sortperm(scores; rev = rev(l))[1:len]
 end
+
+function limit(scores::AbstractVector{GroupScore}, l::PercentageLimiter)
+    return limit([s.score for s in scores], l)
+end
diff --git a/src/filters/mutual_info.jl b/src/filters/mutual_info.jl
@@ -1,11 +1,3 @@
-# packages
-using StatsBase
-using SparseArrays, CategoricalArrays
-using SpecialFunctions  # For digamma function
-using NearestNeighbors  # For KDTree and knn
-using Base.Threads: @threads
-using Random
-
 # ---------------------------------------------------------------------------- #
 #                                    utils                                     #
 # ---------------------------------------------------------------------------- #
@@ -257,7 +249,7 @@ References
        Data Sets". PLoS ONE 9(2), 2014.
 """
 function _estimate_mi(
-    Xdf::AbstractDataFrame,
+    Xdf::AbstractMatrix,
     y::AbstractVector{<:SoleFeatures.Class};
     discrete_mode::Union{AbstractArray, Nothing}=nothing,
     discrete_target::Bool=false,
@@ -302,7 +294,7 @@ function _estimate_mi(
     mi = Vector{Float64}(undef, n_features)
 
     # Use threading for parallel computation
-    @threads for i in 1:n_features
+    Threads.@threads for i in 1:n_features
         x = X[:, i]
         mi[i] = _compute_mi(x, y, discrete_mask[i], discrete_target, n_neighbors)
     end

diff --git a/src/filters/univariate/mutualinformationclassif.jl b/src/filters/univariate/mutualinformationclassif.jl
@@ -14,7 +14,16 @@ function score(
     y::AbstractVector{<:Class},
     selector::MutualInformationClassif
 )::Vector{Float64}
-    scores = fs.mutual_info_classif(Matrix(X), y)
+    scores = mutual_info_classif(Matrix(X), y)
+    return scores
+end
+
+function score(
+    X::AbstractMatrix,
+    y::AbstractVector{<:Class},
+    selector::MutualInformationClassif
+)::Vector{Float64}
+    scores = mutual_info_classif(Matrix(X), y)
     return scores
 end
 

diff --git a/src/filters/univariate/variancefilter.jl b/src/filters/univariate/variancefilter.jl
@@ -17,6 +17,13 @@ function score(
     return var.(eachcol(X))
 end
 
+function score(
+    X::AbstractMatrix,
+    selector::VarianceFilter
+)
+    return var.(eachcol(X))
+end
+
 # Ranking
 VarianceRanking(nbest) = VarianceFilter(RankingLimiter(nbest, true))
 # Threshold