Skip to content

Commit

Permalink
lorenzo
Browse files Browse the repository at this point in the history
  • Loading branch information
PasoStudio73 committed Mar 4, 2025
1 parent f387acd commit 256a803
Show file tree
Hide file tree
Showing 14 changed files with 1,207 additions and 77 deletions.
6 changes: 6 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@ HypothesisTests = "09f84164-cd44-5f33-b23f-e6b0d136a0d5"
IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
MLBase = "f0e99cf1-93fa-52ec-9ecc-5026115318e0"
MultiData = "8cc5100c-b3d1-4f82-90cb-0ea93d317aba"
NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SoleBase = "4475fa32-7023-44a0-aa70-4813b230e492"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"

[compat]
Expand All @@ -23,9 +26,12 @@ DataFrames = "1"
HypothesisTests = "0.11"
IterTools = "1"
MultiData = "0.1.4"
NearestNeighbors = "0.4.21"
OrderedCollections = "1"
Random = "1"
SoleBase = "0.13"
SparseArrays = "1.11.0"
SpecialFunctions = "2.5.0"
StatsBase = "0.30 - 0.34"
julia = "1"

Expand Down
32 changes: 19 additions & 13 deletions src/SoleFeatures.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,37 +4,43 @@ module SoleFeatures
using SoleBase
using MultiData
using StatsBase, Catch22
using CategoricalArrays, DataFrames

using SpecialFunctions # For digamma function
using NearestNeighbors # For KDTree and knn
using SparseArrays, CategoricalArrays, DataFrames
using Random

using Base.Threads: @threads


include("interface.jl")
export AbstractFilterBased
include("utils/utils.jl")

include("utils/features_set.jl")
export mode_5, mode_10, embedding_dist, acf_timescale, acf_first_min, ami2, trev, outlier_timing_pos
export outlier_timing_neg, whiten_timescale, forecast_error, ami_timescale, high_fluctuation, stretch_decreasing
export stretch_high, entropy_pairs, rs_range, dfa, low_freq_power, centroid_freq, transition_variance, periodicity
export base_set, catch9, catch22_set, complete_set

include("dataset/interface.jl")
export Feature, Score, GroupScore
include("dataset/prepare_dataset.jl")
export feature_selection_preprocess

# filters
include("filters/limiter.jl")
export AbstractLimiter
export PercentageLimiter
include("filters/interface.jl")

include("filters/univariate/identityfilter.jl")
include("filters/mutual_info.jl")
include("filters/univariate/mutualinformationclassif.jl")
export MutualInformationClassifRanking
include("filters/univariate/variancefilter.jl")
export VarianceRanking, VarianceThreshold

include("utils/features_set.jl")
export mode_5, mode_10, embedding_dist, acf_timescale, acf_first_min, ami2, trev, outlier_timing_pos
export outlier_timing_neg, whiten_timescale, forecast_error, ami_timescale, high_fluctuation, stretch_decreasing
export stretch_high, entropy_pairs, rs_range, dfa, low_freq_power, centroid_freq, transition_variance, periodicity
export base_set, catch9, catch22_set, complete_set

include("dataset/interface.jl")
export Feature

include("dataset/prepare_dataset.jl")
export feature_selection_preprocess

# using SoleData
# using Reexport
# using LinearAlgebra
Expand Down
89 changes: 72 additions & 17 deletions src/dataset/interface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ Abstract type for feature struct
"""
abstract type AbstractFeature end

"""
Abstract type for score struct
"""
abstract type AbstractScore end

# ---------------------------------------------------------------------------- #
# types #
# ---------------------------------------------------------------------------- #
Expand Down Expand Up @@ -256,44 +261,94 @@ const WIN_PARAMS = Dict(
# end

"""
InfoFeat{V<:Number, T<:Union{Symbol, String}} <: AbstractFeature
InfoFeat{T<:VarName} <: AbstractFeature
Holds info on colum dataset, used in feature selection.
Holds information about dataset columns, used in feature selection.
# Type Parameters
- `T`: Type of the variable name (must be either `Symbol` or `String`)
- `T`: VarName type (must be either `Symbol` or `String`)
# Fields
- `feats::Symbol`: The feature extraction function name
- `var::T`: The variable name/identifier
- `nwin::Int`: The window number (must be positive)
- `id :: Int` : Unique identifier for the feature (Int or nothing)
- `feat :: Symbol` : The feature extraction function name
- `var :: T` : The variable name/identifier
- `nwin :: Int` : The window number (must be positive)
# Constructors
```julia
InfoFeat(feats::Symbol, var::Union{Symbol,String}, nwin::Integer)
InfoFeat(id::Id, feat::Symbol, var::Union{Symbol,String}, nwin::Integer)
"""
struct InfoFeat{T<:VarName} <: AbstractFeature
feats :: Symbol
var :: T
nwin :: Int
id :: Int
var :: T
feat :: Symbol
nwin :: Int

function InfoFeat(feats::Symbol, var::VarName, nwin::Int)
function InfoFeat(id::Int, var::VarName, feat::Symbol, nwin::Int)
nwin > 0 || throw(ArgumentError("Window number must be positive"))
new{typeof(var)}(feats, var, nwin)
new{typeof(var)}(id, var, feat, nwin)
end
end

# Value access methods
Base.getproperty(f::InfoFeat, s::Symbol) = getfield(f, s)
Base.propertynames(::InfoFeat) = (:feats, :var, :nwin)
Base.propertynames(::InfoFeat) = (:id, :feat, :var, :nwin)

# Get variable name
feature_id(f::InfoFeat) = f.id
variable_name(f::InfoFeat) = f.var
# Get feature type
feature_type(f::InfoFeat) = f.feats
# Get window number
feature_type(f::InfoFeat) = f.feat
window_number(f::InfoFeat) = f.nwin

"""
Score <: AbstractScore
A struct representing the score of an individual feature in feature selection.
# Fields
- `id :: Int` : The unique identifier of the feature that this score belongs to
- `score :: Float64` : The numerical score value indicating feature importance/relevance
# Constructors
```julia
Score(id::Int, score::Float64)
"""
struct Score <: AbstractScore
id :: Int
score :: Float64
end

# Value access methods
Base.getproperty(sc::Score, s::Symbol) = getfield(sc, s)
Base.propertynames(::Score) = (:id, :score)

score_id(s::Score) = s.id
score_val(s::Score) = s.score

"""
GroupScore <: AbstractScore
A struct representing the score of a group of features in grouped feature selection.
# Fields
- `grp :: Tuple{Vararg{Symbol}}` : Tuple of symbols identifying the feature group
- `score :: Float64` : Numerical score value indicating the group's importance/relevance
# Constructors
```julia
GroupScore(grp::Tuple{Vararg{Symbol}}, score::Float64)
"""
struct GroupScore <: AbstractScore
grp :: Tuple{Vararg{Symbol}}
score :: Float64
end

# Value access methods
Base.getproperty(sc::GroupScore, s::Symbol) = getfield(sc, s)
Base.propertynames(::GroupScore) = (:id, :score)

score_id(s::GroupScore) = s.id
score_val(s::GroupScore) = s.score

# ---------------------------------------------------------------------------- #
# functions definitions #
# ---------------------------------------------------------------------------- #
41 changes: 27 additions & 14 deletions src/dataset/prepare_dataset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -388,29 +388,42 @@ function feature_selection_preprocess(
X::DataFrame;
vnames::VarNames=nothing,
features::FeatNames=nothing,
nwindows::Union{Int, Nothing}=nothing
type::Union{Base.Callable, Nothing}=nothing,
nwindows::Union{Int, Nothing}=nothing,
relative_overlap::Union{AbstractFloat, Nothing}=nothing
)
# check parameters
isnothing(vnames) && (vnames = names(X))
isnothing(features) && (features = DEFAULT_FE.features)
treatment = :aggregate
_ = _check_dimensions(X)

if !isnothing(type)
type keys(WIN_PARAMS) || throw(ArgumentError("Invalid window type."))
end
if !isnothing(nwindows)
nwindows > 0 || throw(ArgumentError("Number of windows must be positive."))
end
winparams = isnothing(nwindows) ? DEFAULT_FE_WINPARAMS : merge(DEFAULT_FE_WINPARAMS, (nwindows = nwindows,))

# Xinfo = [v => InfoFeat[f, Symbol(v), i]
# for f in features for v in vnames
# for i in 1:length(nwindows)]

# # Replace the Xinfo creation with:
Xinfo = [
(f, v, i) => InfoFeat(Symbol(f), v, i)
for f in features
for v in vnames
for i in 1:nwindows
]
if !isnothing(relative_overlap)
relative_overlap 0 || throw(ArgumentError("Overlap must non negative."))
end

winparams = begin
base_params = isnothing(type) ? DEFAULT_FE_WINPARAMS : merge(DEFAULT_FE_WINPARAMS, (type = type,))
base_params = isnothing(nwindows) ? DEFAULT_FE_WINPARAMS : merge(DEFAULT_FE_WINPARAMS, (nwindows = nwindows,))
isnothing(relative_overlap) ? base_params : merge(base_params, (relative_overlap = relative_overlap,))
end

@show winparams

total_features = length(features) * length(vnames) * nwindows
Xinfo = Vector{InfoFeat}(undef, total_features)
idx = 1

for f in features, v in vnames, n in 1:nwindows
Xinfo[idx] = InfoFeat(idx, v, Symbol(f), n)
idx += 1
end

_treatment(X, vnames, treatment, features, winparams), Xinfo
end
20 changes: 20 additions & 0 deletions src/filters/limiter.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ function limit(scores::AbstractVector{<:Real}, tl::ThresholdLimiter)
return findall(ordf(tl)(threshold(tl)), scores)
end

function limit(scores::AbstractVector{GroupScore}, tl::ThresholdLimiter)
return limit([s.score for s in scores], tl)
end

# ---------------------------------------------------------------------------- #
# ranking limiter #
# ---------------------------------------------------------------------------- #
Expand All @@ -74,6 +78,10 @@ function limit(scores::AbstractVector{<:Real}, rl::RankingLimiter)
return sortperm(scores; rev=rev(rl))[1:nbest(rl)]
end

function limit(scores::AbstractVector{GroupScore}, rl::RankingLimiter)
return limit([s.score for s in scores], rl)
end

# ---------------------------------------------------------------------------- #
# majority limiter #
# ---------------------------------------------------------------------------- #
Expand Down Expand Up @@ -109,6 +117,10 @@ function limit(scores::AbstractVector, ml::MajorityLimiter)
return findall(accepted .≥ bounds)
end

function limit(scores::AbstractVector{GroupScore}, ml::MajorityLimiter)
return limit([s.score for s in scores], ml)
end

# ---------------------------------------------------------------------------- #
# atleast limiter #
# ---------------------------------------------------------------------------- #
Expand Down Expand Up @@ -142,6 +154,10 @@ function limit(scores::AbstractVector, al::AtLeastLimiter)
return findall(accepted .≥ al.atleast)
end

function limit(scores::AbstractVector{GroupScore}, al::AtLeastLimiter)
return limit([s.score for s in scores], al)
end

# ---------------------------------------------------------------------------- #
# percentange limiter #
# ---------------------------------------------------------------------------- #
Expand Down Expand Up @@ -180,3 +196,7 @@ function limit(scores::AbstractVector{<:Real}, l::PercentageLimiter)
len = Int(ceil(length(scores) * perc(l)))
return sortperm(scores; rev = rev(l))[1:len]
end

function limit(scores::AbstractVector{GroupScore}, l::PercentageLimiter)
return limit([s.score for s in scores], l)
end
12 changes: 2 additions & 10 deletions src/filters/mutual_info.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,3 @@
# packages
using StatsBase
using SparseArrays, CategoricalArrays
using SpecialFunctions # For digamma function
using NearestNeighbors # For KDTree and knn
using Base.Threads: @threads
using Random

# ---------------------------------------------------------------------------- #
# utils #
# ---------------------------------------------------------------------------- #
Expand Down Expand Up @@ -257,7 +249,7 @@ References
Data Sets". PLoS ONE 9(2), 2014.
"""
function _estimate_mi(
Xdf::AbstractDataFrame,
Xdf::AbstractMatrix,
y::AbstractVector{<:SoleFeatures.Class};
discrete_mode::Union{AbstractArray, Nothing}=nothing,
discrete_target::Bool=false,
Expand Down Expand Up @@ -302,7 +294,7 @@ function _estimate_mi(
mi = Vector{Float64}(undef, n_features)

# Use threading for parallel computation
@threads for i in 1:n_features
Threads.@threads for i in 1:n_features
x = X[:, i]
mi[i] = _compute_mi(x, y, discrete_mask[i], discrete_target, n_neighbors)
end
Expand Down
11 changes: 10 additions & 1 deletion src/filters/univariate/mutualinformationclassif.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,16 @@ function score(
y::AbstractVector{<:Class},
selector::MutualInformationClassif
)::Vector{Float64}
scores = fs.mutual_info_classif(Matrix(X), y)
scores = mutual_info_classif(Matrix(X), y)
return scores
end

function score(
X::AbstractMatrix,
y::AbstractVector{<:Class},
selector::MutualInformationClassif
)::Vector{Float64}
scores = mutual_info_classif(Matrix(X), y)
return scores
end

Expand Down
7 changes: 7 additions & 0 deletions src/filters/univariate/variancefilter.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@ function score(
return var.(eachcol(X))
end

function score(
X::AbstractMatrix,
selector::VarianceFilter
)
return var.(eachcol(X))
end

# Ranking
VarianceRanking(nbest) = VarianceFilter(RankingLimiter(nbest, true))
# Threshold
Expand Down
Loading

0 comments on commit 256a803

Please sign in to comment.