PasoStudio73 committed Mar 4, 2025
1 parent 6838d29 commit 18b1602
Showing 8 changed files with 119 additions and 90 deletions.
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ MultiData = "8cc5100c-b3d1-4f82-90cb-0ea93d317aba"
NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
SoleBase = "4475fa32-7023-44a0-aa70-4813b230e492"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
Expand All @@ -29,6 +30,7 @@ MultiData = "0.1.4"
NearestNeighbors = "0.4.21"
OrderedCollections = "1"
Random = "1"
Reexport = "1.2.2"
SoleBase = "0.13"
SparseArrays = "1.11.0"
SpecialFunctions = "2.5.0"
Expand Down
3 changes: 3 additions & 0 deletions src/SoleFeatures.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ using SoleBase
using MultiData
using StatsBase, Catch22

using Reexport
@reexport using SoleBase: movingwindow, wholewindow, splitwindow, adaptivewindow

using SpecialFunctions # For digamma function
using NearestNeighbors # For KDTree and knn
using SparseArrays, CategoricalArrays, DataFrames
Expand Down
13 changes: 7 additions & 6 deletions src/dataset/interface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,20 @@ const FeatNames = Union{Vector{<:Base.Callable}, Nothing}
const DEFAULT_FE = (
features = catch9,
type = adaptivewindow,
nwindows = 10,
relative_overlap = 0.2
wholewindow => (nwindows = 1,),
splitwindow => (nwindows = 20,),
adaptivewindow => (nwindows = 20, relative_overlap = 0.5)

# const AVAIL_WINS = (movingwindow, wholewindow, splitwindow, adaptivewindow)
const AVAIL_WINS = (movingwindow, wholewindow, splitwindow, adaptivewindow)
const FE_AVAIL_WINS = (wholewindow, splitwindow, adaptivewindow)
# const AVAIL_TREATMENTS = (:aggregate, :reducesize)

const WIN_PARAMS = Dict(
movingwindow => (window_size = 1024, window_step = 512),
wholewindow => NamedTuple(),
splitwindow => (nwindows = 20),
splitwindow => (nwindows = 20,),
adaptivewindow => (nwindows = 20, relative_overlap = 0.5)

Expand Down
127 changes: 72 additions & 55 deletions src/dataset/prepare_dataset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ function _treatment(
# Fill DataFrame
for row in eachrow(X)
row_intervals = winparams.type(maximum(length.(collect(row))); _wparams...)
# interval_dif is used in case we encounter a row with less intervals than the maximum
# interval_diff is used in case we encounter a row with less intervals than the maximum
interval_diff = length(n_intervals) - length(row_intervals)

if treatment == :aggregate
Expand Down Expand Up @@ -348,41 +348,58 @@ end
vnames::Union{Vector{String}, Vector{Symbol}, Nothing}=nothing,
features::Union{Vector{<:Base.Callable}, Nothing}=nothing,
nwindows::Union{Int, Nothing}=nothing
) -> DataFrame
type::Union{Base.Callable, Nothing}=nothing,
nwindows::Union{Int, Nothing}=nothing,
relative_overlap::Union{AbstractFloat, Nothing}=nothing
) -> Tuple{DataFrame, Vector{InfoFeat}}
Process a DataFrame for feature selection by converting its columns into Feature objects.
Preprocess a dataset for feature selection by transforming the input DataFrame
into a feature-extracted representation and creating corresponding metadata.
# Arguments
- `X::DataFrame`: Input DataFrame containing time series data
- `vnames::Union{Vector{String}, Vector{Symbol}, Nothing}=nothing`: Names for the variables.
If nothing, uses DataFrame column names
- `features::Union{Vector{<:Base.Callable}, Nothing}=nothing`: Feature extraction functions.
If nothing, uses DEFAULT_FE.features
- `nwindows::Union{Int, Nothing}=nothing`: Number of windows for time series segmentation.
- `X::DataFrame`: Input data to process. Can contain numeric columns or vector-valued columns.
- `vnames::VarNames=nothing`: Names of columns to process. If `nothing`, uses all columns in `X`.
- `features::FeatNames=nothing`: Feature extraction functions to apply. If `nothing`, uses
- `type::Union{Base.Callable, Nothing}=nothing`: Window type function that must be a key in
`WIN_PARAMS`. Determines how data is windowed.
- `nwindows::Union{Int, Nothing}=nothing`: Number of windows for feature extraction. Must be
positive if provided. Automatically set to 1 if `type=wholewindow` and not explicitly provided.
- `relative_overlap::Union{AbstractFloat, Nothing}=nothing`: Overlap between consecutive windows.
Must be non-negative if provided.
# Returns
- `DataFrame`: A DataFrame where each element is a Feature object containing:
- value: extracted feature value
- var: variable name
- feats: feature extraction function used
- nwin: window number
- `Tuple{DataFrame, Vector{InfoFeat}}`: A tuple containing:
1. A processed DataFrame with features extracted according to specified parameters
2. A vector of `InfoFeat` objects containing metadata for each extracted feature
# Example
# Throws
- `ArgumentError`: If `type` is not in `WIN_PARAMS`, `nwindows` is not positive,
or `relative_overlap` is negative.
- `DimensionMismatch`: If elements have inconsistent dimensions (via `_check_dimensions`).
# Examples
# Basic usage with default parameters
df = DataFrame(a = [rand(10) for _ in 1:5])
result = feature_selection_preprocess(df)
# Custom features and windows
df = DataFrame(a = [rand(10) for _ in 1:5])
result = feature_selection_preprocess(df,
features = [mean, std],
nwindows = 3
# Basic usage with defaults
X_processed, Xinfo = feature_selection_preprocess(df)
# Specify feature extraction functions
X_processed, Xinfo = feature_selection_preprocess(df,
features=[minimum, maximum, mean])
# Specify windowing parameters
X_processed, Xinfo = feature_selection_preprocess(df,
# Combine parameters for more control
X_processed, Xinfo = feature_selection_preprocess(df,
vnames=["sensor1", "sensor2"],
features=[std, skewness],
function feature_selection_preprocess(
Expand All @@ -392,36 +409,36 @@ function feature_selection_preprocess(
nwindows::Union{Int, Nothing}=nothing,
relative_overlap::Union{AbstractFloat, Nothing}=nothing
# check parameters
# validate parameters
isnothing(vnames) && (vnames = names(X))
isnothing(features) && (features = DEFAULT_FE.features)
treatment = :aggregate
_ = _check_dimensions(X)

if !isnothing(type)
type keys(WIN_PARAMS) || throw(ArgumentError("Invalid window type."))
if !isnothing(nwindows)
nwindows > 0 || throw(ArgumentError("Number of windows must be positive."))
if !isnothing(relative_overlap)
relative_overlap 0 || throw(ArgumentError("Overlap must non negative."))
_ = _check_dimensions(X) # TODO multidimensions
!isnothing(type) && type FE_AVAIL_WINS && throw(ArgumentError("Invalid window type."))
!isnothing(nwindows) && nwindows 0 && throw(ArgumentError("Number of windows must be positive."))
!isnothing(relative_overlap) && relative_overlap < 0 && throw(ArgumentError("Overlap must be non-negative."))

winparams = begin
base_params = isnothing(type) ? DEFAULT_FE_WINPARAMS : merge(DEFAULT_FE_WINPARAMS, (type = type,))
base_params = isnothing(nwindows) ? DEFAULT_FE_WINPARAMS : merge(DEFAULT_FE_WINPARAMS, (nwindows = nwindows,))
isnothing(relative_overlap) ? base_params : merge(base_params, (relative_overlap = relative_overlap,))

total_features = length(features) * length(vnames) * nwindows
Xinfo = Vector{InfoFeat}(undef, total_features)
idx = 1

for f in features, v in vnames, n in 1:nwindows
Xinfo[idx] = InfoFeat(idx, v, Symbol(f), n)
idx += 1
# build winparams
winparams = merge(DEFAULT_WIN_PARAMS[type], (type = type,))
!isnothing(nwindows) && haskey(winparams, :nwindows) && (winparams = merge(winparams, (nwindows = nwindows,)))
!isnothing(relative_overlap) && haskey(winparams, :relative_overlap) && (winparams = merge(winparams, (relative_overlap = relative_overlap,)))

# set nwindows = 1 if type is wholewindow
isnothing(nwindows) && !isnothing(type) && type == wholewindow && (nwindows = 1)

# create Xinfo
nf, nv, nw = length(features), length(vnames), nwindows
Xinfo = [
(f_idx-1) * nv * nw + (v_idx-1) * nw + w_idx,
for f_idx in 1:nf
for v_idx in 1:nv
for w_idx in 1:nw

_treatment(X, vnames, treatment, features, winparams), Xinfo
18 changes: 9 additions & 9 deletions src/experimental/extraction.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,18 @@ function _extract(v::AbstractVector, e::Extractor)
return res

# function extract(df::AbstractDataFrame, es::Array{<:Extractor})
# return DataFrame(string.(es) .=> _extract.(getindex.([df], :, getindex.(es, 1)), es))
# end

function extract(df::AbstractDataFrame, es::Array{<:Extractor})
m = Matrix(undef, size(df, 1), length(es))
Threads.@threads for (i, e) in collect(enumerate(es))
m[:, i] .= _extract(df[:, e[1]], e)
return DataFrame([[v for v in m[:,i]] for i in 1:size(m, 2)], string.(es))
return DataFrame(string.(es) .=> _extract.(getindex.([df], :, getindex.(es, 1)), es))

# function extract(df::AbstractDataFrame, es::Array{<:Extractor})
# m = Matrix(undef, size(df, 1), length(es))
# Threads.@threads for (i, e) in collect(enumerate(es))
# m[:, i] .= _extract(df[:, e[1]], e)
# end
# return DataFrame([[v for v in m[:,i]] for i in 1:size(m, 2)], string.(es))
# end

function groupby(es::Array{<:Extractor}, idxes::Union{Int, NTuple{N, Int}}) where {N}
res = Dict{Any, Vector{Extractor}}()
for e in es
Expand Down
7 changes: 7 additions & 0 deletions src/selection/fselection.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
for i in 1:size(a[1],2)
if a[1][:,i] != b[1][:,i]

# valid_X[:, 1] = [-0.531415, -0.493256, -0.536751, -0.57022, -0.663721,
16 changes: 9 additions & 7 deletions test/benchmarks/01_FS_Base.jl
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,8 @@ function feature_selection(

cache_extracted_dataset::Union{Nothing,AbstractString} = nothing,
return_mid_results::Union{Val{true},Val{false}} = Val(true),
# )::Union{DataFrame,Tuple{DataFrame,FSMidResults}}

# ==================== PREPARE INPUTS ====================

Expand Down Expand Up @@ -577,7 +578,7 @@ function feature_selection(

if isa(return_mid_results, Val{true})

return newX[:,dataset_col_slice], (extraction_column_names = extraction_column_names, fs_mid_results = fs_mid_results)
return newX, newX[:,dataset_col_slice], (extraction_column_names = extraction_column_names, fs_mid_results = fs_mid_results)

return newX[:,dataset_col_slice]
Expand Down Expand Up @@ -825,7 +826,8 @@ end
# load a time-series dataset
df, y = SoleData.load_arff_dataset("NATOPS")

ws = [FixedNumMovingWindows(6, 0.05)...]
# ws = [FixedNumMovingWindows(6, 0.05)...]
ws = [CenteredMovingWindow(1)...]
ms = [minimum, maximum, mean]

fs_methods = [
Expand All @@ -843,12 +845,12 @@ fs_methods = [

# prepare dataset for feature selection
Xdf, Xinfo = @test_nowarn SoleFeatures.feature_selection_preprocess(df; features=ms, nwindows=6)


X, fs_mid_results = feature_selection(df, y, ex_windows = ws, ex_measures = ms, fs_methods = fs_methods, normalize = true)
# X, fs_mid_results = feature_selection(df, y, ex_windows = ws, ex_measures = ms, fs_methods = fs_methods, normalize = true)

b = feature_selection(df, y, ex_windows = ws, ex_measures = ms, fs_methods = fs_methods, normalize = true)

# using BenchmarkTools
# @btime X, fs_mid_results = feature_selection(df, y, ex_windows = ws, ex_measures = ms, fs_methods = fs_methods, normalize = true)

Expand Down
23 changes: 10 additions & 13 deletions test/benchmarks/03_FS_newStruct.jl
Original file line number Diff line number Diff line change
Expand Up @@ -621,7 +621,7 @@ function feature_selection(

# questo serve solo per generare grafici
# fs_mid_results = NamedTuple{(:score,:indices,:name2score,:group_aggr_func,:group_indices,:aggrby)}[]
fs_mid_results = NamedTuple{(:indices,:group_aggr_func,:group_indices,:aggrby)}[]
fs_mid_results = NamedTuple{(:score, :indices,:group_aggr_func,:group_indices,:aggrby)}[]

for (fsm, gfs_params) in zip(fs_methods, aggrby)
current_dataset_col_slice = 1:size(X, 2)
Expand All @@ -631,15 +631,14 @@ function feature_selection(
current_dataset_col_slice = current_dataset_col_slice[fs_mid_results[i].indices]

currX = @view X[:,current_dataset_col_slice]
currXinfo = @view Xinfo[current_dataset_col_slice]
currX = X[:,current_dataset_col_slice]
currXinfo = Xinfo[current_dataset_col_slice]

dataset_param = isnothing(y_coded) || SoleFeatures.is_unsupervised(fsm.selector) ?
(currX, currXinfo) :
(currX, y_coded, currXinfo)

# score, idxes, g_indices =
idxes, scores, g_indices =
idxes, score, g_indices =
if isnothing(gfs_params)
# perform normal feature selection
_fs(dataset_param..., fsm...)..., nothing
Expand All @@ -661,9 +660,8 @@ function feature_selection(

push!(fs_mid_results, (
# score = score,
score = score,
indices = idxes,
# name2score = Dict{String,Number}(names(currX) .=> score),
group_aggr_func = isnothing(gfs_params) ? nothing : gfs_params.aggregatef,
group_indices = g_indices,
aggrby = isnothing(gfs_params) ? nothing : gfs_params.aggrby
Expand All @@ -676,13 +674,12 @@ function feature_selection(

if isa(return_mid_results, Val{true})

return X[:,dataset_col_slice], (extraction_column_names = Xinfo[dataset_col_slice], fs_mid_results = fs_mid_results)

return X, X[:,dataset_col_slice], (extraction_column_names = Xinfo[dataset_col_slice], fs_mid_results = fs_mid_results)
return X[:,dataset_col_slice]
feature_selection(X::AbstractDataFrame, args...; kwargs...) = feature_selection(Matrix(X), args...; kwargs...)

TODO: docs
Expand Down Expand Up @@ -942,13 +939,13 @@ fs_methods = [

# prepare dataset for feature selection
Xdf, Xinfo = @test_nowarn SoleFeatures.feature_selection_preprocess(df; features=ms, type=SoleFeatures.adaptivewindow, nwindows=6, relative_overlap=0.2)
# Xdf, Xinfo = @test_nowarn SoleFeatures.feature_selection_preprocess(df; features=ms, type=adaptivewindow, nwindows=6, relative_overlap=0.05)
Xdf, Xinfo = @test_nowarn SoleFeatures.feature_selection_preprocess(df; features=ms, type=wholewindow)


using BenchmarkTools

Xm = Matrix(Xdf)
feature_selection(Xm, y, Xinfo, fs_methods = fs_methods, norm = false)
a = feature_selection(Xdf, y, Xinfo, fs_methods = fs_methods, norm = false)

# 3.212 ms (52923 allocations: 4.37 MiB)

