@view trouble

aclai-lab · Mar 4, 2025 · 18b1602 · 18b1602
1 parent 6838d29
commit 18b1602
Show file tree

Hide file tree

Showing 8 changed files with 119 additions and 90 deletions.
diff --git a/Project.toml b/Project.toml
@@ -14,6 +14,7 @@ MultiData = "8cc5100c-b3d1-4f82-90cb-0ea93d317aba"
 NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 SoleBase = "4475fa32-7023-44a0-aa70-4813b230e492"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
@@ -29,6 +30,7 @@ MultiData = "0.1.4"
 NearestNeighbors = "0.4.21"
 OrderedCollections = "1"
 Random = "1"
+Reexport = "1.2.2"
 SoleBase = "0.13"
 SparseArrays = "1.11.0"
 SpecialFunctions = "2.5.0"

diff --git a/src/SoleFeatures.jl b/src/SoleFeatures.jl
@@ -5,6 +5,9 @@ using SoleBase
 using MultiData
 using StatsBase, Catch22
 
+using Reexport
+@reexport using SoleBase: movingwindow, wholewindow, splitwindow, adaptivewindow
+
 using SpecialFunctions  # For digamma function
 using NearestNeighbors  # For KDTree and knn
 using SparseArrays, CategoricalArrays, DataFrames

diff --git a/src/dataset/interface.jl b/src/dataset/interface.jl
@@ -48,19 +48,20 @@ const FeatNames = Union{Vector{<:Base.Callable}, Nothing}
 const DEFAULT_FE = (
     features = catch9,
 )
-const DEFAULT_FE_WINPARAMS = (
-    type = adaptivewindow,
-    nwindows = 10,
-    relative_overlap = 0.2
+const DEFAULT_WIN_PARAMS = Dict(
+    wholewindow    => (nwindows = 1,),
+    splitwindow    => (nwindows = 20,),
+    adaptivewindow => (nwindows = 20, relative_overlap = 0.5)
 )
 
-# const AVAIL_WINS       = (movingwindow, wholewindow, splitwindow, adaptivewindow)
+const AVAIL_WINS       = (movingwindow, wholewindow, splitwindow, adaptivewindow)
+const FE_AVAIL_WINS    = (wholewindow, splitwindow, adaptivewindow)
 # const AVAIL_TREATMENTS = (:aggregate, :reducesize)
 
 const WIN_PARAMS = Dict(
     movingwindow   => (window_size = 1024, window_step = 512),
     wholewindow    => NamedTuple(),
-    splitwindow    => (nwindows = 20),
+    splitwindow    => (nwindows = 20,),
     adaptivewindow => (nwindows = 20, relative_overlap = 0.5)
 )
 

diff --git a/src/dataset/prepare_dataset.jl b/src/dataset/prepare_dataset.jl
@@ -106,7 +106,7 @@ function _treatment(
     # Fill DataFrame
     for row in eachrow(X)
         row_intervals = winparams.type(maximum(length.(collect(row))); _wparams...)
-        # interval_dif is used in case we encounter a row with less intervals than the maximum
+        # interval_diff is used in case we encounter a row with less intervals than the maximum
         interval_diff = length(n_intervals) - length(row_intervals)
 
         if treatment == :aggregate
@@ -348,41 +348,58 @@ end
 """
     feature_selection_preprocess(
         X::DataFrame;
-        vnames::Union{Vector{String}, Vector{Symbol}, Nothing}=nothing,
-        features::Union{Vector{<:Base.Callable}, Nothing}=nothing,
-        nwindows::Union{Int, Nothing}=nothing
-    ) -> DataFrame
+        vnames::VarNames=nothing,
+        features::FeatNames=nothing,
+        type::Union{Base.Callable, Nothing}=nothing,
+        nwindows::Union{Int, Nothing}=nothing,
+        relative_overlap::Union{AbstractFloat, Nothing}=nothing
+    ) -> Tuple{DataFrame, Vector{InfoFeat}}
 
-Process a DataFrame for feature selection by converting its columns into Feature objects.
+Preprocess a dataset for feature selection by transforming the input DataFrame
+into a feature-extracted representation and creating corresponding metadata.
 
 # Arguments
-- `X::DataFrame`: Input DataFrame containing time series data
-- `vnames::Union{Vector{String}, Vector{Symbol}, Nothing}=nothing`: Names for the variables. 
-   If nothing, uses DataFrame column names
-- `features::Union{Vector{<:Base.Callable}, Nothing}=nothing`: Feature extraction functions. 
-   If nothing, uses DEFAULT_FE.features
-- `nwindows::Union{Int, Nothing}=nothing`: Number of windows for time series segmentation. 
-   If nothing, uses DEFAULT_FE_WINPARAMS
+- `X::DataFrame`: Input data to process. Can contain numeric columns or vector-valued columns.
+- `vnames::VarNames=nothing`: Names of columns to process. If `nothing`, uses all columns in `X`.
+- `features::FeatNames=nothing`: Feature extraction functions to apply. If `nothing`, uses 
+  `DEFAULT_FE.features`.
+- `type::Union{Base.Callable, Nothing}=nothing`: Window type function that must be a key in 
+  `WIN_PARAMS`. Determines how data is windowed.
+- `nwindows::Union{Int, Nothing}=nothing`: Number of windows for feature extraction. Must be 
+  positive if provided. Automatically set to 1 if `type=wholewindow` and not explicitly provided.
+- `relative_overlap::Union{AbstractFloat, Nothing}=nothing`: Overlap between consecutive windows.
+  Must be non-negative if provided.
 
 # Returns
-- `DataFrame`: A DataFrame where each element is a Feature object containing:
-  - value: extracted feature value
-  - var: variable name
-  - feats: feature extraction function used
-  - nwin: window number
+- `Tuple{DataFrame, Vector{InfoFeat}}`: A tuple containing:
+  1. A processed DataFrame with features extracted according to specified parameters
+  2. A vector of `InfoFeat` objects containing metadata for each extracted feature
 
-# Example
+# Throws
+- `ArgumentError`: If `type` is not in `WIN_PARAMS`, `nwindows` is not positive, 
+  or `relative_overlap` is negative.
+- `DimensionMismatch`: If elements have inconsistent dimensions (via `_check_dimensions`).
+
+# Examples
 ```julia
-# Basic usage with default parameters
-df = DataFrame(a = [rand(10) for _ in 1:5])
-result = feature_selection_preprocess(df)
-
-# Custom features and windows
-df = DataFrame(a = [rand(10) for _ in 1:5])
-result = feature_selection_preprocess(df,
-    features = [mean, std],
-    nwindows = 3
-)
+# Basic usage with defaults
+X_processed, Xinfo = feature_selection_preprocess(df)
+
+# Specify feature extraction functions
+X_processed, Xinfo = feature_selection_preprocess(df, 
+                                                 features=[minimum, maximum, mean])
+
+# Specify windowing parameters
+X_processed, Xinfo = feature_selection_preprocess(df, 
+                                                 nwindows=5, 
+                                                 relative_overlap=0.2)
+
+# Combine parameters for more control
+X_processed, Xinfo = feature_selection_preprocess(df,
+                                                 vnames=["sensor1", "sensor2"],
+                                                 features=[std, skewness],
+                                                 type=slidingwindow,
+                                                 nwindows=3)
 """
 function feature_selection_preprocess(
     X::DataFrame;
@@ -392,36 +409,36 @@ function feature_selection_preprocess(
     nwindows::Union{Int, Nothing}=nothing,
     relative_overlap::Union{AbstractFloat, Nothing}=nothing
 )
-    # check parameters
+    # validate parameters
     isnothing(vnames) && (vnames = names(X))
     isnothing(features) && (features = DEFAULT_FE.features)
     treatment = :aggregate
-    _ = _check_dimensions(X)
-
-    if !isnothing(type)
-        type ∈ keys(WIN_PARAMS) || throw(ArgumentError("Invalid window type."))
-    end
-    if !isnothing(nwindows)
-        nwindows > 0 || throw(ArgumentError("Number of windows must be positive."))
-    end
-    if !isnothing(relative_overlap)
-        relative_overlap ≥ 0 || throw(ArgumentError("Overlap must non negative."))
-    end
+    _ = _check_dimensions(X) # TODO multidimensions
+    !isnothing(type) && type ∉ FE_AVAIL_WINS && throw(ArgumentError("Invalid window type."))
+    !isnothing(nwindows) && nwindows ≤ 0 && throw(ArgumentError("Number of windows must be positive."))
+    !isnothing(relative_overlap) && relative_overlap < 0 && throw(ArgumentError("Overlap must be non-negative."))
 
-    winparams = begin
-        base_params = isnothing(type) ? DEFAULT_FE_WINPARAMS : merge(DEFAULT_FE_WINPARAMS, (type = type,))
-        base_params = isnothing(nwindows) ? DEFAULT_FE_WINPARAMS : merge(DEFAULT_FE_WINPARAMS, (nwindows = nwindows,))
-        isnothing(relative_overlap) ? base_params : merge(base_params, (relative_overlap = relative_overlap,))
-    end
-
-    total_features = length(features) * length(vnames) * nwindows
-    Xinfo = Vector{InfoFeat}(undef, total_features)
-    idx = 1
-
-    for f in features, v in vnames, n in 1:nwindows
-        Xinfo[idx] = InfoFeat(idx, v, Symbol(f), n)
-        idx += 1
-    end
+    # build winparams
+    winparams = merge(DEFAULT_WIN_PARAMS[type], (type = type,))
+    !isnothing(nwindows) && haskey(winparams, :nwindows) && (winparams = merge(winparams, (nwindows = nwindows,)))
+    !isnothing(relative_overlap) && haskey(winparams, :relative_overlap) && (winparams = merge(winparams, (relative_overlap = relative_overlap,)))
+
+    # set nwindows = 1 if type is wholewindow
+    isnothing(nwindows) && !isnothing(type) && type == wholewindow && (nwindows = 1)
+
+    # create Xinfo
+    nf, nv, nw = length(features), length(vnames), nwindows
+    Xinfo = [
+        InfoFeat(
+            (f_idx-1) * nv * nw + (v_idx-1) * nw + w_idx, 
+            vnames[v_idx],
+            Symbol(features[f_idx]), 
+            w_idx
+        )
+        for f_idx in 1:nf 
+        for v_idx in 1:nv 
+        for w_idx in 1:nw
+    ]
 
     _treatment(X, vnames, treatment, features, winparams), Xinfo
 end
diff --git a/src/experimental/extraction.jl b/src/experimental/extraction.jl
@@ -23,18 +23,18 @@ function _extract(v::AbstractVector, e::Extractor)
     return res
 end
 
-# function extract(df::AbstractDataFrame, es::Array{<:Extractor})
-#     return DataFrame(string.(es) .=> _extract.(getindex.([df], :, getindex.(es, 1)), es))
-# end
-
 function extract(df::AbstractDataFrame, es::Array{<:Extractor})
-    m = Matrix(undef, size(df, 1), length(es))
-    Threads.@threads for (i, e) in collect(enumerate(es))
-        m[:, i] .= _extract(df[:, e[1]], e)
-    end
-    return DataFrame([[v for v in m[:,i]] for i in 1:size(m, 2)], string.(es))
+    return DataFrame(string.(es) .=> _extract.(getindex.([df], :, getindex.(es, 1)), es))
 end
 
+# function extract(df::AbstractDataFrame, es::Array{<:Extractor})
+#     m = Matrix(undef, size(df, 1), length(es))
+#     Threads.@threads for (i, e) in collect(enumerate(es))
+#         m[:, i] .= _extract(df[:, e[1]], e)
+#     end
+#     return DataFrame([[v for v in m[:,i]] for i in 1:size(m, 2)], string.(es))
+# end
+
 function groupby(es::Array{<:Extractor}, idxes::Union{Int, NTuple{N, Int}}) where {N}
     res = Dict{Any, Vector{Extractor}}()
     for e in es

diff --git a/src/selection/fselection.jl b/src/selection/fselection.jl
@@ -0,0 +1,7 @@
+for i in 1:size(a[1],2)
+    if a[1][:,i] != b[1][:,i]
+        println(i)
+    end
+end
+
+# valid_X[:, 1] = [-0.531415, -0.493256, -0.536751, -0.57022, -0.663721, 
diff --git a/test/benchmarks/01_FS_Base.jl b/test/benchmarks/01_FS_Base.jl
@@ -459,7 +459,8 @@ function feature_selection(
 
     cache_extracted_dataset::Union{Nothing,AbstractString} = nothing,
     return_mid_results::Union{Val{true},Val{false}} = Val(true),
-)::Union{DataFrame,Tuple{DataFrame,FSMidResults}}
+# )::Union{DataFrame,Tuple{DataFrame,FSMidResults}}
+)
 
     # ==================== PREPARE INPUTS ====================
 
@@ -577,7 +578,7 @@ function feature_selection(
 
     if isa(return_mid_results, Val{true})
 
-        return newX[:,dataset_col_slice], (extraction_column_names = extraction_column_names, fs_mid_results = fs_mid_results)
+        return newX, newX[:,dataset_col_slice], (extraction_column_names = extraction_column_names, fs_mid_results = fs_mid_results)
 
     else
         return newX[:,dataset_col_slice]
@@ -825,7 +826,8 @@ end
 # load a time-series dataset
 df, y = SoleData.load_arff_dataset("NATOPS")
 
-ws = [FixedNumMovingWindows(6, 0.05)...]
+# ws = [FixedNumMovingWindows(6, 0.05)...]
+ws = [CenteredMovingWindow(1)...]
 ms = [minimum, maximum, mean]
 
 fs_methods = [
@@ -843,12 +845,12 @@ fs_methods = [
 	),
 ]
 
-# prepare dataset for feature selection
-Xdf, Xinfo = @test_nowarn SoleFeatures.feature_selection_preprocess(df; features=ms, nwindows=6)
-
 @info "FEATURE SELECTION"
 
-X, fs_mid_results = feature_selection(df, y, ex_windows = ws, ex_measures = ms, fs_methods = fs_methods, normalize = true)
+# X, fs_mid_results = feature_selection(df, y, ex_windows = ws, ex_measures = ms, fs_methods = fs_methods, normalize = true)
+
+b = feature_selection(df, y, ex_windows = ws, ex_measures = ms, fs_methods = fs_methods, normalize = true)
+
 # using BenchmarkTools
 # @btime X, fs_mid_results = feature_selection(df, y, ex_windows = ws, ex_measures = ms, fs_methods = fs_methods, normalize = true)
 

diff --git a/test/benchmarks/03_FS_newStruct.jl b/test/benchmarks/03_FS_newStruct.jl
@@ -621,7 +621,7 @@ function feature_selection(
 
     # questo serve solo per generare grafici
     # fs_mid_results = NamedTuple{(:score,:indices,:name2score,:group_aggr_func,:group_indices,:aggrby)}[]
-    fs_mid_results = NamedTuple{(:indices,:group_aggr_func,:group_indices,:aggrby)}[]
+    fs_mid_results = NamedTuple{(:score, :indices,:group_aggr_func,:group_indices,:aggrby)}[]
 
     for (fsm, gfs_params) in zip(fs_methods, aggrby)
         current_dataset_col_slice = 1:size(X, 2)
@@ -631,15 +631,14 @@ function feature_selection(
             current_dataset_col_slice = current_dataset_col_slice[fs_mid_results[i].indices]
         end
 
-        currX = @view X[:,current_dataset_col_slice]
-        currXinfo = @view Xinfo[current_dataset_col_slice]
+        currX = X[:,current_dataset_col_slice]
+        currXinfo = Xinfo[current_dataset_col_slice]
 
         dataset_param = isnothing(y_coded) || SoleFeatures.is_unsupervised(fsm.selector) ? 
             (currX, currXinfo) : 
             (currX, y_coded, currXinfo)
 
-        # score, idxes, g_indices =
-        idxes, scores, g_indices =
+        idxes, score, g_indices =
             if isnothing(gfs_params)
                 # perform normal feature selection
                 _fs(dataset_param..., fsm...)..., nothing
@@ -661,9 +660,8 @@ function feature_selection(
         sort!(idxes)
 
         push!(fs_mid_results, (
-            # score = score,
+            score = score,
             indices = idxes,
-            # name2score = Dict{String,Number}(names(currX) .=> score),
             group_aggr_func = isnothing(gfs_params) ? nothing : gfs_params.aggregatef,
             group_indices = g_indices,
             aggrby = isnothing(gfs_params) ? nothing : gfs_params.aggrby
@@ -676,13 +674,12 @@ function feature_selection(
     end
 
     if isa(return_mid_results, Val{true})
-
-        return X[:,dataset_col_slice], (extraction_column_names = Xinfo[dataset_col_slice], fs_mid_results = fs_mid_results)
-
+        return X, X[:,dataset_col_slice], (extraction_column_names = Xinfo[dataset_col_slice], fs_mid_results = fs_mid_results)
     else
         return X[:,dataset_col_slice]
     end
 end
+feature_selection(X::AbstractDataFrame, args...; kwargs...) = feature_selection(Matrix(X), args...; kwargs...)
 
 """
 TODO: docs
@@ -942,13 +939,13 @@ fs_methods = [
 ]
 
 # prepare dataset for feature selection
-Xdf, Xinfo = @test_nowarn SoleFeatures.feature_selection_preprocess(df; features=ms, type=SoleFeatures.adaptivewindow, nwindows=6, relative_overlap=0.2)
+# Xdf, Xinfo = @test_nowarn SoleFeatures.feature_selection_preprocess(df; features=ms, type=adaptivewindow, nwindows=6, relative_overlap=0.05)
+Xdf, Xinfo = @test_nowarn SoleFeatures.feature_selection_preprocess(df; features=ms, type=wholewindow)
 
 @info "FEATURE SELECTION"
 
 using BenchmarkTools
 
-Xm = Matrix(Xdf)
-feature_selection(Xm, y, Xinfo, fs_methods = fs_methods, norm = false)
+a = feature_selection(Xdf, y, Xinfo, fs_methods = fs_methods, norm = false)
 
 # 3.212 ms (52923 allocations: 4.37 MiB)