From 18b1602420afc8e652184ba018b6889362ed0a9a Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Wed, 5 Mar 2025 00:14:09 +0100 Subject: [PATCH] @view trouble --- Project.toml | 2 + src/SoleFeatures.jl | 3 + src/dataset/interface.jl | 13 +-- src/dataset/prepare_dataset.jl | 127 ++++++++++++++++------------- src/experimental/extraction.jl | 18 ++-- src/selection/fselection.jl | 7 ++ test/benchmarks/01_FS_Base.jl | 16 ++-- test/benchmarks/03_FS_newStruct.jl | 23 +++--- 8 files changed, 119 insertions(+), 90 deletions(-) create mode 100644 src/selection/fselection.jl diff --git a/Project.toml b/Project.toml index ac104ea..d7dc895 100644 --- a/Project.toml +++ b/Project.toml @@ -14,6 +14,7 @@ MultiData = "8cc5100c-b3d1-4f82-90cb-0ea93d317aba" NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce" OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Reexport = "189a3867-3050-52da-a836-e630ba90ab69" SoleBase = "4475fa32-7023-44a0-aa70-4813b230e492" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" @@ -29,6 +30,7 @@ MultiData = "0.1.4" NearestNeighbors = "0.4.21" OrderedCollections = "1" Random = "1" +Reexport = "1.2.2" SoleBase = "0.13" SparseArrays = "1.11.0" SpecialFunctions = "2.5.0" diff --git a/src/SoleFeatures.jl b/src/SoleFeatures.jl index 820ad58..73c515b 100644 --- a/src/SoleFeatures.jl +++ b/src/SoleFeatures.jl @@ -5,6 +5,9 @@ using SoleBase using MultiData using StatsBase, Catch22 +using Reexport +@reexport using SoleBase: movingwindow, wholewindow, splitwindow, adaptivewindow + using SpecialFunctions # For digamma function using NearestNeighbors # For KDTree and knn using SparseArrays, CategoricalArrays, DataFrames diff --git a/src/dataset/interface.jl b/src/dataset/interface.jl index 39f5933..70e9113 100644 --- a/src/dataset/interface.jl +++ b/src/dataset/interface.jl @@ -48,19 +48,20 @@ const FeatNames = Union{Vector{<:Base.Callable}, Nothing} const DEFAULT_FE = ( features = catch9, ) -const DEFAULT_FE_WINPARAMS = ( - type = adaptivewindow, - nwindows = 10, - relative_overlap = 0.2 +const DEFAULT_WIN_PARAMS = Dict( + wholewindow => (nwindows = 1,), + splitwindow => (nwindows = 20,), + adaptivewindow => (nwindows = 20, relative_overlap = 0.5) ) -# const AVAIL_WINS = (movingwindow, wholewindow, splitwindow, adaptivewindow) +const AVAIL_WINS = (movingwindow, wholewindow, splitwindow, adaptivewindow) +const FE_AVAIL_WINS = (wholewindow, splitwindow, adaptivewindow) # const AVAIL_TREATMENTS = (:aggregate, :reducesize) const WIN_PARAMS = Dict( movingwindow => (window_size = 1024, window_step = 512), wholewindow => NamedTuple(), - splitwindow => (nwindows = 20), + splitwindow => (nwindows = 20,), adaptivewindow => (nwindows = 20, relative_overlap = 0.5) ) diff --git a/src/dataset/prepare_dataset.jl b/src/dataset/prepare_dataset.jl index b7c5119..09a69eb 100644 --- a/src/dataset/prepare_dataset.jl +++ b/src/dataset/prepare_dataset.jl @@ -106,7 +106,7 @@ function _treatment( # Fill DataFrame for row in eachrow(X) row_intervals = winparams.type(maximum(length.(collect(row))); _wparams...) - # interval_dif is used in case we encounter a row with less intervals than the maximum + # interval_diff is used in case we encounter a row with less intervals than the maximum interval_diff = length(n_intervals) - length(row_intervals) if treatment == :aggregate @@ -348,41 +348,58 @@ end """ feature_selection_preprocess( X::DataFrame; - vnames::Union{Vector{String}, Vector{Symbol}, Nothing}=nothing, - features::Union{Vector{<:Base.Callable}, Nothing}=nothing, - nwindows::Union{Int, Nothing}=nothing - ) -> DataFrame + vnames::VarNames=nothing, + features::FeatNames=nothing, + type::Union{Base.Callable, Nothing}=nothing, + nwindows::Union{Int, Nothing}=nothing, + relative_overlap::Union{AbstractFloat, Nothing}=nothing + ) -> Tuple{DataFrame, Vector{InfoFeat}} -Process a DataFrame for feature selection by converting its columns into Feature objects. +Preprocess a dataset for feature selection by transforming the input DataFrame +into a feature-extracted representation and creating corresponding metadata. # Arguments -- `X::DataFrame`: Input DataFrame containing time series data -- `vnames::Union{Vector{String}, Vector{Symbol}, Nothing}=nothing`: Names for the variables. - If nothing, uses DataFrame column names -- `features::Union{Vector{<:Base.Callable}, Nothing}=nothing`: Feature extraction functions. - If nothing, uses DEFAULT_FE.features -- `nwindows::Union{Int, Nothing}=nothing`: Number of windows for time series segmentation. - If nothing, uses DEFAULT_FE_WINPARAMS +- `X::DataFrame`: Input data to process. Can contain numeric columns or vector-valued columns. +- `vnames::VarNames=nothing`: Names of columns to process. If `nothing`, uses all columns in `X`. +- `features::FeatNames=nothing`: Feature extraction functions to apply. If `nothing`, uses + `DEFAULT_FE.features`. +- `type::Union{Base.Callable, Nothing}=nothing`: Window type function that must be a key in + `WIN_PARAMS`. Determines how data is windowed. +- `nwindows::Union{Int, Nothing}=nothing`: Number of windows for feature extraction. Must be + positive if provided. Automatically set to 1 if `type=wholewindow` and not explicitly provided. +- `relative_overlap::Union{AbstractFloat, Nothing}=nothing`: Overlap between consecutive windows. + Must be non-negative if provided. # Returns -- `DataFrame`: A DataFrame where each element is a Feature object containing: - - value: extracted feature value - - var: variable name - - feats: feature extraction function used - - nwin: window number +- `Tuple{DataFrame, Vector{InfoFeat}}`: A tuple containing: + 1. A processed DataFrame with features extracted according to specified parameters + 2. A vector of `InfoFeat` objects containing metadata for each extracted feature -# Example +# Throws +- `ArgumentError`: If `type` is not in `WIN_PARAMS`, `nwindows` is not positive, + or `relative_overlap` is negative. +- `DimensionMismatch`: If elements have inconsistent dimensions (via `_check_dimensions`). + +# Examples ```julia -# Basic usage with default parameters -df = DataFrame(a = [rand(10) for _ in 1:5]) -result = feature_selection_preprocess(df) - -# Custom features and windows -df = DataFrame(a = [rand(10) for _ in 1:5]) -result = feature_selection_preprocess(df, - features = [mean, std], - nwindows = 3 -) +# Basic usage with defaults +X_processed, Xinfo = feature_selection_preprocess(df) + +# Specify feature extraction functions +X_processed, Xinfo = feature_selection_preprocess(df, + features=[minimum, maximum, mean]) + +# Specify windowing parameters +X_processed, Xinfo = feature_selection_preprocess(df, + nwindows=5, + relative_overlap=0.2) + +# Combine parameters for more control +X_processed, Xinfo = feature_selection_preprocess(df, + vnames=["sensor1", "sensor2"], + features=[std, skewness], + type=slidingwindow, + nwindows=3) """ function feature_selection_preprocess( X::DataFrame; @@ -392,36 +409,36 @@ function feature_selection_preprocess( nwindows::Union{Int, Nothing}=nothing, relative_overlap::Union{AbstractFloat, Nothing}=nothing ) - # check parameters + # validate parameters isnothing(vnames) && (vnames = names(X)) isnothing(features) && (features = DEFAULT_FE.features) treatment = :aggregate - _ = _check_dimensions(X) - - if !isnothing(type) - type ∈ keys(WIN_PARAMS) || throw(ArgumentError("Invalid window type.")) - end - if !isnothing(nwindows) - nwindows > 0 || throw(ArgumentError("Number of windows must be positive.")) - end - if !isnothing(relative_overlap) - relative_overlap ≥ 0 || throw(ArgumentError("Overlap must non negative.")) - end + _ = _check_dimensions(X) # TODO multidimensions + !isnothing(type) && type ∉ FE_AVAIL_WINS && throw(ArgumentError("Invalid window type.")) + !isnothing(nwindows) && nwindows ≤ 0 && throw(ArgumentError("Number of windows must be positive.")) + !isnothing(relative_overlap) && relative_overlap < 0 && throw(ArgumentError("Overlap must be non-negative.")) - winparams = begin - base_params = isnothing(type) ? DEFAULT_FE_WINPARAMS : merge(DEFAULT_FE_WINPARAMS, (type = type,)) - base_params = isnothing(nwindows) ? DEFAULT_FE_WINPARAMS : merge(DEFAULT_FE_WINPARAMS, (nwindows = nwindows,)) - isnothing(relative_overlap) ? base_params : merge(base_params, (relative_overlap = relative_overlap,)) - end - - total_features = length(features) * length(vnames) * nwindows - Xinfo = Vector{InfoFeat}(undef, total_features) - idx = 1 - - for f in features, v in vnames, n in 1:nwindows - Xinfo[idx] = InfoFeat(idx, v, Symbol(f), n) - idx += 1 - end + # build winparams + winparams = merge(DEFAULT_WIN_PARAMS[type], (type = type,)) + !isnothing(nwindows) && haskey(winparams, :nwindows) && (winparams = merge(winparams, (nwindows = nwindows,))) + !isnothing(relative_overlap) && haskey(winparams, :relative_overlap) && (winparams = merge(winparams, (relative_overlap = relative_overlap,))) + + # set nwindows = 1 if type is wholewindow + isnothing(nwindows) && !isnothing(type) && type == wholewindow && (nwindows = 1) + + # create Xinfo + nf, nv, nw = length(features), length(vnames), nwindows + Xinfo = [ + InfoFeat( + (f_idx-1) * nv * nw + (v_idx-1) * nw + w_idx, + vnames[v_idx], + Symbol(features[f_idx]), + w_idx + ) + for f_idx in 1:nf + for v_idx in 1:nv + for w_idx in 1:nw + ] _treatment(X, vnames, treatment, features, winparams), Xinfo end diff --git a/src/experimental/extraction.jl b/src/experimental/extraction.jl index 19a250d..1c6a209 100644 --- a/src/experimental/extraction.jl +++ b/src/experimental/extraction.jl @@ -23,18 +23,18 @@ function _extract(v::AbstractVector, e::Extractor) return res end -# function extract(df::AbstractDataFrame, es::Array{<:Extractor}) -# return DataFrame(string.(es) .=> _extract.(getindex.([df], :, getindex.(es, 1)), es)) -# end - function extract(df::AbstractDataFrame, es::Array{<:Extractor}) - m = Matrix(undef, size(df, 1), length(es)) - Threads.@threads for (i, e) in collect(enumerate(es)) - m[:, i] .= _extract(df[:, e[1]], e) - end - return DataFrame([[v for v in m[:,i]] for i in 1:size(m, 2)], string.(es)) + return DataFrame(string.(es) .=> _extract.(getindex.([df], :, getindex.(es, 1)), es)) end +# function extract(df::AbstractDataFrame, es::Array{<:Extractor}) +# m = Matrix(undef, size(df, 1), length(es)) +# Threads.@threads for (i, e) in collect(enumerate(es)) +# m[:, i] .= _extract(df[:, e[1]], e) +# end +# return DataFrame([[v for v in m[:,i]] for i in 1:size(m, 2)], string.(es)) +# end + function groupby(es::Array{<:Extractor}, idxes::Union{Int, NTuple{N, Int}}) where {N} res = Dict{Any, Vector{Extractor}}() for e in es diff --git a/src/selection/fselection.jl b/src/selection/fselection.jl new file mode 100644 index 0000000..21809e7 --- /dev/null +++ b/src/selection/fselection.jl @@ -0,0 +1,7 @@ +for i in 1:size(a[1],2) + if a[1][:,i] != b[1][:,i] + println(i) + end +end + +# valid_X[:, 1] = [-0.531415, -0.493256, -0.536751, -0.57022, -0.663721, \ No newline at end of file diff --git a/test/benchmarks/01_FS_Base.jl b/test/benchmarks/01_FS_Base.jl index 18e7922..d05203f 100644 --- a/test/benchmarks/01_FS_Base.jl +++ b/test/benchmarks/01_FS_Base.jl @@ -459,7 +459,8 @@ function feature_selection( cache_extracted_dataset::Union{Nothing,AbstractString} = nothing, return_mid_results::Union{Val{true},Val{false}} = Val(true), -)::Union{DataFrame,Tuple{DataFrame,FSMidResults}} +# )::Union{DataFrame,Tuple{DataFrame,FSMidResults}} +) # ==================== PREPARE INPUTS ==================== @@ -577,7 +578,7 @@ function feature_selection( if isa(return_mid_results, Val{true}) - return newX[:,dataset_col_slice], (extraction_column_names = extraction_column_names, fs_mid_results = fs_mid_results) + return newX, newX[:,dataset_col_slice], (extraction_column_names = extraction_column_names, fs_mid_results = fs_mid_results) else return newX[:,dataset_col_slice] @@ -825,7 +826,8 @@ end # load a time-series dataset df, y = SoleData.load_arff_dataset("NATOPS") -ws = [FixedNumMovingWindows(6, 0.05)...] +# ws = [FixedNumMovingWindows(6, 0.05)...] +ws = [CenteredMovingWindow(1)...] ms = [minimum, maximum, mean] fs_methods = [ @@ -843,12 +845,12 @@ fs_methods = [ ), ] -# prepare dataset for feature selection -Xdf, Xinfo = @test_nowarn SoleFeatures.feature_selection_preprocess(df; features=ms, nwindows=6) - @info "FEATURE SELECTION" -X, fs_mid_results = feature_selection(df, y, ex_windows = ws, ex_measures = ms, fs_methods = fs_methods, normalize = true) +# X, fs_mid_results = feature_selection(df, y, ex_windows = ws, ex_measures = ms, fs_methods = fs_methods, normalize = true) + +b = feature_selection(df, y, ex_windows = ws, ex_measures = ms, fs_methods = fs_methods, normalize = true) + # using BenchmarkTools # @btime X, fs_mid_results = feature_selection(df, y, ex_windows = ws, ex_measures = ms, fs_methods = fs_methods, normalize = true) diff --git a/test/benchmarks/03_FS_newStruct.jl b/test/benchmarks/03_FS_newStruct.jl index 854872a..c51e8cd 100644 --- a/test/benchmarks/03_FS_newStruct.jl +++ b/test/benchmarks/03_FS_newStruct.jl @@ -621,7 +621,7 @@ function feature_selection( # questo serve solo per generare grafici # fs_mid_results = NamedTuple{(:score,:indices,:name2score,:group_aggr_func,:group_indices,:aggrby)}[] - fs_mid_results = NamedTuple{(:indices,:group_aggr_func,:group_indices,:aggrby)}[] + fs_mid_results = NamedTuple{(:score, :indices,:group_aggr_func,:group_indices,:aggrby)}[] for (fsm, gfs_params) in zip(fs_methods, aggrby) current_dataset_col_slice = 1:size(X, 2) @@ -631,15 +631,14 @@ function feature_selection( current_dataset_col_slice = current_dataset_col_slice[fs_mid_results[i].indices] end - currX = @view X[:,current_dataset_col_slice] - currXinfo = @view Xinfo[current_dataset_col_slice] + currX = X[:,current_dataset_col_slice] + currXinfo = Xinfo[current_dataset_col_slice] dataset_param = isnothing(y_coded) || SoleFeatures.is_unsupervised(fsm.selector) ? (currX, currXinfo) : (currX, y_coded, currXinfo) - # score, idxes, g_indices = - idxes, scores, g_indices = + idxes, score, g_indices = if isnothing(gfs_params) # perform normal feature selection _fs(dataset_param..., fsm...)..., nothing @@ -661,9 +660,8 @@ function feature_selection( sort!(idxes) push!(fs_mid_results, ( - # score = score, + score = score, indices = idxes, - # name2score = Dict{String,Number}(names(currX) .=> score), group_aggr_func = isnothing(gfs_params) ? nothing : gfs_params.aggregatef, group_indices = g_indices, aggrby = isnothing(gfs_params) ? nothing : gfs_params.aggrby @@ -676,13 +674,12 @@ function feature_selection( end if isa(return_mid_results, Val{true}) - - return X[:,dataset_col_slice], (extraction_column_names = Xinfo[dataset_col_slice], fs_mid_results = fs_mid_results) - + return X, X[:,dataset_col_slice], (extraction_column_names = Xinfo[dataset_col_slice], fs_mid_results = fs_mid_results) else return X[:,dataset_col_slice] end end +feature_selection(X::AbstractDataFrame, args...; kwargs...) = feature_selection(Matrix(X), args...; kwargs...) """ TODO: docs @@ -942,13 +939,13 @@ fs_methods = [ ] # prepare dataset for feature selection -Xdf, Xinfo = @test_nowarn SoleFeatures.feature_selection_preprocess(df; features=ms, type=SoleFeatures.adaptivewindow, nwindows=6, relative_overlap=0.2) +# Xdf, Xinfo = @test_nowarn SoleFeatures.feature_selection_preprocess(df; features=ms, type=adaptivewindow, nwindows=6, relative_overlap=0.05) +Xdf, Xinfo = @test_nowarn SoleFeatures.feature_selection_preprocess(df; features=ms, type=wholewindow) @info "FEATURE SELECTION" using BenchmarkTools -Xm = Matrix(Xdf) -feature_selection(Xm, y, Xinfo, fs_methods = fs_methods, norm = false) +a = feature_selection(Xdf, y, Xinfo, fs_methods = fs_methods, norm = false) # 3.212 ms (52923 allocations: 4.37 MiB)