diff --git a/Project.toml b/Project.toml index 96cdeab..22225b6 100644 --- a/Project.toml +++ b/Project.toml @@ -1,39 +1,29 @@ name = "SoleFeatures" uuid = "3ceb4e54-d968-4e97-8c18-2deeb0d429fb" authors = ["Patrik Cavina", "Federico Manzella", "Giovanni Pagliarini"] -version = "0.2.0" +version = "0.3.0" [deps] Catch22 = "acdeb78f-3d39-4310-8fdf-6d75c17c6d5a" +CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" HypothesisTests = "09f84164-cd44-5f33-b23f-e6b0d136a0d5" IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MLBase = "f0e99cf1-93fa-52ec-9ecc-5026115318e0" -MultiData = "8cc5100c-b3d1-4f82-90cb-0ea93d317aba" OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -Reexport = "189a3867-3050-52da-a836-e630ba90ab69" -Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" SoleBase = "4475fa32-7023-44a0-aa70-4813b230e492" -SoleData = "123f1ae1-6307-4526-ab5b-aab3a92a2b8c" -SoleLogics = "b002da8f-3cb3-4d91-bbe3-2953433912b5" -Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] Catch22 = "0.7" +CategoricalArrays = "0.10" DataFrames = "1" -HypothesisTests = "0.10 - 0.11" +HypothesisTests = "0.11" IterTools = "1" -MultiData = "0 - 0.1" OrderedCollections = "1" -PyCall = "1" Random = "1" -Reexport = "1" -Revise = "3" -SoleData = "0.16" +SoleBase = "0.13" StatsBase = "0.30 - 0.34" julia = "1" diff --git a/src/SoleFeatures.jl b/src/SoleFeatures.jl index 2028298..7f7a14c 100644 --- a/src/SoleFeatures.jl +++ b/src/SoleFeatures.jl @@ -1,96 +1,111 @@ -__precompile__() +# __precompile__() module SoleFeatures -using StatsBase -using MultiData -using SoleData -using Reexport +using SoleBase +using StatsBase, Catch22 +using CategoricalArrays, DataFrames using Random -using LinearAlgebra -using HypothesisTests -using IterTools -using PyCall -using MLBase -# using Pkg -# abstracts -export AbstractFeaturesSelector -export AbstractFilterBased -export AbstractWrapperBased -export AbstractEmbeddedBased -export AbstractLimiter -# structs -export VarianceThreshold -export VarianceRanking -export RandomRanking -export StatisticalAtLeastOnce -export StatisticalMajority -export PearsonCorRanking -export Chi2Ranking -export Chi2Threshold -export MutualInformationClassifRanking -export CompoundStatisticalAtLeastOnce -export CompoundStatisticalMajority -export CorrelationFilter -# main functions -export apply, buildbitmask, transform, transform! -# utils -export bm2var +include("utils/features_set.jl") +export mode_5, mode_10, embedding_dist, acf_timescale, acf_first_min, ami2, trev, outlier_timing_pos +export outlier_timing_neg, whiten_timescale, forecast_error, ami_timescale, high_fluctuation, stretch_decreasing +export stretch_high, entropy_pairs, rs_range, dfa, low_freq_power, centroid_freq, transition_variance, periodicity +export base_set, catch9, catch22_set, complete_set -@reexport using DataFrames +include("dataset/dataset_structs.jl") +export Feature -const req_py_pkgs = ["scipy", "scikit-learn", "skfeature"] -const fs = PyNULL() -const construct_w = PyNULL() -const lap_score = PyNULL() -const fisher_score = PyNULL() -function __init__() +include("dataset/prepare_dataset.jl") +export feature_selection_preprocess - pypkgs = getindex.(PyCall.Conda.parseconda(`list`, PyCall.Conda.ROOTENV), "name") - needinstall = !all(p -> in(p, pypkgs), req_py_pkgs) +# using MultiData +# using SoleData +# using Reexport +# using LinearAlgebra +# using HypothesisTests +# using IterTools +# using PyCall +# using MLBase +# # using Pkg - if (needinstall) - PyCall.Conda.pip_interop(true, PyCall.Conda.ROOTENV) - PyCall.Conda.add("scipy") - PyCall.Conda.add("scikit-learn") - PyCall.Conda.pip( - "install", - "git+https://github.com/jundongl/scikit-feature.git#egg=skfeature", - PyCall.Conda.ROOTENV - ) - end +# # abstracts +# export AbstractFeaturesSelector +# export AbstractFilterBased +# export AbstractWrapperBased +# export AbstractEmbeddedBased +# export AbstractLimiter +# # structs +# export VarianceThreshold +# export VarianceRanking +# export RandomRanking +# export StatisticalAtLeastOnce +# export StatisticalMajority +# export PearsonCorRanking +# export Chi2Ranking +# export Chi2Threshold +# export MutualInformationClassifRanking +# export CompoundStatisticalAtLeastOnce +# export CompoundStatisticalMajority +# export CorrelationFilter +# # main functions +# export apply, buildbitmask, transform, transform! +# # utils +# export bm2var - copy!(fs, pyimport_conda("sklearn.feature_selection", "scikit-learn")) - copy!(construct_w, pyimport_conda("skfeature.utility.construct_W", "skfeature")) - copy!(lap_score, pyimport_conda( - "skfeature.function.similarity_based.lap_score", - "skfeature" - )) - copy!(fisher_score, pyimport_conda( - "skfeature.function.similarity_based.fisher_score", - "skfeature" - )) -end +# @reexport using DataFrames -include("interface.jl") -include("core.jl") -# Utils -include("utils/utils.jl") -# Filters -include("filters/limiter.jl") -include("filters/interface.jl") -include("filters/univariate/randomfilter.jl") -include("filters/univariate/statisticalfilter.jl") -include("filters/univariate/variancefilter.jl") -include("filters/univariate/chi2filter.jl") -include("filters/univariate/pearsoncorfilter.jl") -include("filters/univariate/mutualinformationclassif.jl") -include("filters/univariate/suplapscorefiler.jl") -include("filters/univariate/fisherscorefilter.jl") -include("filters/univariate/utils.jl") -include("filters/multivariate/correlationfilter.jl") -# Experimental -include("experimental/Experimental.jl") -import .Experimental +# const req_py_pkgs = ["scipy", "scikit-learn", "skfeature"] +# const fs = PyNULL() +# const construct_w = PyNULL() +# const lap_score = PyNULL() +# const fisher_score = PyNULL() +# function __init__() + +# pypkgs = getindex.(PyCall.Conda.parseconda(`list`, PyCall.Conda.ROOTENV), "name") +# needinstall = !all(p -> in(p, pypkgs), req_py_pkgs) + +# if (needinstall) +# PyCall.Conda.pip_interop(true, PyCall.Conda.ROOTENV) +# PyCall.Conda.add("scipy") +# PyCall.Conda.add("scikit-learn") +# PyCall.Conda.pip( +# "install", +# "git+https://github.com/jundongl/scikit-feature.git#egg=skfeature", +# PyCall.Conda.ROOTENV +# ) +# end + +# copy!(fs, pyimport_conda("sklearn.feature_selection", "scikit-learn")) +# copy!(construct_w, pyimport_conda("skfeature.utility.construct_W", "skfeature")) +# copy!(lap_score, pyimport_conda( +# "skfeature.function.similarity_based.lap_score", +# "skfeature" +# )) +# copy!(fisher_score, pyimport_conda( +# "skfeature.function.similarity_based.fisher_score", +# "skfeature" +# )) +# end + +# include("interface.jl") +# include("core.jl") +# # Utils +# include("utils/utils.jl") +# # Filters +# include("filters/limiter.jl") +# include("filters/interface.jl") +# include("filters/univariate/randomfilter.jl") +# include("filters/univariate/statisticalfilter.jl") +# include("filters/univariate/variancefilter.jl") +# include("filters/univariate/chi2filter.jl") +# include("filters/univariate/pearsoncorfilter.jl") +# include("filters/univariate/mutualinformationclassif.jl") +# include("filters/univariate/suplapscorefiler.jl") +# include("filters/univariate/fisherscorefilter.jl") +# include("filters/univariate/utils.jl") +# include("filters/multivariate/correlationfilter.jl") +# # Experimental +# include("experimental/Experimental.jl") +# import .Experimental end # module diff --git a/src/dataset/dataset_structs.jl b/src/dataset/dataset_structs.jl new file mode 100644 index 0000000..bff3c08 --- /dev/null +++ b/src/dataset/dataset_structs.jl @@ -0,0 +1,312 @@ +# ---------------------------------------------------------------------------- # +# dataset # +# ---------------------------------------------------------------------------- # +# const DEFAULT_PREPROC = ( +# train_ratio = 0.8, +# valid_ratio = 1.0, +# shuffle = true, +# stratified = false, +# nfolds = 6, +# rng = TaskLocalRNG() +# ) + +const DEFAULT_FE = ( + features = catch9, +) +const DEFAULT_FE_WINPARAMS = ( + type = adaptivewindow, + nwindows = 10, + relative_overlap = 0.2 +) + +# const AVAIL_WINS = (movingwindow, wholewindow, splitwindow, adaptivewindow) +# const AVAIL_TREATMENTS = (:aggregate, :reducesize) + +const WIN_PARAMS = Dict( + movingwindow => (window_size = 1024, window_step = 512), + wholewindow => NamedTuple(), + splitwindow => (nwindows = 20), + adaptivewindow => (nwindows = 20, relative_overlap = 0.5) +) + +# """ +# Abstract type for dataset configuration outputs +# """ +# abstract type AbstractDatasetConfig end + +# """ +# Abstract type for dataset outputs +# """ +# abstract type AbstractDataset end + +# """ +# Abstract type for dataset train, test and validation indexing +# """ +# abstract type AbstractIndexCollection end + +""" +Abstract type for feature struct +""" +abstract type AbstractFeature end + +# """ +# DatasetInfo{F<:Base.Callable, R<:Real, I<:Integer, RNG<:AbstractRNG} <: AbstractDatasetConfig + +# An immutable struct containing dataset configuration and metadata. +# It is included in ModelConfig and Dataset structs, +# In a ModelConfig object, it is reachable through the `ds.info` field. + +# # Fields +# - `algo::Symbol`: +# Algorithm type, can be :classification, or :regression. +# - `treatment::Symbol`: +# Data treatment method, specify the behaviour of data reducing if dataset is composed of time-series. +# :aggregate, time-series will be reduced to a scalar (propositional case). +# :reducesize, time-series will be windowed to reduce size. +# - `features::Vector{F}`: +# Features functions applied to the dataset. +# - `train_ratio::R`: +# Ratio of training data (0-1), specify the ratio between train and test partitions, +# the higher the ratio, the more data will be used for training. +# - `valid_ratio::R`: +# Ratio of validation data (0-1), spoecify the ratio between train and validation partitions, +# the higher the ratio, the more data will be used for validation. +# If `valid_ratio` is unspecified, no validation data will be used. +# - `shuffle::Bool`: +# Whether to shuffle data during train, validation and test partitioning. +# - `stratified::Bool`: +# Whether to use cross-validation stratified sampling technique. +# - `nfolds::I`: +# Number of cross-validation folds. +# - `rng::RNG`: +# Random number generator. +# - `winparams::Union{NamedTuple, Nothing}`: +# Window parameters: NamedTuple should have the following fields: +# whole window (; type=wholewindow) +# adaptive window (type=adaptivewindow, nwindows, relative_overlap), +# moving window (type=movingwindow, nwindows, relative_overlap, window_size, window_step) +# split window (type=splitwindow, nwindows). +# - `vnames::Union{Vector{Symbol}, Nothing}`: +# Variable names, usually dataset column names. +# """ +# struct DatasetInfo{F<:Base.Callable, R<:Real, I<:Integer, RNG<:AbstractRNG} <: AbstractDatasetConfig +# algo :: Symbol +# treatment :: Symbol +# features :: Vector{F} +# train_ratio :: R +# valid_ratio :: R +# shuffle :: Bool +# stratified :: Bool +# nfolds :: I +# rng :: RNG +# winparams :: Union{NamedTuple, Nothing} +# vnames :: Union{Vector{Symbol}, Nothing} +# end + +# function DatasetInfo( +# algo::Symbol, +# treatment::Symbol, +# features::AbstractVector{F}, +# train_ratio::R, +# valid_ratio::R, +# shuffle::Bool, +# stratified::Bool, +# nfolds::I, +# rng::RNG, +# winparams::Union{NamedTuple, Nothing}, +# vnames::Union{AbstractVector{<:Union{AbstractString,Symbol}}, Nothing} +# ) where {F<:Base.Callable, R<:Real, I<:Integer, RNG<:AbstractRNG} +# # Validate ratios +# 0 ≤ train_ratio ≤ 1 || throw(ArgumentError("train_ratio must be between 0 and 1")) +# 0 ≤ valid_ratio ≤ 1 || throw(ArgumentError("valid_ratio must be between 0 and 1")) + +# converted_vnames = isnothing(vnames) ? nothing : Vector{Symbol}(Symbol.(vnames)) + +# DatasetInfo{F,R,I,RNG}( +# algo, treatment, features, train_ratio, valid_ratio, +# shuffle, stratified, nfolds, rng, winparams, converted_vnames +# ) +# end + +# function Base.show(io::IO, info::DatasetInfo) +# println(io, "DatasetInfo:") +# for field in fieldnames(DatasetInfo) +# value = getfield(info, field) +# println(io, " ", rpad(String(field) * ":", 15), value) +# end +# end + +# """ +# TT_indexes{T<:Integer} <: AbstractVector{T} + +# A struct that stores indices for train-validation-test splits of a dataset, +# used in Dataset struct. + +# # Fields +# - `train::Vector{T}`: Vector of indices for the training set +# - `valid::Vector{T}`: Vector of indices for the validation set +# - `test::Vector{T}`: Vector of indices for the test set +# """ +# struct TT_indexes{T<:Integer} <: AbstractIndexCollection +# train :: Vector{T} +# valid :: Vector{T} +# test :: Vector{T} +# end + +# function TT_indexes( +# train::AbstractVector{T}, +# valid::AbstractVector{T}, +# test::AbstractVector{T} +# ) where {T<:Integer} +# TT_indexes{T}(train, valid, test) +# end + +# Base.show(io::IO, t::TT_indexes) = print(io, "TT_indexes(train=", t.train, ", validation=", t.valid, ", test=", t.test, ")") +# Base.length(t::TT_indexes) = length(t.train) + length(t.valid) + length(t.test) + +# function _create_views(X, y, tt, stratified::Bool) +# if stratified +# Xtrain = view.(Ref(X), getfield.(tt, :train), Ref(:)) +# Xvalid = view.(Ref(X), getfield.(tt, :valid), Ref(:)) +# Xtest = view.(Ref(X), getfield.(tt, :test), Ref(:)) +# ytrain = view.(Ref(y), getfield.(tt, :train)) +# yvalid = view.(Ref(y), getfield.(tt, :valid)) +# ytest = view.(Ref(y), getfield.(tt, :test)) +# else +# Xtrain = @views X[tt.train, :] +# Xvalid = @views X[tt.valid, :] +# Xtest = @views X[tt.test, :] +# ytrain = @views y[tt.train] +# yvalid = @views y[tt.valid] +# ytest = @views y[tt.test] +# end +# return Xtrain, Xvalid, Xtest, ytrain, yvalid, ytest +# end + +# """ +# Dataset{T<:AbstractDataFrame,S} <: AbstractDataset + +# An immutable struct that efficiently stores dataset splits for machine learning. + +# # Fields +# - `X::T`: The feature matrix as a DataFrame +# - `y::S`: The target vector +# - `tt::Union{TT_indexes{I}, Vector{TT_indexes{I}}}`: Train-test split indices +# - `info::DatasetInfo`: Dataset metadata and configuration +# - `Xtrain`, `Xvalid`, `Xtest`: Data views for features +# - `ytrain`, `yvalid`, `ytest`: Data views for targets +# """ +# struct Dataset{T<:AbstractDataFrame,S} <: AbstractDataset +# X :: T +# y :: S +# tt :: Union{TT_indexes, AbstractVector{<:TT_indexes}} +# info :: DatasetInfo +# Xtrain :: Union{SubDataFrame{T}, Vector{<:SubDataFrame{T}}} +# Xvalid :: Union{SubDataFrame{T}, Vector{<:SubDataFrame{T}}} +# Xtest :: Union{SubDataFrame{T}, Vector{<:SubDataFrame{T}}} +# ytrain :: Union{SubArray{<:eltype(S)}, Vector{<:SubArray{<:eltype(S)}}} +# yvalid :: Union{SubArray{<:eltype(S)}, Vector{<:SubArray{<:eltype(S)}}} +# ytest :: Union{SubArray{<:eltype(S)}, Vector{<:SubArray{<:eltype(S)}}} + +# function Dataset(X::T, y::S, tt, info) where {T<:AbstractDataFrame,S} +# if info.stratified +# Xtrain = view.(Ref(X), getfield.(tt, :train), Ref(:)) +# Xvalid = view.(Ref(X), getfield.(tt, :valid), Ref(:)) +# Xtest = view.(Ref(X), getfield.(tt, :test), Ref(:)) +# ytrain = view.(Ref(y), getfield.(tt, :train)) +# yvalid = view.(Ref(y), getfield.(tt, :valid)) +# ytest = view.(Ref(y), getfield.(tt, :test)) +# else +# Xtrain = @views X[tt.train, :] +# Xvalid = @views X[tt.valid, :] +# Xtest = @views X[tt.test, :] +# ytrain = @views y[tt.train] +# yvalid = @views y[tt.valid] +# ytest = @views y[tt.test] +# end + +# new{T,S}(X, y, tt, info, Xtrain, Xvalid, Xtest, ytrain, yvalid, ytest) +# end +# end + +# function Base.show(io::IO, ds::Dataset) +# println(io, "Dataset:") +# println(io, " X shape: ", size(ds.X)) +# println(io, " y length: ", length(ds.y)) +# if ds.tt isa AbstractVector +# println(io, " Train/Valid/Test: ", length(ds.tt), " folds") +# else +# println(io, " Train indices: ", length(ds.tt.train)) +# println(io, " Valid indices: ", length(ds.tt.valid)) +# println(io, " Test indices: ", length(ds.tt.test)) +# end +# print(io, ds.info) +# end + +""" + Feature{V<:Number, T<:Union{Symbol, String}} <: AbstractFeature + +A parametric struct that represents a feature extracted from time series data. + +# Type Parameters +- `V`: Type of the feature value (must be a subtype of `Number`) +- `T`: Type of the variable name (must be either `Symbol` or `String`) + +# Fields +- `value::V`: The numerical value of the feature +- `var::T`: The variable name/identifier +- `feats::Symbol`: The feature extraction function name +- `nwin::Int`: The window number (must be positive) + +# Constructors +```julia +Feature(value::Number, var::Union{Symbol,String}, feats::Symbol, nwin::Integer) +""" +struct Feature{V<:Number, T<:Union{Symbol, String}} <: AbstractFeature + value :: V + var :: T + feats :: Symbol + nwin :: Int + + function Feature(value::Number, var::Union{Symbol,String}, feats::Symbol, nwin::Integer) + nwin > 0 || throw(ArgumentError("Window number must be positive")) + new{typeof(value), typeof(var)}(value, var, feats, nwin) + end +end + +# Pretty printing +Base.show(io::IO, f::Feature) = print(io, + "Feature($(f.value), $(f.var), $(f.feats), window=$(f.nwin))") + +# Value access methods +Base.getproperty(f::Feature, s::Symbol) = getfield(f, s) +Base.propertynames(::Feature) = (:value, :var, :feats, :nwin) + +# Conversion methods for NaN handling +Base.convert(::Type{Feature}, x::Missing) = Feature(NaN, :missing, :none, 1) +Base.convert(::Type{Feature}, x::Nothing) = Feature(NaN, :nothing, :none, 1) + +# Test if value is NaN +Base.isnan(f::Feature) = isnan(f.value) + +# Numeric comparisons +Base.isless(f::Feature, x::Number) = isless(f.value, x) +Base.isless(x::Number, f::Feature) = isless(x, f.value) +Base.isless(f1::Feature, f2::Feature) = isless(f1.value, f2.value) + +# Convert to number for arithmetic operations +Base.convert(::Type{Number}, f::Feature) = f.value +Base.convert(::Type{Float64}, f::Feature) = convert(Float64, f.value) + +# Forward numeric operations to the value field +for op in (:+, :-, :*, :/, :^) + @eval Base.$op(f::Feature, x::Number) = $op(f.value, x) + @eval Base.$op(x::Number, f::Feature) = $op(x, f.value) +end + +# Get variable name +variable_name(f::Feature) = f.var +# Get feature type +feature_type(f::Feature) = f.feats +# Get window number +window_number(f::Feature) = f.nwin diff --git a/src/dataset/prepare_dataset.jl b/src/dataset/prepare_dataset.jl new file mode 100644 index 0000000..55ebc5b --- /dev/null +++ b/src/dataset/prepare_dataset.jl @@ -0,0 +1,427 @@ +# # ---------------------------------------------------------------------------- # +# # utils # +# # ---------------------------------------------------------------------------- # +# check_dataframe_type(df::AbstractDataFrame) = all(col -> eltype(col) <: Union{Real,AbstractArray{<:Real}}, eachcol(df)) +# hasnans(X::AbstractDataFrame) = any(x -> x == 1, SoleData.hasnans.(eachcol(X))) + +# ---------------------------------------------------------------------------- # +# check dimensions # +# ---------------------------------------------------------------------------- # +""" + _check_dimensions(X::DataFrame) -> Int + +Internal function. +Check dimensionality of elements in DataFrame columns. +Currently supports only scalar values and time series (1-dimensional arrays). + +# Returns +- `Int`: 0 for scalar elements, 1 for 1D array elements + +# Throws +- `DimensionMismatch`: If elements have inconsistent dimensions +- `ArgumentError`: If elements have more than 1D +""" +function _check_dimensions(X::DataFrame) + isempty(X) && return 0 + + # Get reference dimensions from first element + first_col = first(eachcol(X)) + ref_dims = ndims(first(first_col)) + + # Early dimension check + ref_dims > 1 && throw(ArgumentError("Elements more than 1D are not supported.")) + + # Check all columns maintain same dimensionality + all(col -> all(x -> ndims(x) == ref_dims, col), eachcol(X)) || + throw(DimensionMismatch("Inconsistent dimensions across elements")) + + return ref_dims +end + +# ---------------------------------------------------------------------------- # +# treatment # +# ---------------------------------------------------------------------------- # +""" + _treatment(X::DataFrame, vnames::AbstractVector{String}, treatment::Symbol, + features::AbstractVector{<:Base.Callable}, winparams::NamedTuple) + +Internal function. +Processes the input DataFrame `X` based on the specified `treatment` type, +either aggregating or reducing the size of the data. The function applies +the given `features` to the columns specified by `vnames`, using window +parameters defined in `winparams`. + +# Arguments +- `X::DataFrame`: The input data to be processed. +- `vnames::AbstractVector{String}`: Names of the columns in `X` to be treated. +- `treatment::Symbol`: The type of treatment to apply, either `:aggregate` + or `:reducesize`. +- `features::AbstractVector{<:Base.Callable}`: Functions to apply to the + specified columns. +- `winparams::NamedTuple`: Parameters defining the windowing strategy, + including the type of window function. + +# Returns +- `DataFrame`: A new DataFrame with the processed data. + +# Throws +- `ArgumentError`: If `winparams` does not contain a valid `type`. +""" +function _treatment( + X::DataFrame, + vnames::AbstractVector{String}, + treatment::Symbol, + features::AbstractVector{<:Base.Callable}, + winparams::NamedTuple +) + # check parameters + haskey(winparams, :type) || throw(ArgumentError("winparams must contain a type, $(keys(WIN_PARAMS))")) + haskey(WIN_PARAMS, winparams.type) || throw(ArgumentError("winparams.type must be one of: $(keys(WIN_PARAMS))")) + + max_interval = maximum(length.(eachrow(X))) + _wparams = NamedTuple(k => v for (k,v) in pairs(winparams) if k != :type) + n_intervals = winparams.type(max_interval; _wparams...) + + # Initialize DataFrame + if treatment == :aggregate # propositional + if n_intervals == 1 + valid_X = DataFrame([v => Float64[] + for v in [string(f, "(", v, ")") + for f in features for v in vnames]] + ) + else + valid_X = DataFrame([v => Float64[] + for v in [string(f, "(", v, ")w", i) + for f in features for v in vnames + for i in 1:length(n_intervals)]] + ) + end + + elseif treatment == :reducesize # modal + # valid_X = DataFrame([name => Vector{Float64}[] for name in vnames]) + valid_X = DataFrame([name => Vector{Float64}[] for name in vnames]) + + elseif treatment == :feature_selection + if n_intervals == 1 + # valid_X = DataFrame([v => Float64[] + valid_X = DataFrame([v => Feature[] + for v in [string(f, "(", v, ")") + for f in features for v in vnames]] + ) + else + # valid_X = DataFrame([v => Float64[] + valid_X = DataFrame([v => Feature[] + for v in [string(f, "(", v, ")w", i) + for f in features for v in vnames + for i in 1:length(n_intervals)]] + ) + end + end + + # Fill DataFrame + for row in eachrow(X) + row_intervals = winparams.type(maximum(length.(collect(row))); _wparams...) + # interval_dif is used in case we encounter a row with less intervals than the maximum + interval_diff = length(n_intervals) - length(row_intervals) + + if treatment == :aggregate + push!(valid_X, vcat([ + vcat([f(col[r]) for r in row_intervals], + # if interval_diff is positive, fill the rest with NaN + fill(NaN, interval_diff)) for col in row, f in features + ]...) + ) + elseif treatment == :reducesize + f = haskey(_wparams, :reducefunc) ? _wparams.reducefunc : mean + push!(valid_X, [ + vcat([f(col[r]) for r in row_intervals], + # if interval_diff is positive, fill the rest with NaN + fill(NaN, interval_diff)) for col in row + ] + ) + elseif treatment == :feature_selection + push!(valid_X, vcat([ + vcat([ + Feature(f(col[r]), vnames[i], Symbol(f), w) for (w, r) in enumerate(row_intervals)], + # if interval_diff is positive, fill the rest with NaN + fill(NaN, interval_diff)) for (i, col) in enumerate(row), f in features + ]...) + ) + end + end + + return valid_X +end + +# # ---------------------------------------------------------------------------- # +# # partitioning # +# # ---------------------------------------------------------------------------- # +# """ +# _partition(y::Union{CategoricalArray, Vector{T}}, train_ratio::Float64, +# shuffle::Bool, stratified::Bool, nfolds::Int, rng::AbstractRNG) +# where {T<:Union{AbstractString, Number}} + +# Partitions the input vector `y` into training and testing indices based on +# the specified parameters. Supports both stratified and non-stratified +# partitioning. + +# # Arguments +# - `y::Union{CategoricalArray, Vector{T}}`: The target variable to partition. +# - `train_ratio::Float64`: The ratio of data to be used for training in +# non-stratified partitioning. +# - `shuffle::Bool`: Whether to shuffle the data before partitioning. +# - `stratified::Bool`: Whether to perform stratified partitioning. +# - `nfolds::Int`: Number of folds for cross-validation in stratified +# partitioning. +# - `rng::AbstractRNG`: Random number generator for reproducibility. + +# # Returns +# - `Vector{Tuple{Vector{Int}, Vector{Int}}}`: A vector of tuples containing +# training and testing indices. + +# # Throws +# - `ArgumentError`: If `nfolds` is less than 2 when `stratified` is true. +# """ + +# function _partition( +# y::Union{CategoricalArray,Vector{T}}, +# # validation::Bool, +# train_ratio::Float64, +# valid_ratio::Float64, +# shuffle::Bool, +# stratified::Bool, +# nfolds::Int, +# rng::AbstractRNG +# ) where {T<:Union{AbstractString,Number}} +# if stratified +# stratified_cv = MLJ.StratifiedCV(; nfolds, shuffle, rng) +# tt = MLJ.MLJBase.train_test_pairs(stratified_cv, 1:length(y), y) +# if valid_ratio == 1.0 +# return [TT_indexes(train, eltype(train)[], test) for (train, test) in tt] +# else +# tv = collect((MLJ.partition(t[1], train_ratio)..., t[2]) for t in tt) +# return [TT_indexes(train, valid, test) for (train, valid, test) in tv] +# end +# else +# tt = MLJ.partition(eachindex(y), train_ratio; shuffle, rng) +# if valid_ratio == 1.0 +# return TT_indexes(tt[1], eltype(tt[1])[], tt[2]) +# else +# tv = MLJ.partition(tt[1], valid_ratio; shuffle, rng) +# return TT_indexes(tv[1], tv[2], tt[2]) +# end +# end +# end + +# # ---------------------------------------------------------------------------- # +# # prepare dataset # +# # ---------------------------------------------------------------------------- # +# """ +# prepare_dataset(X::AbstractDataFrame, y::AbstractVector; algo::Symbol=:classification, +# treatment::Symbol=:aggregate, features::AbstractVector{<:Base.Callable}=DEFAULT_FEATS, +# train_ratio::Float64=0.8, shuffle::Bool=true, stratified::Bool=false, +# nfolds::Int=6, rng::AbstractRNG=Random.TaskLocalRNG(), +# winparams::Union{NamedTuple,Nothing}=nothing, +# vnames::Union{AbstractVector{<:Union{AbstractString,Symbol}},Nothing}=nothing) + +# Prepares a dataset for machine learning by processing the input DataFrame `X` and target vector `y`. +# Supports both classification and regression tasks, with options for data treatment and partitioning. + +# # Arguments +# - `X::AbstractDataFrame`: The input data containing features. +# - `y::AbstractVector`: The target variable corresponding to the rows in `X`. +# - `algo::Symbol`: The type of algorithm, either `:classification` or `:regression`. +# - `treatment::Symbol`: The data treatment method, default is `:aggregate`. +# - `features::AbstractVector{<:Base.Callable}`: Functions to apply to the data columns. +# - `train_ratio::Float64`: Ratio of data to be used for training. +# - `shuffle::Bool`: Whether to shuffle data before partitioning. +# - `stratified::Bool`: Whether to use stratified partitioning. +# - `nfolds::Int`: Number of folds for cross-validation. +# - `rng::AbstractRNG`: Random number generator for reproducibility. +# - `winparams::Union{NamedTuple,Nothing}`: Parameters for windowing strategy. +# - `vnames::Union{AbstractVector{<:Union{AbstractString,Symbol}},Nothing}`: Names of the columns in `X`. + +# # Returns +# - `SoleXplorer.Dataset`: A dataset object containing processed data and partitioning information. + +# # Throws +# - `ArgumentError`: If input parameters are invalid or unsupported column types are encountered. +# """ + +# function prepare_dataset( +# X::AbstractDataFrame, +# y::AbstractVector; +# # model.config +# algo::Symbol=:classification, +# treatment::Symbol=:aggregate, +# features::AbstractVector{<:Base.Callable}=DEFAULT_FEATS, +# # validation::Bool=false, +# # model.preprocess +# train_ratio::Float64=0.8, +# valid_ratio::Float64=1.0, +# shuffle::Bool=true, +# stratified::Bool=false, +# nfolds::Int=6, +# rng::AbstractRNG=Random.TaskLocalRNG(), +# # model.winparams +# winparams::Union{NamedTuple,Nothing}=nothing, +# vnames::Union{AbstractVector{<:Union{AbstractString,Symbol}},Nothing}=nothing, +# ) +# # check parameters +# check_dataframe_type(X) || throw(ArgumentError("DataFrame must contain only numeric values")) +# size(X, 1) == length(y) || throw(ArgumentError("Number of rows in DataFrame must match length of class labels")) +# treatment in AVAIL_TREATMENTS || throw(ArgumentError("Treatment must be one of: $AVAIL_TREATMENTS")) + +# if algo == :regression +# y isa AbstractVector{<:Number} || throw(ArgumentError("Regression requires a numeric target variable")) +# y isa AbstractFloat || (y = Float64.(y)) +# elseif algo == :classification +# y isa AbstractVector{<:AbstractFloat} && throw(ArgumentError("Classification requires a categorical target variable")) +# y isa CategoricalArray || (y = coerce(y, MLJ.Multiclass)) +# else +# throw(ArgumentError("Algorithms supported, :regression and :classification")) +# end + +# if isnothing(vnames) +# vnames = names(X) +# else +# size(X, 2) == length(vnames) || throw(ArgumentError("Number of columns in DataFrame must match length of variable names")) +# vnames = eltype(vnames) <: Symbol ? string.(vnames) : vnames +# end + +# hasnans(X) && @warn "DataFrame contains NaN values" + +# column_eltypes = eltype.(eachcol(X)) + +# ds_info = DatasetInfo( +# algo, +# treatment, +# features, +# train_ratio, +# valid_ratio, +# shuffle, +# stratified, +# nfolds, +# rng, +# winparams, +# vnames, +# # validation +# ) + +# # case 1: dataframe with numeric columns +# if all(t -> t <: Number, column_eltypes) +# return SoleXplorer.Dataset( +# DataFrame(vnames .=> eachcol(X)), y, +# # _partition(y, validation, train_ratio, valid_ratio, shuffle, stratified, nfolds, rng), +# _partition(y, train_ratio, valid_ratio, shuffle, stratified, nfolds, rng), +# ds_info +# ) +# # case 2: dataframe with vector-valued columns +# elseif all(t -> t <: AbstractVector{<:Number}, column_eltypes) +# return SoleXplorer.Dataset( +# # if winparams is nothing, then leave the dataframe as it is +# isnothing(winparams) ? DataFrame(vnames .=> eachcol(X)) : _treatment(X, vnames, treatment, features, winparams), y, +# # _partition(y, validation, train_ratio, valid_ratio, shuffle, stratified, nfolds, rng), +# _partition(y, train_ratio, valid_ratio, shuffle, stratified, nfolds, rng), +# ds_info +# ) +# else +# throw(ArgumentError("Column type not yet supported")) +# end +# end + +# function prepare_dataset( +# X::AbstractDataFrame, +# y::AbstractVector, +# model::AbstractModelSet +# ) +# # check if it's needed also validation set +# # validation = haskey(VALIDATION, model.type) && getproperty(model.params, VALIDATION[model.type][1]) != VALIDATION[model.type][2] +# # valid_ratio = (validation && model.preprocess.valid_ratio == 1) ? 0.8 : model.preprocess.valid_ratio + +# prepare_dataset( +# X, y; +# algo=model.config.algo, +# treatment=model.config.treatment, +# features=model.features, +# # validation, +# # model.preprocess +# train_ratio=model.preprocess.train_ratio, +# valid_ratio=model.preprocess.valid_ratio, +# shuffle=model.preprocess.shuffle, +# stratified=model.preprocess.stratified, +# nfolds=model.preprocess.nfolds, +# rng=model.preprocess.rng, +# winparams=model.winparams, +# ) +# end + +# # y is not a vector, but a symbol or a string that identifies the column in X +# function prepare_dataset( +# X::AbstractDataFrame, +# y::Union{Symbol,AbstractString}, +# args...; kwargs... +# ) +# prepare_dataset(X[!, Not(y)], X[!, y], args...; kwargs...) +# end + +# ---------------------------------------------------------------------------- # +# feature selection preprocess # +# ---------------------------------------------------------------------------- # +""" + feature_selection_preprocess( + X::DataFrame; + vnames::Union{Vector{String}, Vector{Symbol}, Nothing}=nothing, + features::Union{Vector{<:Base.Callable}, Nothing}=nothing, + nwindows::Union{Int, Nothing}=nothing + ) -> DataFrame + +Process a DataFrame for feature selection by converting its columns into Feature objects. + +# Arguments +- `X::DataFrame`: Input DataFrame containing time series data +- `vnames::Union{Vector{String}, Vector{Symbol}, Nothing}=nothing`: Names for the variables. + If nothing, uses DataFrame column names +- `features::Union{Vector{<:Base.Callable}, Nothing}=nothing`: Feature extraction functions. + If nothing, uses DEFAULT_FE.features +- `nwindows::Union{Int, Nothing}=nothing`: Number of windows for time series segmentation. + If nothing, uses DEFAULT_FE_WINPARAMS + +# Returns +- `DataFrame`: A DataFrame where each element is a Feature object containing: + - value: extracted feature value + - var: variable name + - feats: feature extraction function used + - nwin: window number + +# Example +```julia +# Basic usage with default parameters +df = DataFrame(a = [rand(10) for _ in 1:5]) +result = feature_selection_preprocess(df) + +# Custom features and windows +df = DataFrame(a = [rand(10) for _ in 1:5]) +result = feature_selection_preprocess(df, + features = [mean, std], + nwindows = 3 +) +""" +function feature_selection_preprocess( + X::DataFrame; + vnames::Union{Vector{String}, Vector{Symbol}, Nothing}=nothing, + features::Union{Vector{<:Base.Callable}, Nothing}=nothing, + nwindows::Union{Int, Nothing}=nothing +) + # check parameters + isnothing(vnames) && (vnames = names(X)) + isnothing(features) && (features = DEFAULT_FE.features) + treatment = :feature_selection + _ = _check_dimensions(X) + if !isnothing(nwindows) + nwindows > 0 || throw(ArgumentError("Number of windows must be positive.")) + end + winparams = isnothing(nwindows) ? DEFAULT_FE_WINPARAMS : merge(DEFAULT_FE_WINPARAMS, (nwindows = nwindows,)) + + _treatment(X, vnames, treatment, features, winparams) +end diff --git a/src/utils/features_set.jl b/src/utils/features_set.jl new file mode 100644 index 0000000..13aa897 --- /dev/null +++ b/src/utils/features_set.jl @@ -0,0 +1,39 @@ +# ---------------------------------------------------------------------------- # +# catch22 pretty named functions # +# ---------------------------------------------------------------------------- # + +mode_5(x) = Catch22.DN_HistogramMode_5((x)); @doc (@doc Catch22.DN_HistogramMode_5) mode_5 +mode_10(x) = Catch22.DN_HistogramMode_10((x)); @doc (@doc Catch22.DN_HistogramMode_10) mode_10 +embedding_dist(x) = Catch22.CO_Embed2_Dist_tau_d_expfit_meandiff((x)); @doc (@doc Catch22.CO_Embed2_Dist_tau_d_expfit_meandiff) embedding_dist +acf_timescale(x) = Catch22.CO_f1ecac((x)); @doc (@doc Catch22.CO_f1ecac) acf_timescale +acf_first_min(x) = Catch22.CO_FirstMin_ac((x)); @doc (@doc Catch22.CO_FirstMin_ac) acf_first_min +ami2(x) = Catch22.CO_HistogramAMI_even_2_5((x)); @doc (@doc Catch22.CO_HistogramAMI_even_2_5) ami2 +trev(x) = Catch22.CO_trev_1_num((x)); @doc (@doc Catch22.CO_trev_1_num) trev +outlier_timing_pos(x) = Catch22.DN_OutlierInclude_p_001_mdrmd((x)); @doc (@doc Catch22.DN_OutlierInclude_p_001_mdrmd) outlier_timing_pos +outlier_timing_neg(x) = Catch22.DN_OutlierInclude_n_001_mdrmd((x)); @doc (@doc Catch22.DN_OutlierInclude_n_001_mdrmd) outlier_timing_neg +whiten_timescale(x) = Catch22.FC_LocalSimple_mean1_tauresrat((x)); @doc (@doc Catch22.FC_LocalSimple_mean1_tauresrat) whiten_timescale +forecast_error(x) = Catch22.FC_LocalSimple_mean3_stderr((x)); @doc (@doc Catch22.FC_LocalSimple_mean3_stderr) forecast_error +ami_timescale(x) = Catch22.IN_AutoMutualInfoStats_40_gaussian_fmmi((x)); @doc (@doc Catch22.IN_AutoMutualInfoStats_40_gaussian_fmmi) ami_timescale +high_fluctuation(x) = Catch22.MD_hrv_classic_pnn40((x)); @doc (@doc Catch22.MD_hrv_classic_pnn40) high_fluctuation +stretch_decreasing(x) = Catch22.SB_BinaryStats_diff_longstretch0((x)); @doc (@doc Catch22.SB_BinaryStats_diff_longstretch0) stretch_decreasing +stretch_high(x) = Catch22.SB_BinaryStats_mean_longstretch1((x)); @doc (@doc Catch22.SB_BinaryStats_mean_longstretch1) stretch_high +entropy_pairs(x) = Catch22.SB_MotifThree_quantile_hh((x)); @doc (@doc Catch22.SB_MotifThree_quantile_hh) entropy_pairs +rs_range(x) = Catch22.SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1((x)); @doc (@doc Catch22.SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1) rs_range +dfa(x) = Catch22.SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1((x)); @doc (@doc Catch22.SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1) dfa +low_freq_power(x) = Catch22.SP_Summaries_welch_rect_area_5_1((x)); @doc (@doc Catch22.SP_Summaries_welch_rect_area_5_1) low_freq_power +centroid_freq(x) = Catch22.SP_Summaries_welch_rect_centroid((x)); @doc (@doc Catch22.SP_Summaries_welch_rect_centroid) centroid_freq +transition_variance(x) = Catch22.SB_TransitionMatrix_3ac_sumdiagcov((x)); @doc (@doc Catch22.SB_TransitionMatrix_3ac_sumdiagcov) transition_variance +periodicity(x) = Catch22.PD_PeriodicityWang_th0_01((x)); @doc (@doc Catch22.PD_PeriodicityWang_th0_01) periodicity + +# ---------------------------------------------------------------------------- # +# catch9 # +# ---------------------------------------------------------------------------- # +base_set = [maximum, minimum, mean, std] +catch9 = [maximum, minimum, mean, median, std, stretch_high, stretch_decreasing, entropy_pairs, transition_variance] +catch22_set = [mode_5, mode_10, embedding_dist, acf_timescale, acf_first_min, ami2, trev, outlier_timing_pos, + outlier_timing_neg, whiten_timescale, forecast_error, ami_timescale, high_fluctuation, stretch_decreasing, + stretch_high, entropy_pairs, rs_range, dfa, low_freq_power, centroid_freq, transition_variance, periodicity] +complete_set = [maximum, minimum, mean, median, std, StatsBase.cov, + mode_5, mode_10, embedding_dist, acf_timescale, acf_first_min, ami2, trev, outlier_timing_pos, + outlier_timing_neg, whiten_timescale, forecast_error, ami_timescale, high_fluctuation, stretch_decreasing, + stretch_high, entropy_pairs, rs_range, dfa, low_freq_power, centroid_freq, transition_variance, periodicity] \ No newline at end of file diff --git a/test/modules/prepare_dataset.jl b/test/modules/prepare_dataset.jl new file mode 100644 index 0000000..b0d6ff4 --- /dev/null +++ b/test/modules/prepare_dataset.jl @@ -0,0 +1,93 @@ +using SoleFeatures +using Test +using Sole +using Random, StatsBase, DataFrames +using MLJTuning + +# ---------------------------------------------------------------------------- # +# DATASET PREPARATION # +# ---------------------------------------------------------------------------- # +X, y = SoleData.load_arff_dataset("NATOPS") +train_seed = 11 +rng = Random.Xoshiro(train_seed) +Random.seed!(train_seed) + +# downsize dataset +num_cols_to_sample = 10 +num_rows_to_sample = 50 +chosen_cols = StatsBase.sample(rng, 1:size(X, 2), num_cols_to_sample; replace=false) +chosen_rows = StatsBase.sample(rng, 1:size(X, 1), num_rows_to_sample; replace=false) + +X = X[chosen_rows, chosen_cols] +y = y[chosen_rows] + +@testset "feature_selection_preprocess" begin + @testset "Basic functionality" begin + # Test default parameters + result = feature_selection_preprocess(X) + @test result isa DataFrame + @test all(col -> eltype(col) <: SoleFeatures.Feature, eachcol(result)) + @test size(result, 1) == size(X, 1) + + # Test first Feature object properties + first_feature = first(result[!, 1]) + @test first_feature isa SoleFeatures.Feature + @test first_feature.var isa String + @test first_feature.feats isa Symbol + @test first_feature.nwin isa Int + @test first_feature.nwin > 0 + end + + @testset "Custom parameters" begin + X2 = DataFrame( + temp = [rand(10) for _ in 1:5], + press = [rand(10) for _ in 1:5] + ) + + # Custom features and window + custom_features = [mean, std] + result = feature_selection_preprocess(X2, + features = custom_features, + nwindows = 3, + vnames = ["temperature", "pressure"] + ) + + # Check dimensions + expected_cols = length(custom_features) * size(X2, 2) * 3 # features * variables * windows + @test size(result, 2) == expected_cols + + # Check feature names + for (f, v, w) in Iterators.product(custom_features, ["temperature", "pressure"], 1:3) + col_name = "$(f)($(v))w$(w)" + @test col_name in names(result) + end + end + + @testset "Error handling" begin + # Test with empty DataFrame + @test_throws ArgumentError feature_selection_preprocess(DataFrame()) + + # Test with mixed dimensions + X_invalid = DataFrame( + a = [1.0, 2.0], + b = [[1.0, 2.0], [3.0, 4.0]] + ) + @test_throws DimensionMismatch feature_selection_preprocess(X_invalid) + + # Test with invalid windows + X = DataFrame(a = [rand(10) for _ in 1:5]) + @test_throws ArgumentError feature_selection_preprocess(X, nwindows = 0) + @test_throws ArgumentError feature_selection_preprocess(X, nwindows = -1) + end + + @testset "Performance" begin + # Create larger dataset + X = DataFrame( + [Symbol("var$i") => [rand(100) for _ in 1:100] for i in 1:5] + ) + + # Measure execution time + time_taken = @elapsed feature_selection_preprocess(X) + @test time_taken < 5.0 # Should complete within 5 seconds + end +end diff --git a/test/runtests.jl b/test/runtests.jl index e5ec4ce..24cfe6b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,223 +1,256 @@ -using HypothesisTests -using StatsBase -using Test -using Revise -using MultiData using SoleFeatures +using Test +using Random -include("./test_function.jl") - -@testset "SoleFeatures.jl" begin - - @testset "transform" begin - - @testset "transform!(md, bm; i_modality) using bitmask on a modality of MultiDataset" begin - df = random_timeseries_df(; nvar=10) - md = MultiData.MultiDataset(df, [[3,7,8], [1,2,4], [5,6,9,10]]) - bm_mod = BitVector([0,1,0]) - idx_mod = 1 - # expected values - emfd = deepcopy(md) - MultiData.dropvariables!(emfd, [3,8]) - - SoleFeatures.transform!(md, bm_mod; i_modality=idx_mod) - - @test isequal(md, emfd) - end - - @testset "transform!(md, bm) using bitmask on whole MultiDataset" begin - df = random_timeseries_df(; nvar=5) - md = MultiData.MultiDataset(df, [[4,2,1], [5,3]]) - bm_mod = BitVector([0,1,0,1,1]) - # expected values - emfd = deepcopy(md) - MultiData.dropvariables!(emfd, [1,3]) - - SoleFeatures.transform!(md, bm_mod) - - @test isequal(md, emfd) - end - - @testset "transform!(df, bm) using bitmask on DataFrame" begin - df = random_timeseries_df(; nvar=5) - bm = BitVector([0,1,0,1,1]) - # expected values - edf = deepcopy(df) - select!(edf, [2,4,5]) - - SoleFeatures.transform!(df, bm) - - @test isequal(df, edf) - end - - @testset "transform(md, bm; i_modality) using bitmask on a modality of MultiDataset" begin - df = random_timeseries_df(; nvar=10) - md = MultiData.MultiDataset(df, [[3,7,8], [1,2,4], [5,6,9,10]]) - bm_mod = BitVector([0,1,0]) - idx_mod = 1 - # expected values - emfd = deepcopy(md) - MultiData.dropvariables!(emfd, [3,8]) - - md = SoleFeatures.transform(md, bm_mod; i_modality=idx_mod) - - @test isequal(md, emfd) - end - - @testset "transform(md, bm) using bitmask on whole MultiDataset" begin - df = random_timeseries_df(; nvar=5) - md = MultiData.MultiDataset(df, [[4,2,1], [5,3]]) - bm_mod = BitVector([0,1,0,1,1]) - # expected values - emfd = deepcopy(md) - MultiData.dropvariables!(emfd, [1,3]) - - md = SoleFeatures.transform(md, bm_mod) - - @test isequal(md, emfd) - end - - @testset "transform(df, bm) using bitmask on DataFrame" begin - df = random_timeseries_df(; nvar=5) - bm = BitVector([0,1,0,1,1]) - # expected values - edf = deepcopy(df) - select!(edf, [2,4,5]) - - df = SoleFeatures.transform(df, bm) - - @test isequal(df, edf) - end - +function run_tests(list) + println("\n" * ("#"^50)) + for test in list + println("TEST: $test") + include(test) end +end - @testset "utils" begin - - @testset "_mod_bm2mfd_bm using array of frames and array of bitmasks" begin - df = random_timeseries_df(; nvar=10) - md = MultiData.MultiDataset(df, [[3,7,8], [1,2,4], [5,6,9,10]]) - frms = [1,2,3] - bms = Vector{BitVector}([ [0,1,0],[0,0,1],[0,1,1,0] ]) - # expected values - ebm = BitVector([ 0,0,0,1,0,1,1,0,1,0 ]) - - resbm = SoleFeatures._mod_bm2mfd_bm(md, frms, bms) - - @test isequal(resbm, ebm) - end +println("Julia version: ", VERSION) - @testset "_mod_bm2mfd_bm using modality and bitmask" begin - df = random_timeseries_df(; nvar=10) - md = MultiData.MultiDataset(df, [[3,7,8], [1,2,4], [5,6,9,10]]) - frm = 2 - bm = BitVector([0,0,1]) - # expected values - ebm = BitVector([ 0,0,1,1,1,1,1,1,1,1 ]) +test_suites = [ + ("Prepare Dataset", ["modules/prepare_dataset.jl", ]), - resbm = SoleFeatures._mod_bm2mfd_bm(md, frm, bm) +] - @test isequal(resbm, ebm) +@testset "SoleFeatures.jl" begin + for ts in eachindex(test_suites) + name = test_suites[ts][1] + list = test_suites[ts][2] + let + @testset "$name" begin + run_tests(list) + end end - end + println() +end - @testset "selectors" begin - @testset "transform" begin +# using HypothesisTests +# using StatsBase +# using Test +# using Revise +# using MultiData +# using SoleFeatures - @testset "RandomRanking" begin - seed = 1997 - rr = RandomRanking(3, seed) - df = random_timeseries_df(;nvar=10) - # expected values - edf = deepcopy(df) - select!(edf, [6,2,5]) +# include("./test_function.jl") - SoleFeatures.transform!(df, rr) +# @testset "SoleFeatures.jl" begin - @test isequal(df, edf) - end +# @testset "transform" begin - @testset "VarianceThreshold" begin - df = random_df() - ndf = SoleFeatures.minmax_normalize(df; min_quantile=0.0, max_quantile=1.0) - vt = VarianceThreshold(0.09) - @test (SoleFeatures.transform!(df, vt) isa DataFrame) - end +# @testset "transform!(md, bm; i_modality) using bitmask on a modality of MultiDataset" begin +# df = random_timeseries_df(; nvar=10) +# md = MultiData.MultiDataset(df, [[3,7,8], [1,2,4], [5,6,9,10]]) +# bm_mod = BitVector([0,1,0]) +# idx_mod = 1 +# # expected values +# emfd = deepcopy(md) +# MultiData.dropvariables!(emfd, [3,8]) - @testset "VarianceRanking" begin - df = random_df() - ndf = SoleFeatures.minmax_normalize(df; min_quantile=0.0, max_quantile=1.0) - vr = VarianceRanking(3) - @test (SoleFeatures.transform!(df, vr) isa DataFrame) - end +# SoleFeatures.transform!(md, bm_mod; i_modality=idx_mod) - @testset "StatisticalMajority" begin - df = random_df() - y = rand([:a, :b, :c], 100) - sm = StatisticalMajority(UnequalVarianceTTest) - @test (SoleFeatures.transform!(df, y, sm) isa DataFrame) - end +# @test isequal(md, emfd) +# end - @testset "StatisticalAtLeastOnce" begin - df = random_df() - y = rand([:a, :b, :c], 100) - sa = StatisticalAtLeastOnce(UnequalVarianceZTest) - @test (SoleFeatures.transform!(df, y, sa) isa DataFrame) - end +# @testset "transform!(md, bm) using bitmask on whole MultiDataset" begin +# df = random_timeseries_df(; nvar=5) +# md = MultiData.MultiDataset(df, [[4,2,1], [5,3]]) +# bm_mod = BitVector([0,1,0,1,1]) +# # expected values +# emfd = deepcopy(md) +# MultiData.dropvariables!(emfd, [1,3]) - @testset "CompoundStatisticalMajority" begin - df = random_df() - y = rand([:a, :b, :c], 100) - cm = CompoundStatisticalMajority(UnequalVarianceTTest, MannWhitneyUTest) - @test (SoleFeatures.transform!(df, y, cm) isa DataFrame) - end +# SoleFeatures.transform!(md, bm_mod) - @testset "CompoundStatisticalAtLeastOnce" begin - df = random_df() - y = rand([:a, :b, :c], 100) - ca = CompoundStatisticalAtLeastOnce(UnequalVarianceZTest, MannWhitneyUTest) - @test (SoleFeatures.transform!(df, y, ca) isa DataFrame) - end +# @test isequal(md, emfd) +# end - @testset "CorrelationFilter" begin - df = random_df() - cf = CorrelationFilter(cor, 0) - @test (SoleFeatures.transform!(df, cf) isa DataFrame) - end +# @testset "transform!(df, bm) using bitmask on DataFrame" begin +# df = random_timeseries_df(; nvar=5) +# bm = BitVector([0,1,0,1,1]) +# # expected values +# edf = deepcopy(df) +# select!(edf, [2,4,5]) - @testset "Chi2Filter" begin - df = random_df() - y = rand([:a, :b, :c], 100) - c2r = Chi2Ranking(3) - @test (SoleFeatures.transform!(df, y, c2r) isa DataFrame) - end +# SoleFeatures.transform!(df, bm) - @testset "PearsonCorRanking" begin - df = random_df() - y = rand(100) - c2r = PearsonCorRanking(3) - @test (SoleFeatures.transform!(df, y, c2r) isa DataFrame) - end +# @test isequal(df, edf) +# end - @testset "MutualInformationClassif" begin - df = random_df() - y = rand([:a, :b, :c], 100) - mir = MutualInformationClassifRanking(3) - @test (SoleFeatures.transform!(df, y, mir) isa DataFrame) - end - - # TODO: make this work: see the FIXME in the file test/runtests.jl - # @testset "VarianceRanking on MultiDataset" begin - # df = random_df(); - # df = SoleFeatures.minmax_normalize(df; min_quantile=0.0, max_quantile=1.0) - # md = MultiData.MultiDataset([ [1,2,3,4], [5] ], df) - # vr = VarianceRanking(3) - # @test (SoleFeatures.transform!(md, vr; i_modality=1) isa MultiDataset) - # end - - end +# @testset "transform(md, bm; i_modality) using bitmask on a modality of MultiDataset" begin +# df = random_timeseries_df(; nvar=10) +# md = MultiData.MultiDataset(df, [[3,7,8], [1,2,4], [5,6,9,10]]) +# bm_mod = BitVector([0,1,0]) +# idx_mod = 1 +# # expected values +# emfd = deepcopy(md) +# MultiData.dropvariables!(emfd, [3,8]) - end +# md = SoleFeatures.transform(md, bm_mod; i_modality=idx_mod) -end +# @test isequal(md, emfd) +# end + +# @testset "transform(md, bm) using bitmask on whole MultiDataset" begin +# df = random_timeseries_df(; nvar=5) +# md = MultiData.MultiDataset(df, [[4,2,1], [5,3]]) +# bm_mod = BitVector([0,1,0,1,1]) +# # expected values +# emfd = deepcopy(md) +# MultiData.dropvariables!(emfd, [1,3]) + +# md = SoleFeatures.transform(md, bm_mod) + +# @test isequal(md, emfd) +# end + +# @testset "transform(df, bm) using bitmask on DataFrame" begin +# df = random_timeseries_df(; nvar=5) +# bm = BitVector([0,1,0,1,1]) +# # expected values +# edf = deepcopy(df) +# select!(edf, [2,4,5]) + +# df = SoleFeatures.transform(df, bm) + +# @test isequal(df, edf) +# end + +# end + +# @testset "utils" begin + +# @testset "_mod_bm2mfd_bm using array of frames and array of bitmasks" begin +# df = random_timeseries_df(; nvar=10) +# md = MultiData.MultiDataset(df, [[3,7,8], [1,2,4], [5,6,9,10]]) +# frms = [1,2,3] +# bms = Vector{BitVector}([ [0,1,0],[0,0,1],[0,1,1,0] ]) +# # expected values +# ebm = BitVector([ 0,0,0,1,0,1,1,0,1,0 ]) + +# resbm = SoleFeatures._mod_bm2mfd_bm(md, frms, bms) + +# @test isequal(resbm, ebm) +# end + +# @testset "_mod_bm2mfd_bm using modality and bitmask" begin +# df = random_timeseries_df(; nvar=10) +# md = MultiData.MultiDataset(df, [[3,7,8], [1,2,4], [5,6,9,10]]) +# frm = 2 +# bm = BitVector([0,0,1]) +# # expected values +# ebm = BitVector([ 0,0,1,1,1,1,1,1,1,1 ]) + +# resbm = SoleFeatures._mod_bm2mfd_bm(md, frm, bm) + +# @test isequal(resbm, ebm) +# end + +# end + +# @testset "selectors" begin + +# @testset "transform" begin + +# @testset "RandomRanking" begin +# seed = 1997 +# rr = RandomRanking(3, seed) +# df = random_timeseries_df(;nvar=10) +# # expected values +# edf = deepcopy(df) +# select!(edf, [6,2,5]) + +# SoleFeatures.transform!(df, rr) + +# @test isequal(df, edf) +# end + +# @testset "VarianceThreshold" begin +# df = random_df() +# ndf = SoleFeatures.minmax_normalize(df; min_quantile=0.0, max_quantile=1.0) +# vt = VarianceThreshold(0.09) +# @test (SoleFeatures.transform!(df, vt) isa DataFrame) +# end + +# @testset "VarianceRanking" begin +# df = random_df() +# ndf = SoleFeatures.minmax_normalize(df; min_quantile=0.0, max_quantile=1.0) +# vr = VarianceRanking(3) +# @test (SoleFeatures.transform!(df, vr) isa DataFrame) +# end + +# @testset "StatisticalMajority" begin +# df = random_df() +# y = rand([:a, :b, :c], 100) +# sm = StatisticalMajority(UnequalVarianceTTest) +# @test (SoleFeatures.transform!(df, y, sm) isa DataFrame) +# end + +# @testset "StatisticalAtLeastOnce" begin +# df = random_df() +# y = rand([:a, :b, :c], 100) +# sa = StatisticalAtLeastOnce(UnequalVarianceZTest) +# @test (SoleFeatures.transform!(df, y, sa) isa DataFrame) +# end + +# @testset "CompoundStatisticalMajority" begin +# df = random_df() +# y = rand([:a, :b, :c], 100) +# cm = CompoundStatisticalMajority(UnequalVarianceTTest, MannWhitneyUTest) +# @test (SoleFeatures.transform!(df, y, cm) isa DataFrame) +# end + +# @testset "CompoundStatisticalAtLeastOnce" begin +# df = random_df() +# y = rand([:a, :b, :c], 100) +# ca = CompoundStatisticalAtLeastOnce(UnequalVarianceZTest, MannWhitneyUTest) +# @test (SoleFeatures.transform!(df, y, ca) isa DataFrame) +# end + +# @testset "CorrelationFilter" begin +# df = random_df() +# cf = CorrelationFilter(cor, 0) +# @test (SoleFeatures.transform!(df, cf) isa DataFrame) +# end + +# @testset "Chi2Filter" begin +# df = random_df() +# y = rand([:a, :b, :c], 100) +# c2r = Chi2Ranking(3) +# @test (SoleFeatures.transform!(df, y, c2r) isa DataFrame) +# end + +# @testset "PearsonCorRanking" begin +# df = random_df() +# y = rand(100) +# c2r = PearsonCorRanking(3) +# @test (SoleFeatures.transform!(df, y, c2r) isa DataFrame) +# end + +# @testset "MutualInformationClassif" begin +# df = random_df() +# y = rand([:a, :b, :c], 100) +# mir = MutualInformationClassifRanking(3) +# @test (SoleFeatures.transform!(df, y, mir) isa DataFrame) +# end + +# # TODO: make this work: see the FIXME in the file test/runtests.jl +# # @testset "VarianceRanking on MultiDataset" begin +# # df = random_df(); +# # df = SoleFeatures.minmax_normalize(df; min_quantile=0.0, max_quantile=1.0) +# # md = MultiData.MultiDataset([ [1,2,3,4], [5] ], df) +# # vr = VarianceRanking(3) +# # @test (SoleFeatures.transform!(md, vr; i_modality=1) isa MultiDataset) +# # end + +# end + +# end + +# end