diff --git a/README.md b/README.md index 59d585e..2e5ce93 100644 --- a/README.md +++ b/README.md @@ -24,15 +24,16 @@ conda install -c conda-forge tscv ## Usage -This extension defines 3 cross-validator classes and 1 function: +This extension defines 4 cross-validator classes and 1 function: - `GapLeavePOut` - `GapKFold` - `GapRollForward` +- `CombinatorialGapKFold` - `gap_train_test_split` -The three classes can all be passed, as the `cv` argument, to +The four classes can all be passed, as the `cv` argument, to scikit-learn functions such as `cross-validate`, `cross_val_score`, -and `cross_val_predict`, just like the native cross-validator classes. +and `cross_val_predict` (except `CombinatorialGapKFold`), just like the native cross-validator classes. The one function is an alternative to the `train_test_split` function in `scikit-learn`. diff --git a/setup.py b/setup.py index c5c892d..08d062a 100644 --- a/setup.py +++ b/setup.py @@ -52,5 +52,5 @@ def get_version(rel_path): 'Programming Language :: Python :: 3.9', ], python_requires=">=3.6", - install_requires=['numpy>=1.13.3', 'scikit-learn>=0.22'] + install_requires=['numpy>=1.13.3', 'scipy>=1.3.0', 'scikit-learn>=0.22'] ) diff --git a/tscv/__init__.py b/tscv/__init__.py index 4b466ba..b0c6d97 100644 --- a/tscv/__init__.py +++ b/tscv/__init__.py @@ -3,6 +3,7 @@ from ._split import GapKFold from ._split import GapWalkForward from ._split import GapRollForward +from ._split import CombinatorialGapKFold from ._split import gap_train_test_split @@ -13,4 +14,5 @@ 'GapKFold', 'GapWalkForward', 'GapRollForward', + 'CombinatorialGapKFold', 'gap_train_test_split'] diff --git a/tscv/_split.py b/tscv/_split.py index 31e6521..7b83dbf 100644 --- a/tscv/_split.py +++ b/tscv/_split.py @@ -10,8 +10,9 @@ import numbers from math import modf from abc import ABCMeta, abstractmethod -from itertools import chain +from itertools import chain, combinations from inspect import signature +from scipy.special import comb import numpy as np from sklearn.utils import indexable @@ -24,6 +25,7 @@ 'GapLeavePOut', 'GapKFold', 'GapWalkForward', + 'CombinatorialGapKFold', 'gap_train_test_split'] @@ -371,6 +373,121 @@ def get_n_splits(self, X=None, y=None, groups=None): return self.n_splits +class CombinatorialGapKFold(GapCrossValidator): + """Combinatorial K-Folds cross-validator with Gaps + + Provides train/test indices to split data in train/test sets. Split + dataset into N groups of k folds (without shuffling). + + Parameters + ---------- + N : int, default=5 + Number of groups. Must be at least 2. + + k : int, default=2 + Number of test splits. Must be at least 1. + + gap_before : int, default=0 + Gap before the test sets. + + gap_after : int, default=0 + Gap after the test sets. + + Examples + -------- + >>> import numpy as np + >>> from tscv import CombinatorialGapKFold + >>> cgkf = CombinatorialGapKFold(N=5, k=2, gap_before=1, gap_after=1) + >>> cgkf.get_n_splits(np.arange(10)) + 10 + >>> print(cgkf) + CombinatorialGapKFold(N=None, gap_after=1, gap_before=1, k=None) + >>> for train_index, test_index in cgkf.split(np.arange(10)): + ... print("TRAIN:", train_index, "TEST:", test_index) + TRAIN: [5 6 7 8 9] TEST: [0 1 2 3] + TRAIN: [7 8 9] TEST: [0 1 4 5] + TRAIN: [3 4 9] TEST: [0 1 6 7] + TRAIN: [3 4 5 6] TEST: [0 1 8 9] + TRAIN: [0 7 8 9] TEST: [2 3 4 5] + TRAIN: [0 9] TEST: [2 3 6 7] + TRAIN: [0 5 6] TEST: [2 3 8 9] + TRAIN: [0 1 2 9] TEST: [4 5 6 7] + TRAIN: [0 1 2] TEST: [4 5 8 9] + TRAIN: [0 1 2 3 4] TEST: [6 7 8 9] + """ + + def __init__(self, N=5, k=2, gap_before=0, gap_after=0): + if not isinstance(N, numbers.Integral): + raise ValueError('The number of groups must be of Integral type. ' + '%s of type %s was passed.' + % (N, type(N))) + N = int(N) + + if not isinstance(k, numbers.Integral): + raise ValueError('The number of test splits must be of Integral ' + 'type. %s of type %s was passed.' + % (k, type(k))) + k = int(k) + + if N <= 1: + raise ValueError( + "Combinatorial k-fold cross-validation requires at least two" + " groups by setting N=2 or more," + " got N={0}.".format(N)) + + if k < 1: + raise ValueError( + "Combinatorial k-fold cross-validation requires at least one" + " test split by setting k=1 or more," + " got k={0}.".format(k)) + + super().__init__(gap_before, gap_after) + self.n_groups = N + self.test_splits = k + + def _iter_test_indices(self, X, y=None, groups=None): + n_samples = _num_samples(X) + n_splits = self.n_groups + gap_before, gap_after = self.gap_before, self.gap_after + if n_splits > n_samples: + raise ValueError( + ("Cannot have number of splits n_splits={0} greater" + " than the number of samples: n_samples={1}.") + .format(self.n_groups, n_samples)) + self.indexes = np.arange(n_samples) + splits = [(split[0], split[-1] + 1) + for split + in np.array_split(self.indexes, self.n_groups)] + splits_combinations = list(combinations(splits, self.test_splits)) + for splits_combination in splits_combinations: + test_indexes = np.empty(0) + for start, stop in splits_combination: + test_indexes = np.union1d( + test_indexes, self.indexes[start:stop]).astype(int) + yield test_indexes + + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + return int(comb(self.n_groups, self.test_splits)) + + def gap_train_test_split(*arrays, **options): """Split arrays or matrices into random train and test subsets (with a gap) diff --git a/tscv/tests/test_split.py b/tscv/tests/test_split.py index 2b43ec0..8540bd1 100644 --- a/tscv/tests/test_split.py +++ b/tscv/tests/test_split.py @@ -15,6 +15,7 @@ from tscv import GapKFold from tscv import GapWalkForward from tscv import GapRollForward +from tscv import CombinatorialGapKFold from tscv import gap_train_test_split @@ -568,3 +569,132 @@ def test_roll_size(self): train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3]) assert_array_equal(test, [6]) + + +def test_combinatorial_gap_k_fold(): + splits = CombinatorialGapKFold().split(np.arange(10)) + + train, test = next(splits) + assert_array_equal(train, [4, 5, 6, 7, 8, 9]) + assert_array_equal(test, [0, 1, 2, 3]) + + train, test = next(splits) + assert_array_equal(train, [2, 3, 6, 7, 8, 9]) + assert_array_equal(test, [0, 1, 4, 5]) + + train, test = next(splits) + assert_array_equal(train, [2, 3, 4, 5, 8, 9]) + assert_array_equal(test, [0, 1, 6, 7]) + + train, test = next(splits) + assert_array_equal(train, [2, 3, 4, 5, 6, 7]) + assert_array_equal(test, [0, 1, 8, 9]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 6, 7, 8, 9]) + assert_array_equal(test, [2, 3, 4, 5]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 4, 5, 8, 9]) + assert_array_equal(test, [2, 3, 6, 7]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 4, 5, 6, 7]) + assert_array_equal(test, [2, 3, 8, 9]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 8, 9]) + assert_array_equal(test, [4, 5, 6, 7]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 6, 7]) + assert_array_equal(test, [4, 5, 8, 9]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4, 5]) + assert_array_equal(test, [6, 7, 8, 9]) + + splits = CombinatorialGapKFold( + N=5, k=2, gap_before=1, gap_after=2).split(np.arange(10)) + + train, test = next(splits) + assert_array_equal(train, [6, 7, 8, 9]) + assert_array_equal(test, [0, 1, 2, 3]) + + train, test = next(splits) + assert_array_equal(train, [8, 9]) + assert_array_equal(test, [0, 1, 4, 5]) + + train, test = next(splits) + assert_array_equal(train, [4]) + assert_array_equal(test, [0, 1, 6, 7]) + + train, test = next(splits) + assert_array_equal(train, [4, 5, 6]) + assert_array_equal(test, [0, 1, 8, 9]) + + train, test = next(splits) + assert_array_equal(train, [0, 8, 9]) + assert_array_equal(test, [2, 3, 4, 5]) + + train, test = next(splits) + assert_array_equal(train, [0]) + assert_array_equal(test, [2, 3, 6, 7]) + + train, test = next(splits) + assert_array_equal(train, [0, 6]) + assert_array_equal(test, [2, 3, 8, 9]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2]) + assert_array_equal(test, [4, 5, 6, 7]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2]) + assert_array_equal(test, [4, 5, 8, 9]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4]) + assert_array_equal(test, [6, 7, 8, 9]) + + assert_equal(CombinatorialGapKFold( + N=10, k=3).get_n_splits(np.arange(100)), 120) + + splits = CombinatorialGapKFold( + N=5, k=3, gap_before=1, gap_after=2).split(np.arange(10)) + + train, test = next(splits) + assert_array_equal(train, [8, 9]) + assert_array_equal(test, [0, 1, 2, 3, 4, 5]) + + with pytest.raises( + ValueError, + match="The number of groups must be of Integral type. 5.0 of type was passed." + ): + CombinatorialGapKFold(N=5.0, k=2) + + with pytest.raises( + ValueError, + match="The number of test splits must be of Integral type. 2 of type was passed." + ): + CombinatorialGapKFold(N=5, k="2") + + with pytest.raises( + ValueError, + match="Combinatorial k-fold cross-validation requires at least two groups by setting N=2 or more, got N=1." + ): + CombinatorialGapKFold(N=1, k=2) + + with pytest.raises( + ValueError, + match="Combinatorial k-fold cross-validation requires at least one test split by setting k=1 or more, got k=0." + ): + CombinatorialGapKFold(N=5, k=0) + + splits = CombinatorialGapKFold(N=15, k=2).split(np.arange(10)) + + with pytest.raises( + ValueError, + match="Cannot have number of splits n_splits=15 greater than the number of samples: n_samples=10." + ): + next(splits)