From 3c066a62d784da6e3e64de4b86ae9b4a4cf135ed Mon Sep 17 00:00:00 2001 From: Aldo Date: Tue, 1 Feb 2022 17:58:19 +0100 Subject: [PATCH 1/9] add CombinatorialGapKFold --- README.md | 7 +-- tscv/__init__.py | 2 + tscv/_split.py | 120 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 125 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 59d585e..2e5ce93 100644 --- a/README.md +++ b/README.md @@ -24,15 +24,16 @@ conda install -c conda-forge tscv ## Usage -This extension defines 3 cross-validator classes and 1 function: +This extension defines 4 cross-validator classes and 1 function: - `GapLeavePOut` - `GapKFold` - `GapRollForward` +- `CombinatorialGapKFold` - `gap_train_test_split` -The three classes can all be passed, as the `cv` argument, to +The four classes can all be passed, as the `cv` argument, to scikit-learn functions such as `cross-validate`, `cross_val_score`, -and `cross_val_predict`, just like the native cross-validator classes. +and `cross_val_predict` (except `CombinatorialGapKFold`), just like the native cross-validator classes. The one function is an alternative to the `train_test_split` function in `scikit-learn`. diff --git a/tscv/__init__.py b/tscv/__init__.py index 4b466ba..b0c6d97 100644 --- a/tscv/__init__.py +++ b/tscv/__init__.py @@ -3,6 +3,7 @@ from ._split import GapKFold from ._split import GapWalkForward from ._split import GapRollForward +from ._split import CombinatorialGapKFold from ._split import gap_train_test_split @@ -13,4 +14,5 @@ 'GapKFold', 'GapWalkForward', 'GapRollForward', + 'CombinatorialGapKFold', 'gap_train_test_split'] diff --git a/tscv/_split.py b/tscv/_split.py index 31e6521..d5a1354 100644 --- a/tscv/_split.py +++ b/tscv/_split.py @@ -10,7 +10,7 @@ import numbers from math import modf from abc import ABCMeta, abstractmethod -from itertools import chain +from itertools import chain, combinations from inspect import signature import numpy as np @@ -371,6 +371,124 @@ def get_n_splits(self, X=None, y=None, groups=None): return self.n_splits +class CombinatorialGapKFold(GapCrossValidator): + """Combinatorial K-Folds cross-validator with Gaps + + Provides train/test indices to split data in train/test sets. Split + dataset into N groups of k folds (without shuffling). + + Parameters + ---------- + N : int, default=5 + Number of groups. Must be at least 2. + + k : int, default=2 + Number of test splints. Must be at least 2. + + gap_before : int, default=2 + Gap before the test sets. + + gap_after : int, default=0 + Gap after the test sets. + + Examples + -------- + >>> import numpy as np + >>> from tscv import CombinatorialGapKFold + >>> cgkf = CombinatorialGapKFold(N=5, k=2, gap_before=1, gap_after=1) + >>> cgkf.get_n_splits(np.arange(10)) + 10 + >>> print(cgkf) + CombinatorialGapKFold(N=None, gap_after=1, gap_before=1, k=None) + >>> for train_index, test_index in cgkf.split(np.arange(10)): + ... print("TRAIN:", train_index, "TEST:", test_index) + TRAIN: [5 6 7 8 9] TEST: [0 1 2 3] + TRAIN: [7 8 9] TEST: [0 1 4 5] + TRAIN: [3 4 9] TEST: [0 1 6 7] + TRAIN: [3 4 5 6] TEST: [0 1 8 9] + TRAIN: [0 7 8 9] TEST: [2 3 4 5] + TRAIN: [0 9] TEST: [2 3 6 7] + TRAIN: [0 5 6] TEST: [2 3 8 9] + TRAIN: [0 1 2 9] TEST: [4 5 6 7] + TRAIN: [0 1 2] TEST: [4 5 8 9] + TRAIN: [0 1 2 3 4] TEST: [6 7 8 9] + """ + + def __init__(self, N=5, k=2, gap_before=0, gap_after=0): + if not isinstance(N, numbers.Integral): + raise ValueError('The number of groups must be of Integral type. ' + '%s of type %s was passed.' + % (N, type(N))) + N = int(N) + + if not isinstance(k, numbers.Integral): + raise ValueError('The number of test splits must be of Integral type. ' + '%s of type %s was passed.' + % (k, type(k))) + k = int(k) + + if N <= 1: + raise ValueError( + "Combinatorial k-fold cross-validation requires at least two" + " groups by setting N=2 or more," + " got N={0}.".format(N)) + + if k < 1: + raise ValueError( + "Combinatorial k-fold cross-validation requires at least one" + " test split by setting k=1 or more," + " got k={0}.".format(k)) + + super().__init__(gap_before, gap_after) + self.n_groups = N + self.test_splits = k + + def split(self, X, y=None, groups=None): + n_samples = _num_samples(X) + n_splits = self.n_groups + gap_before, gap_after = self.gap_before, self.gap_after + if n_splits > n_samples: + raise ValueError( + ("Cannot have number of splits n_splits={0} greater" + " than the number of samples: n_samples={1}.") + .format(self.n_groups, n_samples)) + self.indexes = np.arange(n_samples) + splits = [(split[0], split[-1]+1) for split in np.array_split(self.indexes, self.n_groups)] + splits_combinations = list(combinations(splits, self.test_splits)) + for splits_combination in splits_combinations: + test_indexes = np.empty(0) + train_indexes = self.indexes + for start, stop in splits_combination: + test_indexes = np.union1d(test_indexes, self.indexes[start:stop]).astype(int) + begin = max(0, start-gap_before) + end = min(n_samples, stop+gap_after) + train_indexes = np.intersect1d(train_indexes, np.setdiff1d(self.indexes, self.indexes[begin:end])) + if len(train_indexes) <= 0: + raise ValueError("Not enough training samples available") + yield train_indexes, test_indexes + + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + return len(list(combinations(range(self.n_groups), self.test_splits))) + + def gap_train_test_split(*arrays, **options): """Split arrays or matrices into random train and test subsets (with a gap) From cb3b0df26ad39e0aff863f26caf54a41e12137a0 Mon Sep 17 00:00:00 2001 From: Aldo Date: Tue, 1 Feb 2022 18:54:58 +0100 Subject: [PATCH 2/9] pep8 --- tscv/_split.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tscv/_split.py b/tscv/_split.py index d5a1354..46ab3b2 100644 --- a/tscv/_split.py +++ b/tscv/_split.py @@ -24,6 +24,7 @@ 'GapLeavePOut', 'GapKFold', 'GapWalkForward', + 'CombinatorialGapKFold', 'gap_train_test_split'] @@ -422,8 +423,8 @@ def __init__(self, N=5, k=2, gap_before=0, gap_after=0): N = int(N) if not isinstance(k, numbers.Integral): - raise ValueError('The number of test splits must be of Integral type. ' - '%s of type %s was passed.' + raise ValueError('The number of test splits must be of Integral ' + 'type. %s of type %s was passed.' % (k, type(k))) k = int(k) @@ -453,16 +454,20 @@ def split(self, X, y=None, groups=None): " than the number of samples: n_samples={1}.") .format(self.n_groups, n_samples)) self.indexes = np.arange(n_samples) - splits = [(split[0], split[-1]+1) for split in np.array_split(self.indexes, self.n_groups)] + splits = [(split[0], split[-1]+1) + for split + in np.array_split(self.indexes, self.n_groups)] splits_combinations = list(combinations(splits, self.test_splits)) for splits_combination in splits_combinations: test_indexes = np.empty(0) train_indexes = self.indexes for start, stop in splits_combination: - test_indexes = np.union1d(test_indexes, self.indexes[start:stop]).astype(int) + test_indexes = np.union1d( + test_indexes, self.indexes[start:stop]).astype(int) begin = max(0, start-gap_before) end = min(n_samples, stop+gap_after) - train_indexes = np.intersect1d(train_indexes, np.setdiff1d(self.indexes, self.indexes[begin:end])) + train_indexes = np.intersect1d(train_indexes, + np.setdiff1d(self.indexes, self.indexes[begin:end])) if len(train_indexes) <= 0: raise ValueError("Not enough training samples available") yield train_indexes, test_indexes From 60029a9c7201e3e78023333f8cfe4031f37ccd14 Mon Sep 17 00:00:00 2001 From: Aldo Date: Tue, 1 Feb 2022 19:01:36 +0100 Subject: [PATCH 3/9] pep8 --- tscv/_split.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tscv/_split.py b/tscv/_split.py index 46ab3b2..978737c 100644 --- a/tscv/_split.py +++ b/tscv/_split.py @@ -454,8 +454,8 @@ def split(self, X, y=None, groups=None): " than the number of samples: n_samples={1}.") .format(self.n_groups, n_samples)) self.indexes = np.arange(n_samples) - splits = [(split[0], split[-1]+1) - for split + splits = [(split[0], split[-1] + 1) + for split in np.array_split(self.indexes, self.n_groups)] splits_combinations = list(combinations(splits, self.test_splits)) for splits_combination in splits_combinations: @@ -464,9 +464,9 @@ def split(self, X, y=None, groups=None): for start, stop in splits_combination: test_indexes = np.union1d( test_indexes, self.indexes[start:stop]).astype(int) - begin = max(0, start-gap_before) - end = min(n_samples, stop+gap_after) - train_indexes = np.intersect1d(train_indexes, + begin = max(0, start - gap_before) + end = min(n_samples, stop + gap_after) + train_indexes = np.intersect1d(train_indexes, np.setdiff1d(self.indexes, self.indexes[begin:end])) if len(train_indexes) <= 0: raise ValueError("Not enough training samples available") From 43dd80675320bd418538887069a1a79755d742ca Mon Sep 17 00:00:00 2001 From: Aldo Date: Mon, 7 Feb 2022 12:43:26 +0100 Subject: [PATCH 4/9] add test https://github.com/WenjieZ/TSCV/pull/41#issuecomment-1030515788 --- tscv/tests/test_split.py | 104 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/tscv/tests/test_split.py b/tscv/tests/test_split.py index 2b43ec0..4f21152 100644 --- a/tscv/tests/test_split.py +++ b/tscv/tests/test_split.py @@ -15,6 +15,7 @@ from tscv import GapKFold from tscv import GapWalkForward from tscv import GapRollForward +from tscv import CombinatorialGapKFold from tscv import gap_train_test_split @@ -568,3 +569,106 @@ def test_roll_size(self): train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3]) assert_array_equal(test, [6]) + + +def test_combinatorial_gap_k_fold(): + splits = CombinatorialGapKFold().split(np.arange(10)) + + train, test = next(splits) + assert_array_equal(train, [4, 5, 6, 7, 8, 9]) + assert_array_equal(test, [0, 1, 2, 3]) + + train, test = next(splits) + assert_array_equal(train, [2, 3, 6, 7, 8, 9]) + assert_array_equal(test, [0, 1, 4, 5]) + + train, test = next(splits) + assert_array_equal(train, [2, 3, 4, 5, 8, 9]) + assert_array_equal(test, [0, 1, 6, 7]) + + train, test = next(splits) + assert_array_equal(train, [2, 3, 4, 5, 6, 7]) + assert_array_equal(test, [0, 1, 8, 9]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 6, 7, 8, 9]) + assert_array_equal(test, [2, 3, 4, 5]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 4, 5, 8, 9]) + assert_array_equal(test, [2, 3, 6, 7]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 4, 5, 6, 7]) + assert_array_equal(test, [2, 3, 8, 9]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 8, 9]) + assert_array_equal(test, [4, 5, 6, 7]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 6, 7]) + assert_array_equal(test, [4, 5, 8, 9]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4, 5]) + assert_array_equal(test, [6, 7, 8, 9]) + + splits = CombinatorialGapKFold( + N=5, k=2, gap_before=1, gap_after=2).split(np.arange(10)) + + train, test = next(splits) + assert_array_equal(train, [6, 7, 8, 9]) + assert_array_equal(test, [0, 1, 2, 3]) + + train, test = next(splits) + assert_array_equal(train, [8, 9]) + assert_array_equal(test, [0, 1, 4, 5]) + + train, test = next(splits) + assert_array_equal(train, [4]) + assert_array_equal(test, [0, 1, 6, 7]) + + train, test = next(splits) + assert_array_equal(train, [4, 5, 6]) + assert_array_equal(test, [0, 1, 8, 9]) + + train, test = next(splits) + assert_array_equal(train, [0, 8, 9]) + assert_array_equal(test, [2, 3, 4, 5]) + + train, test = next(splits) + assert_array_equal(train, [0]) + assert_array_equal(test, [2, 3, 6, 7]) + + train, test = next(splits) + assert_array_equal(train, [0, 6]) + assert_array_equal(test, [2, 3, 8, 9]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2]) + assert_array_equal(test, [4, 5, 6, 7]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2]) + assert_array_equal(test, [4, 5, 8, 9]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4]) + assert_array_equal(test, [6, 7, 8, 9]) + + assert_equal(CombinatorialGapKFold( + N=10, k=3).get_n_splits(np.arange(100)), 120) + + splits = CombinatorialGapKFold( + N=5, k=3, gap_before=1, gap_after=2).split(np.arange(10)) + + train, test = next(splits) + assert_array_equal(train, [8, 9]) + assert_array_equal(test, [0, 1, 2, 3, 4, 5]) + + with pytest.raises( + ValueError, + match="Not enough training samples available" + ): + next(splits) From d77fe02b7d98bc6feccf35b8b00092971a9c3857 Mon Sep 17 00:00:00 2001 From: Aldo Date: Mon, 7 Feb 2022 12:59:45 +0100 Subject: [PATCH 5/9] more tests --- tscv/tests/test_split.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tscv/tests/test_split.py b/tscv/tests/test_split.py index 4f21152..b240233 100644 --- a/tscv/tests/test_split.py +++ b/tscv/tests/test_split.py @@ -672,3 +672,35 @@ def test_combinatorial_gap_k_fold(): match="Not enough training samples available" ): next(splits) + + with pytest.raises( + ValueError, + match="The number of groups must be of Integral type. 5.0 of type was passed." + ): + CombinatorialGapKFold(N=5.0, k=2) + + with pytest.raises( + ValueError, + match="The number of test splits must be of Integral type. 2 of type was passed." + ): + CombinatorialGapKFold(N=5, k="2") + + with pytest.raises( + ValueError, + match="Combinatorial k-fold cross-validation requires at least two groups by setting N=2 or more, got N=1." + ): + CombinatorialGapKFold(N=1, k=2) + + with pytest.raises( + ValueError, + match="Combinatorial k-fold cross-validation requires at least one test split by setting k=1 or more, got k=0." + ): + CombinatorialGapKFold(N=5, k=0) + + splits = CombinatorialGapKFold(N=15, k=2).split(np.arange(10)) + + with pytest.raises( + ValueError, + match="Cannot have number of splits n_splits=15 greater than the number of samples: n_samples=10." + ): + next(splits) From e587202fc26e67b1bfaf1241deddfa0b4c3e50e8 Mon Sep 17 00:00:00 2001 From: Aldo Date: Tue, 8 Feb 2022 18:57:42 +0100 Subject: [PATCH 6/9] fix docstring --- tscv/_split.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tscv/_split.py b/tscv/_split.py index 978737c..03268b3 100644 --- a/tscv/_split.py +++ b/tscv/_split.py @@ -384,9 +384,9 @@ class CombinatorialGapKFold(GapCrossValidator): Number of groups. Must be at least 2. k : int, default=2 - Number of test splints. Must be at least 2. + Number of test splits. Must be at least 1. - gap_before : int, default=2 + gap_before : int, default=0 Gap before the test sets. gap_after : int, default=0 From 0f8c24dc67e79ef4984dfe46d595074b39030802 Mon Sep 17 00:00:00 2001 From: Aldo Date: Fri, 11 Feb 2022 10:33:41 +0100 Subject: [PATCH 7/9] fix https://github.com/WenjieZ/TSCV/pull/41#discussion_r804434648 --- tscv/_split.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tscv/_split.py b/tscv/_split.py index 03268b3..73002bb 100644 --- a/tscv/_split.py +++ b/tscv/_split.py @@ -12,6 +12,7 @@ from abc import ABCMeta, abstractmethod from itertools import chain, combinations from inspect import signature +from scipy.special import comb import numpy as np from sklearn.utils import indexable @@ -491,7 +492,7 @@ def get_n_splits(self, X=None, y=None, groups=None): n_splits : int Returns the number of splitting iterations in the cross-validator. """ - return len(list(combinations(range(self.n_groups), self.test_splits))) + return int(comb(self.n_groups, self.test_splits)) def gap_train_test_split(*arrays, **options): From c226a52bae6b19dc29c9b81abe752d593345932b Mon Sep 17 00:00:00 2001 From: Aldo Date: Fri, 11 Feb 2022 13:16:18 +0100 Subject: [PATCH 8/9] fix https://github.com/WenjieZ/TSCV/pull/41#pullrequestreview-879875822 and https://github.com/WenjieZ/TSCV/pull/41#discussion_r804533522 --- setup.py | 2 +- tscv/_split.py | 11 ++--------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/setup.py b/setup.py index c5c892d..08d062a 100644 --- a/setup.py +++ b/setup.py @@ -52,5 +52,5 @@ def get_version(rel_path): 'Programming Language :: Python :: 3.9', ], python_requires=">=3.6", - install_requires=['numpy>=1.13.3', 'scikit-learn>=0.22'] + install_requires=['numpy>=1.13.3', 'scipy>=1.3.0', 'scikit-learn>=0.22'] ) diff --git a/tscv/_split.py b/tscv/_split.py index 73002bb..7b83dbf 100644 --- a/tscv/_split.py +++ b/tscv/_split.py @@ -445,7 +445,7 @@ def __init__(self, N=5, k=2, gap_before=0, gap_after=0): self.n_groups = N self.test_splits = k - def split(self, X, y=None, groups=None): + def _iter_test_indices(self, X, y=None, groups=None): n_samples = _num_samples(X) n_splits = self.n_groups gap_before, gap_after = self.gap_before, self.gap_after @@ -461,17 +461,10 @@ def split(self, X, y=None, groups=None): splits_combinations = list(combinations(splits, self.test_splits)) for splits_combination in splits_combinations: test_indexes = np.empty(0) - train_indexes = self.indexes for start, stop in splits_combination: test_indexes = np.union1d( test_indexes, self.indexes[start:stop]).astype(int) - begin = max(0, start - gap_before) - end = min(n_samples, stop + gap_after) - train_indexes = np.intersect1d(train_indexes, - np.setdiff1d(self.indexes, self.indexes[begin:end])) - if len(train_indexes) <= 0: - raise ValueError("Not enough training samples available") - yield train_indexes, test_indexes + yield test_indexes def get_n_splits(self, X=None, y=None, groups=None): """Returns the number of splitting iterations in the cross-validator From c7b2bed7958d94125a362e4b07c03fb29b8615aa Mon Sep 17 00:00:00 2001 From: Aldo Date: Fri, 11 Feb 2022 13:19:45 +0100 Subject: [PATCH 9/9] fix tests --- tscv/tests/test_split.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tscv/tests/test_split.py b/tscv/tests/test_split.py index b240233..8540bd1 100644 --- a/tscv/tests/test_split.py +++ b/tscv/tests/test_split.py @@ -667,12 +667,6 @@ def test_combinatorial_gap_k_fold(): assert_array_equal(train, [8, 9]) assert_array_equal(test, [0, 1, 2, 3, 4, 5]) - with pytest.raises( - ValueError, - match="Not enough training samples available" - ): - next(splits) - with pytest.raises( ValueError, match="The number of groups must be of Integral type. 5.0 of type was passed."