From 3c066a62d784da6e3e64de4b86ae9b4a4cf135ed Mon Sep 17 00:00:00 2001
From: Aldo <aldo.deramo@gmail.com>
Date: Tue, 1 Feb 2022 17:58:19 +0100
Subject: [PATCH 1/9] add CombinatorialGapKFold

---
 README.md        |   7 +--
 tscv/__init__.py |   2 +
 tscv/_split.py   | 120 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 125 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 59d585e..2e5ce93 100644
--- a/README.md
+++ b/README.md
@@ -24,15 +24,16 @@ conda install -c conda-forge tscv
 
 ## Usage
 
-This extension defines 3 cross-validator classes and 1 function:
+This extension defines 4 cross-validator classes and 1 function:
 - `GapLeavePOut`
 - `GapKFold`
 - `GapRollForward`
+- `CombinatorialGapKFold`
 - `gap_train_test_split`
 
-The three classes can all be passed, as the `cv` argument, to
+The four classes can all be passed, as the `cv` argument, to
 scikit-learn functions such as `cross-validate`, `cross_val_score`,
-and `cross_val_predict`, just like the native cross-validator classes.
+and `cross_val_predict` (except `CombinatorialGapKFold`), just like the native cross-validator classes.
 
 The one function is an alternative to the `train_test_split` function in `scikit-learn`.
 
diff --git a/tscv/__init__.py b/tscv/__init__.py
index 4b466ba..b0c6d97 100644
--- a/tscv/__init__.py
+++ b/tscv/__init__.py
@@ -3,6 +3,7 @@
 from ._split import GapKFold
 from ._split import GapWalkForward
 from ._split import GapRollForward
+from ._split import CombinatorialGapKFold
 from ._split import gap_train_test_split
 
 
@@ -13,4 +14,5 @@
            'GapKFold',
            'GapWalkForward',
            'GapRollForward',
+           'CombinatorialGapKFold',
            'gap_train_test_split']
diff --git a/tscv/_split.py b/tscv/_split.py
index 31e6521..d5a1354 100644
--- a/tscv/_split.py
+++ b/tscv/_split.py
@@ -10,7 +10,7 @@
 import numbers
 from math import modf
 from abc import ABCMeta, abstractmethod
-from itertools import chain
+from itertools import chain, combinations
 from inspect import signature
 
 import numpy as np
@@ -371,6 +371,124 @@ def get_n_splits(self, X=None, y=None, groups=None):
         return self.n_splits
 
 
+class CombinatorialGapKFold(GapCrossValidator):
+    """Combinatorial K-Folds cross-validator with Gaps
+
+    Provides train/test indices to split data in train/test sets. Split
+    dataset into N groups of k folds (without shuffling).
+
+    Parameters
+    ----------
+    N : int, default=5
+        Number of groups. Must be at least 2.
+
+    k : int, default=2
+        Number of test splints. Must be at least 2.
+
+    gap_before : int, default=2
+        Gap before the test sets.
+
+    gap_after : int, default=0
+        Gap after the test sets.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from tscv import CombinatorialGapKFold
+    >>> cgkf = CombinatorialGapKFold(N=5, k=2, gap_before=1, gap_after=1)
+    >>> cgkf.get_n_splits(np.arange(10))
+    10
+    >>> print(cgkf)
+    CombinatorialGapKFold(N=None, gap_after=1, gap_before=1, k=None)
+    >>> for train_index, test_index in cgkf.split(np.arange(10)):
+    ...    print("TRAIN:", train_index, "TEST:", test_index)
+    TRAIN: [5 6 7 8 9] TEST: [0 1 2 3]
+    TRAIN: [7 8 9] TEST: [0 1 4 5]
+    TRAIN: [3 4 9] TEST: [0 1 6 7]
+    TRAIN: [3 4 5 6] TEST: [0 1 8 9]
+    TRAIN: [0 7 8 9] TEST: [2 3 4 5]
+    TRAIN: [0 9] TEST: [2 3 6 7]
+    TRAIN: [0 5 6] TEST: [2 3 8 9]
+    TRAIN: [0 1 2 9] TEST: [4 5 6 7]
+    TRAIN: [0 1 2] TEST: [4 5 8 9]
+    TRAIN: [0 1 2 3 4] TEST: [6 7 8 9]
+    """
+
+    def __init__(self, N=5, k=2, gap_before=0, gap_after=0):
+        if not isinstance(N, numbers.Integral):
+            raise ValueError('The number of groups must be of Integral type. '
+                             '%s of type %s was passed.'
+                             % (N, type(N)))
+        N = int(N)
+
+        if not isinstance(k, numbers.Integral):
+            raise ValueError('The number of test splits must be of Integral type. '
+                             '%s of type %s was passed.'
+                             % (k, type(k)))
+        k = int(k)
+
+        if N <= 1:
+            raise ValueError(
+                "Combinatorial k-fold cross-validation requires at least two"
+                " groups by setting N=2 or more,"
+                " got N={0}.".format(N))
+
+        if k < 1:
+            raise ValueError(
+                "Combinatorial k-fold cross-validation requires at least one"
+                " test split by setting k=1 or more,"
+                " got k={0}.".format(k))
+
+        super().__init__(gap_before, gap_after)
+        self.n_groups = N
+        self.test_splits = k
+
+    def split(self, X, y=None, groups=None):
+        n_samples = _num_samples(X)
+        n_splits = self.n_groups
+        gap_before, gap_after = self.gap_before, self.gap_after
+        if n_splits > n_samples:
+            raise ValueError(
+                ("Cannot have number of splits n_splits={0} greater"
+                 " than the number of samples: n_samples={1}.")
+                .format(self.n_groups, n_samples))
+        self.indexes = np.arange(n_samples)
+        splits = [(split[0], split[-1]+1) for split in np.array_split(self.indexes, self.n_groups)]
+        splits_combinations = list(combinations(splits, self.test_splits))
+        for splits_combination in splits_combinations:
+            test_indexes = np.empty(0)
+            train_indexes = self.indexes
+            for start, stop in splits_combination:
+                test_indexes = np.union1d(test_indexes, self.indexes[start:stop]).astype(int)
+                begin = max(0, start-gap_before)
+                end = min(n_samples, stop+gap_after)
+                train_indexes = np.intersect1d(train_indexes, np.setdiff1d(self.indexes, self.indexes[begin:end]))
+                if len(train_indexes) <= 0:
+                    raise ValueError("Not enough training samples available")
+            yield train_indexes, test_indexes
+
+    def get_n_splits(self, X=None, y=None, groups=None):
+        """Returns the number of splitting iterations in the cross-validator
+
+        Parameters
+        ----------
+        X : object
+            Always ignored, exists for compatibility.
+
+        y : object
+            Always ignored, exists for compatibility.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Returns
+        -------
+        n_splits : int
+            Returns the number of splitting iterations in the cross-validator.
+        """
+        return len(list(combinations(range(self.n_groups), self.test_splits)))
+
+
 def gap_train_test_split(*arrays, **options):
     """Split arrays or matrices into random train and test subsets (with a gap)
 

From cb3b0df26ad39e0aff863f26caf54a41e12137a0 Mon Sep 17 00:00:00 2001
From: Aldo <aldo.deramo@gmail.com>
Date: Tue, 1 Feb 2022 18:54:58 +0100
Subject: [PATCH 2/9] pep8

---
 tscv/_split.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tscv/_split.py b/tscv/_split.py
index d5a1354..46ab3b2 100644
--- a/tscv/_split.py
+++ b/tscv/_split.py
@@ -24,6 +24,7 @@
            'GapLeavePOut',
            'GapKFold',
            'GapWalkForward',
+           'CombinatorialGapKFold',
            'gap_train_test_split']
 
 
@@ -422,8 +423,8 @@ def __init__(self, N=5, k=2, gap_before=0, gap_after=0):
         N = int(N)
 
         if not isinstance(k, numbers.Integral):
-            raise ValueError('The number of test splits must be of Integral type. '
-                             '%s of type %s was passed.'
+            raise ValueError('The number of test splits must be of Integral '
+                             'type. %s of type %s was passed.'
                              % (k, type(k)))
         k = int(k)
 
@@ -453,16 +454,20 @@ def split(self, X, y=None, groups=None):
                  " than the number of samples: n_samples={1}.")
                 .format(self.n_groups, n_samples))
         self.indexes = np.arange(n_samples)
-        splits = [(split[0], split[-1]+1) for split in np.array_split(self.indexes, self.n_groups)]
+        splits = [(split[0], split[-1]+1) 
+                  for split 
+                  in np.array_split(self.indexes, self.n_groups)]
         splits_combinations = list(combinations(splits, self.test_splits))
         for splits_combination in splits_combinations:
             test_indexes = np.empty(0)
             train_indexes = self.indexes
             for start, stop in splits_combination:
-                test_indexes = np.union1d(test_indexes, self.indexes[start:stop]).astype(int)
+                test_indexes = np.union1d(
+                    test_indexes, self.indexes[start:stop]).astype(int)
                 begin = max(0, start-gap_before)
                 end = min(n_samples, stop+gap_after)
-                train_indexes = np.intersect1d(train_indexes, np.setdiff1d(self.indexes, self.indexes[begin:end]))
+                train_indexes = np.intersect1d(train_indexes, 
+                    np.setdiff1d(self.indexes, self.indexes[begin:end]))
                 if len(train_indexes) <= 0:
                     raise ValueError("Not enough training samples available")
             yield train_indexes, test_indexes

From 60029a9c7201e3e78023333f8cfe4031f37ccd14 Mon Sep 17 00:00:00 2001
From: Aldo <aldo.deramo@gmail.com>
Date: Tue, 1 Feb 2022 19:01:36 +0100
Subject: [PATCH 3/9] pep8

---
 tscv/_split.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tscv/_split.py b/tscv/_split.py
index 46ab3b2..978737c 100644
--- a/tscv/_split.py
+++ b/tscv/_split.py
@@ -454,8 +454,8 @@ def split(self, X, y=None, groups=None):
                  " than the number of samples: n_samples={1}.")
                 .format(self.n_groups, n_samples))
         self.indexes = np.arange(n_samples)
-        splits = [(split[0], split[-1]+1) 
-                  for split 
+        splits = [(split[0], split[-1] + 1)
+                  for split
                   in np.array_split(self.indexes, self.n_groups)]
         splits_combinations = list(combinations(splits, self.test_splits))
         for splits_combination in splits_combinations:
@@ -464,9 +464,9 @@ def split(self, X, y=None, groups=None):
             for start, stop in splits_combination:
                 test_indexes = np.union1d(
                     test_indexes, self.indexes[start:stop]).astype(int)
-                begin = max(0, start-gap_before)
-                end = min(n_samples, stop+gap_after)
-                train_indexes = np.intersect1d(train_indexes, 
+                begin = max(0, start - gap_before)
+                end = min(n_samples, stop + gap_after)
+                train_indexes = np.intersect1d(train_indexes,
                     np.setdiff1d(self.indexes, self.indexes[begin:end]))
                 if len(train_indexes) <= 0:
                     raise ValueError("Not enough training samples available")

From 43dd80675320bd418538887069a1a79755d742ca Mon Sep 17 00:00:00 2001
From: Aldo <aldo.deramo@gmail.com>
Date: Mon, 7 Feb 2022 12:43:26 +0100
Subject: [PATCH 4/9] add test
 https://github.com/WenjieZ/TSCV/pull/41#issuecomment-1030515788

---
 tscv/tests/test_split.py | 104 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/tscv/tests/test_split.py b/tscv/tests/test_split.py
index 2b43ec0..4f21152 100644
--- a/tscv/tests/test_split.py
+++ b/tscv/tests/test_split.py
@@ -15,6 +15,7 @@
 from tscv import GapKFold
 from tscv import GapWalkForward
 from tscv import GapRollForward
+from tscv import CombinatorialGapKFold
 from tscv import gap_train_test_split
 
 
@@ -568,3 +569,106 @@ def test_roll_size(self):
         train, test = next(splits)
         assert_array_equal(train, [0, 1, 2, 3])
         assert_array_equal(test, [6])
+
+
+def test_combinatorial_gap_k_fold():
+    splits = CombinatorialGapKFold().split(np.arange(10))
+
+    train, test = next(splits)
+    assert_array_equal(train, [4, 5, 6, 7, 8, 9])
+    assert_array_equal(test, [0, 1, 2, 3])
+
+    train, test = next(splits)
+    assert_array_equal(train, [2, 3, 6, 7, 8, 9])
+    assert_array_equal(test, [0, 1, 4, 5])
+
+    train, test = next(splits)
+    assert_array_equal(train, [2, 3, 4, 5, 8, 9])
+    assert_array_equal(test, [0, 1, 6, 7])
+
+    train, test = next(splits)
+    assert_array_equal(train, [2, 3, 4, 5, 6, 7])
+    assert_array_equal(test, [0, 1, 8, 9])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 6, 7, 8, 9])
+    assert_array_equal(test, [2, 3, 4, 5])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 4, 5, 8, 9])
+    assert_array_equal(test, [2, 3, 6, 7])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 4, 5, 6, 7])
+    assert_array_equal(test, [2, 3, 8, 9])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 8, 9])
+    assert_array_equal(test, [4, 5, 6, 7])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 6, 7])
+    assert_array_equal(test, [4, 5, 8, 9])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 4, 5])
+    assert_array_equal(test, [6, 7, 8, 9])
+
+    splits = CombinatorialGapKFold(
+        N=5, k=2, gap_before=1, gap_after=2).split(np.arange(10))
+
+    train, test = next(splits)
+    assert_array_equal(train, [6, 7, 8, 9])
+    assert_array_equal(test, [0, 1, 2, 3])
+
+    train, test = next(splits)
+    assert_array_equal(train, [8, 9])
+    assert_array_equal(test, [0, 1, 4, 5])
+
+    train, test = next(splits)
+    assert_array_equal(train, [4])
+    assert_array_equal(test, [0, 1, 6, 7])
+
+    train, test = next(splits)
+    assert_array_equal(train, [4, 5, 6])
+    assert_array_equal(test, [0, 1, 8, 9])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 8, 9])
+    assert_array_equal(test, [2, 3, 4, 5])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0])
+    assert_array_equal(test, [2, 3, 6, 7])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 6])
+    assert_array_equal(test, [2, 3, 8, 9])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2])
+    assert_array_equal(test, [4, 5, 6, 7])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2])
+    assert_array_equal(test, [4, 5, 8, 9])
+
+    train, test = next(splits)
+    assert_array_equal(train, [0, 1, 2, 3, 4])
+    assert_array_equal(test, [6, 7, 8, 9])
+
+    assert_equal(CombinatorialGapKFold(
+        N=10, k=3).get_n_splits(np.arange(100)), 120)
+
+    splits = CombinatorialGapKFold(
+        N=5, k=3, gap_before=1, gap_after=2).split(np.arange(10))
+
+    train, test = next(splits)
+    assert_array_equal(train, [8, 9])
+    assert_array_equal(test, [0, 1, 2, 3, 4, 5])
+
+    with pytest.raises(
+        ValueError,
+        match="Not enough training samples available"
+    ):
+        next(splits)

From d77fe02b7d98bc6feccf35b8b00092971a9c3857 Mon Sep 17 00:00:00 2001
From: Aldo <aldo.deramo@gmail.com>
Date: Mon, 7 Feb 2022 12:59:45 +0100
Subject: [PATCH 5/9] more tests

---
 tscv/tests/test_split.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/tscv/tests/test_split.py b/tscv/tests/test_split.py
index 4f21152..b240233 100644
--- a/tscv/tests/test_split.py
+++ b/tscv/tests/test_split.py
@@ -672,3 +672,35 @@ def test_combinatorial_gap_k_fold():
         match="Not enough training samples available"
     ):
         next(splits)
+
+    with pytest.raises(
+        ValueError,
+        match="The number of groups must be of Integral type. 5.0 of type <class 'float'> was passed."
+    ):
+        CombinatorialGapKFold(N=5.0, k=2)
+
+    with pytest.raises(
+        ValueError,
+        match="The number of test splits must be of Integral type. 2 of type <class 'str'> was passed."
+    ):
+        CombinatorialGapKFold(N=5, k="2")
+
+    with pytest.raises(
+        ValueError,
+        match="Combinatorial k-fold cross-validation requires at least two groups by setting N=2 or more, got N=1."
+    ):
+        CombinatorialGapKFold(N=1, k=2)
+
+    with pytest.raises(
+        ValueError,
+        match="Combinatorial k-fold cross-validation requires at least one test split by setting k=1 or more, got k=0."
+    ):
+        CombinatorialGapKFold(N=5, k=0)
+
+    splits = CombinatorialGapKFold(N=15, k=2).split(np.arange(10))
+
+    with pytest.raises(
+        ValueError,
+        match="Cannot have number of splits n_splits=15 greater than the number of samples: n_samples=10."
+    ):
+        next(splits)

From e587202fc26e67b1bfaf1241deddfa0b4c3e50e8 Mon Sep 17 00:00:00 2001
From: Aldo <aldo.deramo@gmail.com>
Date: Tue, 8 Feb 2022 18:57:42 +0100
Subject: [PATCH 6/9] fix docstring

---
 tscv/_split.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tscv/_split.py b/tscv/_split.py
index 978737c..03268b3 100644
--- a/tscv/_split.py
+++ b/tscv/_split.py
@@ -384,9 +384,9 @@ class CombinatorialGapKFold(GapCrossValidator):
         Number of groups. Must be at least 2.
 
     k : int, default=2
-        Number of test splints. Must be at least 2.
+        Number of test splits. Must be at least 1.
 
-    gap_before : int, default=2
+    gap_before : int, default=0
         Gap before the test sets.
 
     gap_after : int, default=0

From 0f8c24dc67e79ef4984dfe46d595074b39030802 Mon Sep 17 00:00:00 2001
From: Aldo <aldo.deramo@gmail.com>
Date: Fri, 11 Feb 2022 10:33:41 +0100
Subject: [PATCH 7/9] fix
 https://github.com/WenjieZ/TSCV/pull/41#discussion_r804434648

---
 tscv/_split.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tscv/_split.py b/tscv/_split.py
index 03268b3..73002bb 100644
--- a/tscv/_split.py
+++ b/tscv/_split.py
@@ -12,6 +12,7 @@
 from abc import ABCMeta, abstractmethod
 from itertools import chain, combinations
 from inspect import signature
+from scipy.special import comb
 
 import numpy as np
 from sklearn.utils import indexable
@@ -491,7 +492,7 @@ def get_n_splits(self, X=None, y=None, groups=None):
         n_splits : int
             Returns the number of splitting iterations in the cross-validator.
         """
-        return len(list(combinations(range(self.n_groups), self.test_splits)))
+        return int(comb(self.n_groups, self.test_splits))
 
 
 def gap_train_test_split(*arrays, **options):

From c226a52bae6b19dc29c9b81abe752d593345932b Mon Sep 17 00:00:00 2001
From: Aldo <aldo.deramo@gmail.com>
Date: Fri, 11 Feb 2022 13:16:18 +0100
Subject: [PATCH 8/9] fix
 https://github.com/WenjieZ/TSCV/pull/41#pullrequestreview-879875822 and
 https://github.com/WenjieZ/TSCV/pull/41#discussion_r804533522

---
 setup.py       |  2 +-
 tscv/_split.py | 11 ++---------
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/setup.py b/setup.py
index c5c892d..08d062a 100644
--- a/setup.py
+++ b/setup.py
@@ -52,5 +52,5 @@ def get_version(rel_path):
                  'Programming Language :: Python :: 3.9',
                  ],
     python_requires=">=3.6",
-    install_requires=['numpy>=1.13.3', 'scikit-learn>=0.22']
+    install_requires=['numpy>=1.13.3', 'scipy>=1.3.0', 'scikit-learn>=0.22']
 )
diff --git a/tscv/_split.py b/tscv/_split.py
index 73002bb..7b83dbf 100644
--- a/tscv/_split.py
+++ b/tscv/_split.py
@@ -445,7 +445,7 @@ def __init__(self, N=5, k=2, gap_before=0, gap_after=0):
         self.n_groups = N
         self.test_splits = k
 
-    def split(self, X, y=None, groups=None):
+    def _iter_test_indices(self, X, y=None, groups=None):
         n_samples = _num_samples(X)
         n_splits = self.n_groups
         gap_before, gap_after = self.gap_before, self.gap_after
@@ -461,17 +461,10 @@ def split(self, X, y=None, groups=None):
         splits_combinations = list(combinations(splits, self.test_splits))
         for splits_combination in splits_combinations:
             test_indexes = np.empty(0)
-            train_indexes = self.indexes
             for start, stop in splits_combination:
                 test_indexes = np.union1d(
                     test_indexes, self.indexes[start:stop]).astype(int)
-                begin = max(0, start - gap_before)
-                end = min(n_samples, stop + gap_after)
-                train_indexes = np.intersect1d(train_indexes,
-                    np.setdiff1d(self.indexes, self.indexes[begin:end]))
-                if len(train_indexes) <= 0:
-                    raise ValueError("Not enough training samples available")
-            yield train_indexes, test_indexes
+            yield test_indexes
 
     def get_n_splits(self, X=None, y=None, groups=None):
         """Returns the number of splitting iterations in the cross-validator

From c7b2bed7958d94125a362e4b07c03fb29b8615aa Mon Sep 17 00:00:00 2001
From: Aldo <aldo.deramo@gmail.com>
Date: Fri, 11 Feb 2022 13:19:45 +0100
Subject: [PATCH 9/9] fix tests

---
 tscv/tests/test_split.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tscv/tests/test_split.py b/tscv/tests/test_split.py
index b240233..8540bd1 100644
--- a/tscv/tests/test_split.py
+++ b/tscv/tests/test_split.py
@@ -667,12 +667,6 @@ def test_combinatorial_gap_k_fold():
     assert_array_equal(train, [8, 9])
     assert_array_equal(test, [0, 1, 2, 3, 4, 5])
 
-    with pytest.raises(
-        ValueError,
-        match="Not enough training samples available"
-    ):
-        next(splits)
-
     with pytest.raises(
         ValueError,
         match="The number of groups must be of Integral type. 5.0 of type <class 'float'> was passed."