Implement covariance matrix (#839)

* init * Implement CovMatrix * typos * wrong import * fix confusion * Update cov.py * Update confusion.py * fix confusion * Update cov.py Former-commit-id: ec66af8
online-ml · Feb 4, 2022 · 1ab1a7d · 1ab1a7d
1 parent b5f6381
commit 1ab1a7d
Show file tree

Hide file tree

Showing 10 changed files with 218 additions and 90 deletions.
diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md
@@ -34,6 +34,7 @@
 ## misc
 
 - Created this module to store some stuff that was in the `utils` module but wasn't necessarily shared between modules.
+- Implement `misc.CovMatrix`.
 
 ## reco
 

diff --git a/river/metrics/confusion.py b/river/metrics/confusion.py
@@ -1,7 +1,8 @@
 import functools
-import textwrap
 from collections import defaultdict
 
+from river import utils
+
 
 class ConfusionMatrix:
     """Confusion Matrix for binary and multi-class classification.
@@ -85,38 +86,16 @@ def classes(self):
 
     def __repr__(self):
 
-        # The classes are sorted alphabetically for reproducibility reasons
         classes = sorted(self.classes)
-
         if not classes:
             return ""
 
-        # Determine the required width of each column in the table
-        largest_label_len = max(len(str(c)) for c in classes)
-        largest_number_len = len(
-            str(max(max(v for v in c.values()) for c in self.data.values()))
-        )
-        width = max(largest_label_len, largest_number_len) + 2
-
-        # Make a template to print out rows one by one
-        row_format = "{:>{width}}" * (len(classes) + 1)
-
-        # Write down the header
-        table = row_format.format("", *map(str, classes), width=width) + "\n"
-
-        # Write down the true labels row by row
-        table += "\n".join(
-            (
-                row_format.format(
-                    str(y_true),
-                    *[f"{self.data[y_true][y_pred]:0.0f}" for y_pred in classes],
-                    width=width,
-                )
-                for y_true in classes
-            )
-        )
+        headers = [""] + list(map(str, classes))
+        columns = [headers[1:]]
+        for col in classes:
+            columns.append([f"{int(self.data[row][col]):,}" for row in classes])
 
-        return textwrap.dedent(table)
+        return utils.pretty.print_table(headers, columns)
 
     def support(self, label):
         return self.sum_row[label]

diff --git a/river/metrics/multioutput/confusion.py b/river/metrics/multioutput/confusion.py
@@ -1,4 +1,3 @@
-import collections
 import textwrap
 
 from river import metrics
@@ -31,33 +30,43 @@ class MultiLabelConfusionMatrix:
 
     >>> cm
     0
-               False   True
-        False      0      1
-         True      0      1
+                False   True
+        False       0      1
+         True       0      1
     <BLANKLINE>
     1
-               False   True
-        False      0      0
-         True      1      1
+                False   True
+        False       0      0
+         True       1      1
     <BLANKLINE>
     2
-               False   True
-        False      1      0
-         True      0      1
+                False   True
+        False       1      0
+         True       0      1
 
     """
 
     def __init__(self):
-        self.data = collections.defaultdict(metrics.ConfusionMatrix)
+        self.data = dict()
 
     def update(self, y_true, y_pred, sample_weight=1.0):
         for label, yt in y_true.items():
-            self.data[label].update(yt, y_pred[label], sample_weight)
+            try:
+                cm = self.data[label]
+            except KeyError:
+                cm = metrics.ConfusionMatrix()
+                self.data[label] = cm
+            cm.update(yt, y_pred[label], sample_weight)
         return self
 
     def revert(self, y_true, y_pred, sample_weight=1.0):
         for label, yt in y_true.items():
-            self.data[label].revert(yt, y_pred[label], sample_weight)
+            try:
+                cm = self.data[label]
+            except KeyError:
+                cm = metrics.ConfusionMatrix()
+                self.data[label] = cm
+            cm.update(yt, y_pred[label], sample_weight)
         return self
 
     def __repr__(self):

diff --git a/river/misc/__init__.py b/river/misc/__init__.py
@@ -1,6 +1,11 @@
-"""Miscellaneous algorithms."""
+"""Miscellaneous.
+
+This module essentially regroups some implementations that have nowhere else to go.
+
+"""
+from .cov_matrix import CovMatrix
 from .histogram import Histogram
 from .sdft import SDFT
 from .skyline import Skyline
 
-__all__ = ["Histogram", "SDFT", "Skyline"]
+__all__ = ["CovMatrix", "Histogram", "SDFT", "Skyline"]
diff --git a/river/misc/cov_matrix.py b/river/misc/cov_matrix.py
@@ -0,0 +1,136 @@
+import collections
+import itertools
+
+import pandas as pd
+
+from river import stats, utils
+
+
+class CovMatrix(collections.UserDict):
+    """Covariance matrix.
+
+    Parameters
+    ----------
+    ddof
+        Delta Degrees of Freedom.
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> import pandas as pd
+    >>> from river import misc
+
+    >>> np.random.seed(42)
+    >>> X = pd.DataFrame(np.random.random((8, 3)), columns=["red", "green", "blue"])
+    >>> X
+            red     green      blue
+    0  0.374540  0.950714  0.731994
+    1  0.598658  0.156019  0.155995
+    2  0.058084  0.866176  0.601115
+    3  0.708073  0.020584  0.969910
+    4  0.832443  0.212339  0.181825
+    5  0.183405  0.304242  0.524756
+    6  0.431945  0.291229  0.611853
+    7  0.139494  0.292145  0.366362
+
+    >>> cov = misc.CovMatrix()
+    >>> for x in X.to_dict(orient="records"):
+    ...     cov = cov.update(x)
+    >>> cov
+            blue     green    red
+     blue    0.076    0.020   -0.010
+    green    0.020    0.113   -0.053
+      red   -0.010   -0.053    0.079
+
+    There is also an `update_many` method to process mini-batches. The results are identical.
+
+    >>> cov = misc.CovMatrix()
+    >>> cov = cov.update_many(X)
+    >>> cov
+              blue     green    red
+     blue    0.076    0.020   -0.010
+    green    0.020    0.113   -0.053
+      red   -0.010   -0.053    0.079
+
+    The covariances are stored in a dictionary, meaning any one of them can be accessed as such:
+
+    >>> cov["blue", "green"]
+    Cov: 0.020292
+
+    """
+
+    _fmt = ",.3f"
+
+    def __init__(self, ddof=1):
+        super().__init__()
+        self.ddof = ddof
+
+    def update(self, x: dict):
+        """Update with a single sample.
+
+        Parameters
+        ----------
+        x
+            A sample.
+
+        """
+
+        for i, j in itertools.combinations_with_replacement(sorted(x), r=2):
+            try:
+                cov = self[i, j]
+            except KeyError:
+                self[i, j] = stats.Cov(self.ddof)
+                cov = self[i, j]
+            cov.update(x[i], x[j])
+
+        return self
+
+    def update_many(self, X: pd.DataFrame):
+        """Update with many samples.
+
+        Parameters
+        ----------
+        X
+            Samples.
+
+        """
+
+        for i, j in itertools.combinations_with_replacement(sorted(X.columns), r=2):
+            try:
+                cov = self[i, j]
+            except KeyError:
+                self[i, j] = stats.Cov(self.ddof)
+                cov = self[i, j]
+            cov.update_many(X[i].values, X[j].values)
+
+        return self
+
+    def __getitem__(self, key):
+        """
+
+        A covariance matrix is symmetric. For ease of use we make the __getitem__ method symmetric.
+
+        """
+        x, y = key
+        try:
+            return super().__getitem__((x, y))
+        except KeyError:
+            return super().__getitem__((y, x))
+
+    def __repr__(self):
+
+        names = sorted(set(i for i, _ in self))
+
+        headers = [""] + list(map(str, names))
+        columns = [headers[1:]]
+        for col in names:
+            column = []
+            for row in names:
+                try:
+                    column.append(f"{self[row, col].get():{self._fmt}}")
+                except KeyError:
+                    column.append("")
+            columns.append(column)
+
+        return utils.pretty.print_table(headers, columns)
diff --git a/river/misc/histogram.py b/river/misc/histogram.py
@@ -84,7 +84,7 @@ class Histogram(collections.UserList):
     Examples
     --------
 
-    >>> from river import special
+    >>> from river import misc
     >>> import matplotlib.pyplot as plt
     >>> import numpy as np
 
@@ -95,7 +95,7 @@ class Histogram(collections.UserList):
     ...     np.random.normal(3, 1, 1000),
     ... ))
 
-    >>> hist = special.Histogram(max_bins=60)
+    >>> hist = misc.Histogram(max_bins=60)
 
     >>> for x in values:
     ...     hist = hist.update(x)
@@ -196,9 +196,9 @@ def iter_cdf(self, X, verbose=False):
         Examples
         --------
 
-        >>> from river import special
+        >>> from river import misc
 
-        >>> hist = special.Histogram()
+        >>> hist = misc.Histogram()
         >>> for x in range(4):
         ...     hist = hist.update(x)
 
@@ -244,9 +244,9 @@ def cdf(self, x):
         Examples
         --------
 
-        >>> from river import special
+        >>> from river import misc
 
-        >>> hist = special.Histogram()
+        >>> hist = misc.Histogram()
         >>> for x in range(4):
         ...     hist = hist.update(x)
 

diff --git a/river/misc/sdft.py b/river/misc/sdft.py
@@ -20,12 +20,12 @@ class SDFT:
     --------
 
     >>> import numpy as np
-    >>> from river import special
+    >>> from river import misc
 
     >>> X = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 
     >>> window_size = 5
-    >>> sdft = special.SDFT(window_size)
+    >>> sdft = misc.SDFT(window_size)
 
     >>> for i, x in enumerate(X):
     ...     sdft = sdft.update(x)

diff --git a/river/misc/skyline.py b/river/misc/skyline.py
@@ -22,7 +22,7 @@ class Skyline(collections.UserList):
     Here is an example taken from [this](https://maxhalford.github.io/blog/skyline-queries/) blog post.
 
     >>> import random
-    >>> from river import special
+    >>> from river import misc
     >>> import matplotlib.pyplot as plt
 
     >>> city_prices = {
@@ -37,7 +37,7 @@ class Skyline(collections.UserList):
     ...     price = round(random.uniform(0.8, 1.2) * city_prices[city] * size)
     ...     return {'city': city, 'size': size, 'price': price}
 
-    >>> skyline = special.Skyline(minimize=['price'], maximize=['size'])
+    >>> skyline = misc.Skyline(minimize=['price'], maximize=['size'])
 
     >>> random.seed(42)
 
@@ -67,7 +67,7 @@ class Skyline(collections.UserList):
     Here is another example using the kart data from *Mario Kart: Double Dash!!*.
 
     >>> import collections
-    >>> from river import special
+    >>> from river import misc
 
     >>> Kart = collections.namedtuple(
     ...      'Kart',
@@ -98,7 +98,7 @@ class Skyline(collections.UserList):
     ...     Kart('Parade Kart', 7, 3, 4, 7, 3)
     ... ]
 
-    >>> skyline = special.Skyline(
+    >>> skyline = misc.Skyline(
     ...     maximize=['speed', 'off_road', 'acceleration', 'turbo'],
     ...     minimize=['weight']
     ... )
-Original file line number
+Diff line change
@@ Expand Up / @@ -34,6 +34,7 @@ @@
     ## misc
     - Created this module to store some stuff that was in the `utils` module but wasn't necessarily shared between modules.
+    - Implement `misc.CovMatrix`.
     ## reco
@@ Expand Down @@