Skip to content

Commit

Permalink
Implement covariance matrix (#839)
Browse files Browse the repository at this point in the history
* init

* Implement CovMatrix

* typos

* wrong import

* fix confusion

* Update cov.py

* Update confusion.py

* fix confusion

* Update cov.py

Former-commit-id: ec66af8
  • Loading branch information
MaxHalford authored Feb 4, 2022
1 parent b5f6381 commit 1ab1a7d
Show file tree
Hide file tree
Showing 10 changed files with 218 additions and 90 deletions.
1 change: 1 addition & 0 deletions docs/releases/unreleased.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
## misc

- Created this module to store some stuff that was in the `utils` module but wasn't necessarily shared between modules.
- Implement `misc.CovMatrix`.

## reco

Expand Down
35 changes: 7 additions & 28 deletions river/metrics/confusion.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import functools
import textwrap
from collections import defaultdict

from river import utils


class ConfusionMatrix:
"""Confusion Matrix for binary and multi-class classification.
Expand Down Expand Up @@ -85,38 +86,16 @@ def classes(self):

def __repr__(self):

# The classes are sorted alphabetically for reproducibility reasons
classes = sorted(self.classes)

if not classes:
return ""

# Determine the required width of each column in the table
largest_label_len = max(len(str(c)) for c in classes)
largest_number_len = len(
str(max(max(v for v in c.values()) for c in self.data.values()))
)
width = max(largest_label_len, largest_number_len) + 2

# Make a template to print out rows one by one
row_format = "{:>{width}}" * (len(classes) + 1)

# Write down the header
table = row_format.format("", *map(str, classes), width=width) + "\n"

# Write down the true labels row by row
table += "\n".join(
(
row_format.format(
str(y_true),
*[f"{self.data[y_true][y_pred]:0.0f}" for y_pred in classes],
width=width,
)
for y_true in classes
)
)
headers = [""] + list(map(str, classes))
columns = [headers[1:]]
for col in classes:
columns.append([f"{int(self.data[row][col]):,}" for row in classes])

return textwrap.dedent(table)
return utils.pretty.print_table(headers, columns)

def support(self, label):
return self.sum_row[label]
Expand Down
35 changes: 22 additions & 13 deletions river/metrics/multioutput/confusion.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import collections
import textwrap

from river import metrics
Expand Down Expand Up @@ -31,33 +30,43 @@ class MultiLabelConfusionMatrix:
>>> cm
0
False True
False 0 1
True 0 1
False True
False 0 1
True 0 1
<BLANKLINE>
1
False True
False 0 0
True 1 1
False True
False 0 0
True 1 1
<BLANKLINE>
2
False True
False 1 0
True 0 1
False True
False 1 0
True 0 1
"""

def __init__(self):
self.data = collections.defaultdict(metrics.ConfusionMatrix)
self.data = dict()

def update(self, y_true, y_pred, sample_weight=1.0):
for label, yt in y_true.items():
self.data[label].update(yt, y_pred[label], sample_weight)
try:
cm = self.data[label]
except KeyError:
cm = metrics.ConfusionMatrix()
self.data[label] = cm
cm.update(yt, y_pred[label], sample_weight)
return self

def revert(self, y_true, y_pred, sample_weight=1.0):
for label, yt in y_true.items():
self.data[label].revert(yt, y_pred[label], sample_weight)
try:
cm = self.data[label]
except KeyError:
cm = metrics.ConfusionMatrix()
self.data[label] = cm
cm.update(yt, y_pred[label], sample_weight)
return self

def __repr__(self):
Expand Down
9 changes: 7 additions & 2 deletions river/misc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
"""Miscellaneous algorithms."""
"""Miscellaneous.
This module essentially regroups some implementations that have nowhere else to go.
"""
from .cov_matrix import CovMatrix
from .histogram import Histogram
from .sdft import SDFT
from .skyline import Skyline

__all__ = ["Histogram", "SDFT", "Skyline"]
__all__ = ["CovMatrix", "Histogram", "SDFT", "Skyline"]
136 changes: 136 additions & 0 deletions river/misc/cov_matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import collections
import itertools

import pandas as pd

from river import stats, utils


class CovMatrix(collections.UserDict):
"""Covariance matrix.
Parameters
----------
ddof
Delta Degrees of Freedom.
Examples
--------
>>> import numpy as np
>>> import pandas as pd
>>> from river import misc
>>> np.random.seed(42)
>>> X = pd.DataFrame(np.random.random((8, 3)), columns=["red", "green", "blue"])
>>> X
red green blue
0 0.374540 0.950714 0.731994
1 0.598658 0.156019 0.155995
2 0.058084 0.866176 0.601115
3 0.708073 0.020584 0.969910
4 0.832443 0.212339 0.181825
5 0.183405 0.304242 0.524756
6 0.431945 0.291229 0.611853
7 0.139494 0.292145 0.366362
>>> cov = misc.CovMatrix()
>>> for x in X.to_dict(orient="records"):
... cov = cov.update(x)
>>> cov
blue green red
blue 0.076 0.020 -0.010
green 0.020 0.113 -0.053
red -0.010 -0.053 0.079
There is also an `update_many` method to process mini-batches. The results are identical.
>>> cov = misc.CovMatrix()
>>> cov = cov.update_many(X)
>>> cov
blue green red
blue 0.076 0.020 -0.010
green 0.020 0.113 -0.053
red -0.010 -0.053 0.079
The covariances are stored in a dictionary, meaning any one of them can be accessed as such:
>>> cov["blue", "green"]
Cov: 0.020292
"""

_fmt = ",.3f"

def __init__(self, ddof=1):
super().__init__()
self.ddof = ddof

def update(self, x: dict):
"""Update with a single sample.
Parameters
----------
x
A sample.
"""

for i, j in itertools.combinations_with_replacement(sorted(x), r=2):
try:
cov = self[i, j]
except KeyError:
self[i, j] = stats.Cov(self.ddof)
cov = self[i, j]
cov.update(x[i], x[j])

return self

def update_many(self, X: pd.DataFrame):
"""Update with many samples.
Parameters
----------
X
Samples.
"""

for i, j in itertools.combinations_with_replacement(sorted(X.columns), r=2):
try:
cov = self[i, j]
except KeyError:
self[i, j] = stats.Cov(self.ddof)
cov = self[i, j]
cov.update_many(X[i].values, X[j].values)

return self

def __getitem__(self, key):
"""
A covariance matrix is symmetric. For ease of use we make the __getitem__ method symmetric.
"""
x, y = key
try:
return super().__getitem__((x, y))
except KeyError:
return super().__getitem__((y, x))

def __repr__(self):

names = sorted(set(i for i, _ in self))

headers = [""] + list(map(str, names))
columns = [headers[1:]]
for col in names:
column = []
for row in names:
try:
column.append(f"{self[row, col].get():{self._fmt}}")
except KeyError:
column.append("")
columns.append(column)

return utils.pretty.print_table(headers, columns)
12 changes: 6 additions & 6 deletions river/misc/histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class Histogram(collections.UserList):
Examples
--------
>>> from river import special
>>> from river import misc
>>> import matplotlib.pyplot as plt
>>> import numpy as np
Expand All @@ -95,7 +95,7 @@ class Histogram(collections.UserList):
... np.random.normal(3, 1, 1000),
... ))
>>> hist = special.Histogram(max_bins=60)
>>> hist = misc.Histogram(max_bins=60)
>>> for x in values:
... hist = hist.update(x)
Expand Down Expand Up @@ -196,9 +196,9 @@ def iter_cdf(self, X, verbose=False):
Examples
--------
>>> from river import special
>>> from river import misc
>>> hist = special.Histogram()
>>> hist = misc.Histogram()
>>> for x in range(4):
... hist = hist.update(x)
Expand Down Expand Up @@ -244,9 +244,9 @@ def cdf(self, x):
Examples
--------
>>> from river import special
>>> from river import misc
>>> hist = special.Histogram()
>>> hist = misc.Histogram()
>>> for x in range(4):
... hist = hist.update(x)
Expand Down
4 changes: 2 additions & 2 deletions river/misc/sdft.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ class SDFT:
--------
>>> import numpy as np
>>> from river import special
>>> from river import misc
>>> X = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
>>> window_size = 5
>>> sdft = special.SDFT(window_size)
>>> sdft = misc.SDFT(window_size)
>>> for i, x in enumerate(X):
... sdft = sdft.update(x)
Expand Down
8 changes: 4 additions & 4 deletions river/misc/skyline.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class Skyline(collections.UserList):
Here is an example taken from [this](https://maxhalford.github.io/blog/skyline-queries/) blog post.
>>> import random
>>> from river import special
>>> from river import misc
>>> import matplotlib.pyplot as plt
>>> city_prices = {
Expand All @@ -37,7 +37,7 @@ class Skyline(collections.UserList):
... price = round(random.uniform(0.8, 1.2) * city_prices[city] * size)
... return {'city': city, 'size': size, 'price': price}
>>> skyline = special.Skyline(minimize=['price'], maximize=['size'])
>>> skyline = misc.Skyline(minimize=['price'], maximize=['size'])
>>> random.seed(42)
Expand Down Expand Up @@ -67,7 +67,7 @@ class Skyline(collections.UserList):
Here is another example using the kart data from *Mario Kart: Double Dash!!*.
>>> import collections
>>> from river import special
>>> from river import misc
>>> Kart = collections.namedtuple(
... 'Kart',
Expand Down Expand Up @@ -98,7 +98,7 @@ class Skyline(collections.UserList):
... Kart('Parade Kart', 7, 3, 4, 7, 3)
... ]
>>> skyline = special.Skyline(
>>> skyline = misc.Skyline(
... maximize=['speed', 'off_road', 'acceleration', 'turbo'],
... minimize=['weight']
... )
Expand Down
Loading

0 comments on commit 1ab1a7d

Please sign in to comment.