Skip to content

Commit

Permalink
Batch update for Mean/Var/Cov (#838)
Browse files Browse the repository at this point in the history
* simplify __add__ and __sub__

* implement Mean.update_many

* implement Var.update_many

* don't inherit from SortedWindow

* implement test_add_bivariate

* remove tests, they're already there

* typo

* implement Cov.update_many

* flake

* Update test_var.py

* unweighted

* types

* Update cov.py

* Update iqr.py

* Update unreleased.md

* rename special to misc

Former-commit-id: be2a904
  • Loading branch information
MaxHalford authored Feb 3, 2022
1 parent 8fc4330 commit b5f6381
Show file tree
Hide file tree
Showing 15 changed files with 165 additions and 82 deletions.
8 changes: 6 additions & 2 deletions docs/releases/unreleased.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,20 @@

- `metrics.ROCAUC` works with `base.AnomalyDetectors`s.

## misc

- Created this module to store some stuff that was in the `utils` module but wasn't necessarily shared between modules.

## reco

- Renamed the `Recommender` base class into `Ranker`.
- Added a `rank` method to each recommender.
- Removed `reco.SurpriseWrapper` as it wasn't really useful.
- Added an `is_contextual` property to each ranker to indicate if a model makes use of contextual features or not.

## special
## stats

- Created this module to store some stuff that was in the `utils` module.
- `stats.Mean`, `stats.Var`, and `stats.Cov` each now have an `update_many` method which accepts numpy arrays.

## utils

Expand Down
2 changes: 2 additions & 0 deletions river/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
imblearn,
linear_model,
metrics,
misc,
model_selection,
multiclass,
multioutput,
Expand Down Expand Up @@ -60,6 +61,7 @@
"imblearn",
"linear_model",
"metrics",
"misc",
"model_selection",
"multiclass",
"multioutput",
Expand Down
1 change: 1 addition & 0 deletions river/special/__init__.py → river/misc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""Miscellaneous algorithms."""
from .histogram import Histogram
from .sdft import SDFT
from .skyline import Skyline
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
66 changes: 39 additions & 27 deletions river/stats/cov.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import copy

from . import base, mean, summing
import numpy as np

from . import base, summing


class Cov(base.Bivariate):
Expand Down Expand Up @@ -43,91 +45,101 @@ class Cov(base.Bivariate):

def __init__(self, ddof=1):
self.ddof = ddof
self.mean_x = mean.Mean()
self.mean_y = mean.Mean()
self.mean_x = 0
self.mean_y = 0
self.n = 0
self._C = 0
self.cov = 0

def update(self, x, y, w=1.0):
dx = x - self.mean_x.get()
self.mean_x.update(x, w)
self.mean_y.update(y, w)
dy = y - self.mean_y.get()
self.cov += w * (dx * dy - self.cov) / max(1, self.mean_x.n - self.ddof)
self.n += w
dx = x - self.mean_x
self.mean_x += (w / self.n) * dx
self.mean_y += (w / self.n) * (y - self.mean_y)
self._C += w * dx * (y - self.mean_y)
self.cov = self._C / max(1, self.n - self.ddof)
return self

def update_many(self, X: np.ndarray, Y: np.ndarray):
self.n += len(X)
dx = X - self.mean_x
self.mean_x += dx.sum() / self.n
self.mean_y += ((Y - self.mean_y)).sum() / self.n
self._C += (dx * (Y - self.mean_y)).sum()
self.cov = self._C / max(1, self.n - self.ddof)
return self

def get(self):
return self.cov

def __iadd__(self, other):
old_mean_x = self.mean_x.get()
old_mean_y = self.mean_y.get()
old_n = self.mean_x.n
old_mean_x = self.mean_x
old_mean_y = self.mean_y
old_n = self.n

# Update mean estimates
self.mean_x += other.mean_x
self.mean_y += other.mean_y

if self.mean_x.n <= self.ddof:
if self.n <= self.ddof:
return self

# Scale factors
scale_a = old_n - self.ddof
scale_b = other.mean_x.n - other.ddof
scale_b = other.n - other.ddof

# Scale the covariances
self.cov = scale_a * self.cov + scale_b * other.cov
# Apply correction factor
self.cov += (
(old_mean_x - other.mean_x.get())
* (old_mean_y - other.mean_y.get())
* ((old_n * other.mean_x.n) / self.mean_x.n)
(old_mean_x - other.mean_x)
* (old_mean_y - other.mean_y)
* ((old_n * other.n) / self.n)
)
# Reapply scale
self.cov /= self.mean_x.n - self.ddof
self.cov /= self.n - self.ddof

return self

def __add__(self, other):
result = copy.deepcopy(self)
result += other

return result

def __isub__(self, other):
if self.mean_x.n <= self.ddof:
if self.n <= self.ddof:
return self

old_n = self.mean_x.n
old_n = self.n

# Update mean estimates
self.mean_x -= other.mean_x
self.mean_y -= other.mean_y

if self.mean_x.n <= self.ddof:
if self.n <= self.ddof:
self.cov = 0
return self

# Scale factors
scale_x = old_n - self.ddof
scale_b = other.mean_x.n - other.ddof
scale_b = other.n - other.ddof

# Scale the covariances
self.cov = scale_x * self.cov - scale_b * other.cov
# Apply correction
self.cov -= (
(self.mean_x.get() - other.mean_x.get())
* (self.mean_y.get() - other.mean_y.get())
* ((self.mean_x.n * other.mean_x.n) / old_n)
(self.mean_x - other.mean_x)
* (self.mean_y - other.mean_y)
* ((self.n * other.n) / old_n)
)
# Re-apply scale factor
self.cov /= self.mean_x.n - self.ddof
self.cov /= self.n - self.ddof

return self

def __sub__(self, other):
result = copy.deepcopy(self)
result -= other

return result


Expand Down
4 changes: 1 addition & 3 deletions river/stats/iqr.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from .. import utils
from . import base, quantile


Expand Down Expand Up @@ -64,7 +63,7 @@ def get(self):
return q_sup - q_inf


class RollingIQR(base.RollingUnivariate, utils.SortedWindow):
class RollingIQR(base.RollingUnivariate):
"""Computes the rolling interquartile range.
Parameters
Expand Down Expand Up @@ -105,7 +104,6 @@ class RollingIQR(base.RollingUnivariate, utils.SortedWindow):
"""

def __init__(self, window_size: int, q_inf=0.25, q_sup=0.75):
super().__init__(size=window_size)
if q_inf >= q_sup:
raise ValueError("q_inf must be strictly less than q_sup")
self.q_inf = q_inf
Expand Down
20 changes: 10 additions & 10 deletions river/stats/maximum.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def get(self):
return self.max


class RollingMax(base.RollingUnivariate, utils.SortedWindow):
class RollingMax(base.RollingUnivariate):
"""Running max over a window.
Parameters
Expand All @@ -70,19 +70,19 @@ class RollingMax(base.RollingUnivariate, utils.SortedWindow):
"""

def __init__(self, window_size: int):
super().__init__(size=window_size)
self.window = utils.SortedWindow(size=window_size)

@property
def window_size(self):
return self.size
return self.window.size

def update(self, x):
self.append(x)
self.window.append(x)
return self

def get(self):
try:
return self[-1]
return self.window[-1]
except IndexError:
return None

Expand Down Expand Up @@ -125,7 +125,7 @@ def get(self):
return self.abs_max


class RollingAbsMax(base.RollingUnivariate, utils.SortedWindow):
class RollingAbsMax(base.RollingUnivariate):
"""Running absolute max over a window.
Parameters
Expand All @@ -152,18 +152,18 @@ class RollingAbsMax(base.RollingUnivariate, utils.SortedWindow):
"""

def __init__(self, window_size: int):
super().__init__(size=window_size)
self.window = utils.SortedWindow(size=window_size)

@property
def window_size(self):
return self.size
return self.window.size

def update(self, x):
self.append(abs(x))
self.window.append(abs(x))
return self

def get(self):
try:
return self[-1]
return self.window[-1]
except IndexError:
return None
13 changes: 9 additions & 4 deletions river/stats/mean.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import copy

import numpy as np

from . import base, summing


Expand Down Expand Up @@ -45,6 +47,13 @@ def update(self, x, w=1.0):
self._mean += (w / self.n) * (x - self._mean)
return self

def update_many(self, X: np.ndarray):
a = self.n / (self.n + len(X))
b = len(X) / (self.n + len(X))
self._mean = a * self._mean + b * np.mean(X)
self.n += len(X)
return self

def revert(self, x, w=1.0):
self.n -= w
if self.n < 0:
Expand All @@ -70,13 +79,11 @@ def __iadd__(self, other):
old_n = self.n
self.n += other.n
self._mean = (old_n * self._mean + other.n * other._mean) / self.n

return self

def __add__(self, other):
result = copy.deepcopy(self)
result += other

return result

def __isub__(self, other):
Expand All @@ -88,13 +95,11 @@ def __isub__(self, other):
else:
self.n = 0.0
self._mean = 0.0

return self

def __sub__(self, other):
result = copy.deepcopy(self)
result -= other

return result


Expand Down
10 changes: 5 additions & 5 deletions river/stats/minimum.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def get(self):
return self.min


class RollingMin(base.RollingUnivariate, utils.SortedWindow):
class RollingMin(base.RollingUnivariate):
"""Running min over a window.
Parameters
Expand All @@ -54,18 +54,18 @@ class RollingMin(base.RollingUnivariate, utils.SortedWindow):
"""

def __init__(self, window_size: int):
super().__init__(size=window_size)
self.window = utils.SortedWindow(size=window_size)

@property
def window_size(self):
return self.size
return self.window.size

def update(self, x):
self.append(x)
self.window.append(x)
return self

def get(self):
try:
return self[0]
return self.window[0]
except IndexError:
return None
Loading

0 comments on commit b5f6381

Please sign in to comment.