Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MultivariateGaussian #1301

Merged
merged 17 commits into from
Aug 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/releases/unreleased.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ Calling `learn_one` in a pipeline will now update each part of the pipeline in t

- Added `preprocessing.OrdinalEncoder`, to map string features to integers.

## proba

- Added `proba.MultivariateGaussian`.

## stream

- `stream.iter_arff` now supports sparse data.
Expand Down
3 changes: 2 additions & 1 deletion river/proba/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from . import base
from .beta import Beta
from .gaussian import Gaussian
from .gaussian import MultivariateGaussian
from .multinomial import Multinomial

__all__ = ["base", "Beta", "Gaussian", "Multinomial"]
__all__ = ["base", "Beta", "Gaussian", "Multinomial", "MultivariateGaussian"]
23 changes: 23 additions & 0 deletions river/proba/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,26 @@ def revert(self, x: float):
@abc.abstractmethod
def cdf(self, x: float):
"""Cumulative density function, i.e. P(X <= x)."""


class MultivariateContinuousDistribution(Distribution):
"""A probability distribution for multivariate continuous values.

Parameters
----------
seed
Random number generator seed for reproducibility.

"""

@abc.abstractmethod
def update(self, x: dict[str, float]):
"""Updates the parameters of the distribution given a new observation."""

@abc.abstractmethod
def revert(self, x: dict[str, float]):
"""Reverts the parameters of the distribution for a given observation."""

@abc.abstractmethod
def cdf(self, x: dict[str, float]) -> float:
"""Cumulative density function, i.e. P(X <= x)."""
206 changes: 205 additions & 1 deletion river/proba/gaussian.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
from __future__ import annotations

import math
import warnings

import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal

from river import covariance
from river import stats
from river.proba import base

__all__ = ["Gaussian"]
__all__ = ["Gaussian", "MultivariateGaussian"]


class Gaussian(base.ContinuousDistribution):
Expand Down Expand Up @@ -90,3 +96,201 @@ def sample(self):
@property
def mode(self):
return self.mu


class MultivariateGaussian(base.MultivariateContinuousDistribution):
"""Multivariate normal distribution with parameters mu and var.

Parameters
----------
seed
Random number generator seed for reproducibility.

Examples
--------
>>> import numpy as np
>>> import pandas as pd

>>> np.random.seed(42)
>>> X = pd.DataFrame(np.random.random((8, 3)),
... columns=["red", "green", "blue"])
>>> X
red green blue
0 0.374540 0.950714 0.731994
1 0.598658 0.156019 0.155995
2 0.058084 0.866176 0.601115
3 0.708073 0.020584 0.969910
4 0.832443 0.212339 0.181825
5 0.183405 0.304242 0.524756
6 0.431945 0.291229 0.611853
7 0.139494 0.292145 0.366362

>>> p = MultivariateGaussian()
>>> p.n_samples
0.0

>>> for x in X.to_dict(orient="records"):
... p = p.update(x)
>>> p.var
blue green red
blue 0.076119 0.020292 -0.010128
green 0.020292 0.112931 -0.053268
red -0.010128 -0.053268 0.078961

Retrieving current state in nice format is simple
>>> p
𝒩(
μ=(0.518, 0.387, 0.416),
σ^2=(
[ 0.076 0.020 -0.010]
[ 0.020 0.113 -0.053]
[-0.010 -0.053 0.079]
)
)

To retrieve number of samples and mode
>>> p.n_samples
8.0
>>> p.mode # doctest: +ELLIPSIS
{'blue': 0.5179..., 'green': 0.3866..., 'red': 0.4158...}

To retrieve pdf and cdf
>>> p(x) # doctest: +ELLIPSIS
0.97967086129734...
>>> p.cdf(x) # doctest: +ELLIPSIS
0.00509653891791713...

To sample data from distribution
>>> p.sample() # doctest: +ELLIPSIS
{'blue': 0.3053..., 'green': -0.0532..., 'red': 0.7388...}

MultivariateGaussian works with `utils.Rolling`

>>> from river import utils
>>> p = utils.Rolling(MultivariateGaussian(), window_size=5)
>>> for x in X.to_dict(orient="records"):
... p = p.update(x)
>>> p.var
blue green red
blue 0.087062 -0.022873 0.007765
green -0.022873 0.014279 -0.025181
red 0.007765 -0.025181 0.095066

MultivariateGaussian works with `utils.TimeRolling`

>>> from datetime import datetime as dt, timedelta as td
>>> X.index = [dt(2023, 3, 28, 0, 0, 0) + td(seconds=x) for x in range(8)]
>>> p = utils.TimeRolling(MultivariateGaussian(), period=td(seconds=5))
>>> for t, x in X.iterrows():
... p = p.update(x.to_dict(), t=t)
>>> p.var
blue green red
blue 0.087062 -0.022873 0.007765
green -0.022873 0.014279 -0.025181
red 0.007765 -0.025181 0.095066

Variance on diagonal is consistent with Gaussian

>>> from river.proba import Gaussian
>>> p = MultivariateGaussian()
>>> p_ = Gaussian()
>>> for t, x in X.iterrows():
... p = p.update(x.to_dict())
... p_ = p_.update(x['blue'])
>>> p.sigma['blue']['blue'] == p_.sigma
True
"""

def __init__(self, seed=None):
super().__init__(seed)
self._var = covariance.EmpiricalCovariance(ddof=1)

# TODO: add method _from_state to initialize model (for warm starting)

@property
def n_samples(self) -> float:
if not self._var.matrix:
return 0.0
else:
return list(self._var.matrix.values())[-1].mean.n

@property
def mu(self) -> dict:
"""The mean value of the distribution."""
return {
key1: values.mean.get()
for (key1, key2), values in sorted(self._var.matrix.items())
if key1 == key2
}

@property
def var(self) -> pd.DataFrame:
"""The variance of the distribution."""
variables = sorted(list({var for cov in self._var.matrix.keys() for var in cov}))
# Initialize the covariance matrix array
cov_array = np.zeros((len(variables), len(variables)))

# Fill in the covariance matrix array
for i in range(len(variables)):
for j in range(i, len(variables)):
if i == j:
# Fill in the diagonal with variances
cov_array[i, j] = self._var[(variables[i], variables[j])].get()
else:
# Fill in the off-diagonal with covariances
cov_array[i, j] = self._var[(variables[i], variables[j])].get()
cov_array[j, i] = self._var[(variables[i], variables[j])].get()

cov_array = pd.DataFrame(cov_array, index=variables, columns=variables)
return cov_array

@property
def sigma(self) -> pd.DataFrame:
"""The standard deviation of the distribution."""
return self.var**0.5

def __repr__(self):
mu_str = ", ".join(f"{m:.3f}" for m in self.mu.values())
var_str = self.var.to_string(float_format="{:0.3f}".format, header=False, index=False)
var_str = " [" + var_str.replace("\n", "]\n [") + "]"
return f"𝒩(\n μ=({mu_str}),\n σ^2=(\n{var_str}\n )\n)"

def update(self, x):
# TODO: add support for weigthed samples
self._var.update(x)
return self

def revert(self, x):
# TODO: add support for weigthed samples
self._var.revert(x)
return self

def __call__(self, x: dict[str, float]):
"""PDF(x) method."""
x_ = [x[i] for i in self.mu]
var = self.var
if var is not None:
try:
pdf_ = multivariate_normal([*self.mu.values()], var).pdf(x_)
return float(pdf_)
# TODO: validate occurence of ValueError
# The input matrix must be symmetric positive semidefinite.
except ValueError: # pragma: no cover
return 0.0
# TODO: validate occurence of OverflowError
except OverflowError: # pragma: no cover
return 0.0
return 0.0 # pragma: no cover

def cdf(self, x: dict[str, float]):
x_ = [x[i] for i in self.mu]
cdf_ = multivariate_normal([*self.mu.values()], self.var, allow_singular=True).cdf(x_)
return float(cdf_)

def sample(self) -> dict[str, float]:
sample_ = multivariate_normal([*self.mu.values()], self.var).rvs().tolist()
return dict(zip(self.mu.keys(), sample_))

@property
def mode(self) -> dict:
return self.mu
Loading