Merge branch 'online-ml:main' into master

online-ml · Aug 28, 2024 · b71b822 · b71b822
2 parents ebe9568 + 7de4751
commit b71b822
Show file tree

Hide file tree

Showing 142 changed files with 815 additions and 749 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,26 +1,28 @@
 files: river
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.2.0
+    rev: v4.4.0
     hooks:
       - id: check-json
       - id: check-yaml
-      - id: end-of-file-fixer
-      - id: trailing-whitespace
-      - id: mixed-line-ending
 
-  - repo: local
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.5.7
     hooks:
+      # Run the linter.
       - id: ruff
-        name: ruff
-        language: python
-        types: [python]
-        entry: ruff
-        args: 
-          - --fix
+        types_or: [python, pyi, jupyter]
+        args: [--fix]
+      # Run the formatter.
+      - id: ruff-format
+        types_or: [python, pyi, jupyter]
 
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: "v1.1.1"
+    hooks:
       - id: mypy
-        name: mypy
-        language: python
-        types: [python]
-        entry: mypy --implicit-optional
+        args:
+          - "--config-file=pyproject.toml"
+          - "--python-version=3.11"
+          - "--implicit-optional"
diff --git a/docs/recipes/on-hoeffding-trees.ipynb b/docs/recipes/on-hoeffding-trees.ipynb
diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md
@@ -1 +1,18 @@
 # Unreleased
+
+- The units used in River have been corrected to be based on powers of 2 (KiB, MiB). This only changes the display, the behaviour is unchanged.
+
+## cluster
+
+- Update the description of `cluster.ODAC`.
+- Change `draw` in `cluster.ODAC` to draw the hierarchical cluster's structure as a Graphviz graph.
+- Add `render_ascii` in `cluster.ODAC` to render the hierarchical cluster's structure in text format.
+- Work with `stats.Var` in `cluster.ODAC` when cluster has only one time series.
+
+## stats
+
+- Removed the unexported class `stats.CentralMoments`.
+
+## tree
+
+- Instead of letting trees grow indefinitely, setting the `max_depth` parameter to `None` will stop the trees from growing when they reach the system recursion limit.
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -85,7 +85,6 @@ plugins:
 
 extra_javascript:
   - javascripts/config.js
-  - https://polyfill.io/v3/polyfill.min.js?features=es6
   - https://cdn.jsdelivr.net/npm/[email protected]/es5/tex-mml-chtml.js
   - https://cdn.jsdelivr.net/npm/vega@5
   - https://cdn.jsdelivr.net/npm/vega-lite@5

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -35,12 +35,12 @@ pandas = "^2.1"
 
 [tool.poetry.group.dev.dependencies]
 graphviz = "^0.20.1"
-gym = "^0.26.2"
+gymnasium = "^0.29.0"
 matplotlib = "^3.0.2"
-mypy = "^1.6.1"
+mypy = "^1.11.1"
 pre-commit = "^3.5.0"
 pytest = "^7.4.2"
-ruff = "^0.1.1"
+ruff = "^0.4.10"
 scikit-learn = "^1.3.1"
 sqlalchemy = "^2.0.22"
 sympy = "^1.10.1"
@@ -117,20 +117,36 @@ markers = [
 ]
 
 [tool.ruff]
-select = ["E", "F", "I", "UP"]  # https://beta.ruff.rs/docs/rules/
 line-length = 100
 target-version = 'py310'
+extend-include = ["*.ipynb"]
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # isort
+    "I",
+]
 ignore = ["E501"]
+fixable = ["ALL"]
 
-[tool.ruff.isort]
+[tool.ruff.lint.isort]
 required-imports = ["from __future__ import annotations"]
 
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+
 [tool.mypy]
 files = "river"
 
 [[tool.mypy.overrides]]
 module = [
-    "river.*",
     "mmh3.*",
     "numpy.*",
     "sklearn.*",
@@ -141,6 +157,9 @@ module = [
     "vaex.*",
     "torch.*",
     "sqlalchemy.*",
-    "requests.*"
+    "requests.*",
+    "gymnasium.*",
+    "sympy.*",
+    "polars.*"
 ]
 ignore_missing_imports = true
diff --git a/river/__init__.py b/river/__init__.py
@@ -5,6 +5,7 @@
 memory, or simply when it isn't available all at once. river's API is heavily inspired from that of
 scikit-learn, enough so that users who are familiar with scikit-learn should feel right at home.
 """
+
 from __future__ import annotations
 
 from .__version__ import __version__  # noqa: F401
diff --git a/river/active/__init__.py b/river/active/__init__.py
@@ -1,4 +1,5 @@
 """Online active learning."""
+
 from __future__ import annotations
 
 from . import base

diff --git a/river/active/base.py b/river/active/base.py
@@ -30,8 +30,7 @@ def _wrapped_model(self):
         return self.classifier
 
     @abc.abstractmethod
-    def _ask_for_label(self, x, y_pred) -> bool:
-        ...
+    def _ask_for_label(self, x, y_pred) -> bool: ...
 
     def predict_proba_one(self, x, **kwargs):
         """Predict the probability of each label for `x` and indicate whether a label is needed.

diff --git a/river/active/entropy.py b/river/active/entropy.py
@@ -63,9 +63,7 @@ class EntropySampler(ActiveLearningClassifier):
 
     """
 
-    def __init__(
-        self, classifier: base.Classifier, discount_factor: float = 3, seed=None
-    ):
+    def __init__(self, classifier: base.Classifier, discount_factor: float = 3, seed=None):
         super().__init__(classifier, seed=seed)
         self.discount_factor = discount_factor
 

diff --git a/river/anomaly/filter.py b/river/anomaly/filter.py
@@ -86,9 +86,7 @@ class ThresholdFilter(anomaly.base.AnomalyFilter):
 
     """
 
-    def __init__(
-        self, anomaly_detector, threshold: float, protect_anomaly_detector=True
-    ):
+    def __init__(self, anomaly_detector, threshold: float, protect_anomaly_detector=True):
         super().__init__(
             anomaly_detector=anomaly_detector,
             protect_anomaly_detector=protect_anomaly_detector,
@@ -188,7 +186,6 @@ def _unit_test_params(cls):
         from river import preprocessing
 
         yield {
-            "anomaly_detector": preprocessing.StandardScaler()
-            | anomaly.OneClassSVM(nu=0.2),
+            "anomaly_detector": preprocessing.StandardScaler() | anomaly.OneClassSVM(nu=0.2),
             "q": 0.995,
         }
diff --git a/river/anomaly/pad.py b/river/anomaly/pad.py
@@ -100,7 +100,6 @@ def __init__(
         n_std: float = 3.0,
         warmup_period: int = 0,
     ):
-
         self.predictive_model = (
             predictive_model
             if predictive_model is not None
@@ -123,9 +122,7 @@ def learn_one(self, x: dict | None, y: base.typing.Target | float):
         self.iter += 1
 
         # Check whether the model is a time-series forecasting or regression/classification model
-        if isinstance(
-            self.predictive_model, time_series.base.Forecaster
-        ) and isinstance(y, float):
+        if isinstance(self.predictive_model, time_series.base.Forecaster) and isinstance(y, float):
             # When there's no data point as dict of features, the target will be passed
             # to the forecaster as an exogenous variable.
             if not x:

diff --git a/river/api.py b/river/api.py
@@ -1,4 +1,5 @@
 """River API module."""
+
 from __future__ import annotations
 
 from . import (

diff --git a/river/bandit/__init__.py b/river/bandit/__init__.py
@@ -5,6 +5,7 @@
 (see `model_selection.BanditRegressor`).
 
 """
+
 from __future__ import annotations
 
 from . import base, datasets, envs

diff --git a/river/bandit/base.py b/river/bandit/base.py
@@ -65,8 +65,7 @@ def __post_init__(self):
             )
 
     @abc.abstractmethod
-    def _pull(self, arm_ids: list[ArmID]) -> ArmID:
-        ...
+    def _pull(self, arm_ids: list[ArmID]) -> ArmID: ...
 
     def pull(self, arm_ids: list[ArmID]) -> ArmID:
         """Pull arm(s).

diff --git a/river/bandit/bayes_ucb.py b/river/bandit/bayes_ucb.py
@@ -28,7 +28,7 @@ class BayesUCB(bandit.base.Policy):
     Examples
     --------
 
-    >>> import gym
+    >>> import gymnasium as gym
     >>> from river import bandit
     >>> from river import proba
     >>> from river import stats
@@ -63,9 +63,9 @@ class BayesUCB(bandit.base.Policy):
 
     def __init__(self, reward_obj=None, burn_in=0, seed: int | None = None):
         super().__init__(reward_obj, burn_in)
-        self._posteriors: collections.defaultdict[
-            bandit.base.ArmID, proba.Beta
-        ] = collections.defaultdict(proba.Beta)
+        self._posteriors: collections.defaultdict[bandit.base.ArmID, proba.Beta] = (
+            collections.defaultdict(proba.Beta)
+        )
         self.seed = seed
         self._rng = random.Random(seed)
 

diff --git a/river/bandit/envs/__init__.py b/river/bandit/envs/__init__.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 try:
-    import gym
+    import gymnasium as gym
 
     GYM_INSTALLED = True
 except ImportError:

diff --git a/river/bandit/envs/candy_cane.py b/river/bandit/envs/candy_cane.py
@@ -2,7 +2,7 @@
 
 import dataclasses
 
-import gym
+import gymnasium as gym
 
 
 @dataclasses.dataclass
@@ -25,7 +25,7 @@ class CandyCaneContest(gym.Env):
     Examples
     --------
 
-    >>> import gym
+    >>> import gymnasium as gym
     >>> from river import stats
 
     >>> env = gym.make('river_bandits/CandyCaneContest-v0')
@@ -58,12 +58,8 @@ def __init__(self, n_machines=100, reward_decay=0.03):
         self.action_space = gym.spaces.Discrete(n_machines)
         self.observation_space = gym.spaces.Dict(
             {
-                "attempts": gym.spaces.Tuple(
-                    [gym.spaces.Discrete(self.n_steps)] * n_machines
-                ),
-                "successes": gym.spaces.Tuple(
-                    [gym.spaces.Discrete(self.n_steps)] * n_machines
-                ),
+                "attempts": gym.spaces.Tuple([gym.spaces.Discrete(self.n_steps)] * n_machines),
+                "successes": gym.spaces.Tuple([gym.spaces.Discrete(self.n_steps)] * n_machines),
             }
         )
         self.reward_range = (0.0, 1.0)

diff --git a/river/bandit/envs/testbed.py b/river/bandit/envs/testbed.py
@@ -2,7 +2,7 @@
 
 import math
 
-import gym
+import gymnasium as gym
 
 
 class KArmedTestbed(gym.Env):

diff --git a/river/bandit/epsilon_greedy.py b/river/bandit/epsilon_greedy.py
@@ -33,7 +33,7 @@ class EpsilonGreedy(bandit.base.Policy):
     Examples
     --------
 
-    >>> import gym
+    >>> import gymnasium as gym
     >>> from river import bandit
     >>> from river import stats
 

diff --git a/river/bandit/evaluate.py b/river/bandit/evaluate.py
@@ -5,7 +5,7 @@
 import typing
 
 try:
-    import gym
+    import gymnasium as gym
 except ImportError:
     ...
 
@@ -52,7 +52,7 @@ def evaluate(
     Examples
     --------
 
-    >>> import gym
+    >>> import gymnasium as gym
     >>> from river import bandit
 
     >>> trace = bandit.evaluate(

diff --git a/river/bandit/exp3.py b/river/bandit/exp3.py
@@ -35,7 +35,7 @@ class Exp3(bandit.base.Policy):
     Examples
     --------
 
-    >>> import gym
+    >>> import gymnasium as gym
     >>> from river import bandit
     >>> from river import proba
     >>> from river import stats
@@ -77,9 +77,7 @@ def __init__(
         burn_in=0,
         seed: int | None = None,
     ):
-        super().__init__(
-            reward_obj=reward_obj, reward_scaler=reward_scaler, burn_in=burn_in
-        )
+        super().__init__(reward_obj=reward_obj, reward_scaler=reward_scaler, burn_in=burn_in)
         self.seed = seed
         self.gamma = gamma
         self._rng = random.Random(seed)
@@ -91,8 +89,7 @@ def __init__(
     def _pull(self, arm_ids):
         total = sum(self._weights[arm_id] for arm_id in arm_ids)
         self._probabilities = {
-            arm_id: (1 - self.gamma) * (self._weights[arm_id] / total)
-            + self.gamma / len(arm_ids)
+            arm_id: (1 - self.gamma) * (self._weights[arm_id] / total) + self.gamma / len(arm_ids)
             for arm_id in arm_ids
         }
         return self._rng.choices(arm_ids, weights=self._probabilities.values())[0]

diff --git a/river/bandit/random.py b/river/bandit/random.py
@@ -23,7 +23,7 @@ class RandomPolicy(bandit.base.Policy):
     Examples
     --------
 
-    >>> import gym
+    >>> import gymnasium as gym
     >>> from river import bandit
     >>> from river import proba
     >>> from river import stats
@@ -52,9 +52,9 @@ class RandomPolicy(bandit.base.Policy):
 
     def __init__(self, reward_obj=None, burn_in=0, seed: int | None = None):
         super().__init__(reward_obj, burn_in)
-        self._posteriors: collections.defaultdict[
-            bandit.base.ArmID, proba.Beta
-        ] = collections.defaultdict(proba.Beta)
+        self._posteriors: collections.defaultdict[bandit.base.ArmID, proba.Beta] = (
+            collections.defaultdict(proba.Beta)
+        )
         self.seed = seed
         self._rng = random.Random(seed)
 

diff --git a/river/bandit/test_envs.py b/river/bandit/test_envs.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
-import gym.utils.env_checker
+import gymnasium as gym
+import gymnasium.utils.env_checker
 import pytest
 
 from river import bandit
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,7 @@ @@
     (see `model_selection.BanditRegressor`).
     """
     from __future__ import annotations
     from . import base, datasets, envs
@@ Expand Down @@