Stable-Baselines-Team · araffin · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -19,17 +19,17 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
         include:
           # Default version
           - gymnasium-version: "1.0.0"
           # Add a new config to test gym<1.0
           - python-version: "3.10"
             gymnasium-version: "0.29.1"
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v6
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v6
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
@@ -39,7 +39,8 @@ jobs:
           pip install uv
           # cpu version of pytorch
           # See https://github.com/astral-sh/uv/issues/1497
-          uv pip install --system torch==2.4.1+cpu --index https://download.pytorch.org/whl/cpu
+          # Need Pytorch 2.9+ for Python 3.13
+          uv pip install --system torch==2.9.1+cpu --index https://download.pytorch.org/whl/cpu
 
           # Install master version
           # and dependencies for docs and tests

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -3,6 +3,32 @@
 Changelog
 ==========
 
+Release 2.8.0a0 (WIP)
+--------------------------
+
+Breaking Changes:
+^^^^^^^^^^^^^^^^^
+- Removed support for Python 3.9, please upgrade to Python >= 3.10
+- Upgraded to Stable-Baselines3 >= 2.8.0
+- Set ``strict=True`` for every call to ``zip(...)``
+
+
+New Features:
+^^^^^^^^^^^^^
+- Added official support for Python 3.13
+
+Bug Fixes:
+^^^^^^^^^^
+
+Deprecations:
+^^^^^^^^^^^^^
+
+Others:
+^^^^^^^
+
+Documentation:
+^^^^^^^^^^^^^^
+
 Release 2.7.1 (2025-12-05)
 --------------------------
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,8 +1,8 @@
 [tool.ruff]
 # Same as Black.
 line-length = 127
-# Assume Python 3.9
-target-version = "py39"
+# Assume Python 3.10
+target-version = "py310"
 
 [tool.ruff.lint]
 select = ["E", "F", "B", "UP", "C90", "RUF"]

diff --git a/sb3_contrib/ars/ars.py b/sb3_contrib/ars/ars.py
@@ -3,7 +3,7 @@
 import time
 import warnings
 from functools import partial
-from typing import Any, ClassVar, Optional, TypeVar, Union
+from typing import Any, ClassVar, TypeVar
 
 import numpy as np
 import torch as th
@@ -57,21 +57,21 @@ class ARS(BaseAlgorithm):
 
     def __init__(
         self,
-        policy: Union[str, type[ARSPolicy]],
-        env: Union[GymEnv, str],
+        policy: str | type[ARSPolicy],
+        env: GymEnv | str,
         n_delta: int = 8,
-        n_top: Optional[int] = None,
-        learning_rate: Union[float, Schedule] = 0.02,
-        delta_std: Union[float, Schedule] = 0.05,
+        n_top: int | None = None,
+        learning_rate: float | Schedule = 0.02,
+        delta_std: float | Schedule = 0.05,
         zero_policy: bool = True,
         alive_bonus_offset: float = 0,
         n_eval_episodes: int = 1,
-        policy_kwargs: Optional[dict[str, Any]] = None,
+        policy_kwargs: dict[str, Any] | None = None,
         stats_window_size: int = 100,
-        tensorboard_log: Optional[str] = None,
-        seed: Optional[int] = None,
+        tensorboard_log: str | None = None,
+        seed: int | None = None,
         verbose: int = 0,
-        device: Union[th.device, str] = "cpu",
+        device: th.device | str = "cpu",
         _init_setup_model: bool = True,
     ):
         super().__init__(
@@ -137,7 +137,7 @@ def _mimic_monitor_wrapper(self, episode_rewards: np.ndarray, episode_lengths: n
         # Mimic Monitor Wrapper
         infos = [
             {"episode": {"r": episode_reward, "l": episode_length}}
-            for episode_reward, episode_length in zip(episode_rewards, episode_lengths)
+            for episode_reward, episode_length in zip(episode_rewards, episode_lengths, strict=True)
         ]
 
         self._update_info_buffer(infos)
@@ -163,7 +163,7 @@ def _trigger_callback(
         callback.on_step()
 
     def evaluate_candidates(
-        self, candidate_weights: th.Tensor, callback: BaseCallback, async_eval: Optional[AsyncEval]
+        self, candidate_weights: th.Tensor, callback: BaseCallback, async_eval: AsyncEval | None
     ) -> th.Tensor:
         """
         Evaluate each candidate.
@@ -257,7 +257,7 @@ def dump_logs(self) -> None:
         self.logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard")
         self.logger.dump(step=self.num_timesteps)
 
-    def _do_one_update(self, callback: BaseCallback, async_eval: Optional[AsyncEval]) -> None:
+    def _do_one_update(self, callback: BaseCallback, async_eval: AsyncEval | None) -> None:
         """
         Sample new candidates, evaluate them and then update current policy.
 
@@ -312,7 +312,7 @@ def learn(
         log_interval: int = 1,
         tb_log_name: str = "ARS",
         reset_num_timesteps: bool = True,
-        async_eval: Optional[AsyncEval] = None,
+        async_eval: AsyncEval | None = None,
         progress_bar: bool = False,
     ) -> SelfARS:
         """
@@ -353,9 +353,9 @@ def learn(
 
     def set_parameters(
         self,
-        load_path_or_dict: Union[str, dict[str, dict]],
+        load_path_or_dict: str | dict[str, dict],
         exact_match: bool = True,
-        device: Union[th.device, str] = "auto",
+        device: th.device | str = "auto",
     ) -> None:
         # Patched set_parameters() to handle ARS linear policy saved with sb3-contrib < 1.7.0
         params = None

diff --git a/sb3_contrib/ars/policies.py b/sb3_contrib/ars/policies.py
@@ -1,4 +1,4 @@
-from typing import Any, Optional
+from typing import Any
 
 import torch as th
 from gymnasium import spaces
@@ -26,7 +26,7 @@ def __init__(
         self,
         observation_space: spaces.Space,
         action_space: spaces.Space,
-        net_arch: Optional[list[int]] = None,
+        net_arch: list[int] | None = None,
         activation_fn: type[nn.Module] = nn.ReLU,
         with_bias: bool = True,
         squash_output: bool = True,

diff --git a/sb3_contrib/common/envs/invalid_actions_env.py b/sb3_contrib/common/envs/invalid_actions_env.py
@@ -1,5 +1,3 @@
-from typing import Optional
-
 import numpy as np
 from gymnasium import spaces
 from stable_baselines3.common.envs import IdentityEnv
@@ -12,7 +10,7 @@ class InvalidActionEnvDiscrete(IdentityEnv[int]):
 
     def __init__(
         self,
-        dim: Optional[int] = None,
+        dim: int | None = None,
         ep_length: int = 100,
         n_invalid_actions: int = 0,
     ):
@@ -47,7 +45,7 @@ class InvalidActionEnvMultiDiscrete(IdentityEnv[np.ndarray]):
 
     def __init__(
         self,
-        dims: Optional[list[int]] = None,
+        dims: list[int] | None = None,
         ep_length: int = 100,
         n_invalid_actions: int = 0,
     ):
@@ -89,7 +87,7 @@ class InvalidActionEnvMultiBinary(IdentityEnv[np.ndarray]):
 
     def __init__(
         self,
-        dims: Optional[int] = None,
+        dims: int | None = None,
         ep_length: int = 100,
         n_invalid_actions: int = 0,
     ):

diff --git a/sb3_contrib/common/maskable/buffers.py b/sb3_contrib/common/maskable/buffers.py
@@ -1,5 +1,5 @@
 from collections.abc import Generator
-from typing import NamedTuple, Optional, Union
+from typing import NamedTuple
 
 import numpy as np
 import torch as th
@@ -50,7 +50,7 @@ def __init__(
         buffer_size: int,
         observation_space: spaces.Space,
         action_space: spaces.Space,
-        device: Union[th.device, str] = "auto",
+        device: th.device | str = "auto",
         gae_lambda: float = 1,
         gamma: float = 0.99,
         n_envs: int = 1,
@@ -76,7 +76,7 @@ def reset(self) -> None:
 
         super().reset()
 
-    def add(self, *args, action_masks: Optional[np.ndarray] = None, **kwargs) -> None:
+    def add(self, *args, action_masks: np.ndarray | None = None, **kwargs) -> None:
         """
         :param action_masks: Masks applied to constrain the choice of possible actions.
         """
@@ -85,7 +85,7 @@ def add(self, *args, action_masks: Optional[np.ndarray] = None, **kwargs) -> Non
 
         super().add(*args, **kwargs)
 
-    def get(self, batch_size: Optional[int] = None) -> Generator[MaskableRolloutBufferSamples, None, None]:  # type: ignore[override]
+    def get(self, batch_size: int | None = None) -> Generator[MaskableRolloutBufferSamples, None, None]:  # type: ignore[override]
         assert self.full, ""
         indices = np.random.permutation(self.buffer_size * self.n_envs)
         # Prepare the data
@@ -111,7 +111,7 @@ def get(self, batch_size: Optional[int] = None) -> Generator[MaskableRolloutBuff
             yield self._get_samples(indices[start_idx : start_idx + batch_size])
             start_idx += batch_size
 
-    def _get_samples(self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> MaskableRolloutBufferSamples:  # type: ignore[override]
+    def _get_samples(self, batch_inds: np.ndarray, env: VecNormalize | None = None) -> MaskableRolloutBufferSamples:  # type: ignore[override]
         data = (
             self.observations[batch_inds],
             self.actions[batch_inds],
@@ -156,7 +156,7 @@ def __init__(
         buffer_size: int,
         observation_space: spaces.Dict,
         action_space: spaces.Space,
-        device: Union[th.device, str] = "auto",
+        device: th.device | str = "auto",
         gae_lambda: float = 1,
         gamma: float = 0.99,
         n_envs: int = 1,
@@ -182,7 +182,7 @@ def reset(self) -> None:
 
         super().reset()
 
-    def add(self, *args, action_masks: Optional[np.ndarray] = None, **kwargs) -> None:
+    def add(self, *args, action_masks: np.ndarray | None = None, **kwargs) -> None:
         """
         :param action_masks: Masks applied to constrain the choice of possible actions.
         """
@@ -191,7 +191,7 @@ def add(self, *args, action_masks: Optional[np.ndarray] = None, **kwargs) -> Non
 
         super().add(*args, **kwargs)
 
-    def get(self, batch_size: Optional[int] = None) -> Generator[MaskableDictRolloutBufferSamples, None, None]:  # type: ignore[override]
+    def get(self, batch_size: int | None = None) -> Generator[MaskableDictRolloutBufferSamples, None, None]:  # type: ignore[override]
         assert self.full, ""
         indices = np.random.permutation(self.buffer_size * self.n_envs)
         # Prepare the data
@@ -214,7 +214,7 @@ def get(self, batch_size: Optional[int] = None) -> Generator[MaskableDictRollout
             yield self._get_samples(indices[start_idx : start_idx + batch_size])
             start_idx += batch_size
 
-    def _get_samples(self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> MaskableDictRolloutBufferSamples:  # type: ignore[override]
+    def _get_samples(self, batch_inds: np.ndarray, env: VecNormalize | None = None) -> MaskableDictRolloutBufferSamples:  # type: ignore[override]
         return MaskableDictRolloutBufferSamples(
             observations={key: self.to_torch(obs[batch_inds]) for (key, obs) in self.observations.items()},
             actions=self.to_torch(self.actions[batch_inds]),

diff --git a/sb3_contrib/common/maskable/distributions.py b/sb3_contrib/common/maskable/distributions.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Optional, TypeVar, Union
+from typing import TypeVar, Union
 
 import numpy as np
 import torch as th
@@ -13,7 +13,7 @@
 SelfMaskableMultiCategoricalDistribution = TypeVar(
     "SelfMaskableMultiCategoricalDistribution", bound="MaskableMultiCategoricalDistribution"
 )
-MaybeMasks = Union[th.Tensor, np.ndarray, None]
+MaybeMasks = Union[th.Tensor, np.ndarray, None]  # noqa: UP007
 
 
 class MaskableCategorical(Categorical):
@@ -34,12 +34,12 @@ class MaskableCategorical(Categorical):
 
     def __init__(
         self,
-        probs: Optional[th.Tensor] = None,
-        logits: Optional[th.Tensor] = None,
-        validate_args: Optional[bool] = None,
+        probs: th.Tensor | None = None,
+        logits: th.Tensor | None = None,
+        validate_args: bool | None = None,
         masks: MaybeMasks = None,
     ):
-        self.masks: Optional[th.Tensor] = None
+        self.masks: th.Tensor | None = None
         super().__init__(probs, logits, validate_args)
         self._original_logits = self.logits
         self.apply_masking(masks)
@@ -112,7 +112,7 @@ class MaskableCategoricalDistribution(MaskableDistribution):
 
     def __init__(self, action_dim: int):
         super().__init__()
-        self.distribution: Optional[MaskableCategorical] = None
+        self.distribution: MaskableCategorical | None = None
         self.action_dim = action_dim
 
     def proba_distribution_net(self, latent_dim: int) -> nn.Module:
@@ -212,7 +212,7 @@ def log_prob(self, actions: th.Tensor) -> th.Tensor:
 
         # Extract each discrete action and compute log prob for their respective distributions
         return th.stack(
-            [dist.log_prob(action) for dist, action in zip(self.distributions, th.unbind(actions, dim=1))], dim=1
+            [dist.log_prob(action) for dist, action in zip(self.distributions, th.unbind(actions, dim=1), strict=True)], dim=1
         ).sum(dim=1)
 
     def entropy(self) -> th.Tensor:
@@ -248,7 +248,7 @@ def apply_masking(self, masks: MaybeMasks) -> None:
             # Then split columnwise for each discrete action
             split_masks = th.split(masks_tensor, list(self.action_dims), dim=1)  # type: ignore[assignment]
 
-        for distribution, mask in zip(self.distributions, split_masks):
+        for distribution, mask in zip(self.distributions, split_masks, strict=True):
             distribution.apply_masking(mask)
 
 

diff --git a/sb3_contrib/common/maskable/evaluation.py b/sb3_contrib/common/maskable/evaluation.py
@@ -1,5 +1,6 @@
 import warnings
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any
 
 import gymnasium as gym
 import numpy as np
@@ -12,16 +13,16 @@
 
 def evaluate_policy(
     model: MaskablePPO,
-    env: Union[gym.Env, VecEnv],
+    env: gym.Env | VecEnv,
     n_eval_episodes: int = 10,
     deterministic: bool = True,
     render: bool = False,
-    callback: Optional[Callable[[dict[str, Any], dict[str, Any]], None]] = None,
-    reward_threshold: Optional[float] = None,
+    callback: Callable[[dict[str, Any], dict[str, Any]], None] | None = None,
+    reward_threshold: float | None = None,
     return_episode_rewards: bool = False,
     warn: bool = True,
     use_masking: bool = True,
-) -> Union[tuple[float, float], tuple[list[float], list[int]]]:
+) -> tuple[float, float] | tuple[list[float], list[int]]:
     """
     Runs policy for ``n_eval_episodes`` episodes and returns average reward.
     If a vector env is passed in, this divides the episodes to evaluate onto the