Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,17 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"]
python-version: ["3.10", "3.11", "3.12", "3.13"]
include:
# Default version
- gymnasium-version: "1.0.0"
# Add a new config to test gym<1.0
- python-version: "3.10"
gymnasium-version: "0.29.1"
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v6
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand All @@ -39,7 +39,8 @@ jobs:
pip install uv
# cpu version of pytorch
# See https://github.com/astral-sh/uv/issues/1497
uv pip install --system torch==2.4.1+cpu --index https://download.pytorch.org/whl/cpu
# Need Pytorch 2.9+ for Python 3.13
uv pip install --system torch==2.9.1+cpu --index https://download.pytorch.org/whl/cpu

# Install master version
# and dependencies for docs and tests
Expand Down
26 changes: 26 additions & 0 deletions docs/misc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,32 @@
Changelog
==========

Release 2.8.0a0 (WIP)
--------------------------

Breaking Changes:
^^^^^^^^^^^^^^^^^
- Removed support for Python 3.9, please upgrade to Python >= 3.10
- Upgraded to Stable-Baselines3 >= 2.8.0
- Set ``strict=True`` for every call to ``zip(...)``


New Features:
^^^^^^^^^^^^^
- Added official support for Python 3.13

Bug Fixes:
^^^^^^^^^^

Deprecations:
^^^^^^^^^^^^^

Others:
^^^^^^^

Documentation:
^^^^^^^^^^^^^^

Release 2.7.1 (2025-12-05)
--------------------------

Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[tool.ruff]
# Same as Black.
line-length = 127
# Assume Python 3.9
target-version = "py39"
# Assume Python 3.10
target-version = "py310"

[tool.ruff.lint]
select = ["E", "F", "B", "UP", "C90", "RUF"]
Expand Down
32 changes: 16 additions & 16 deletions sb3_contrib/ars/ars.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import time
import warnings
from functools import partial
from typing import Any, ClassVar, Optional, TypeVar, Union
from typing import Any, ClassVar, TypeVar

import numpy as np
import torch as th
Expand Down Expand Up @@ -57,21 +57,21 @@ class ARS(BaseAlgorithm):

def __init__(
self,
policy: Union[str, type[ARSPolicy]],
env: Union[GymEnv, str],
policy: str | type[ARSPolicy],
env: GymEnv | str,
n_delta: int = 8,
n_top: Optional[int] = None,
learning_rate: Union[float, Schedule] = 0.02,
delta_std: Union[float, Schedule] = 0.05,
n_top: int | None = None,
learning_rate: float | Schedule = 0.02,
delta_std: float | Schedule = 0.05,
zero_policy: bool = True,
alive_bonus_offset: float = 0,
n_eval_episodes: int = 1,
policy_kwargs: Optional[dict[str, Any]] = None,
policy_kwargs: dict[str, Any] | None = None,
stats_window_size: int = 100,
tensorboard_log: Optional[str] = None,
seed: Optional[int] = None,
tensorboard_log: str | None = None,
seed: int | None = None,
verbose: int = 0,
device: Union[th.device, str] = "cpu",
device: th.device | str = "cpu",
_init_setup_model: bool = True,
):
super().__init__(
Expand Down Expand Up @@ -137,7 +137,7 @@ def _mimic_monitor_wrapper(self, episode_rewards: np.ndarray, episode_lengths: n
# Mimic Monitor Wrapper
infos = [
{"episode": {"r": episode_reward, "l": episode_length}}
for episode_reward, episode_length in zip(episode_rewards, episode_lengths)
for episode_reward, episode_length in zip(episode_rewards, episode_lengths, strict=True)
]

self._update_info_buffer(infos)
Expand All @@ -163,7 +163,7 @@ def _trigger_callback(
callback.on_step()

def evaluate_candidates(
self, candidate_weights: th.Tensor, callback: BaseCallback, async_eval: Optional[AsyncEval]
self, candidate_weights: th.Tensor, callback: BaseCallback, async_eval: AsyncEval | None
) -> th.Tensor:
"""
Evaluate each candidate.
Expand Down Expand Up @@ -257,7 +257,7 @@ def dump_logs(self) -> None:
self.logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard")
self.logger.dump(step=self.num_timesteps)

def _do_one_update(self, callback: BaseCallback, async_eval: Optional[AsyncEval]) -> None:
def _do_one_update(self, callback: BaseCallback, async_eval: AsyncEval | None) -> None:
"""
Sample new candidates, evaluate them and then update current policy.

Expand Down Expand Up @@ -312,7 +312,7 @@ def learn(
log_interval: int = 1,
tb_log_name: str = "ARS",
reset_num_timesteps: bool = True,
async_eval: Optional[AsyncEval] = None,
async_eval: AsyncEval | None = None,
progress_bar: bool = False,
) -> SelfARS:
"""
Expand Down Expand Up @@ -353,9 +353,9 @@ def learn(

def set_parameters(
self,
load_path_or_dict: Union[str, dict[str, dict]],
load_path_or_dict: str | dict[str, dict],
exact_match: bool = True,
device: Union[th.device, str] = "auto",
device: th.device | str = "auto",
) -> None:
# Patched set_parameters() to handle ARS linear policy saved with sb3-contrib < 1.7.0
params = None
Expand Down
4 changes: 2 additions & 2 deletions sb3_contrib/ars/policies.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Optional
from typing import Any

import torch as th
from gymnasium import spaces
Expand Down Expand Up @@ -26,7 +26,7 @@ def __init__(
self,
observation_space: spaces.Space,
action_space: spaces.Space,
net_arch: Optional[list[int]] = None,
net_arch: list[int] | None = None,
activation_fn: type[nn.Module] = nn.ReLU,
with_bias: bool = True,
squash_output: bool = True,
Expand Down
8 changes: 3 additions & 5 deletions sb3_contrib/common/envs/invalid_actions_env.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from typing import Optional

import numpy as np
from gymnasium import spaces
from stable_baselines3.common.envs import IdentityEnv
Expand All @@ -12,7 +10,7 @@ class InvalidActionEnvDiscrete(IdentityEnv[int]):

def __init__(
self,
dim: Optional[int] = None,
dim: int | None = None,
ep_length: int = 100,
n_invalid_actions: int = 0,
):
Expand Down Expand Up @@ -47,7 +45,7 @@ class InvalidActionEnvMultiDiscrete(IdentityEnv[np.ndarray]):

def __init__(
self,
dims: Optional[list[int]] = None,
dims: list[int] | None = None,
ep_length: int = 100,
n_invalid_actions: int = 0,
):
Expand Down Expand Up @@ -89,7 +87,7 @@ class InvalidActionEnvMultiBinary(IdentityEnv[np.ndarray]):

def __init__(
self,
dims: Optional[int] = None,
dims: int | None = None,
ep_length: int = 100,
n_invalid_actions: int = 0,
):
Expand Down
18 changes: 9 additions & 9 deletions sb3_contrib/common/maskable/buffers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from collections.abc import Generator
from typing import NamedTuple, Optional, Union
from typing import NamedTuple

import numpy as np
import torch as th
Expand Down Expand Up @@ -50,7 +50,7 @@ def __init__(
buffer_size: int,
observation_space: spaces.Space,
action_space: spaces.Space,
device: Union[th.device, str] = "auto",
device: th.device | str = "auto",
gae_lambda: float = 1,
gamma: float = 0.99,
n_envs: int = 1,
Expand All @@ -76,7 +76,7 @@ def reset(self) -> None:

super().reset()

def add(self, *args, action_masks: Optional[np.ndarray] = None, **kwargs) -> None:
def add(self, *args, action_masks: np.ndarray | None = None, **kwargs) -> None:
"""
:param action_masks: Masks applied to constrain the choice of possible actions.
"""
Expand All @@ -85,7 +85,7 @@ def add(self, *args, action_masks: Optional[np.ndarray] = None, **kwargs) -> Non

super().add(*args, **kwargs)

def get(self, batch_size: Optional[int] = None) -> Generator[MaskableRolloutBufferSamples, None, None]: # type: ignore[override]
def get(self, batch_size: int | None = None) -> Generator[MaskableRolloutBufferSamples, None, None]: # type: ignore[override]
assert self.full, ""
indices = np.random.permutation(self.buffer_size * self.n_envs)
# Prepare the data
Expand All @@ -111,7 +111,7 @@ def get(self, batch_size: Optional[int] = None) -> Generator[MaskableRolloutBuff
yield self._get_samples(indices[start_idx : start_idx + batch_size])
start_idx += batch_size

def _get_samples(self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> MaskableRolloutBufferSamples: # type: ignore[override]
def _get_samples(self, batch_inds: np.ndarray, env: VecNormalize | None = None) -> MaskableRolloutBufferSamples: # type: ignore[override]
data = (
self.observations[batch_inds],
self.actions[batch_inds],
Expand Down Expand Up @@ -156,7 +156,7 @@ def __init__(
buffer_size: int,
observation_space: spaces.Dict,
action_space: spaces.Space,
device: Union[th.device, str] = "auto",
device: th.device | str = "auto",
gae_lambda: float = 1,
gamma: float = 0.99,
n_envs: int = 1,
Expand All @@ -182,7 +182,7 @@ def reset(self) -> None:

super().reset()

def add(self, *args, action_masks: Optional[np.ndarray] = None, **kwargs) -> None:
def add(self, *args, action_masks: np.ndarray | None = None, **kwargs) -> None:
"""
:param action_masks: Masks applied to constrain the choice of possible actions.
"""
Expand All @@ -191,7 +191,7 @@ def add(self, *args, action_masks: Optional[np.ndarray] = None, **kwargs) -> Non

super().add(*args, **kwargs)

def get(self, batch_size: Optional[int] = None) -> Generator[MaskableDictRolloutBufferSamples, None, None]: # type: ignore[override]
def get(self, batch_size: int | None = None) -> Generator[MaskableDictRolloutBufferSamples, None, None]: # type: ignore[override]
assert self.full, ""
indices = np.random.permutation(self.buffer_size * self.n_envs)
# Prepare the data
Expand All @@ -214,7 +214,7 @@ def get(self, batch_size: Optional[int] = None) -> Generator[MaskableDictRollout
yield self._get_samples(indices[start_idx : start_idx + batch_size])
start_idx += batch_size

def _get_samples(self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> MaskableDictRolloutBufferSamples: # type: ignore[override]
def _get_samples(self, batch_inds: np.ndarray, env: VecNormalize | None = None) -> MaskableDictRolloutBufferSamples: # type: ignore[override]
return MaskableDictRolloutBufferSamples(
observations={key: self.to_torch(obs[batch_inds]) for (key, obs) in self.observations.items()},
actions=self.to_torch(self.actions[batch_inds]),
Expand Down
18 changes: 9 additions & 9 deletions sb3_contrib/common/maskable/distributions.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from typing import Optional, TypeVar, Union
from typing import TypeVar, Union

import numpy as np
import torch as th
Expand All @@ -13,7 +13,7 @@
SelfMaskableMultiCategoricalDistribution = TypeVar(
"SelfMaskableMultiCategoricalDistribution", bound="MaskableMultiCategoricalDistribution"
)
MaybeMasks = Union[th.Tensor, np.ndarray, None]
MaybeMasks = Union[th.Tensor, np.ndarray, None] # noqa: UP007


class MaskableCategorical(Categorical):
Expand All @@ -34,12 +34,12 @@ class MaskableCategorical(Categorical):

def __init__(
self,
probs: Optional[th.Tensor] = None,
logits: Optional[th.Tensor] = None,
validate_args: Optional[bool] = None,
probs: th.Tensor | None = None,
logits: th.Tensor | None = None,
validate_args: bool | None = None,
masks: MaybeMasks = None,
):
self.masks: Optional[th.Tensor] = None
self.masks: th.Tensor | None = None
super().__init__(probs, logits, validate_args)
self._original_logits = self.logits
self.apply_masking(masks)
Expand Down Expand Up @@ -112,7 +112,7 @@ class MaskableCategoricalDistribution(MaskableDistribution):

def __init__(self, action_dim: int):
super().__init__()
self.distribution: Optional[MaskableCategorical] = None
self.distribution: MaskableCategorical | None = None
self.action_dim = action_dim

def proba_distribution_net(self, latent_dim: int) -> nn.Module:
Expand Down Expand Up @@ -212,7 +212,7 @@ def log_prob(self, actions: th.Tensor) -> th.Tensor:

# Extract each discrete action and compute log prob for their respective distributions
return th.stack(
[dist.log_prob(action) for dist, action in zip(self.distributions, th.unbind(actions, dim=1))], dim=1
[dist.log_prob(action) for dist, action in zip(self.distributions, th.unbind(actions, dim=1), strict=True)], dim=1
).sum(dim=1)

def entropy(self) -> th.Tensor:
Expand Down Expand Up @@ -248,7 +248,7 @@ def apply_masking(self, masks: MaybeMasks) -> None:
# Then split columnwise for each discrete action
split_masks = th.split(masks_tensor, list(self.action_dims), dim=1) # type: ignore[assignment]

for distribution, mask in zip(self.distributions, split_masks):
for distribution, mask in zip(self.distributions, split_masks, strict=True):
distribution.apply_masking(mask)


Expand Down
11 changes: 6 additions & 5 deletions sb3_contrib/common/maskable/evaluation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import warnings
from typing import Any, Callable, Optional, Union
from collections.abc import Callable
from typing import Any

import gymnasium as gym
import numpy as np
Expand All @@ -12,16 +13,16 @@

def evaluate_policy(
model: MaskablePPO,
env: Union[gym.Env, VecEnv],
env: gym.Env | VecEnv,
n_eval_episodes: int = 10,
deterministic: bool = True,
render: bool = False,
callback: Optional[Callable[[dict[str, Any], dict[str, Any]], None]] = None,
reward_threshold: Optional[float] = None,
callback: Callable[[dict[str, Any], dict[str, Any]], None] | None = None,
reward_threshold: float | None = None,
return_episode_rewards: bool = False,
warn: bool = True,
use_masking: bool = True,
) -> Union[tuple[float, float], tuple[list[float], list[int]]]:
) -> tuple[float, float] | tuple[list[float], list[int]]:
"""
Runs policy for ``n_eval_episodes`` episodes and returns average reward.
If a vector env is passed in, this divides the episodes to evaluate onto the
Expand Down
Loading