Merge branch 'master' into hybrid_PPO

araffin · web-flow · commit d8a1f77d2c29 · 2025-11-06T17:06:10.000+01:00
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 <img src="docs/\_static/img/logo.png" align="right" width="40%"/>
 
-[![CI](https://github.com/Stable-Baselines-Team/stable-baselines3-contrib/workflows/CI/badge.svg)](https://github.com/Stable-Baselines-Team/stable-baselines3-contrib/actions) [![codestyle](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+[![CI](https://github.com/Stable-Baselines-Team/stable-baselines3-contrib/actions/workflows/ci.yml/badge.svg)](https://github.com/Stable-Baselines-Team/stable-baselines3-contrib/actions/workflows/ci.yml) [![codestyle](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 
 # Stable-Baselines3 - Contrib (SB3-Contrib)
 
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -3,6 +3,28 @@
 Changelog
 ==========
 
+Release 2.7.1a3 (WIP)
+--------------------------
+
+Breaking Changes:
+^^^^^^^^^^^^^^^^^
+
+New Features:
+^^^^^^^^^^^^^
+
+Bug Fixes:
+^^^^^^^^^^
+- Fix tensorboard log name for ``MaskablePPO``
+
+Deprecations:
+^^^^^^^^^^^^^
+
+Others:
+^^^^^^^
+
+Documentation:
+^^^^^^^^^^^^^^
+
 Release 2.7.0 (2025-07-25)
 --------------------------
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -35,7 +35,7 @@ exclude = """(?x)(
 
 [tool.pytest.ini_options]
 # Deterministic ordering for tests; useful for pytest-xdist.
-env = ["PYTHONHASHSEED=0"]
+# env = ["PYTHONHASHSEED=0"]
 
 filterwarnings = [
     # Tensorboard warnings
diff --git a/sb3_contrib/common/envs/invalid_actions_env.py b/sb3_contrib/common/envs/invalid_actions_env.py
@@ -20,7 +20,7 @@ def __init__(
             dim = 1
         assert n_invalid_actions < dim, f"Too many invalid actions: {n_invalid_actions} < {dim}"
 
-        space = spaces.Discrete(dim)
+        space = spaces.Discrete(dim)  # type: ignore[var-annotated]
         self.n_invalid_actions = n_invalid_actions
         self.possible_actions = np.arange(space.n, dtype=int)
         self.invalid_actions: list[int] = []
diff --git a/sb3_contrib/common/recurrent/policies.py b/sb3_contrib/common/recurrent/policies.py
@@ -296,7 +296,7 @@ def predict_values(
         features = super(ActorCriticPolicy, self).extract_features(obs, self.vf_features_extractor)
 
         if self.lstm_critic is not None:
-            latent_vf, lstm_states_vf = self._process_sequence(features, lstm_states, episode_starts, self.lstm_critic)
+            latent_vf, _ = self._process_sequence(features, lstm_states, episode_starts, self.lstm_critic)
         elif self.shared_lstm:
             # Use LSTM from the actor
             latent_pi, _ = self._process_sequence(features, lstm_states, episode_starts, self.lstm_actor)
diff --git a/sb3_contrib/ppo_mask/ppo_mask.py b/sb3_contrib/ppo_mask/ppo_mask.py
@@ -221,7 +221,7 @@ def collect_rollouts(
         while n_steps < n_rollout_steps:
             with th.no_grad():
                 # Convert to pytorch tensor or to TensorDict
-                obs_tensor = obs_as_tensor(self._last_obs, self.device)
+                obs_tensor = obs_as_tensor(self._last_obs, self.device)  # type: ignore[arg-type]
 
                 # This is the only change related to invalid action masking
                 if use_masking:
@@ -431,7 +431,7 @@ def learn(  # type: ignore[override]
         total_timesteps: int,
         callback: MaybeCallback = None,
         log_interval: int = 1,
-        tb_log_name: str = "PPO",
+        tb_log_name: str = "MaskablePPO",
         reset_num_timesteps: bool = True,
         use_masking: bool = True,
         progress_bar: bool = False,
diff --git a/sb3_contrib/version.txt b/sb3_contrib/version.txt
@@ -1 +1 @@
-2.7.0
+2.7.1a3
diff --git a/tests/wrappers/test_action_masker.py b/tests/wrappers/test_action_masker.py
@@ -6,7 +6,7 @@
 
 
 class IdentityEnvDiscrete(IdentityEnv):
-    def __init__(self, dim: int = 1, ep_length: int = 100):
+    def __init__(self, dim=1, ep_length=100):
         """
         Identity environment for testing purposes
 
@@ -17,12 +17,12 @@ def __init__(self, dim: int = 1, ep_length: int = 100):
         self.useless_property = 1
         super().__init__(ep_length=ep_length, space=space)
 
-    def _action_masks(self) -> list[int]:
+    def _action_masks(self):  #  -> list[bool]
         assert isinstance(self.action_space, spaces.Discrete)
         return [i == self.state for i in range(self.action_space.n)]
 
 
-def action_mask_fn(env: IdentityEnvDiscrete) -> list[int]:
+def action_mask_fn(env):  # -> list[int]
     assert isinstance(env.action_space, spaces.Discrete)
     return [i == env.state for i in range(env.action_space.n)]