Merge branch 'master' into copilot/fix-b1975246-02f5-4774-8cf1-737731887c11

araffin · web-flow · commit 179725cdbb16 · 2025-08-29T16:25:45.000+02:00
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -36,6 +36,7 @@ Documentation:
 - Added plotting documentation and examples
 - Added documentation clarifying gSDE (Generalized State-Dependent Exploration) inference behavior for PPO, SAC, and A2C algorithms
 - Documented Atari wrapper reset behavior where ``env.reset()`` may perform a no-op step instead of truly resetting when ``terminal_on_life_loss=True`` (default), and how to avoid this behavior by setting ``terminal_on_life_loss=False``
+- Clarified comment in ``_sample_action()`` method to better explain action scaling behavior for off-policy algorithms (@copilot)
 
 
 Release 2.7.0 (2025-07-25)
diff --git a/stable_baselines3/common/off_policy_algorithm.py b/stable_baselines3/common/off_policy_algorithm.py
@@ -391,7 +391,8 @@ def _sample_action(
             unscaled_action = np.array([self.action_space.sample() for _ in range(n_envs)])
         else:
             # Note: when using continuous actions,
-            # we assume that the policy uses tanh to scale the action
+            # the policy internally uses tanh to bound the action but predict() returns
+            # actions unscaled to the original action space [low, high]
             # We use non-deterministic action in the case of SAC, for TD3, it does not matter
             assert self._last_obs is not None, "self._last_obs was not set"
             unscaled_action, _ = self.predict(self._last_obs, deterministic=False)