Add n_steps param and update changelog

araffin · araffin · commit 6b65820e13cb · 2025-06-11T09:28:07.000+02:00
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -3,14 +3,16 @@
 Changelog
 ==========
 
-Release 2.6.1a1 (WIP)
+Release 2.7.0a0 (WIP)
 --------------------------
 
 Breaking Changes:
 ^^^^^^^^^^^^^^^^^
 
 New Features:
 ^^^^^^^^^^^^^
+- Added support for n-step returns for off-policy algorithms via the `n_steps` parameter
+- Added ``NStepReplayBuffer`` that allows to compute n-step returns without additional memory requirement (and without for loops)
 
 Bug Fixes:
 ^^^^^^^^^^
diff --git a/stable_baselines3/common/base_class.py b/stable_baselines3/common/base_class.py
@@ -316,7 +316,6 @@ def _excluded_save_params(self) -> list[str]:
             "replay_buffer",
             "rollout_buffer",
             "_vec_normalize_env",
-            "_episode_storage",
             "_logger",
             "_custom_logger",
         ]
diff --git a/stable_baselines3/common/off_policy_algorithm.py b/stable_baselines3/common/off_policy_algorithm.py
@@ -11,7 +11,7 @@
 from gymnasium import spaces
 
 from stable_baselines3.common.base_class import BaseAlgorithm
-from stable_baselines3.common.buffers import DictReplayBuffer, ReplayBuffer
+from stable_baselines3.common.buffers import DictReplayBuffer, NStepReplayBuffer, ReplayBuffer
 from stable_baselines3.common.callbacks import BaseCallback
 from stable_baselines3.common.noise import ActionNoise, VectorizedActionNoise
 from stable_baselines3.common.policies import BasePolicy
@@ -51,6 +51,7 @@ class OffPolicyAlgorithm(BaseAlgorithm):
     :param optimize_memory_usage: Enable a memory efficient variant of the replay buffer
         at a cost of more complexity.
         See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
+    :param n_steps: When n_step > 1, uses n-step return (with the NStepReplayBuffer) when updating the Q-value network.
     :param policy_kwargs: Additional arguments to be passed to the policy on creation
     :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
         the reported success rate, mean episode length, and mean reward over
@@ -93,6 +94,7 @@ def __init__(
         replay_buffer_class: Optional[type[ReplayBuffer]] = None,
         replay_buffer_kwargs: Optional[dict[str, Any]] = None,
         optimize_memory_usage: bool = False,
+        n_steps: int = 1,
         policy_kwargs: Optional[dict[str, Any]] = None,
         stats_window_size: int = 100,
         tensorboard_log: Optional[str] = None,
@@ -134,7 +136,7 @@ def __init__(
         self.replay_buffer: Optional[ReplayBuffer] = None
         self.replay_buffer_class = replay_buffer_class
         self.replay_buffer_kwargs = replay_buffer_kwargs or {}
-        self._episode_storage = None
+        self.n_steps = n_steps
 
         # Save train freq parameter, will be converted later to TrainFreq object
         self.train_freq = train_freq
@@ -176,6 +178,11 @@ def _setup_model(self) -> None:
         if self.replay_buffer_class is None:
             if isinstance(self.observation_space, spaces.Dict):
                 self.replay_buffer_class = DictReplayBuffer
+                assert self.n_steps == 1, "N-step returns are not supported for Dict observation spaces yet."
+            elif self.n_steps > 1:
+                self.replay_buffer_class = NStepReplayBuffer
+                # Add required arguments for computing n-step returns
+                self.replay_buffer_kwargs.update({"n_steps": self.n_steps, "gamma": self.gamma})
             else:
                 self.replay_buffer_class = ReplayBuffer
 
diff --git a/stable_baselines3/ddpg/ddpg.py b/stable_baselines3/ddpg/ddpg.py
@@ -44,6 +44,7 @@ class DDPG(TD3):
     :param optimize_memory_usage: Enable a memory efficient variant of the replay buffer
         at a cost of more complexity.
         See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
+    :param n_steps: When n_step > 1, uses n-step return (with the NStepReplayBuffer) when updating the Q-value network.
     :param policy_kwargs: additional arguments to be passed to the policy on creation. See :ref:`ddpg_policies`
     :param verbose: Verbosity level: 0 for no output, 1 for info messages (such as device or wrappers used), 2 for
         debug messages
@@ -69,6 +70,7 @@ def __init__(
         replay_buffer_class: Optional[type[ReplayBuffer]] = None,
         replay_buffer_kwargs: Optional[dict[str, Any]] = None,
         optimize_memory_usage: bool = False,
+        n_steps: int = 1,
         tensorboard_log: Optional[str] = None,
         policy_kwargs: Optional[dict[str, Any]] = None,
         verbose: int = 0,
@@ -90,12 +92,13 @@ def __init__(
             action_noise=action_noise,
             replay_buffer_class=replay_buffer_class,
             replay_buffer_kwargs=replay_buffer_kwargs,
+            optimize_memory_usage=optimize_memory_usage,
+            n_steps=n_steps,
             policy_kwargs=policy_kwargs,
             tensorboard_log=tensorboard_log,
             verbose=verbose,
             device=device,
             seed=seed,
-            optimize_memory_usage=optimize_memory_usage,
             # Remove all tricks from TD3 to obtain DDPG:
             # we still need to specify target_policy_noise > 0 to avoid errors
             policy_delay=1,
diff --git a/stable_baselines3/dqn/dqn.py b/stable_baselines3/dqn/dqn.py
@@ -44,6 +44,7 @@ class DQN(OffPolicyAlgorithm):
     :param optimize_memory_usage: Enable a memory efficient variant of the replay buffer
         at a cost of more complexity.
         See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
+    :param n_steps: When n_step > 1, uses n-step return (with the NStepReplayBuffer) when updating the Q-value network.
     :param target_update_interval: update the target network every ``target_update_interval``
         environment steps.
     :param exploration_fraction: fraction of entire training period over which the exploration rate is reduced
@@ -88,6 +89,7 @@ def __init__(
         replay_buffer_class: Optional[type[ReplayBuffer]] = None,
         replay_buffer_kwargs: Optional[dict[str, Any]] = None,
         optimize_memory_usage: bool = False,
+        n_steps: int = 1,
         target_update_interval: int = 10000,
         exploration_fraction: float = 0.1,
         exploration_initial_eps: float = 1.0,
@@ -115,14 +117,15 @@ def __init__(
             action_noise=None,  # No action noise
             replay_buffer_class=replay_buffer_class,
             replay_buffer_kwargs=replay_buffer_kwargs,
+            optimize_memory_usage=optimize_memory_usage,
+            n_steps=n_steps,
             policy_kwargs=policy_kwargs,
             stats_window_size=stats_window_size,
             tensorboard_log=tensorboard_log,
             verbose=verbose,
             device=device,
             seed=seed,
             sde_support=False,
-            optimize_memory_usage=optimize_memory_usage,
             supported_action_spaces=(spaces.Discrete,),
             support_multi_env=True,
         )
diff --git a/stable_baselines3/sac/sac.py b/stable_baselines3/sac/sac.py
@@ -53,6 +53,7 @@ class SAC(OffPolicyAlgorithm):
     :param optimize_memory_usage: Enable a memory efficient variant of the replay buffer
         at a cost of more complexity.
         See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
+    :param n_steps: When n_step > 1, uses n-step return (with the NStepReplayBuffer) when updating the Q-value network.
     :param ent_coef: Entropy regularization coefficient. (Equivalent to
         inverse of reward scale in the original SAC paper.)  Controlling exploration/exploitation trade-off.
         Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value)
@@ -103,6 +104,7 @@ def __init__(
         replay_buffer_class: Optional[type[ReplayBuffer]] = None,
         replay_buffer_kwargs: Optional[dict[str, Any]] = None,
         optimize_memory_usage: bool = False,
+        n_steps: int = 1,
         ent_coef: Union[str, float] = "auto",
         target_update_interval: int = 1,
         target_entropy: Union[str, float] = "auto",
@@ -131,6 +133,8 @@ def __init__(
             action_noise,
             replay_buffer_class=replay_buffer_class,
             replay_buffer_kwargs=replay_buffer_kwargs,
+            optimize_memory_usage=optimize_memory_usage,
+            n_steps=n_steps,
             policy_kwargs=policy_kwargs,
             stats_window_size=stats_window_size,
             tensorboard_log=tensorboard_log,
@@ -140,7 +144,6 @@ def __init__(
             use_sde=use_sde,
             sde_sample_freq=sde_sample_freq,
             use_sde_at_warmup=use_sde_at_warmup,
-            optimize_memory_usage=optimize_memory_usage,
             supported_action_spaces=(spaces.Box,),
             support_multi_env=True,
         )
diff --git a/stable_baselines3/td3/td3.py b/stable_baselines3/td3/td3.py
@@ -48,6 +48,7 @@ class TD3(OffPolicyAlgorithm):
     :param optimize_memory_usage: Enable a memory efficient variant of the replay buffer
         at a cost of more complexity.
         See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
+    :param n_steps: When n_step > 1, uses n-step return (with the NStepReplayBuffer) when updating the Q-value network.
     :param policy_delay: Policy and target networks will only be updated once every policy_delay steps
         per training steps. The Q values will be updated policy_delay more often (update every training step).
     :param target_policy_noise: Standard deviation of Gaussian noise added to target policy
@@ -92,6 +93,7 @@ def __init__(
         replay_buffer_class: Optional[type[ReplayBuffer]] = None,
         replay_buffer_kwargs: Optional[dict[str, Any]] = None,
         optimize_memory_usage: bool = False,
+        n_steps: int = 1,
         policy_delay: int = 2,
         target_policy_noise: float = 0.2,
         target_noise_clip: float = 0.5,
@@ -117,14 +119,15 @@ def __init__(
             action_noise=action_noise,
             replay_buffer_class=replay_buffer_class,
             replay_buffer_kwargs=replay_buffer_kwargs,
+            optimize_memory_usage=optimize_memory_usage,
+            n_steps=n_steps,
             policy_kwargs=policy_kwargs,
             stats_window_size=stats_window_size,
             tensorboard_log=tensorboard_log,
             verbose=verbose,
             device=device,
             seed=seed,
             sde_support=False,
-            optimize_memory_usage=optimize_memory_usage,
             supported_action_spaces=(spaces.Box,),
             support_multi_env=True,
         )
diff --git a/stable_baselines3/version.txt b/stable_baselines3/version.txt
@@ -1 +1 @@
-2.6.1a1
+2.7.0a0
diff --git a/tests/test_n_step_replay.py b/tests/test_n_step_replay.py
@@ -2,33 +2,30 @@
 import numpy as np
 import pytest
 
-from stable_baselines3 import DQN, SAC
+from stable_baselines3 import DQN, SAC, TD3
 from stable_baselines3.common.buffers import NStepReplayBuffer, ReplayBuffer
 from stable_baselines3.common.env_util import make_vec_env
 
 
-@pytest.mark.parametrize("model_class", [SAC, DQN])
+@pytest.mark.parametrize("model_class", [SAC, DQN, TD3])
 def test_run(model_class):
     env_id = "CartPole-v1" if model_class == DQN else "Pendulum-v1"
     env = make_vec_env(env_id, n_envs=2)
-
-    n_steps = 2
-    gamma = 0.99
+    gamma = 0.989
 
     model = model_class(
         "MlpPolicy",
         env,
-        replay_buffer_class=NStepReplayBuffer,
-        replay_buffer_kwargs=dict(
-            n_steps=n_steps,
-            gamma=gamma,
-        ),
         train_freq=4,
+        n_steps=3,
         policy_kwargs=dict(net_arch=[64]),
         learning_starts=100,
         buffer_size=int(2e4),
         gamma=gamma,
     )
+    assert isinstance(model.replay_buffer, NStepReplayBuffer)
+    assert model.replay_buffer.n_steps == 3
+    assert model.replay_buffer.gamma == gamma
 
     model.learn(total_timesteps=150)
 

Original file line number	Diff line number	Diff line change
`@@ -316,7 +316,6 @@ def _excluded_save_params(self) -> list[str]:`
`316`	`316`	`"replay_buffer",`
`317`	`317`	`"rollout_buffer",`
`318`	`318`	`"_vec_normalize_env",`
`319`		`- "_episode_storage",`
`320`	`319`	`"_logger",`
`321`	`320`	`"_custom_logger",`
`322`	`321`	`]`