Merge branch 'master' into master

araffin · web-flow · commit c7d151dbaa0f · 2025-06-16T14:09:15.000+02:00
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -3,14 +3,16 @@
 Changelog
 ==========
 
-Release 2.6.1a1 (WIP)
+Release 2.7.0a0 (WIP)
 --------------------------
 
 Breaking Changes:
 ^^^^^^^^^^^^^^^^^
 
 New Features:
 ^^^^^^^^^^^^^
+- Added support for n-step returns for off-policy algorithms via the `n_steps` parameter
+- Added ``NStepReplayBuffer`` that allows to compute n-step returns without additional memory requirement (and without for loops)
 
 Bug Fixes:
 ^^^^^^^^^^
diff --git a/stable_baselines3/common/base_class.py b/stable_baselines3/common/base_class.py
@@ -316,7 +316,6 @@ def _excluded_save_params(self) -> list[str]:
             "replay_buffer",
             "rollout_buffer",
             "_vec_normalize_env",
-            "_episode_storage",
             "_logger",
             "_custom_logger",
         ]
diff --git a/stable_baselines3/common/buffers.py b/stable_baselines3/common/buffers.py
@@ -838,3 +838,113 @@ def _get_samples(  # type: ignore[override]
             advantages=self.to_torch(self.advantages[batch_inds].flatten()),
             returns=self.to_torch(self.returns[batch_inds].flatten()),
         )
+
+
+class NStepReplayBuffer(ReplayBuffer):
+    """
+    Replay buffer used for computing n-step returns in off-policy algorithms like SAC/DQN.
+
+    The n-step return combines multiple steps of future rewards,
+    discounted by the discount factor gamma.
+    This can help improve sample efficiency and credit assignment.
+
+    This implementation uses the same storage space as a normal replay buffer,
+    and NumPy vectorized operations at sampling time to efficiently compute the
+    n-step return, without requiring extra memory.
+
+    This implementation is inspired by:
+    - https://github.com/younggyoseo/FastTD3
+    - https://github.com/DLR-RM/stable-baselines3/pull/81
+
+    It avoids potential issues such as:
+    - https://github.com/younggyoseo/FastTD3/issues/6
+
+    :param buffer_size: Max number of element in the buffer
+    :param observation_space: Observation space
+    :param action_space: Action space
+    :param device: PyTorch device
+    :param n_envs: Number of parallel environments
+    :param optimize_memory_usage: Not supported
+    :param handle_timeout_termination: Handle timeout termination (due to timelimit)
+        separately and treat the task as infinite horizon task.
+        https://github.com/DLR-RM/stable-baselines3/issues/284
+    :param n_steps: Number of steps to accumulate rewards for n-step returns
+    :param gamma: Discount factor for future rewards
+    """
+
+    def __init__(self, *args, n_steps: int = 3, gamma: float = 0.99, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.n_steps = n_steps
+        self.gamma = gamma
+        if self.optimize_memory_usage:
+            raise NotImplementedError("NStepReplayBuffer doesn't support optimize_memory_usage=True")
+
+    def _get_samples(self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> ReplayBufferSamples:
+        """
+        Sample a batch of transitions and compute n-step returns.
+
+        For each sampled transition, the method computes the cumulative discounted reward over
+        the next `n_steps`, properly handling episode termination and timeouts.
+        The next observation and done flag correspond to the last transition in the computed n-step trajectory.
+
+        :param batch_inds: Indices of samples to retrieve
+        :param env: Optional VecNormalize environment for normalizing observations/rewards
+        :return: A batch of samples with n-step returns and corresponding observations/actions
+        """
+        # Randomly choose env indices for each sample
+        env_indices = np.random.randint(0, self.n_envs, size=batch_inds.shape)
+
+        # Note: the self.pos index is dangerous (will overlap two different episodes when buffer is full)
+        # so we set self.pos-1 to truncated=True (temporarily) if done=False and truncated=False
+        last_valid_index = self.pos - 1
+        original_timeout_values = self.timeouts[last_valid_index].copy()
+        self.timeouts[last_valid_index] = np.logical_or(original_timeout_values, np.logical_not(self.dones[last_valid_index]))
+
+        # Compute n-step indices with wrap-around
+        steps = np.arange(self.n_steps).reshape(1, -1)  # shape: [1, n_steps]
+        indices = (batch_inds[:, None] + steps) % self.buffer_size  # shape: [batch, n_steps]
+
+        # Retrieve sequences of transitions
+        rewards_seq = self._normalize_reward(self.rewards[indices, env_indices[:, None]], env)  # [batch, n_steps]
+        dones_seq = self.dones[indices, env_indices[:, None]]  # [batch, n_steps]
+        truncated_seq = self.timeouts[indices, env_indices[:, None]]  # [batch, n_steps]
+
+        # Compute masks: 1 until first done/truncation (inclusive)
+        done_or_truncated = np.logical_or(dones_seq, truncated_seq)
+        done_idx = done_or_truncated.argmax(axis=1)
+        # If no done/truncation, keep full sequence
+        has_done_or_truncated = done_or_truncated.any(axis=1)
+        done_idx = np.where(has_done_or_truncated, done_idx, self.n_steps - 1)
+
+        mask = np.arange(self.n_steps).reshape(1, -1) <= done_idx[:, None]  # shape: [batch, n_steps]
+        # Compute discount factors for bootstrapping (using target Q-Value)
+        # It is gamma ** n_steps by default but should be adjusted in case of early termination/truncation.
+        target_q_discounts = self.gamma ** mask.sum(axis=1, keepdims=True).astype(np.float32)  # [batch, 1]
+
+        # Apply discount
+        discounts = self.gamma ** np.arange(self.n_steps, dtype=np.float32).reshape(1, -1)  # [1, n_steps]
+        discounted_rewards = rewards_seq * discounts * mask
+        n_step_returns = discounted_rewards.sum(axis=1, keepdims=True)  # [batch, 1]
+
+        # Compute indices of next_obs/done at the final point of the n-step transition
+        last_indices = (batch_inds + done_idx) % self.buffer_size
+        next_obs = self._normalize_obs(self.next_observations[last_indices, env_indices], env)
+        next_dones = self.dones[last_indices, env_indices][:, None].astype(np.float32)
+        next_timeouts = self.timeouts[last_indices, env_indices][:, None].astype(np.float32)
+        final_dones = next_dones * (1.0 - next_timeouts)
+
+        # Revert back tmp changes to avoid sampling across episodes
+        self.timeouts[last_valid_index] = original_timeout_values
+
+        # Gather observations and actions
+        obs = self._normalize_obs(self.observations[batch_inds, env_indices], env)
+        actions = self.actions[batch_inds, env_indices]
+
+        return ReplayBufferSamples(
+            observations=self.to_torch(obs),  # type: ignore[arg-type]
+            actions=self.to_torch(actions),
+            next_observations=self.to_torch(next_obs),  # type: ignore[arg-type]
+            dones=self.to_torch(final_dones),
+            rewards=self.to_torch(n_step_returns),
+            discounts=self.to_torch(target_q_discounts),
+        )
diff --git a/stable_baselines3/common/off_policy_algorithm.py b/stable_baselines3/common/off_policy_algorithm.py
@@ -11,7 +11,7 @@
 from gymnasium import spaces
 
 from stable_baselines3.common.base_class import BaseAlgorithm
-from stable_baselines3.common.buffers import DictReplayBuffer, ReplayBuffer
+from stable_baselines3.common.buffers import DictReplayBuffer, NStepReplayBuffer, ReplayBuffer
 from stable_baselines3.common.callbacks import BaseCallback
 from stable_baselines3.common.noise import ActionNoise, VectorizedActionNoise
 from stable_baselines3.common.policies import BasePolicy
@@ -51,6 +51,7 @@ class OffPolicyAlgorithm(BaseAlgorithm):
     :param optimize_memory_usage: Enable a memory efficient variant of the replay buffer
         at a cost of more complexity.
         See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
+    :param n_steps: When n_step > 1, uses n-step return (with the NStepReplayBuffer) when updating the Q-value network.
     :param policy_kwargs: Additional arguments to be passed to the policy on creation
     :param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
         the reported success rate, mean episode length, and mean reward over
@@ -93,6 +94,7 @@ def __init__(
         replay_buffer_class: Optional[type[ReplayBuffer]] = None,
         replay_buffer_kwargs: Optional[dict[str, Any]] = None,
         optimize_memory_usage: bool = False,
+        n_steps: int = 1,
         policy_kwargs: Optional[dict[str, Any]] = None,
         stats_window_size: int = 100,
         tensorboard_log: Optional[str] = None,
@@ -134,7 +136,7 @@ def __init__(
         self.replay_buffer: Optional[ReplayBuffer] = None
         self.replay_buffer_class = replay_buffer_class
         self.replay_buffer_kwargs = replay_buffer_kwargs or {}
-        self._episode_storage = None
+        self.n_steps = n_steps
 
         # Save train freq parameter, will be converted later to TrainFreq object
         self.train_freq = train_freq
@@ -176,6 +178,11 @@ def _setup_model(self) -> None:
         if self.replay_buffer_class is None:
             if isinstance(self.observation_space, spaces.Dict):
                 self.replay_buffer_class = DictReplayBuffer
+                assert self.n_steps == 1, "N-step returns are not supported for Dict observation spaces yet."
+            elif self.n_steps > 1:
+                self.replay_buffer_class = NStepReplayBuffer
+                # Add required arguments for computing n-step returns
+                self.replay_buffer_kwargs.update({"n_steps": self.n_steps, "gamma": self.gamma})
             else:
                 self.replay_buffer_class = ReplayBuffer
 
diff --git a/stable_baselines3/common/type_aliases.py b/stable_baselines3/common/type_aliases.py
@@ -52,6 +52,8 @@ class ReplayBufferSamples(NamedTuple):
     next_observations: th.Tensor
     dones: th.Tensor
     rewards: th.Tensor
+    # For n-step replay buffer
+    discounts: Optional[th.Tensor] = None
 
 
 class DictReplayBufferSamples(NamedTuple):
@@ -60,6 +62,7 @@ class DictReplayBufferSamples(NamedTuple):
     next_observations: TensorDict
     dones: th.Tensor
     rewards: th.Tensor
+    discounts: Optional[th.Tensor] = None
 
 
 class RolloutReturn(NamedTuple):
diff --git a/stable_baselines3/ddpg/ddpg.py b/stable_baselines3/ddpg/ddpg.py
@@ -44,6 +44,7 @@ class DDPG(TD3):
     :param optimize_memory_usage: Enable a memory efficient variant of the replay buffer
         at a cost of more complexity.
         See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
+    :param n_steps: When n_step > 1, uses n-step return (with the NStepReplayBuffer) when updating the Q-value network.
     :param policy_kwargs: additional arguments to be passed to the policy on creation. See :ref:`ddpg_policies`
     :param verbose: Verbosity level: 0 for no output, 1 for info messages (such as device or wrappers used), 2 for
         debug messages
@@ -69,6 +70,7 @@ def __init__(
         replay_buffer_class: Optional[type[ReplayBuffer]] = None,
         replay_buffer_kwargs: Optional[dict[str, Any]] = None,
         optimize_memory_usage: bool = False,
+        n_steps: int = 1,
         tensorboard_log: Optional[str] = None,
         policy_kwargs: Optional[dict[str, Any]] = None,
         verbose: int = 0,
@@ -90,12 +92,13 @@ def __init__(
             action_noise=action_noise,
             replay_buffer_class=replay_buffer_class,
             replay_buffer_kwargs=replay_buffer_kwargs,
+            optimize_memory_usage=optimize_memory_usage,
+            n_steps=n_steps,
             policy_kwargs=policy_kwargs,
             tensorboard_log=tensorboard_log,
             verbose=verbose,
             device=device,
             seed=seed,
-            optimize_memory_usage=optimize_memory_usage,
             # Remove all tricks from TD3 to obtain DDPG:
             # we still need to specify target_policy_noise > 0 to avoid errors
             policy_delay=1,
diff --git a/stable_baselines3/dqn/dqn.py b/stable_baselines3/dqn/dqn.py
@@ -44,6 +44,7 @@ class DQN(OffPolicyAlgorithm):
     :param optimize_memory_usage: Enable a memory efficient variant of the replay buffer
         at a cost of more complexity.
         See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
+    :param n_steps: When n_step > 1, uses n-step return (with the NStepReplayBuffer) when updating the Q-value network.
     :param target_update_interval: update the target network every ``target_update_interval``
         environment steps.
     :param exploration_fraction: fraction of entire training period over which the exploration rate is reduced
@@ -88,6 +89,7 @@ def __init__(
         replay_buffer_class: Optional[type[ReplayBuffer]] = None,
         replay_buffer_kwargs: Optional[dict[str, Any]] = None,
         optimize_memory_usage: bool = False,
+        n_steps: int = 1,
         target_update_interval: int = 10000,
         exploration_fraction: float = 0.1,
         exploration_initial_eps: float = 1.0,
@@ -115,14 +117,15 @@ def __init__(
             action_noise=None,  # No action noise
             replay_buffer_class=replay_buffer_class,
             replay_buffer_kwargs=replay_buffer_kwargs,
+            optimize_memory_usage=optimize_memory_usage,
+            n_steps=n_steps,
             policy_kwargs=policy_kwargs,
             stats_window_size=stats_window_size,
             tensorboard_log=tensorboard_log,
             verbose=verbose,
             device=device,
             seed=seed,
             sde_support=False,
-            optimize_memory_usage=optimize_memory_usage,
             supported_action_spaces=(spaces.Discrete,),
             support_multi_env=True,
         )
@@ -191,6 +194,8 @@ def train(self, gradient_steps: int, batch_size: int = 100) -> None:
         for _ in range(gradient_steps):
             # Sample replay buffer
             replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env)  # type: ignore[union-attr]
+            # For n-step replay, discount factor is gamma**n_steps (when no early termination)
+            discounts = replay_data.discounts if replay_data.discounts is not None else self.gamma
 
             with th.no_grad():
                 # Compute the next Q-values using the target network
@@ -200,7 +205,7 @@ def train(self, gradient_steps: int, batch_size: int = 100) -> None:
                 # Avoid potential broadcast issue
                 next_q_values = next_q_values.reshape(-1, 1)
                 # 1-step TD target
-                target_q_values = replay_data.rewards + (1 - replay_data.dones) * self.gamma * next_q_values
+                target_q_values = replay_data.rewards + (1 - replay_data.dones) * discounts * next_q_values
 
             # Get current Q-values estimates
             current_q_values = self.q_net(replay_data.observations)
diff --git a/stable_baselines3/her/her_replay_buffer.py b/stable_baselines3/her/her_replay_buffer.py
@@ -402,7 +402,7 @@ def truncate_last_trajectory(self) -> None:
                 self.dones[self.pos - 1, env_idx] = True
                 # make sure that last episodes can be sampled and
                 # update next episode start (self._current_ep_start)
-                self._compute_episode_length(env_idx)
+                self._compute_episode_length(int(env_idx))
                 # handle infinite horizon tasks
                 if self.handle_timeout_termination:
                     self.timeouts[self.pos - 1, env_idx] = True  # not an actual timeout, but it allows bootstrapping
diff --git a/stable_baselines3/sac/sac.py b/stable_baselines3/sac/sac.py
@@ -53,6 +53,7 @@ class SAC(OffPolicyAlgorithm):
     :param optimize_memory_usage: Enable a memory efficient variant of the replay buffer
         at a cost of more complexity.
         See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
+    :param n_steps: When n_step > 1, uses n-step return (with the NStepReplayBuffer) when updating the Q-value network.
     :param ent_coef: Entropy regularization coefficient. (Equivalent to
         inverse of reward scale in the original SAC paper.)  Controlling exploration/exploitation trade-off.
         Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value)
@@ -103,6 +104,7 @@ def __init__(
         replay_buffer_class: Optional[type[ReplayBuffer]] = None,
         replay_buffer_kwargs: Optional[dict[str, Any]] = None,
         optimize_memory_usage: bool = False,
+        n_steps: int = 1,
         ent_coef: Union[str, float] = "auto",
         target_update_interval: int = 1,
         target_entropy: Union[str, float] = "auto",
@@ -131,6 +133,8 @@ def __init__(
             action_noise,
             replay_buffer_class=replay_buffer_class,
             replay_buffer_kwargs=replay_buffer_kwargs,
+            optimize_memory_usage=optimize_memory_usage,
+            n_steps=n_steps,
             policy_kwargs=policy_kwargs,
             stats_window_size=stats_window_size,
             tensorboard_log=tensorboard_log,
@@ -140,7 +144,6 @@ def __init__(
             use_sde=use_sde,
             sde_sample_freq=sde_sample_freq,
             use_sde_at_warmup=use_sde_at_warmup,
-            optimize_memory_usage=optimize_memory_usage,
             supported_action_spaces=(spaces.Box,),
             support_multi_env=True,
         )
@@ -213,6 +216,8 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None:
         for gradient_step in range(gradient_steps):
             # Sample replay buffer
             replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env)  # type: ignore[union-attr]
+            # For n-step replay, discount factor is gamma**n_steps (when no early termination)
+            discounts = replay_data.discounts if replay_data.discounts is not None else self.gamma
 
             # We need to sample because `log_std` may have changed between two gradient steps
             if self.use_sde:
@@ -252,7 +257,7 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None:
                 # add entropy term
                 next_q_values = next_q_values - ent_coef * next_log_prob.reshape(-1, 1)
                 # td error + entropy term
-                target_q_values = replay_data.rewards + (1 - replay_data.dones) * self.gamma * next_q_values
+                target_q_values = replay_data.rewards + (1 - replay_data.dones) * discounts * next_q_values
 
             # Get current Q-values estimates for each critic network
             # using action from the replay buffer
diff --git a/stable_baselines3/td3/td3.py b/stable_baselines3/td3/td3.py
diff --git a/stable_baselines3/version.txt b/stable_baselines3/version.txt
diff --git a/tests/test_buffers.py b/tests/test_buffers.py
diff --git a/tests/test_n_step_replay.py b/tests/test_n_step_replay.py

Original file line number	Diff line number	Diff line change
`@@ -316,7 +316,6 @@ def _excluded_save_params(self) -> list[str]:`
`316`	`316`	`"replay_buffer",`
`317`	`317`	`"rollout_buffer",`
`318`	`318`	`"_vec_normalize_env",`
`319`		`- "_episode_storage",`
`320`	`319`	`"_logger",`
`321`	`320`	`"_custom_logger",`
`322`	`321`	`]`