Fix discounts for early terminations and fix reward normalization

araffin · araffin · commit fc09267da9cb · 2025-06-10T14:28:10.000+02:00
diff --git a/stable_baselines3/common/buffers.py b/stable_baselines3/common/buffers.py
@@ -894,18 +894,18 @@ def _get_samples(self, batch_inds: np.ndarray, env: Optional[VecNormalize] = Non
         # Randomly choose env indices for each sample
         env_indices = np.random.randint(0, self.n_envs, size=batch_inds.shape)
 
-        # Compute n-step indices with wrap-around
-        steps = np.arange(self.n_steps).reshape(1, -1)  # shape: [1, n_steps]
         # Note: the self.pos index is dangerous (will overlap two different episodes when buffer is full)
         # so we set self.pos-1 to truncated=True (temporarily) if done=False
         # TODO: avoid copying the whole array (requires some more indices trickery)
         safe_timeouts = self.timeouts.copy()
         safe_timeouts[self.pos - 1, :] = np.logical_not(self.dones[self.pos - 1, :])
 
+        # Compute n-step indices with wrap-around
+        steps = np.arange(self.n_steps).reshape(1, -1)  # shape: [1, n_steps]
         indices = (batch_inds[:, None] + steps) % self.buffer_size  # shape: [batch, n_steps]
 
         # Retrieve sequences of transitions
-        rewards_seq = self.rewards[indices, env_indices[:, None]]  # [batch, n_steps]
+        rewards_seq = self._normalize_reward(self.rewards[indices, env_indices[:, None]], env)  # [batch, n_steps]
         dones_seq = self.dones[indices, env_indices[:, None]]  # [batch, n_steps]
         truncs_seq = safe_timeouts[indices, env_indices[:, None]]  # [batch, n_steps]
 
@@ -917,6 +917,9 @@ def _get_samples(self, batch_inds: np.ndarray, env: Optional[VecNormalize] = Non
         done_idx = np.where(has_done_or_trunc, done_idx, self.n_steps - 1)
 
         mask = np.arange(self.n_steps).reshape(1, -1) <= done_idx[:, None]  # shape: [batch, n_steps]
+        # Compute discount factors for bootstrapping (using target Q-Value)
+        # It is gamma ** n_steps by default but should be adjusted in case of early termination/truncation.
+        target_q_discounts = self.gamma ** mask.sum(axis=1, keepdims=True).astype(np.float32)  # [batch, 1]
 
         # Apply discount
         discounts = self.gamma ** np.arange(self.n_steps, dtype=np.float32).reshape(1, -1)  # [1, n_steps]
@@ -939,6 +942,6 @@ def _get_samples(self, batch_inds: np.ndarray, env: Optional[VecNormalize] = Non
             actions=self.to_torch(actions),
             next_observations=self.to_torch(next_obs),  # type: ignore[arg-type]
             dones=self.to_torch(final_dones),
-            # FIXME: what to do with self._normalize_reward ?
             rewards=self.to_torch(n_step_returns),
+            discounts=self.to_torch(target_q_discounts),
         )
diff --git a/stable_baselines3/common/type_aliases.py b/stable_baselines3/common/type_aliases.py
@@ -52,6 +52,8 @@ class ReplayBufferSamples(NamedTuple):
     next_observations: th.Tensor
     dones: th.Tensor
     rewards: th.Tensor
+    # For n-step replay buffer
+    discounts: Optional[th.Tensor] = None
 
 
 class DictReplayBufferSamples(NamedTuple):
@@ -60,6 +62,7 @@ class DictReplayBufferSamples(NamedTuple):
     next_observations: TensorDict
     dones: th.Tensor
     rewards: th.Tensor
+    discounts: Optional[th.Tensor] = None
 
 
 class RolloutReturn(NamedTuple):
diff --git a/stable_baselines3/dqn/dqn.py b/stable_baselines3/dqn/dqn.py
@@ -191,6 +191,8 @@ def train(self, gradient_steps: int, batch_size: int = 100) -> None:
         for _ in range(gradient_steps):
             # Sample replay buffer
             replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env)  # type: ignore[union-attr]
+            # For n-step replay, discount factor is gamma**n_steps (when no early termination)
+            discounts = replay_data.discounts if replay_data.discounts is not None else self.gamma
 
             with th.no_grad():
                 # Compute the next Q-values using the target network
@@ -200,7 +202,7 @@ def train(self, gradient_steps: int, batch_size: int = 100) -> None:
                 # Avoid potential broadcast issue
                 next_q_values = next_q_values.reshape(-1, 1)
                 # 1-step TD target
-                target_q_values = replay_data.rewards + (1 - replay_data.dones) * self.gamma * next_q_values
+                target_q_values = replay_data.rewards + (1 - replay_data.dones) * discounts * next_q_values
 
             # Get current Q-values estimates
             current_q_values = self.q_net(replay_data.observations)
diff --git a/stable_baselines3/sac/sac.py b/stable_baselines3/sac/sac.py
@@ -213,6 +213,8 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None:
         for gradient_step in range(gradient_steps):
             # Sample replay buffer
             replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env)  # type: ignore[union-attr]
+            # For n-step replay, discount factor is gamma**n_steps (when no early termination)
+            discounts = replay_data.discounts if replay_data.discounts is not None else self.gamma
 
             # We need to sample because `log_std` may have changed between two gradient steps
             if self.use_sde:
@@ -252,7 +254,7 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None:
                 # add entropy term
                 next_q_values = next_q_values - ent_coef * next_log_prob.reshape(-1, 1)
                 # td error + entropy term
-                target_q_values = replay_data.rewards + (1 - replay_data.dones) * self.gamma * next_q_values
+                target_q_values = replay_data.rewards + (1 - replay_data.dones) * discounts * next_q_values
 
             # Get current Q-values estimates for each critic network
             # using action from the replay buffer
diff --git a/stable_baselines3/td3/td3.py b/stable_baselines3/td3/td3.py
@@ -163,6 +163,8 @@ def train(self, gradient_steps: int, batch_size: int = 100) -> None:
             self._n_updates += 1
             # Sample replay buffer
             replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env)  # type: ignore[union-attr]
+            # For n-step replay, discount factor is gamma**n_steps (when no early termination)
+            discounts = replay_data.discounts if replay_data.discounts is not None else self.gamma
 
             with th.no_grad():
                 # Select action according to policy and add clipped noise
@@ -173,7 +175,7 @@ def train(self, gradient_steps: int, batch_size: int = 100) -> None:
                 # Compute the next Q-values: min over all critics targets
                 next_q_values = th.cat(self.critic_target(replay_data.next_observations, next_actions), dim=1)
                 next_q_values, _ = th.min(next_q_values, dim=1, keepdim=True)
-                target_q_values = replay_data.rewards + (1 - replay_data.dones) * self.gamma * next_q_values
+                target_q_values = replay_data.rewards + (1 - replay_data.dones) * discounts * next_q_values
 
             # Get current Q-values estimates for each critic network
             current_q_values = self.critic(replay_data.observations, replay_data.actions)
diff --git a/tests/test_n_step_replay.py b/tests/test_n_step_replay.py
@@ -12,10 +12,8 @@ def test_run(model_class):
     env_id = "CartPole-v1" if model_class == DQN else "Pendulum-v1"
     env = make_vec_env(env_id, n_envs=2)
 
-    # FIXME: need to set the discount factor manually
     n_steps = 2
     gamma = 0.99
-    discount = gamma**n_steps
 
     model = model_class(
         "MlpPolicy",
@@ -29,7 +27,7 @@ def test_run(model_class):
         policy_kwargs=dict(net_arch=[64]),
         learning_starts=100,
         buffer_size=int(2e4),
-        gamma=discount,
+        gamma=gamma,
     )
 
     model.learn(total_timesteps=150)
@@ -103,11 +101,11 @@ def test_nstep_early_termination(done_at, n_steps):
 
     base_idx = 0
     batch = buffer._get_samples(np.array([base_idx]))
-    actual = batch.rewards.numpy().item()
+    actual = batch.rewards.item()
 
     expected = compute_expected_nstep_reward(gamma=0.99, n_steps=n_steps, stop_idx=done_at - base_idx)
     np.testing.assert_allclose(actual, expected, rtol=1e-4)
-    assert batch.dones.numpy().item() == 1.0
+    assert batch.dones.item() == 1.0
 
 
 @pytest.mark.parametrize("truncated_at", [1, 2])
@@ -117,46 +115,51 @@ def test_nstep_early_truncation(truncated_at):
 
     base_idx = 0
     batch = buffer._get_samples(np.array([base_idx]))
-    actual = batch.rewards.numpy().item()
+    actual = batch.rewards.item()
 
     expected = compute_expected_nstep_reward(gamma=0.99, n_steps=3, stop_idx=truncated_at - base_idx)
     np.testing.assert_allclose(actual, expected, rtol=1e-4)
-    assert batch.dones.numpy().item() == 0.0
+    assert batch.dones.item() == 0.0
 
 
 @pytest.mark.parametrize("n_steps", [3, 5])
-def test_nstep_no_termination_or_truncation(n_steps):
+def test_nstep_no_terminations(n_steps):
     buffer = create_buffer(n_steps=n_steps)
     fill_buffer(buffer, length=10)  # no done or truncation
+    gamma = 0.99
 
     base_idx = 3
     batch = buffer._get_samples(np.array([base_idx]))
-    actual = batch.rewards.numpy().item()
-
-    expected = compute_expected_nstep_reward(gamma=0.99, n_steps=n_steps)
+    actual = batch.rewards.item()
+    # Discount factor for bootstrapping with target Q-Value
+    np.testing.assert_allclose(batch.discounts.item(), gamma**n_steps)
+    expected = compute_expected_nstep_reward(gamma=gamma, n_steps=n_steps)
     np.testing.assert_allclose(actual, expected, rtol=1e-4)
-    assert batch.dones.numpy().item() == 0.0
+    assert batch.dones.item() == 0.0
 
     # Check that self.pos-1 truncation is set when buffer is full
     # Note: buffer size is 10, here we are erasing past transitions
     fill_buffer(buffer, length=2)
     # We create a tmp truncation to not sample across episodes
     base_idx = 0
     batch = buffer._get_samples(np.array([base_idx]))
-    actual = batch.rewards.numpy().item()
+    actual = batch.rewards.item()
     # Note: compute_expected_nstep assumes base_idx=1
     expected = compute_expected_nstep_reward(gamma=0.99, n_steps=n_steps, stop_idx=buffer.pos - 1)
     np.testing.assert_allclose(actual, expected, rtol=1e-4)
-    assert batch.dones.numpy().item() == 0.0
+    assert batch.dones.item() == 0.0
+    # Discount factor for bootstrapping with target Q-Value
+    # (bigger than gamma ** n_steps because of truncation at n_steps=2)
+    np.testing.assert_allclose(batch.discounts.item(), gamma**2)
 
     # Set done=1 manually, the tmp truncation should not be set (it would set batch.done=False)
     buffer.dones[buffer.pos - 1, :] = True
     batch = buffer._get_samples(np.array([base_idx]))
-    actual = batch.rewards.numpy().item()
+    actual = batch.rewards.item()
     # Note: compute_expected_nstep assumes base_idx=0
     expected = compute_expected_nstep_reward(gamma=0.99, n_steps=n_steps, stop_idx=buffer.pos - 1)
     np.testing.assert_allclose(actual, expected, rtol=1e-4)
-    assert batch.dones.numpy().item() == 1.0
+    assert batch.dones.item() == 1.0
 
 
 def test_match_normal_buffer():
@@ -168,12 +171,12 @@ def test_match_normal_buffer():
 
     base_idx = 3
     batch1 = buffer._get_samples(np.array([base_idx]))
-    actual1 = batch1.rewards.numpy().item()
+    actual1 = batch1.rewards.item()
 
     batch2 = ref_buffer._get_samples(np.array([base_idx]))
 
     expected = compute_expected_nstep_reward(gamma=0.99, n_steps=1)
     np.testing.assert_allclose(actual1, expected, rtol=1e-4)
-    assert batch1.dones.numpy().item() == 0.0
+    assert batch1.dones.item() == 0.0
 
     np.testing.assert_allclose(batch1.rewards.numpy(), batch2.rewards.numpy(), rtol=1e-4)