Cast at sample time only

araffin · araffin · commit 4b0cfc36f881 · 2025-08-04T18:39:02.000+02:00
diff --git a/stable_baselines3/common/buffers.py b/stable_baselines3/common/buffers.py
@@ -390,9 +390,7 @@ def __init__(
 
     def reset(self) -> None:
         self.observations = np.zeros((self.buffer_size, self.n_envs, *self.obs_shape), dtype=self.observation_space.dtype)
-        self.actions = np.zeros(
-            (self.buffer_size, self.n_envs, self.action_dim), dtype=self._maybe_cast_dtype(self.action_space.dtype)
-        )
+        self.actions = np.zeros((self.buffer_size, self.n_envs, self.action_dim), dtype=self.action_space.dtype)
         self.rewards = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32)
         self.returns = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32)
         self.episode_starts = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32)
@@ -514,29 +512,15 @@ def _get_samples(
     ) -> RolloutBufferSamples:
         data = (
             self.observations[batch_inds],
-            self.actions[batch_inds],
+            # Cast to float32 (backward compatible), this would lead to RuntimeError for MultiBinary space
+            self.actions[batch_inds].astype(np.float32, copy=False),
             self.values[batch_inds].flatten(),
             self.log_probs[batch_inds].flatten(),
             self.advantages[batch_inds].flatten(),
             self.returns[batch_inds].flatten(),
         )
         return RolloutBufferSamples(*tuple(map(self.to_torch, data)))
 
-    @staticmethod
-    def _maybe_cast_dtype(dtype: np.typing.DTypeLike) -> np.typing.DTypeLike:
-        """
-        Cast `np.int8` action datatype to `np.float32`, keep the others dtype unchanged.
-        Otherwise, this would lead to
-        "RuntimeError: result type Float can't be cast to the desired output type Char"
-        when trying to compute the log prob for MultiBinary space.
-
-        :param dtype: The original action space dtype
-        :return: ``np.float32`` if the dtype was int8, the original dtype otherwise.
-        """
-        if dtype == np.int8:
-            return np.float32
-        return dtype
-
 
 class DictReplayBuffer(ReplayBuffer):
     """
@@ -765,9 +749,7 @@ def reset(self) -> None:
             self.observations[key] = np.zeros(
                 (self.buffer_size, self.n_envs, *obs_input_shape), dtype=self.observation_space[key].dtype
             )
-        self.actions = np.zeros(
-            (self.buffer_size, self.n_envs, self.action_dim), dtype=self._maybe_cast_dtype(self.action_space.dtype)
-        )
+        self.actions = np.zeros((self.buffer_size, self.n_envs, self.action_dim), dtype=self.action_space.dtype)
         self.rewards = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32)
         self.returns = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32)
         self.episode_starts = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32)
@@ -853,7 +835,8 @@ def _get_samples(  # type: ignore[override]
     ) -> DictRolloutBufferSamples:
         return DictRolloutBufferSamples(
             observations={key: self.to_torch(obs[batch_inds]) for (key, obs) in self.observations.items()},
-            actions=self.to_torch(self.actions[batch_inds]),
+            # Cast to float32 (backward compatible), this would lead to RuntimeError for MultiBinary space
+            actions=self.to_torch(self.actions[batch_inds].astype(np.float32, copy=False)),
             old_values=self.to_torch(self.values[batch_inds].flatten()),
             old_log_prob=self.to_torch(self.log_probs[batch_inds].flatten()),
             advantages=self.to_torch(self.advantages[batch_inds].flatten()),
diff --git a/tests/test_buffers.py b/tests/test_buffers.py
@@ -193,8 +193,9 @@ def test_buffer_dtypes(obs_dtype, use_dict, action_space):
     buffer_params = dict(buffer_size=1, action_space=action_space)
     # For off-policy algorithms, we cast float64 actions to float32, see GH#1145
     actual_replay_action_dtype = ReplayBuffer._maybe_cast_dtype(action_space.dtype)
-    # For on-policy, we cast int8 to int64 to avoid issue computing log prob
-    actual_rollout_action_dtype = RolloutBuffer._maybe_cast_dtype(action_space.dtype)
+    # For on-policy, we cast at sample time to float32 for backward compat
+    # and to avoid issue computing log prob with multibinary
+    actual_rollout_action_dtype = np.float32
 
     if use_dict:
         dict_obs_space = spaces.Dict({"obs": obs_space, "obs_2": spaces.Box(0, 100, dtype=np.uint8)})
@@ -212,7 +213,7 @@ def test_buffer_dtypes(obs_dtype, use_dict, action_space):
         assert rollout_buffer.observations.dtype == obs_dtype
         assert replay_buffer.observations.dtype == obs_dtype
 
-    assert rollout_buffer.actions.dtype == actual_rollout_action_dtype
+    assert rollout_buffer.actions.dtype == action_space.dtype
     assert replay_buffer.actions.dtype == actual_replay_action_dtype
     # Check that sampled types are corrects
     rollout_buffer.full = True