Fix QRDQN loading target_update_interval (#259)

jak3122 · araffin · web-flow · commit 3d9a975af220 · 2024-10-02T10:22:57.000+02:00
* Fix QRDQN loading target_update_interval

* Update changelog

* Update version

---------

Co-authored-by: Antonin RAFFIN &lt;antonin.raffin@ensta.org&gt;
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -152,13 +152,13 @@ To run tests with `pytest`:
 make pytest
 ```
 
-Type checking with `pytype` and `mypy`:
+Type checking with `mypy`:
 
 ```
 make type
 ```
 
-Codestyle check with `black`, `isort` and `flake8`:
+Codestyle check with `black` and `ruff`:
 
 ```
 make check-codestyle
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -4,7 +4,7 @@ Changelog
 ==========
 
 
-Release 2.4.0a8 (WIP)
+Release 2.4.0a9 (WIP)
 --------------------------
 
 Breaking Changes:
@@ -19,6 +19,7 @@ Bug Fixes:
 - Updated QR-DQN optimizer input to only include quantile_net parameters (@corentinlger)
 - Updated QR-DQN paper link in docs (@corentinlger)
 - Fixed a warning with PyTorch 2.4 when loading a `RecurrentPPO` model (You are using torch.load with weights_only=False)
+- Fixed loading QRDQN changes `target_update_interval` (@jak3122)
 
 Deprecations:
 ^^^^^^^^^^^^^
diff --git a/sb3_contrib/qrdqn/qrdqn.py b/sb3_contrib/qrdqn/qrdqn.py
@@ -153,8 +153,7 @@ def _setup_model(self) -> None:
         self.exploration_schedule = get_linear_fn(
             self.exploration_initial_eps, self.exploration_final_eps, self.exploration_fraction
         )
-        # Account for multiple environments
-        # each call to step() corresponds to n_envs transitions
+
         if self.n_envs > 1:
             if self.n_envs > self.target_update_interval:
                 warnings.warn(
@@ -164,8 +163,6 @@ def _setup_model(self) -> None:
                     f"which corresponds to {self.n_envs} steps."
                 )
 
-            self.target_update_interval = max(self.target_update_interval // self.n_envs, 1)
-
     def _create_aliases(self) -> None:
         self.quantile_net = self.policy.quantile_net
         self.quantile_net_target = self.policy.quantile_net_target
@@ -177,7 +174,9 @@ def _on_step(self) -> None:
         This method is called in ``collect_rollouts()`` after each step in the environment.
         """
         self._n_calls += 1
-        if self._n_calls % self.target_update_interval == 0:
+        # Account for multiple environments
+        # each call to step() corresponds to n_envs transitions
+        if self._n_calls % max(self.target_update_interval // self.n_envs, 1) == 0:
             polyak_update(self.quantile_net.parameters(), self.quantile_net_target.parameters(), self.tau)
             # Copy running stats, see https://github.com/DLR-RM/stable-baselines3/issues/996
             polyak_update(self.batch_norm_stats, self.batch_norm_stats_target, 1.0)
diff --git a/sb3_contrib/version.txt b/sb3_contrib/version.txt
@@ -1 +1 @@
-2.4.0a8
+2.4.0a9
diff --git a/tests/test_save_load.py b/tests/test_save_load.py
@@ -8,6 +8,7 @@
 import pytest
 import torch as th
 from stable_baselines3.common.base_class import BaseAlgorithm
+from stable_baselines3.common.env_util import make_vec_env
 from stable_baselines3.common.envs import FakeImageEnv, IdentityEnv, IdentityEnvBox
 from stable_baselines3.common.utils import get_device
 from stable_baselines3.common.vec_env import DummyVecEnv
@@ -481,3 +482,14 @@ def test_save_load_pytorch_var(tmp_path):
     assert model.log_ent_coef is None
     # Check that the entropy coefficient is still the same
     assert th.allclose(ent_coef_before, ent_coef_after)
+
+
+def test_dqn_target_update_interval(tmp_path):
+    # `target_update_interval` should not change when reloading the model. See GH Issue #258.
+    env = make_vec_env(env_id="CartPole-v1", n_envs=2)
+    model = QRDQN("MlpPolicy", env, verbose=1, target_update_interval=100)
+    model.save(tmp_path / "dqn_cartpole")
+    del model
+    model = QRDQN.load(tmp_path / "dqn_cartpole")
+    os.remove(tmp_path / "dqn_cartpole.zip")
+    assert model.target_update_interval == 100