From bb00557eea8abaefee3dcdbd4ca2fd718030b6b6 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Mon, 11 Apr 2022 16:29:48 +0200
Subject: [PATCH 01/14] Use the MaxTrialsCallback to set the number of trials
 idnependent from workers

---
 utils/exp_manager.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utils/exp_manager.py b/utils/exp_manager.py
index 1b6725831..5dffdb1f8 100644
--- a/utils/exp_manager.py
+++ b/utils/exp_manager.py
@@ -12,6 +12,7 @@
 import optuna
 import torch as th
 import yaml
+from optuna.study import MaxTrialsCallback
 from optuna.integration.skopt import SkoptSampler
 from optuna.pruners import BasePruner, MedianPruner, SuccessiveHalvingPruner
 from optuna.samplers import BaseSampler, RandomSampler, TPESampler
@@ -747,7 +748,7 @@ def hyperparameters_optimization(self) -> None:
         )
 
         try:
-            study.optimize(self.objective, n_trials=self.n_trials, n_jobs=self.n_jobs)
+            study.optimize(self.objective, n_jobs=self.n_jobs, callbacks=[MaxTrialsCallback(self.n_trials)])
         except KeyboardInterrupt:
             pass
 

From edbd3ac8f9d9c6fc71ce5f7150a0b5d476910563 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Tue, 29 Mar 2022 16:16:16 +0200
Subject: [PATCH 02/14] Add seals environments and corresponding (tentative)
 hyperparameters.

---
 hyperparams/a2c.yml   |  33 ++++++++++++
 hyperparams/ars.yml   |  77 ++++++++++++++++++++++++++++
 hyperparams/ddpg.yml  |  22 ++++++++
 hyperparams/dqn.yml   |  30 +++++++++++
 hyperparams/ppo.yml   | 113 ++++++++++++++++++++++++++++++++++++++++++
 hyperparams/qrdqn.yml |  30 +++++++++++
 hyperparams/sac.yml   |  20 ++++++++
 hyperparams/td3.yml   |  32 ++++++++++++
 hyperparams/tqc.yml   |  21 ++++++++
 hyperparams/trpo.yml  |  44 ++++++++++++++++
 requirements.txt      |   1 +
 utils/import_envs.py  |   5 ++
 12 files changed, 428 insertions(+)

diff --git a/hyperparams/a2c.yml b/hyperparams/a2c.yml
index f51b0fb09..23763cd02 100644
--- a/hyperparams/a2c.yml
+++ b/hyperparams/a2c.yml
@@ -15,6 +15,12 @@ CartPole-v1:
   policy: 'MlpPolicy'
   ent_coef: 0.0
 
+seals/CartPole-v0:
+  n_envs: 8
+  n_timesteps: !!float 5e5
+  policy: 'MlpPolicy'
+  ent_coef: 0.0
+
 LunarLander-v2:
   n_envs: 8
   n_timesteps: !!float 2e5
@@ -31,6 +37,13 @@ MountainCar-v0:
   policy: 'MlpPolicy'
   ent_coef: .0
 
+seals/MountainCar-v0:
+  normalize: true
+  n_envs: 16
+  n_timesteps: !!float 1e6
+  policy: 'MlpPolicy'
+  ent_coef: .0
+
 Acrobot-v1:
   normalize: true
   n_envs: 16
@@ -166,19 +179,39 @@ HalfCheetah-v3: &mujoco-defaults
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
 
+seals/HalfCheetah-v0:
+  <<: *mujoco-defaults
+
 Ant-v3:
   <<: *mujoco-defaults
 
+seals/Ant-v0:
+  <<: *mujoco-defaults
+
 Hopper-v3:
   <<: *mujoco-defaults
 
+seals/Hopper-v0:
+  <<: *mujoco-defaults
+
 Walker2d-v3:
   <<: *mujoco-defaults
 
+seals/Walker2d-v0:
+  <<: *mujoco-defaults
+
 Humanoid-v3:
   <<: *mujoco-defaults
   n_timesteps: !!float 2e6
 
+seals/Humanoid-v0:
+  <<: *mujoco-defaults
+  n_timesteps: !!float 2e6
+
 Swimmer-v3:
   <<: *mujoco-defaults
   gamma: 0.9999
+
+seals/Swimmer-v0:
+  <<: *mujoco-defaults
+  gamma: 0.9999
diff --git a/hyperparams/ars.yml b/hyperparams/ars.yml
index e58d4fa3c..eb89cc211 100644
--- a/hyperparams/ars.yml
+++ b/hyperparams/ars.yml
@@ -5,6 +5,12 @@ CartPole-v1:
   policy: 'LinearPolicy'
   n_delta: 2
 
+seals/CartPole-v0:
+  n_envs: 1
+  n_timesteps: !!float 5e4
+  policy: 'LinearPolicy'
+  n_delta: 2
+
 # Tuned
 Pendulum-v1: &pendulum-params
   n_envs: 1
@@ -41,6 +47,11 @@ MountainCar-v0:
   n_delta: 8
   n_timesteps: !!float 5e5
 
+seals/MountainCar-v0:
+  <<: *pendulum-params
+  n_delta: 8
+  n_timesteps: !!float 5e5
+
 # Tuned
 MountainCarContinuous-v0:
   <<: *pendulum-params
@@ -119,6 +130,17 @@ Swimmer-v3:
   alive_bonus_offset: 0
   # normalize: "dict(norm_obs=True, norm_reward=False)"
 
+seals/Swimmer-v0:
+  n_envs: 1
+  policy: 'LinearPolicy'
+  n_timesteps: !!float 2e6
+  learning_rate: !!float 0.02
+  delta_std: !!float 0.01
+  n_delta: 1
+  n_top: 1
+  alive_bonus_offset: 0
+  # normalize: "dict(norm_obs=True, norm_reward=False)"
+
 Hopper-v3:
   n_envs: 1
   policy: 'LinearPolicy'
@@ -130,6 +152,17 @@ Hopper-v3:
   alive_bonus_offset: -1
   normalize: "dict(norm_obs=True, norm_reward=False)"
 
+seals/Hopper-v0:
+  n_envs: 1
+  policy: 'LinearPolicy'
+  n_timesteps: !!float 7e6
+  learning_rate: !!float 0.01
+  delta_std: !!float 0.025
+  n_delta: 8
+  n_top: 4
+  alive_bonus_offset: -1
+  normalize: "dict(norm_obs=True, norm_reward=False)"
+
 HalfCheetah-v3:
   n_envs: 1
   policy: 'LinearPolicy'
@@ -141,6 +174,17 @@ HalfCheetah-v3:
   alive_bonus_offset: 0
   normalize: "dict(norm_obs=True, norm_reward=False)"
 
+seals/HalfCheetah-v0:
+  n_envs: 1
+  policy: 'LinearPolicy'
+  n_timesteps: !!float 1.25e7
+  learning_rate: !!float 0.02
+  delta_std: !!float 0.03
+  n_delta: 32
+  n_top: 4
+  alive_bonus_offset: 0
+  normalize: "dict(norm_obs=True, norm_reward=False)"
+
 Walker2d-v3:
   n_envs: 1
   policy: 'LinearPolicy'
@@ -152,6 +196,17 @@ Walker2d-v3:
   alive_bonus_offset: -1
   normalize: "dict(norm_obs=True, norm_reward=False)"
 
+seals/Walker2d-v0:
+  n_envs: 1
+  policy: 'LinearPolicy'
+  n_timesteps: !!float 7.5e7
+  learning_rate: !!float 0.03
+  delta_std: !!float 0.025
+  n_delta: 40
+  n_top: 30
+  alive_bonus_offset: -1
+  normalize: "dict(norm_obs=True, norm_reward=False)"
+
 Ant-v3:
   n_envs: 1
   policy: 'LinearPolicy'
@@ -163,6 +218,17 @@ Ant-v3:
   alive_bonus_offset: -1
   normalize: "dict(norm_obs=True, norm_reward=False)"
 
+seals/Ant-v0:
+  n_envs: 1
+  policy: 'LinearPolicy'
+  n_timesteps: !!float 7.5e7
+  learning_rate: !!float 0.015
+  delta_std: !!float 0.025
+  n_delta: 60
+  n_top: 20
+  alive_bonus_offset: -1
+  normalize: "dict(norm_obs=True, norm_reward=False)"
+
 
 Humanoid-v3:
   n_envs: 1
@@ -175,6 +241,17 @@ Humanoid-v3:
   alive_bonus_offset: -5
   normalize: "dict(norm_obs=True, norm_reward=False)"
 
+seals/Humanoid-v0:
+  n_envs: 1
+  policy: 'LinearPolicy'
+  n_timesteps: !!float 2.5e8
+  learning_rate: 0.02
+  delta_std: 0.0075
+  n_delta: 256
+  n_top: 256
+  alive_bonus_offset: -5
+  normalize: "dict(norm_obs=True, norm_reward=False)"
+
 # Almost tuned
 BipedalWalker-v3:
   n_envs: 1
diff --git a/hyperparams/ddpg.yml b/hyperparams/ddpg.yml
index 14a53cfca..996cee226 100644
--- a/hyperparams/ddpg.yml
+++ b/hyperparams/ddpg.yml
@@ -131,21 +131,43 @@ HalfCheetah-v3: &mujoco-defaults
   noise_type: 'normal'
   noise_std: 0.1
 
+seals/HalfCheetah-v0:
+  <<: *mujoco-defaults
+
 Ant-v3:
   <<: *mujoco-defaults
 
+seals/Ant-v0:
+  <<: *mujoco-defaults
+
 Hopper-v3:
   <<: *mujoco-defaults
 
+seals/Hopper-v0:
+  <<: *mujoco-defaults
+
 Walker2d-v3:
   <<: *mujoco-defaults
 
+seals/Walker2d-v0:
+  <<: *mujoco-defaults
+
 Humanoid-v3:
   <<: *mujoco-defaults
   n_timesteps: !!float 2e6
 
+seals/Humanoid-v0:
+  <<: *mujoco-defaults
+  n_timesteps: !!float 2e6
+
 Swimmer-v3:
   <<: *mujoco-defaults
   gamma: 0.9999
   train_freq: 1
   gradient_steps: 1
+
+seals/Swimmer-v0:
+  <<: *mujoco-defaults
+  gamma: 0.9999
+  train_freq: 1
+  gradient_steps: 1
diff --git a/hyperparams/dqn.yml b/hyperparams/dqn.yml
index 301adf654..6fb77ed6c 100644
--- a/hyperparams/dqn.yml
+++ b/hyperparams/dqn.yml
@@ -31,6 +31,21 @@ CartPole-v1:
   exploration_final_eps: 0.04
   policy_kwargs: "dict(net_arch=[256, 256])"
 
+seals/CartPole-v0:
+  n_timesteps: !!float 5e4
+  policy: 'MlpPolicy'
+  learning_rate: !!float 2.3e-3
+  batch_size: 64
+  buffer_size: 100000
+  learning_starts: 1000
+  gamma: 0.99
+  target_update_interval: 10
+  train_freq: 256
+  gradient_steps: 128
+  exploration_fraction: 0.16
+  exploration_final_eps: 0.04
+  policy_kwargs: "dict(net_arch=[256, 256])"
+
 # Tuned
 MountainCar-v0:
   n_timesteps: !!float 1.2e5
@@ -47,6 +62,21 @@ MountainCar-v0:
   exploration_final_eps: 0.07
   policy_kwargs: "dict(net_arch=[256, 256])"
 
+seals/MountainCar-v0:
+  n_timesteps: !!float 1.2e5
+  policy: 'MlpPolicy'
+  learning_rate: !!float 4e-3
+  batch_size: 128
+  buffer_size: 10000
+  learning_starts: 1000
+  gamma: 0.98
+  target_update_interval: 600
+  train_freq: 16
+  gradient_steps: 8
+  exploration_fraction: 0.2
+  exploration_final_eps: 0.07
+  policy_kwargs: "dict(net_arch=[256, 256])"
+
 # Tuned
 LunarLander-v2:
   n_timesteps: !!float 1e5
diff --git a/hyperparams/ppo.yml b/hyperparams/ppo.yml
index 2a88c30d8..07e4e51f7 100644
--- a/hyperparams/ppo.yml
+++ b/hyperparams/ppo.yml
@@ -42,6 +42,19 @@ CartPole-v1:
   learning_rate: lin_0.001
   clip_range: lin_0.2
 
+seals/CartPole-v0:
+  n_envs: 8
+  n_timesteps: !!float 1e5
+  policy: 'MlpPolicy'
+  n_steps: 32
+  batch_size: 256
+  gae_lambda: 0.8
+  gamma: 0.98
+  n_epochs: 20
+  ent_coef: 0.0
+  learning_rate: lin_0.001
+  clip_range: lin_0.2
+
 MountainCar-v0:
   normalize: true
   n_envs: 16
@@ -53,6 +66,17 @@ MountainCar-v0:
   n_epochs: 4
   ent_coef: 0.0
 
+seals/MountainCar-v0:
+  normalize: true
+  n_envs: 16
+  n_timesteps: !!float 1e6
+  policy: 'MlpPolicy'
+  n_steps: 16
+  gae_lambda: 0.98
+  gamma: 0.99
+  n_epochs: 4
+  ent_coef: 0.0
+
 # Tuned
 MountainCarContinuous-v0:
   normalize: true
@@ -355,6 +379,9 @@ Ant-v3: &mujoco-defaults
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
 
+seals/Ant-v0:
+  <<: *mujoco-defaults
+
 # Hopper-v3:
 #   <<: *mujoco-defaults
 #
@@ -369,6 +396,10 @@ Swimmer-v3:
   <<: *mujoco-defaults
   gamma: 0.9999
 
+seals/Swimmer-v0:
+  <<: *mujoco-defaults
+  gamma: 0.9999
+
 # Tuned
 # 10 mujoco envs
 
@@ -394,6 +425,28 @@ HalfCheetah-v3:
                     net_arch=[dict(pi=[256, 256], vf=[256, 256])]
                   )"
 
+seals/HalfCheetah-v0:
+  normalize: true
+  n_envs: 1
+  policy: 'MlpPolicy'
+  n_timesteps: !!float 1e6
+  batch_size: 64
+  n_steps: 512
+  gamma: 0.98
+  learning_rate: 2.0633e-05
+  ent_coef: 0.000401762
+  clip_range: 0.1
+  n_epochs: 20
+  gae_lambda: 0.92
+  max_grad_norm: 0.8
+  vf_coef: 0.58096
+  policy_kwargs: "dict(
+                    log_std_init=-2,
+                    ortho_init=False,
+                    activation_fn=nn.ReLU,
+                    net_arch=[dict(pi=[256, 256], vf=[256, 256])]
+                  )"
+
 # Ant-v3:
 #   normalize: true
 #   n_envs: 1
@@ -432,6 +485,28 @@ Hopper-v3:
                     net_arch=[dict(pi=[256, 256], vf=[256, 256])]
                   )"
 
+seals/Hopper-v0:
+  normalize: true
+  n_envs: 1
+  policy: 'MlpPolicy'
+  n_timesteps: !!float 1e6
+  batch_size: 32
+  n_steps: 512
+  gamma: 0.999
+  learning_rate: 9.80828e-05
+  ent_coef: 0.00229519
+  clip_range: 0.2
+  n_epochs: 5
+  gae_lambda: 0.99
+  max_grad_norm: 0.7
+  vf_coef: 0.835671
+  policy_kwargs: "dict(
+                    log_std_init=-2,
+                    ortho_init=False,
+                    activation_fn=nn.ReLU,
+                    net_arch=[dict(pi=[256, 256], vf=[256, 256])]
+                  )"
+
 HumanoidStandup-v3:
   normalize: true
   n_envs: 1
@@ -476,6 +551,28 @@ Humanoid-v3:
                     net_arch=[dict(pi=[256, 256], vf=[256, 256])]
                   )"
 
+seals/Humanoid-v0:
+  normalize: true
+  n_envs: 1
+  policy: 'MlpPolicy'
+  n_timesteps: !!float 1e7
+  batch_size: 256
+  n_steps: 512
+  gamma: 0.95
+  learning_rate: 3.56987e-05
+  ent_coef: 0.00238306
+  clip_range: 0.3
+  n_epochs: 5
+  gae_lambda: 0.9
+  max_grad_norm: 2
+  vf_coef: 0.431892
+  policy_kwargs: "dict(
+                    log_std_init=-2,
+                    ortho_init=False,
+                    activation_fn=nn.ReLU,
+                    net_arch=[dict(pi=[256, 256], vf=[256, 256])]
+                  )"
+
 InvertedDoublePendulum-v3:
   normalize: true
   n_envs: 1
@@ -561,3 +658,19 @@ Walker2d-v3:
   gae_lambda: 0.95
   max_grad_norm: 1
   vf_coef: 0.871923
+
+seals/Walker2d-v0:
+  normalize: true
+  n_envs: 1
+  policy: 'MlpPolicy'
+  n_timesteps: !!float 1e6
+  batch_size: 32
+  n_steps: 512
+  gamma: 0.99
+  learning_rate: 5.05041e-05
+  ent_coef: 0.000585045
+  clip_range: 0.1
+  n_epochs: 20
+  gae_lambda: 0.95
+  max_grad_norm: 1
+  vf_coef: 0.871923
diff --git a/hyperparams/qrdqn.yml b/hyperparams/qrdqn.yml
index 6a6f13bd0..aca12afba 100644
--- a/hyperparams/qrdqn.yml
+++ b/hyperparams/qrdqn.yml
@@ -23,6 +23,21 @@ CartPole-v1:
   exploration_final_eps: 0.04
   policy_kwargs: "dict(net_arch=[256, 256], n_quantiles=10)"
 
+seals/CartPole-v0:
+  n_timesteps: !!float 5e4
+  policy: 'MlpPolicy'
+  learning_rate: !!float 2.3e-3
+  batch_size: 64
+  buffer_size: 100000
+  learning_starts: 1000
+  gamma: 0.99
+  target_update_interval: 10
+  train_freq: 256
+  gradient_steps: 128
+  exploration_fraction: 0.16
+  exploration_final_eps: 0.04
+  policy_kwargs: "dict(net_arch=[256, 256], n_quantiles=10)"
+
 # Tuned
 MountainCar-v0:
   n_timesteps: !!float 1.2e5
@@ -39,6 +54,21 @@ MountainCar-v0:
   exploration_final_eps: 0.07
   policy_kwargs: "dict(net_arch=[256, 256], n_quantiles=25)"
 
+seals/MountainCar-v0:
+  n_timesteps: !!float 1.2e5
+  policy: 'MlpPolicy'
+  learning_rate: !!float 4e-3
+  batch_size: 128
+  buffer_size: 10000
+  learning_starts: 1000
+  gamma: 0.98
+  target_update_interval: 600
+  train_freq: 16
+  gradient_steps: 8
+  exploration_fraction: 0.2
+  exploration_final_eps: 0.07
+  policy_kwargs: "dict(net_arch=[256, 256], n_quantiles=25)"
+
 # Tuned
 LunarLander-v2:
   n_timesteps: !!float 1e5
diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml
index 8aeaef616..c0af74cd8 100644
--- a/hyperparams/sac.yml
+++ b/hyperparams/sac.yml
@@ -195,23 +195,43 @@ HalfCheetah-v3: &mujoco-defaults
   policy: 'MlpPolicy'
   learning_starts: 10000
 
+seals/HalfCheetah-v0:
+  <<: *mujoco-defaults
+
 Ant-v3:
   <<: *mujoco-defaults
 
+seals/Ant-v0:
+  <<: *mujoco-defaults
+
 Hopper-v3:
   <<: *mujoco-defaults
 
+seals/Hopper-v0:
+  <<: *mujoco-defaults
+
 Walker2d-v3:
   <<: *mujoco-defaults
 
+seals/Walker2d-v0:
+  <<: *mujoco-defaults
+
 Humanoid-v3:
   <<: *mujoco-defaults
   n_timesteps: !!float 2e6
 
+seals/Humanoid-v0:
+  <<: *mujoco-defaults
+  n_timesteps: !!float 2e6
+
 Swimmer-v3:
   <<: *mujoco-defaults
   gamma: 0.9999
 
+seals/Swimmer-v0:
+  <<: *mujoco-defaults
+  gamma: 0.9999
+
 # === HER Robotics GoalEnvs ===
 
 FetchReach-v1:
diff --git a/hyperparams/td3.yml b/hyperparams/td3.yml
index 9b941516b..c29d49199 100644
--- a/hyperparams/td3.yml
+++ b/hyperparams/td3.yml
@@ -133,9 +133,15 @@ HalfCheetah-v3: &mujoco-defaults
   noise_type: 'normal'
   noise_std: 0.1
 
+seals/HalfCheetah-v0:
+  <<: *mujoco-defaults
+
 Ant-v3:
   <<: *mujoco-defaults
 
+seals/Ant-v0:
+  <<: *mujoco-defaults
+
 Hopper-v3:
   <<: *mujoco-defaults
   # SAC Hyperparams
@@ -144,9 +150,20 @@ Hopper-v3:
   learning_rate: !!float 3e-4
   batch_size: 256
 
+seals/Hopper-v0:
+  <<: *mujoco-defaults
+  # SAC Hyperparams
+  train_freq: 1
+  gradient_steps: 1
+  learning_rate: !!float 3e-4
+  batch_size: 256
+
 Walker2d-v3:
   <<: *mujoco-defaults
 
+seals/Walker2d-v0:
+  <<: *mujoco-defaults
+
 Humanoid-v3:
   <<: *mujoco-defaults
   n_timesteps: !!float 2e6
@@ -156,9 +173,24 @@ Humanoid-v3:
   learning_rate: !!float 3e-4
   batch_size: 256
 
+seals/Humanoid-v0:
+  <<: *mujoco-defaults
+  n_timesteps: !!float 2e6
+  # SAC Hyperparams
+  train_freq: 1
+  gradient_steps: 1
+  learning_rate: !!float 3e-4
+  batch_size: 256
+
 # Tuned
 Swimmer-v3:
   <<: *mujoco-defaults
   gamma: 0.9999
   train_freq: 1
   gradient_steps: 1
+
+seals/Swimmer-v0:
+  <<: *mujoco-defaults
+  gamma: 0.9999
+  train_freq: 1
+  gradient_steps: 1
diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml
index 54e2f3a4d..23587c2e6 100644
--- a/hyperparams/tqc.yml
+++ b/hyperparams/tqc.yml
@@ -145,24 +145,45 @@ HalfCheetah-v3: &mujoco-defaults
   policy: 'MlpPolicy'
   learning_starts: 10000
 
+seals/HalfCheetah-v0:
+  <<: *mujoco-defaults
+
 Ant-v3:
   <<: *mujoco-defaults
 
+seals/Ant-v0:
+  <<: *mujoco-defaults
+
 Hopper-v3:
   <<: *mujoco-defaults
   top_quantiles_to_drop_per_net: 5
 
+seals/Hopper-v0:
+  <<: *mujoco-defaults
+  top_quantiles_to_drop_per_net: 5
+
 Walker2d-v3:
   <<: *mujoco-defaults
 
+seals/Walker2d-v0:
+  <<: *mujoco-defaults
+
 Humanoid-v3:
   <<: *mujoco-defaults
   n_timesteps: !!float 2e6
 
+seals/Humanoid-v0:
+  <<: *mujoco-defaults
+  n_timesteps: !!float 2e6
+
 Swimmer-v3:
   <<: *mujoco-defaults
   gamma: 0.9999
 
+seals/Swimmer-v0:
+  <<: *mujoco-defaults
+  gamma: 0.9999
+
 # === HER Robotics GoalEnvs ===
 FetchReach-v1:
   env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
diff --git a/hyperparams/trpo.yml b/hyperparams/trpo.yml
index c78263338..01fbfccba 100644
--- a/hyperparams/trpo.yml
+++ b/hyperparams/trpo.yml
@@ -11,6 +11,18 @@ CartPole-v1:
   learning_rate: !!float 1e-3
   n_critic_updates: 20
 
+seals/CartPole-v0:
+  n_envs: 2
+  n_timesteps: !!float 1e5
+  policy: 'MlpPolicy'
+  n_steps: 512
+  batch_size: 512
+  cg_damping: !!float 1e-3
+  gae_lambda: 0.98
+  gamma: 0.99
+  learning_rate: !!float 1e-3
+  n_critic_updates: 20
+
 # Tuned
 Pendulum-v1:
   n_envs: 2
@@ -60,6 +72,14 @@ MountainCar-v0:
   n_steps: 1024
   n_critic_updates: 20
 
+seals/MountainCar-v0:
+  normalize: true
+  n_envs: 2
+  n_timesteps: !!float 1e5
+  policy: 'MlpPolicy'
+  n_steps: 1024
+  n_critic_updates: 20
+
 # Tuned
 MountainCarContinuous-v0:
   normalize: True
@@ -122,25 +142,49 @@ Ant-v3: &mujoco-defaults
   <<: *pybullet-defaults
   n_timesteps: !!float 1e6
 
+seals/Ant-v0:
+  <<: *mujoco-defaults
+
 # Tuned
 HalfCheetah-v3:
   <<: *mujoco-defaults
   target_kl: 0.04
+
+seals/HalfCheetah-v0:
+  <<: *mujoco-defaults
+  target_kl: 0.04
+
 # Tuned
 Hopper-v3:
   <<: *mujoco-defaults
+
+seals/Hopper-v0:
+  <<: *mujoco-defaults
+
 # Tuned
 Walker2d-v3:
   <<: *mujoco-defaults
 
+seals/Walker2d-v0:
+  <<: *mujoco-defaults
+
 Humanoid-v3:
   <<: *mujoco-defaults
   n_timesteps: !!float 2e6
+
+seals/Humanoid-v0:
+  <<: *mujoco-defaults
+  n_timesteps: !!float 2e6
+
 # Tuned
 Swimmer-v3:
   <<: *mujoco-defaults
   gamma: 0.9999
 
+seals/Swimmer-v0:
+  <<: *mujoco-defaults
+  gamma: 0.9999
+
 # Tuned
 BipedalWalker-v3:
   <<: *mujoco-defaults
diff --git a/requirements.txt b/requirements.txt
index dbd890eb1..e28ddd32e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,3 +14,4 @@ plotly
 panda-gym==1.1.1 # tmp fix: until compatibility with panda-gym v2
 rliable>=1.0.5
 wandb
+seals
diff --git a/utils/import_envs.py b/utils/import_envs.py
index fbe0370e3..8b74024ea 100644
--- a/utils/import_envs.py
+++ b/utils/import_envs.py
@@ -32,3 +32,8 @@
     import panda_gym  # pytype: disable=import-error
 except ImportError:
     panda_gym = None
+
+try:
+    import seals
+except ImportError:
+    seals = None

From fc8244468371faaa40f090aa136732b7615c6e64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Sun, 17 Apr 2022 20:37:20 +0200
Subject: [PATCH 03/14] Use `NopPruner` when pruner is set to `"none"` (#234)

* Replace diverted MedianPruner by NopPruner

* Update changelog

Co-authored-by: Antonin RAFFIN <antonin.raffin@ensta.org>
---
 CHANGELOG.md         | 1 +
 utils/exp_manager.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 25c7dd5ab..3108d0185 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@
 ### Documentation
 
 ### Other
+- When pruner is set to `"none"`, use `NopPruner` instead of diverted `MedianPruner` (@qgallouedec)
 
 ## Release 1.5.0 (2022-03-25)
 
diff --git a/utils/exp_manager.py b/utils/exp_manager.py
index 5dffdb1f8..66a34c2df 100644
--- a/utils/exp_manager.py
+++ b/utils/exp_manager.py
@@ -14,7 +14,7 @@
 import yaml
 from optuna.study import MaxTrialsCallback
 from optuna.integration.skopt import SkoptSampler
-from optuna.pruners import BasePruner, MedianPruner, SuccessiveHalvingPruner
+from optuna.pruners import BasePruner, MedianPruner, NopPruner, SuccessiveHalvingPruner
 from optuna.samplers import BaseSampler, RandomSampler, TPESampler
 from optuna.visualization import plot_optimization_history, plot_param_importances
 from sb3_contrib.common.vec_env import AsyncEval
@@ -620,7 +620,7 @@ def _create_pruner(self, pruner_method: str) -> BasePruner:
             pruner = MedianPruner(n_startup_trials=self.n_startup_trials, n_warmup_steps=self.n_evaluations // 3)
         elif pruner_method == "none":
             # Do not prune
-            pruner = MedianPruner(n_startup_trials=self.n_trials, n_warmup_steps=self.n_evaluations)
+            pruner = NopPruner()
         else:
             raise ValueError(f"Unknown pruner: {pruner_method}")
         return pruner

From 217cf1fee6e370252f107684c7f3c9e714363a21 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Thu, 21 Apr 2022 17:34:52 +0200
Subject: [PATCH 04/14] Dox fixes and move to python 3.7+ style (#237)

* Dox fixes and move to python 3.7+ style

* Fix type

* Fix broken packages
---
 .github/workflows/ci.yml             |  4 +--
 .github/workflows/trained_agents.yml |  4 +--
 CHANGELOG.md                         |  5 ++-
 README.md                            |  2 +-
 docker/Dockerfile                    |  2 +-
 enjoy.py                             |  2 +-
 hyperparams/ppo.yml                  |  2 +-
 requirements.txt                     |  1 +
 scripts/plot_from_file.py            |  2 +-
 utils/callbacks.py                   | 10 +++---
 utils/exp_manager.py                 |  6 ++--
 utils/record_video.py                |  2 +-
 utils/utils.py                       |  4 +--
 utils/wrappers.py                    | 48 ++++++++++++++--------------
 14 files changed, 49 insertions(+), 45 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 24390204e..45d873255 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -30,13 +30,13 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         # cpu version of pytorch - faster to download
-        pip install torch==1.8.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
+        pip install torch==1.11+cpu -f https://download.pytorch.org/whl/torch_stable.html
         pip install pybullet==3.1.9
         pip install -r requirements.txt
         # Use headless version
         pip install opencv-python-headless
         # install parking-env to test HER (pinned so it works with gym 0.21)
-        pip install git+https://github.com/eleurent/highway-env@1a04c6a98be64632cf9683625022023e70ff1ab1
+        pip install highway-env==1.5.0
     - name: Type check
       run: |
         make type
diff --git a/.github/workflows/trained_agents.yml b/.github/workflows/trained_agents.yml
index 39a970f85..a047e9861 100644
--- a/.github/workflows/trained_agents.yml
+++ b/.github/workflows/trained_agents.yml
@@ -29,13 +29,13 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         # cpu version of pytorch - faster to download
-        pip install torch==1.8.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
+        pip install torch==1.11+cpu -f https://download.pytorch.org/whl/torch_stable.html
         pip install pybullet==3.1.9
         pip install -r requirements.txt
         # Use headless version
         pip install opencv-python-headless
         # install parking-env to test HER (pinned so it works with gym 0.21)
-        pip install git+https://github.com/eleurent/highway-env@1a04c6a98be64632cf9683625022023e70ff1ab1
+        pip install highway-env==1.5.0
         # Add support for pickle5 protocol
         pip install pickle5
     - name: Check trained agents
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3108d0185..bc7c863ea 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,12 +3,15 @@
 ### Breaking Changes
 - Change default value for number of hyperparameter optimization trials from 10 to 500. (@ernestum)
 - Derive number of intermediate pruning evaluations from number of time steps (1 evaluation per 100k time steps.) (@ernestum)
-- Updated default --eval-freq from 10k to 25k steps 
+- Updated default --eval-freq from 10k to 25k steps
+- Update default horizon to 2 for the `HistoryWrapper`
 
 ### New Features
 - Support setting PyTorch's device with thye `--device` flag (@gregwar)
 
 ### Bug fixes
+- Fix `Reacher-v3` name in PPO hyperparameter file
+- Pinned ale-py==0.7.4 until new SB3 version is released
 
 ### Documentation
 
diff --git a/README.md b/README.md
index 9b760fa3a..181aaec0f 100644
--- a/README.md
+++ b/README.md
@@ -76,7 +76,7 @@ python scripts/plot_train.py -a her -e Fetch -y success -f rl-trained-agents/ -w
 Plot evaluation reward curve for TQC, SAC and TD3 on the HalfCheetah and Ant PyBullet environments:
 
 ```
-python scripts/all_plots.py -a sac td3 tqc --env HalfCheetah Ant -f rl-trained-agents/
+python3 scripts/all_plots.py -a sac td3 tqc --env HalfCheetahBullet AntBullet -f rl-trained-agents/
 ```
 
 ## Plot with the rliable library
diff --git a/docker/Dockerfile b/docker/Dockerfile
index a99e9a7b1..2baa348c1 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -21,7 +21,7 @@ RUN \
     mkdir -p ${CODE_DIR}/rl_zoo && \
     pip uninstall -y stable-baselines3 && \
     pip install -r /tmp/requirements.txt && \
-    pip install git+https://github.com/eleurent/highway-env@1a04c6a98be64632cf9683625022023e70ff1ab1 && \
+    pip install pip install highway-env==1.5.0 && \
     rm -rf $HOME/.cache/pip
 
 ENV PATH=$VENV/bin:$PATH
diff --git a/enjoy.py b/enjoy.py
index c92304fb2..731b451bd 100644
--- a/enjoy.py
+++ b/enjoy.py
@@ -138,7 +138,7 @@ def step_count(checkpoint_path: str) -> int:
     env_kwargs = {}
     args_path = os.path.join(log_path, env_id, "args.yml")
     if os.path.isfile(args_path):
-        with open(args_path, "r") as f:
+        with open(args_path) as f:
             loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader)  # pytype: disable=module-attr
             if loaded_args["env_kwargs"] is not None:
                 env_kwargs = loaded_args["env_kwargs"]
diff --git a/hyperparams/ppo.yml b/hyperparams/ppo.yml
index 07e4e51f7..fc2d3e1aa 100644
--- a/hyperparams/ppo.yml
+++ b/hyperparams/ppo.yml
@@ -605,7 +605,7 @@ InvertedPendulum-v3:
   max_grad_norm: 0.3
   vf_coef: 0.19816
 
-Reacher-v3:
+Reacher-v2:
   normalize: true
   n_envs: 1
   policy: 'MlpPolicy'
diff --git a/requirements.txt b/requirements.txt
index e28ddd32e..6fe1ba43e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,4 +14,5 @@ plotly
 panda-gym==1.1.1 # tmp fix: until compatibility with panda-gym v2
 rliable>=1.0.5
 wandb
+ale-py==0.7.4 # tmp fix: until new SB3 version is released
 seals
diff --git a/scripts/plot_from_file.py b/scripts/plot_from_file.py
index 4ec0043cf..b77660c86 100644
--- a/scripts/plot_from_file.py
+++ b/scripts/plot_from_file.py
@@ -193,7 +193,7 @@ def restyle_boxplot(artist_dict, color, gray="#222222", linewidth=1, fliersize=5
             warnings.warn(f"{env} not found for normalizing scores, you should update `env_key_to_env_id`")
 
     # Truncate to convert to matrix
-    min_runs = min([len(algo_score) for algo_score in algo_scores])
+    min_runs = min(len(algo_score) for algo_score in algo_scores)
     if min_runs > 0:
         algo_scores = [algo_score[:min_runs] for algo_score in algo_scores]
         # shape: (n_envs, n_runs) -> (n_runs, n_envs)
diff --git a/utils/callbacks.py b/utils/callbacks.py
index 608c5a778..5d3931d9f 100644
--- a/utils/callbacks.py
+++ b/utils/callbacks.py
@@ -31,7 +31,7 @@ def __init__(
         log_path: Optional[str] = None,
     ):
 
-        super(TrialEvalCallback, self).__init__(
+        super().__init__(
             eval_env=eval_env,
             n_eval_episodes=n_eval_episodes,
             eval_freq=eval_freq,
@@ -46,7 +46,7 @@ def __init__(
 
     def _on_step(self) -> bool:
         if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
-            super(TrialEvalCallback, self)._on_step()
+            super()._on_step()
             self.eval_idx += 1
             # report best or report current ?
             # report num_timesteps or elasped time ?
@@ -69,7 +69,7 @@ class SaveVecNormalizeCallback(BaseCallback):
     """
 
     def __init__(self, save_freq: int, save_path: str, name_prefix: Optional[str] = None, verbose: int = 0):
-        super(SaveVecNormalizeCallback, self).__init__(verbose)
+        super().__init__(verbose)
         self.save_freq = save_freq
         self.save_path = save_path
         self.name_prefix = name_prefix
@@ -111,7 +111,7 @@ class ParallelTrainCallback(BaseCallback):
     """
 
     def __init__(self, gradient_steps: int = 100, verbose: int = 0, sleep_time: float = 0.0):
-        super(ParallelTrainCallback, self).__init__(verbose)
+        super().__init__(verbose)
         self.batch_size = 0
         self._model_ready = True
         self._model = None
@@ -202,7 +202,7 @@ class RawStatisticsCallback(BaseCallback):
     """
 
     def __init__(self, verbose=0):
-        super(RawStatisticsCallback, self).__init__(verbose)
+        super().__init__(verbose)
         # Custom counter to reports stats
         # (and avoid reporting multiple values for the same step)
         self._timesteps_counter = 0
diff --git a/utils/exp_manager.py b/utils/exp_manager.py
index 66a34c2df..6fe7bd870 100644
--- a/utils/exp_manager.py
+++ b/utils/exp_manager.py
@@ -48,7 +48,7 @@
 from utils.utils import ALGOS, get_callback_list, get_latest_run_id, get_wrapper_class, linear_schedule
 
 
-class ExperimentManager(object):
+class ExperimentManager:
     """
     Experiment manager: read the hyperparameters,
     preprocess them, create the environment and the RL model.
@@ -91,7 +91,7 @@ def __init__(
         no_optim_plots: bool = False,
         device: Union[th.device, str] = "auto",
     ):
-        super(ExperimentManager, self).__init__()
+        super().__init__()
         self.algo = algo
         self.env_id = env_id
         # Custom params
@@ -261,7 +261,7 @@ def _save_config(self, saved_hyperparams: Dict[str, Any]) -> None:
 
     def read_hyperparameters(self) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         # Load hyperparameters from yaml file
-        with open(f"hyperparams/{self.algo}.yml", "r") as f:
+        with open(f"hyperparams/{self.algo}.yml") as f:
             hyperparams_dict = yaml.safe_load(f)
             if self.env_id in list(hyperparams_dict.keys()):
                 hyperparams = hyperparams_dict[self.env_id]
diff --git a/utils/record_video.py b/utils/record_video.py
index dd89c0220..cc9d9ab68 100644
--- a/utils/record_video.py
+++ b/utils/record_video.py
@@ -86,7 +86,7 @@
     env_kwargs = {}
     args_path = os.path.join(log_path, env_id, "args.yml")
     if os.path.isfile(args_path):
-        with open(args_path, "r") as f:
+        with open(args_path) as f:
             loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader)  # pytype: disable=module-attr
             if loaded_args["env_kwargs"] is not None:
                 env_kwargs = loaded_args["env_kwargs"]
diff --git a/utils/utils.py b/utils/utils.py
index 6072cc7cd..e46213dbf 100644
--- a/utils/utils.py
+++ b/utils/utils.py
@@ -318,7 +318,7 @@ def get_saved_hyperparams(
         config_file = os.path.join(stats_path, "config.yml")
         if os.path.isfile(config_file):
             # Load saved hyperparameters
-            with open(os.path.join(stats_path, "config.yml"), "r") as f:
+            with open(os.path.join(stats_path, "config.yml")) as f:
                 hyperparams = yaml.load(f, Loader=yaml.UnsafeLoader)  # pytype: disable=module-attr
             hyperparams["normalize"] = hyperparams.get("normalize", False)
         else:
@@ -347,7 +347,7 @@ class StoreDict(argparse.Action):
 
     def __init__(self, option_strings, dest, nargs=None, **kwargs):
         self._nargs = nargs
-        super(StoreDict, self).__init__(option_strings, dest, nargs=nargs, **kwargs)
+        super().__init__(option_strings, dest, nargs=nargs, **kwargs)
 
     def __call__(self, parser, namespace, values, option_string=None):
         arg_dict = {}
diff --git a/utils/wrappers.py b/utils/wrappers.py
index 9cdaf783f..a62155338 100644
--- a/utils/wrappers.py
+++ b/utils/wrappers.py
@@ -11,7 +11,7 @@ class DoneOnSuccessWrapper(gym.Wrapper):
     """
 
     def __init__(self, env: gym.Env, reward_offset: float = 0.0, n_successes: int = 1):
-        super(DoneOnSuccessWrapper, self).__init__(env)
+        super().__init__(env)
         self.reward_offset = reward_offset
         self.n_successes = n_successes
         self.current_successes = 0
@@ -41,12 +41,12 @@ class ActionNoiseWrapper(gym.Wrapper):
     Add gaussian noise to the action (without telling the agent),
     to test the robustness of the control.
 
-    :param env: (gym.Env)
-    :param noise_std: (float) Standard deviation of the noise
+    :param env:
+    :param noise_std: Standard deviation of the noise
     """
 
-    def __init__(self, env, noise_std=0.1):
-        super(ActionNoiseWrapper, self).__init__(env)
+    def __init__(self, env: gym.Env, noise_std: float = 0.1):
+        super().__init__(env)
         self.noise_std = noise_std
 
     def step(self, action):
@@ -95,13 +95,13 @@ class LowPassFilterWrapper(gym.Wrapper):
     """
     Butterworth-Lowpass
 
-    :param env: (gym.Env)
+    :param env:
     :param freq: Filter corner frequency.
     :param df: Sampling rate in Hz.
     """
 
-    def __init__(self, env, freq=5.0, df=25.0):
-        super(LowPassFilterWrapper, self).__init__(env)
+    def __init__(self, env: gym.Env, freq: float = 5.0, df: float = 25.0):
+        super().__init__(env)
         self.freq = freq
         self.df = df
         self.signal = []
@@ -123,12 +123,12 @@ class ActionSmoothingWrapper(gym.Wrapper):
     """
     Smooth the action using exponential moving average.
 
-    :param env: (gym.Env)
-    :param smoothing_coef: (float) Smoothing coefficient (0 no smoothing, 1 very smooth)
+    :param env:
+    :param smoothing_coef: Smoothing coefficient (0 no smoothing, 1 very smooth)
     """
 
-    def __init__(self, env, smoothing_coef: float = 0.0):
-        super(ActionSmoothingWrapper, self).__init__(env)
+    def __init__(self, env: gym.Env, smoothing_coef: float = 0.0):
+        super().__init__(env)
         self.smoothing_coef = smoothing_coef
         self.smoothed_action = None
         # from https://github.com/rail-berkeley/softlearning/issues/3
@@ -152,12 +152,12 @@ class DelayedRewardWrapper(gym.Wrapper):
     Delay the reward by `delay` steps, it makes the task harder but more realistic.
     The reward is accumulated during those steps.
 
-    :param env: (gym.Env)
-    :param delay: (int) Number of steps the reward should be delayed.
+    :param env:
+    :param delay: Number of steps the reward should be delayed.
     """
 
-    def __init__(self, env, delay=10):
-        super(DelayedRewardWrapper, self).__init__(env)
+    def __init__(self, env: gym.Env, delay: int = 10):
+        super().__init__(env)
         self.delay = delay
         self.current_step = 0
         self.accumulated_reward = 0.0
@@ -185,11 +185,11 @@ class HistoryWrapper(gym.Wrapper):
     """
     Stack past observations and actions to give an history to the agent.
 
-    :param env: (gym.Env)
-    :param horizon: (int) Number of steps to keep in the history.
+    :param env:
+    :param horizon:Number of steps to keep in the history.
     """
 
-    def __init__(self, env: gym.Env, horizon: int = 5):
+    def __init__(self, env: gym.Env, horizon: int = 2):
         assert isinstance(env.observation_space, gym.spaces.Box)
 
         wrapped_obs_space = env.observation_space
@@ -208,7 +208,7 @@ def __init__(self, env: gym.Env, horizon: int = 5):
         # Overwrite the observation space
         env.observation_space = gym.spaces.Box(low=low, high=high, dtype=wrapped_obs_space.dtype)
 
-        super(HistoryWrapper, self).__init__(env)
+        super().__init__(env)
 
         self.horizon = horizon
         self.low_action, self.high_action = low_action, high_action
@@ -244,11 +244,11 @@ class HistoryWrapperObsDict(gym.Wrapper):
     """
     History Wrapper for dict observation.
 
-    :param env: (gym.Env)
-    :param horizon: (int) Number of steps to keep in the history.
+    :param env:
+    :param horizon: Number of steps to keep in the history.
     """
 
-    def __init__(self, env, horizon=5):
+    def __init__(self, env: gym.Env, horizon: int = 2):
         assert isinstance(env.observation_space.spaces["observation"], gym.spaces.Box)
 
         wrapped_obs_space = env.observation_space.spaces["observation"]
@@ -267,7 +267,7 @@ def __init__(self, env, horizon=5):
         # Overwrite the observation space
         env.observation_space.spaces["observation"] = gym.spaces.Box(low=low, high=high, dtype=wrapped_obs_space.dtype)
 
-        super(HistoryWrapperObsDict, self).__init__(env)
+        super().__init__(env)
 
         self.horizon = horizon
         self.low_action, self.high_action = low_action, high_action

From bcb42996ceb39a1001260fda50340d5237afffa6 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@ensta.org>
Date: Thu, 21 Apr 2022 17:42:07 +0200
Subject: [PATCH 05/14] Fix division by zero with n-evaluations (closes #238)

---
 utils/exp_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/exp_manager.py b/utils/exp_manager.py
index 6fe7bd870..1d756bd24 100644
--- a/utils/exp_manager.py
+++ b/utils/exp_manager.py
@@ -345,7 +345,7 @@ def _preprocess_hyperparams(
 
         # Derive n_evaluations from number of timesteps if needed
         if self.n_evaluations is None and self.optimize_hyperparameters:
-            self.n_evaluations = self.n_timesteps // int(1e5)
+            self.n_evaluations = max(1, self.n_timesteps // int(1e5))
             print(
                 f"Doing {self.n_evaluations} intermediate evaluations for pruning based on the number of timesteps."
                 " (1 evaluation every 100k timesteps)"

From 763acebde7e5227db0a3a505e6eb7f616a6d8a67 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Tue, 26 Apr 2022 12:36:28 +0200
Subject: [PATCH 06/14] Add command line option for total number of trials.

---
 train.py             |  8 ++++++++
 utils/exp_manager.py | 15 ++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/train.py b/train.py
index bbf85795e..3bf57b3d7 100644
--- a/train.py
+++ b/train.py
@@ -64,6 +64,13 @@
         type=int,
         default=500,
     )
+    parser.add_argument(
+        "--total-n-trials",
+        help="Number of trials for optimizing hyperparameters. "
+             "This applies to the entire optimization process and takes precedence over --n-trials if set.",
+        type=int,
+        default=None,
+    )
     parser.add_argument(
         "-optimize", "--optimize-hyperparameters", action="store_true", default=False, help="Run hyperparameters search"
     )
@@ -201,6 +208,7 @@
         args.storage,
         args.study_name,
         args.n_trials,
+        args.total_n_trials,
         args.n_jobs,
         args.sampler,
         args.pruner,
diff --git a/utils/exp_manager.py b/utils/exp_manager.py
index 1d756bd24..b00b22daa 100644
--- a/utils/exp_manager.py
+++ b/utils/exp_manager.py
@@ -16,6 +16,7 @@
 from optuna.integration.skopt import SkoptSampler
 from optuna.pruners import BasePruner, MedianPruner, NopPruner, SuccessiveHalvingPruner
 from optuna.samplers import BaseSampler, RandomSampler, TPESampler
+from optuna.trial import TrialState
 from optuna.visualization import plot_optimization_history, plot_param_importances
 from sb3_contrib.common.vec_env import AsyncEval
 
@@ -74,6 +75,7 @@ def __init__(
         storage: Optional[str] = None,
         study_name: Optional[str] = None,
         n_trials: int = 1,
+        total_n_trials: Optional[int] = None,
         n_jobs: int = 1,
         sampler: str = "tpe",
         pruner: str = "median",
@@ -134,6 +136,7 @@ def __init__(
         self.no_optim_plots = no_optim_plots
         # maximum number of trials for finding the best hyperparams
         self.n_trials = n_trials
+        self.total_n_trials = total_n_trials
         # number of parallel jobs when doing hyperparameter search
         self.n_jobs = n_jobs
         self.sampler = sampler
@@ -748,7 +751,17 @@ def hyperparameters_optimization(self) -> None:
         )
 
         try:
-            study.optimize(self.objective, n_jobs=self.n_jobs, callbacks=[MaxTrialsCallback(self.n_trials)])
+            if self.total_n_trials is not None:
+                study.optimize(self.objective,
+                               n_jobs=self.n_jobs,
+                               callbacks=
+                               [MaxTrialsCallback(
+                                   self.total_n_trials,
+                                    states=[TrialState.COMPLETE, TrialState.RUNNING])])
+            else:
+                study.optimize(self.objective,
+                               n_jobs=self.n_jobs,
+                               n_trials=self.n_trials)
         except KeyboardInterrupt:
             pass
 

From 975c520993181476600cc8f2a706278e1de0d97e Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Tue, 26 Apr 2022 12:46:04 +0200
Subject: [PATCH 07/14] Fix formatting.

---
 train.py             |  2 +-
 utils/exp_manager.py | 17 +++++++----------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/train.py b/train.py
index 3bf57b3d7..d55eac042 100644
--- a/train.py
+++ b/train.py
@@ -67,7 +67,7 @@
     parser.add_argument(
         "--total-n-trials",
         help="Number of trials for optimizing hyperparameters. "
-             "This applies to the entire optimization process and takes precedence over --n-trials if set.",
+        "This applies to the entire optimization process and takes precedence over --n-trials if set.",
         type=int,
         default=None,
     )
diff --git a/utils/exp_manager.py b/utils/exp_manager.py
index b00b22daa..4dbeba261 100644
--- a/utils/exp_manager.py
+++ b/utils/exp_manager.py
@@ -12,10 +12,10 @@
 import optuna
 import torch as th
 import yaml
-from optuna.study import MaxTrialsCallback
 from optuna.integration.skopt import SkoptSampler
 from optuna.pruners import BasePruner, MedianPruner, NopPruner, SuccessiveHalvingPruner
 from optuna.samplers import BaseSampler, RandomSampler, TPESampler
+from optuna.study import MaxTrialsCallback
 from optuna.trial import TrialState
 from optuna.visualization import plot_optimization_history, plot_param_importances
 from sb3_contrib.common.vec_env import AsyncEval
@@ -752,16 +752,13 @@ def hyperparameters_optimization(self) -> None:
 
         try:
             if self.total_n_trials is not None:
-                study.optimize(self.objective,
-                               n_jobs=self.n_jobs,
-                               callbacks=
-                               [MaxTrialsCallback(
-                                   self.total_n_trials,
-                                    states=[TrialState.COMPLETE, TrialState.RUNNING])])
+                study.optimize(
+                    self.objective,
+                    n_jobs=self.n_jobs,
+                    callbacks=[MaxTrialsCallback(self.total_n_trials, states=[TrialState.COMPLETE, TrialState.RUNNING])],
+                )
             else:
-                study.optimize(self.objective,
-                               n_jobs=self.n_jobs,
-                               n_trials=self.n_trials)
+                study.optimize(self.objective, n_jobs=self.n_jobs, n_trials=self.n_trials)
         except KeyboardInterrupt:
             pass
 

From 0cafc7192dbdfac5e70372ac17119f124dcfd0cd Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 28 Apr 2022 14:11:14 +0200
Subject: [PATCH 08/14] Add test for training with multiple workers and the new
 --total-n-trials argument.

---
 tests/test_train.py | 46 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/tests/test_train.py b/tests/test_train.py
index ae0f5eda2..fabd88649 100644
--- a/tests/test_train.py
+++ b/tests/test_train.py
@@ -1,7 +1,9 @@
 import os
 import subprocess
 
+import optuna
 import pytest
+from optuna.trial import TrialState
 
 
 def _assert_eq(left, right):
@@ -101,3 +103,47 @@ def test_parallel_train(tmp_path):
 
     return_code = subprocess.call(["python", "train.py"] + args)
     _assert_eq(return_code, 0)
+
+
+def test_multiple_workers(tmp_path):
+    study_name = "test-study"
+    storage = f"sqlite:///{tmp_path}/optuna.db"
+    n_trials = 6
+    args = [
+        "-optimize",
+        "--no-optim-plots",
+        "--storage",
+        storage,
+        "--total-n-trials",
+        str(n_trials),
+        "--study-name",
+        study_name,
+        "--n-evaluations",
+        str(1),
+        "-n",
+        str(100),
+        "--algo",
+        "ppo",
+        "--env",
+        "Pendulum-v1",
+        "--log-folder",
+        tmp_path,
+    ]
+
+    p1 = subprocess.Popen(["python", "train.py"] + args)
+    p2 = subprocess.Popen(["python", "train.py"] + args)
+    p3 = subprocess.Popen(["python", "train.py"] + args)
+    p4 = subprocess.Popen(["python", "train.py"] + args)
+
+    return_code1 = p1.wait()
+    return_code2 = p2.wait()
+    return_code3 = p3.wait()
+    return_code4 = p4.wait()
+
+    study = optuna.load_study(study_name=study_name, storage=storage)
+    assert sum(t.state == TrialState.COMPLETE for t in study.trials) == n_trials
+
+    assert return_code1 == 0
+    assert return_code2 == 0
+    assert return_code3 == 0
+    assert return_code4 == 0

From 6110f3c31ef2b19e8cdab26ed40a48aec536a2da Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 28 Apr 2022 14:11:38 +0200
Subject: [PATCH 09/14] Ensure Pruned trials are counted and that no
 optimization is started when enough trials are already present.

---
 utils/exp_manager.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/utils/exp_manager.py b/utils/exp_manager.py
index 4dbeba261..0002a9d56 100644
--- a/utils/exp_manager.py
+++ b/utils/exp_manager.py
@@ -752,11 +752,21 @@ def hyperparameters_optimization(self) -> None:
 
         try:
             if self.total_n_trials is not None:
-                study.optimize(
-                    self.objective,
-                    n_jobs=self.n_jobs,
-                    callbacks=[MaxTrialsCallback(self.total_n_trials, states=[TrialState.COMPLETE, TrialState.RUNNING])],
-                )
+                counted_states = [
+                    TrialState.COMPLETE,
+                    TrialState.RUNNING,
+                    TrialState.PRUNED,
+                ]
+                completed_trials = sum(t.state in counted_states for t in study.trials)
+                if completed_trials < self.total_n_trials:
+                    study.optimize(
+                        self.objective,
+                        n_jobs=self.n_jobs,
+                        callbacks=[MaxTrialsCallback(
+                            self.total_n_trials,
+                            states=counted_states,
+                        )],
+                    )
             else:
                 study.optimize(self.objective, n_jobs=self.n_jobs, n_trials=self.n_trials)
         except KeyboardInterrupt:

From 069556e36ef4f6235a96bdf26c056d6cd67b154e Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 28 Apr 2022 14:19:50 +0200
Subject: [PATCH 10/14] Update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bc7c863ea..71ea3442d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@
 
 ### New Features
 - Support setting PyTorch's device with thye `--device` flag (@gregwar)
+- Add `--tital-n-trials` parameter to help with distributed optimization. (@ernestum)
 
 ### Bug fixes
 - Fix `Reacher-v3` name in PPO hyperparameter file

From 7a195bc78f56aa57d3697681bd9f407a010be066 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 28 Apr 2022 14:28:35 +0200
Subject: [PATCH 11/14] Ensure that pruned trials are understood as completed
 trials.

---
 tests/test_train.py | 2 +-
 train.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_train.py b/tests/test_train.py
index fabd88649..d2b87d4cc 100644
--- a/tests/test_train.py
+++ b/tests/test_train.py
@@ -141,7 +141,7 @@ def test_multiple_workers(tmp_path):
     return_code4 = p4.wait()
 
     study = optuna.load_study(study_name=study_name, storage=storage)
-    assert sum(t.state == TrialState.COMPLETE for t in study.trials) == n_trials
+    assert sum(t.state in (TrialState.COMPLETE, TrialState.PRUNED) for t in study.trials) == n_trials
 
     assert return_code1 == 0
     assert return_code2 == 0
diff --git a/train.py b/train.py
index d55eac042..191d01004 100644
--- a/train.py
+++ b/train.py
@@ -66,7 +66,7 @@
     )
     parser.add_argument(
         "--total-n-trials",
-        help="Number of trials for optimizing hyperparameters. "
+        help="Number of (potentially pruned) trials for optimizing hyperparameters. "
         "This applies to the entire optimization process and takes precedence over --n-trials if set.",
         type=int,
         default=None,

From 645ea48db433cc657ada1d874fe8afe050b155f0 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 28 Apr 2022 14:28:44 +0200
Subject: [PATCH 12/14] Fix formatting.

---
 utils/exp_manager.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/utils/exp_manager.py b/utils/exp_manager.py
index 0002a9d56..430fe923e 100644
--- a/utils/exp_manager.py
+++ b/utils/exp_manager.py
@@ -762,10 +762,12 @@ def hyperparameters_optimization(self) -> None:
                     study.optimize(
                         self.objective,
                         n_jobs=self.n_jobs,
-                        callbacks=[MaxTrialsCallback(
-                            self.total_n_trials,
-                            states=counted_states,
-                        )],
+                        callbacks=[
+                            MaxTrialsCallback(
+                                self.total_n_trials,
+                                states=counted_states,
+                            )
+                        ],
                     )
             else:
                 study.optimize(self.objective, n_jobs=self.n_jobs, n_trials=self.n_trials)

From 374b3f9a03bba8e9f8a2787fe2c15bd9432810ab Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Fri, 29 Apr 2022 15:34:32 +0200
Subject: [PATCH 13/14] Add tuned hyperparameters for ppo and seals
 environments.

---
 hyperparams/ppo.yml | 156 ++++++++++++++++++++++++++------------------
 1 file changed, 91 insertions(+), 65 deletions(-)

diff --git a/hyperparams/ppo.yml b/hyperparams/ppo.yml
index fc2d3e1aa..41bb70568 100644
--- a/hyperparams/ppo.yml
+++ b/hyperparams/ppo.yml
@@ -42,18 +42,22 @@ CartPole-v1:
   learning_rate: lin_0.001
   clip_range: lin_0.2
 
+# Tuned
 seals/CartPole-v0:
   n_envs: 8
   n_timesteps: !!float 1e5
   policy: 'MlpPolicy'
-  n_steps: 32
   batch_size: 256
-  gae_lambda: 0.8
-  gamma: 0.98
-  n_epochs: 20
-  ent_coef: 0.0
-  learning_rate: lin_0.001
-  clip_range: lin_0.2
+  clip_range: 0.4
+  ent_coef: 0.008508727919228772
+  gae_lambda: 0.9
+  gamma: 0.9999
+  learning_rate: 0.0012403278189645594
+  max_grad_norm: 0.8
+  n_epochs: 10
+  n_steps: 512
+  policy_kwargs: dict(activation_fn=nn.ReLU, net_arch=[dict(pi=[64, 64], vf=[64, 64])])
+  vf_coef: 0.489343896591493
 
 MountainCar-v0:
   normalize: true
@@ -66,16 +70,23 @@ MountainCar-v0:
   n_epochs: 4
   ent_coef: 0.0
 
+# Tuned
 seals/MountainCar-v0:
   normalize: true
   n_envs: 16
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
-  n_steps: 16
+  batch_size: 512
+  clip_range: 0.2
+  ent_coef: 6.4940755116195606e-06
   gae_lambda: 0.98
   gamma: 0.99
-  n_epochs: 4
-  ent_coef: 0.0
+  learning_rate: 0.0004476103728105138
+  max_grad_norm: 1
+  n_epochs: 20
+  n_steps: 256
+  policy_kwargs: dict(activation_fn=nn.Tanh, net_arch=[dict(pi=[64, 64], vf=[64, 64])])
+  vf_coef: 0.25988158989488963
 
 # Tuned
 MountainCarContinuous-v0:
@@ -379,8 +390,20 @@ Ant-v3: &mujoco-defaults
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
 
+# Tuned
 seals/Ant-v0:
   <<: *mujoco-defaults
+  batch_size: 16
+  clip_range: 0.3
+  ent_coef: 3.1441389214159857e-06
+  gae_lambda: 0.8
+  gamma: 0.995
+  learning_rate: 0.00017959211641976886
+  max_grad_norm: 0.9
+  n_epochs: 10
+  n_steps: 2048
+  policy_kwargs: dict(activation_fn=nn.Tanh, net_arch=[dict(pi=[64, 64], vf=[64, 64])])
+  vf_coef: 0.4351450387648799
 
 # Hopper-v3:
 #   <<: *mujoco-defaults
@@ -396,9 +419,20 @@ Swimmer-v3:
   <<: *mujoco-defaults
   gamma: 0.9999
 
+# Tuned
 seals/Swimmer-v0:
   <<: *mujoco-defaults
-  gamma: 0.9999
+  batch_size: 8
+  clip_range: 0.1
+  ent_coef: 5.167107294612664e-08
+  gae_lambda: 0.95
+  gamma: 0.999
+  learning_rate: 0.0001214437022727675
+  max_grad_norm: 2
+  n_epochs: 20
+  n_steps: 2048
+  policy_kwargs: dict(activation_fn=nn.Tanh, net_arch=[dict(pi=[64, 64], vf=[64, 64])])
+  vf_coef: 0.6162112311062333
 
 # Tuned
 # 10 mujoco envs
@@ -425,27 +459,23 @@ HalfCheetah-v3:
                     net_arch=[dict(pi=[256, 256], vf=[256, 256])]
                   )"
 
+# Tuned
 seals/HalfCheetah-v0:
   normalize: true
   n_envs: 1
   policy: 'MlpPolicy'
   n_timesteps: !!float 1e6
   batch_size: 64
-  n_steps: 512
-  gamma: 0.98
-  learning_rate: 2.0633e-05
-  ent_coef: 0.000401762
   clip_range: 0.1
-  n_epochs: 20
-  gae_lambda: 0.92
+  ent_coef: 3.794797423594763e-06
+  gae_lambda: 0.95
+  gamma: 0.95
+  learning_rate: 0.0003286871805949382
   max_grad_norm: 0.8
-  vf_coef: 0.58096
-  policy_kwargs: "dict(
-                    log_std_init=-2,
-                    ortho_init=False,
-                    activation_fn=nn.ReLU,
-                    net_arch=[dict(pi=[256, 256], vf=[256, 256])]
-                  )"
+  n_epochs: 5
+  n_steps: 512
+  policy_kwargs: dict(activation_fn=nn.Tanh, net_arch=[dict(pi=[64, 64], vf=[64, 64])])
+  vf_coef: 0.11483689492120866
 
 # Ant-v3:
 #   normalize: true
@@ -485,27 +515,23 @@ Hopper-v3:
                     net_arch=[dict(pi=[256, 256], vf=[256, 256])]
                   )"
 
+# Tuned
 seals/Hopper-v0:
   normalize: true
   n_envs: 1
   policy: 'MlpPolicy'
   n_timesteps: !!float 1e6
-  batch_size: 32
-  n_steps: 512
-  gamma: 0.999
-  learning_rate: 9.80828e-05
-  ent_coef: 0.00229519
-  clip_range: 0.2
-  n_epochs: 5
-  gae_lambda: 0.99
-  max_grad_norm: 0.7
-  vf_coef: 0.835671
-  policy_kwargs: "dict(
-                    log_std_init=-2,
-                    ortho_init=False,
-                    activation_fn=nn.ReLU,
-                    net_arch=[dict(pi=[256, 256], vf=[256, 256])]
-                  )"
+  batch_size: 512
+  clip_range: 0.1
+  ent_coef: 0.0010159833764878474
+  gae_lambda: 0.98
+  gamma: 0.995
+  learning_rate: 0.0003904770450788824
+  max_grad_norm: 0.9
+  n_epochs: 20
+  n_steps: 2048
+  policy_kwargs: dict(activation_fn=nn.ReLU, net_arch=[dict(pi=[64, 64], vf=[64, 64])])
+  vf_coef: 0.20315938606555833
 
 HumanoidStandup-v3:
   normalize: true
@@ -551,27 +577,24 @@ Humanoid-v3:
                     net_arch=[dict(pi=[256, 256], vf=[256, 256])]
                   )"
 
+# Tuned
 seals/Humanoid-v0:
   normalize: true
   n_envs: 1
   policy: 'MlpPolicy'
   n_timesteps: !!float 1e7
   batch_size: 256
-  n_steps: 512
-  gamma: 0.95
-  learning_rate: 3.56987e-05
-  ent_coef: 0.00238306
-  clip_range: 0.3
-  n_epochs: 5
-  gae_lambda: 0.9
-  max_grad_norm: 2
-  vf_coef: 0.431892
-  policy_kwargs: "dict(
-                    log_std_init=-2,
-                    ortho_init=False,
-                    activation_fn=nn.ReLU,
-                    net_arch=[dict(pi=[256, 256], vf=[256, 256])]
-                  )"
+  clip_range: 0.2
+  ent_coef: 2.0745206045994986e-05
+  gae_lambda: 0.92
+  gamma: 0.999
+  learning_rate: 2.0309225666232827e-05
+  max_grad_norm: 0.5
+  n_epochs: 20
+  n_steps: 2048
+  policy_kwargs: dict(activation_fn=nn.ReLU, net_arch=[dict(pi=[256, 256], vf=[256,
+    256])])
+  vf_coef: 0.819262464558427
 
 InvertedDoublePendulum-v3:
   normalize: true
@@ -659,18 +682,21 @@ Walker2d-v3:
   max_grad_norm: 1
   vf_coef: 0.871923
 
+# Tuned
 seals/Walker2d-v0:
   normalize: true
   n_envs: 1
   policy: 'MlpPolicy'
   n_timesteps: !!float 1e6
-  batch_size: 32
-  n_steps: 512
-  gamma: 0.99
-  learning_rate: 5.05041e-05
-  ent_coef: 0.000585045
-  clip_range: 0.1
-  n_epochs: 20
-  gae_lambda: 0.95
-  max_grad_norm: 1
-  vf_coef: 0.871923
+  batch_size: 8
+  clip_range: 0.4
+  ent_coef: 0.00013057334805552262
+  gae_lambda: 0.92
+  gamma: 0.98
+  learning_rate: 3.791707778339674e-05
+  max_grad_norm: 0.6
+  n_epochs: 5
+  n_steps: 2048
+  policy_kwargs: dict(activation_fn=nn.ReLU, net_arch=[dict(pi=[256, 256], vf=[256,
+    256])])
+  vf_coef: 0.6167177795726859

From ea16e61b7da8d0eee2b97049fff9a6254b765b10 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Fri, 29 Apr 2022 16:05:51 +0200
Subject: [PATCH 14/14] Add tuned hyperparameters for SAC and seals
 environments.

---
 hyperparams/sac.yml | 55 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml
index c0af74cd8..e9d2469f0 100644
--- a/hyperparams/sac.yml
+++ b/hyperparams/sac.yml
@@ -195,42 +195,95 @@ HalfCheetah-v3: &mujoco-defaults
   policy: 'MlpPolicy'
   learning_starts: 10000
 
+# Tuned
 seals/HalfCheetah-v0:
   <<: *mujoco-defaults
+  batch_size: 2048
+  buffer_size: 100000
+  gamma: 0.95
+  learning_rate: 0.000884624878315995
+  learning_starts: 10000
+  policy_kwargs: dict(net_arch=[64, 64], log_std_init=-0.6932709443503001)
+  tau: 0.01
+  train_freq: 64
 
 Ant-v3:
   <<: *mujoco-defaults
 
+# Tuned
 seals/Ant-v0:
   <<: *mujoco-defaults
+  batch_size: 512
+  buffer_size: 1000000
+  gamma: 0.98
+  learning_rate: 0.0018514039303149058
+  learning_starts: 1000
+  policy_kwargs: dict(net_arch=[256, 256], log_std_init=-2.2692589009754176)
+  tau: 0.05
+  train_freq: 64
 
 Hopper-v3:
   <<: *mujoco-defaults
 
+# Tuned
 seals/Hopper-v0:
   <<: *mujoco-defaults
+  batch_size: 128
+  buffer_size: 100000
+  gamma: 0.98
+  learning_rate: 0.001709807687567946
+  learning_starts: 1000
+  policy_kwargs: dict(net_arch=[256, 256], log_std_init=-1.6829391077276037)
+  tau: 0.08
+  train_freq: 32
 
 Walker2d-v3:
   <<: *mujoco-defaults
 
+# Tuned
 seals/Walker2d-v0:
   <<: *mujoco-defaults
+  batch_size: 128
+  buffer_size: 100000
+  gamma: 0.99
+  learning_rate: 0.0005845844772048097
+  learning_starts: 1000
+  policy_kwargs: dict(net_arch=[400, 300], log_std_init=0.1955317469998743)
+  tau: 0.02
+  train_freq: 1
 
 Humanoid-v3:
   <<: *mujoco-defaults
   n_timesteps: !!float 2e6
 
+# Tuned
 seals/Humanoid-v0:
   <<: *mujoco-defaults
   n_timesteps: !!float 2e6
+  batch_size: 64
+  buffer_size: 100000
+  gamma: 0.98
+  learning_rate: 4.426351861707874e-05
+  learning_starts: 20000
+  policy_kwargs: dict(net_arch=[400, 300], log_std_init=-0.1034412732183072)
+  tau: 0.08
+  train_freq: 8
 
 Swimmer-v3:
   <<: *mujoco-defaults
   gamma: 0.9999
 
+# Tuned
 seals/Swimmer-v0:
   <<: *mujoco-defaults
-  gamma: 0.9999
+  batch_size: 128
+  buffer_size: 100000
+  gamma: 0.995
+  learning_rate: 0.00039981805535514633
+  learning_starts: 1000
+  policy_kwargs: dict(net_arch=[400, 300], log_std_init=-2.689958330139309)
+  tau: 0.01
+  train_freq: 256
 
 # === HER Robotics GoalEnvs ===