From bb00557eea8abaefee3dcdbd4ca2fd718030b6b6 Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Mon, 11 Apr 2022 16:29:48 +0200 Subject: [PATCH 01/14] Use the MaxTrialsCallback to set the number of trials idnependent from workers --- utils/exp_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/exp_manager.py b/utils/exp_manager.py index 1b6725831..5dffdb1f8 100644 --- a/utils/exp_manager.py +++ b/utils/exp_manager.py @@ -12,6 +12,7 @@ import optuna import torch as th import yaml +from optuna.study import MaxTrialsCallback from optuna.integration.skopt import SkoptSampler from optuna.pruners import BasePruner, MedianPruner, SuccessiveHalvingPruner from optuna.samplers import BaseSampler, RandomSampler, TPESampler @@ -747,7 +748,7 @@ def hyperparameters_optimization(self) -> None: ) try: - study.optimize(self.objective, n_trials=self.n_trials, n_jobs=self.n_jobs) + study.optimize(self.objective, n_jobs=self.n_jobs, callbacks=[MaxTrialsCallback(self.n_trials)]) except KeyboardInterrupt: pass From edbd3ac8f9d9c6fc71ce5f7150a0b5d476910563 Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Tue, 29 Mar 2022 16:16:16 +0200 Subject: [PATCH 02/14] Add seals environments and corresponding (tentative) hyperparameters. --- hyperparams/a2c.yml | 33 ++++++++++++ hyperparams/ars.yml | 77 ++++++++++++++++++++++++++++ hyperparams/ddpg.yml | 22 ++++++++ hyperparams/dqn.yml | 30 +++++++++++ hyperparams/ppo.yml | 113 ++++++++++++++++++++++++++++++++++++++++++ hyperparams/qrdqn.yml | 30 +++++++++++ hyperparams/sac.yml | 20 ++++++++ hyperparams/td3.yml | 32 ++++++++++++ hyperparams/tqc.yml | 21 ++++++++ hyperparams/trpo.yml | 44 ++++++++++++++++ requirements.txt | 1 + utils/import_envs.py | 5 ++ 12 files changed, 428 insertions(+) diff --git a/hyperparams/a2c.yml b/hyperparams/a2c.yml index f51b0fb09..23763cd02 100644 --- a/hyperparams/a2c.yml +++ b/hyperparams/a2c.yml @@ -15,6 +15,12 @@ CartPole-v1: policy: 'MlpPolicy' ent_coef: 0.0 +seals/CartPole-v0: + n_envs: 8 + n_timesteps: !!float 5e5 + policy: 'MlpPolicy' + ent_coef: 0.0 + LunarLander-v2: n_envs: 8 n_timesteps: !!float 2e5 @@ -31,6 +37,13 @@ MountainCar-v0: policy: 'MlpPolicy' ent_coef: .0 +seals/MountainCar-v0: + normalize: true + n_envs: 16 + n_timesteps: !!float 1e6 + policy: 'MlpPolicy' + ent_coef: .0 + Acrobot-v1: normalize: true n_envs: 16 @@ -166,19 +179,39 @@ HalfCheetah-v3: &mujoco-defaults n_timesteps: !!float 1e6 policy: 'MlpPolicy' +seals/HalfCheetah-v0: + <<: *mujoco-defaults + Ant-v3: <<: *mujoco-defaults +seals/Ant-v0: + <<: *mujoco-defaults + Hopper-v3: <<: *mujoco-defaults +seals/Hopper-v0: + <<: *mujoco-defaults + Walker2d-v3: <<: *mujoco-defaults +seals/Walker2d-v0: + <<: *mujoco-defaults + Humanoid-v3: <<: *mujoco-defaults n_timesteps: !!float 2e6 +seals/Humanoid-v0: + <<: *mujoco-defaults + n_timesteps: !!float 2e6 + Swimmer-v3: <<: *mujoco-defaults gamma: 0.9999 + +seals/Swimmer-v0: + <<: *mujoco-defaults + gamma: 0.9999 diff --git a/hyperparams/ars.yml b/hyperparams/ars.yml index e58d4fa3c..eb89cc211 100644 --- a/hyperparams/ars.yml +++ b/hyperparams/ars.yml @@ -5,6 +5,12 @@ CartPole-v1: policy: 'LinearPolicy' n_delta: 2 +seals/CartPole-v0: + n_envs: 1 + n_timesteps: !!float 5e4 + policy: 'LinearPolicy' + n_delta: 2 + # Tuned Pendulum-v1: &pendulum-params n_envs: 1 @@ -41,6 +47,11 @@ MountainCar-v0: n_delta: 8 n_timesteps: !!float 5e5 +seals/MountainCar-v0: + <<: *pendulum-params + n_delta: 8 + n_timesteps: !!float 5e5 + # Tuned MountainCarContinuous-v0: <<: *pendulum-params @@ -119,6 +130,17 @@ Swimmer-v3: alive_bonus_offset: 0 # normalize: "dict(norm_obs=True, norm_reward=False)" +seals/Swimmer-v0: + n_envs: 1 + policy: 'LinearPolicy' + n_timesteps: !!float 2e6 + learning_rate: !!float 0.02 + delta_std: !!float 0.01 + n_delta: 1 + n_top: 1 + alive_bonus_offset: 0 + # normalize: "dict(norm_obs=True, norm_reward=False)" + Hopper-v3: n_envs: 1 policy: 'LinearPolicy' @@ -130,6 +152,17 @@ Hopper-v3: alive_bonus_offset: -1 normalize: "dict(norm_obs=True, norm_reward=False)" +seals/Hopper-v0: + n_envs: 1 + policy: 'LinearPolicy' + n_timesteps: !!float 7e6 + learning_rate: !!float 0.01 + delta_std: !!float 0.025 + n_delta: 8 + n_top: 4 + alive_bonus_offset: -1 + normalize: "dict(norm_obs=True, norm_reward=False)" + HalfCheetah-v3: n_envs: 1 policy: 'LinearPolicy' @@ -141,6 +174,17 @@ HalfCheetah-v3: alive_bonus_offset: 0 normalize: "dict(norm_obs=True, norm_reward=False)" +seals/HalfCheetah-v0: + n_envs: 1 + policy: 'LinearPolicy' + n_timesteps: !!float 1.25e7 + learning_rate: !!float 0.02 + delta_std: !!float 0.03 + n_delta: 32 + n_top: 4 + alive_bonus_offset: 0 + normalize: "dict(norm_obs=True, norm_reward=False)" + Walker2d-v3: n_envs: 1 policy: 'LinearPolicy' @@ -152,6 +196,17 @@ Walker2d-v3: alive_bonus_offset: -1 normalize: "dict(norm_obs=True, norm_reward=False)" +seals/Walker2d-v0: + n_envs: 1 + policy: 'LinearPolicy' + n_timesteps: !!float 7.5e7 + learning_rate: !!float 0.03 + delta_std: !!float 0.025 + n_delta: 40 + n_top: 30 + alive_bonus_offset: -1 + normalize: "dict(norm_obs=True, norm_reward=False)" + Ant-v3: n_envs: 1 policy: 'LinearPolicy' @@ -163,6 +218,17 @@ Ant-v3: alive_bonus_offset: -1 normalize: "dict(norm_obs=True, norm_reward=False)" +seals/Ant-v0: + n_envs: 1 + policy: 'LinearPolicy' + n_timesteps: !!float 7.5e7 + learning_rate: !!float 0.015 + delta_std: !!float 0.025 + n_delta: 60 + n_top: 20 + alive_bonus_offset: -1 + normalize: "dict(norm_obs=True, norm_reward=False)" + Humanoid-v3: n_envs: 1 @@ -175,6 +241,17 @@ Humanoid-v3: alive_bonus_offset: -5 normalize: "dict(norm_obs=True, norm_reward=False)" +seals/Humanoid-v0: + n_envs: 1 + policy: 'LinearPolicy' + n_timesteps: !!float 2.5e8 + learning_rate: 0.02 + delta_std: 0.0075 + n_delta: 256 + n_top: 256 + alive_bonus_offset: -5 + normalize: "dict(norm_obs=True, norm_reward=False)" + # Almost tuned BipedalWalker-v3: n_envs: 1 diff --git a/hyperparams/ddpg.yml b/hyperparams/ddpg.yml index 14a53cfca..996cee226 100644 --- a/hyperparams/ddpg.yml +++ b/hyperparams/ddpg.yml @@ -131,21 +131,43 @@ HalfCheetah-v3: &mujoco-defaults noise_type: 'normal' noise_std: 0.1 +seals/HalfCheetah-v0: + <<: *mujoco-defaults + Ant-v3: <<: *mujoco-defaults +seals/Ant-v0: + <<: *mujoco-defaults + Hopper-v3: <<: *mujoco-defaults +seals/Hopper-v0: + <<: *mujoco-defaults + Walker2d-v3: <<: *mujoco-defaults +seals/Walker2d-v0: + <<: *mujoco-defaults + Humanoid-v3: <<: *mujoco-defaults n_timesteps: !!float 2e6 +seals/Humanoid-v0: + <<: *mujoco-defaults + n_timesteps: !!float 2e6 + Swimmer-v3: <<: *mujoco-defaults gamma: 0.9999 train_freq: 1 gradient_steps: 1 + +seals/Swimmer-v0: + <<: *mujoco-defaults + gamma: 0.9999 + train_freq: 1 + gradient_steps: 1 diff --git a/hyperparams/dqn.yml b/hyperparams/dqn.yml index 301adf654..6fb77ed6c 100644 --- a/hyperparams/dqn.yml +++ b/hyperparams/dqn.yml @@ -31,6 +31,21 @@ CartPole-v1: exploration_final_eps: 0.04 policy_kwargs: "dict(net_arch=[256, 256])" +seals/CartPole-v0: + n_timesteps: !!float 5e4 + policy: 'MlpPolicy' + learning_rate: !!float 2.3e-3 + batch_size: 64 + buffer_size: 100000 + learning_starts: 1000 + gamma: 0.99 + target_update_interval: 10 + train_freq: 256 + gradient_steps: 128 + exploration_fraction: 0.16 + exploration_final_eps: 0.04 + policy_kwargs: "dict(net_arch=[256, 256])" + # Tuned MountainCar-v0: n_timesteps: !!float 1.2e5 @@ -47,6 +62,21 @@ MountainCar-v0: exploration_final_eps: 0.07 policy_kwargs: "dict(net_arch=[256, 256])" +seals/MountainCar-v0: + n_timesteps: !!float 1.2e5 + policy: 'MlpPolicy' + learning_rate: !!float 4e-3 + batch_size: 128 + buffer_size: 10000 + learning_starts: 1000 + gamma: 0.98 + target_update_interval: 600 + train_freq: 16 + gradient_steps: 8 + exploration_fraction: 0.2 + exploration_final_eps: 0.07 + policy_kwargs: "dict(net_arch=[256, 256])" + # Tuned LunarLander-v2: n_timesteps: !!float 1e5 diff --git a/hyperparams/ppo.yml b/hyperparams/ppo.yml index 2a88c30d8..07e4e51f7 100644 --- a/hyperparams/ppo.yml +++ b/hyperparams/ppo.yml @@ -42,6 +42,19 @@ CartPole-v1: learning_rate: lin_0.001 clip_range: lin_0.2 +seals/CartPole-v0: + n_envs: 8 + n_timesteps: !!float 1e5 + policy: 'MlpPolicy' + n_steps: 32 + batch_size: 256 + gae_lambda: 0.8 + gamma: 0.98 + n_epochs: 20 + ent_coef: 0.0 + learning_rate: lin_0.001 + clip_range: lin_0.2 + MountainCar-v0: normalize: true n_envs: 16 @@ -53,6 +66,17 @@ MountainCar-v0: n_epochs: 4 ent_coef: 0.0 +seals/MountainCar-v0: + normalize: true + n_envs: 16 + n_timesteps: !!float 1e6 + policy: 'MlpPolicy' + n_steps: 16 + gae_lambda: 0.98 + gamma: 0.99 + n_epochs: 4 + ent_coef: 0.0 + # Tuned MountainCarContinuous-v0: normalize: true @@ -355,6 +379,9 @@ Ant-v3: &mujoco-defaults n_timesteps: !!float 1e6 policy: 'MlpPolicy' +seals/Ant-v0: + <<: *mujoco-defaults + # Hopper-v3: # <<: *mujoco-defaults # @@ -369,6 +396,10 @@ Swimmer-v3: <<: *mujoco-defaults gamma: 0.9999 +seals/Swimmer-v0: + <<: *mujoco-defaults + gamma: 0.9999 + # Tuned # 10 mujoco envs @@ -394,6 +425,28 @@ HalfCheetah-v3: net_arch=[dict(pi=[256, 256], vf=[256, 256])] )" +seals/HalfCheetah-v0: + normalize: true + n_envs: 1 + policy: 'MlpPolicy' + n_timesteps: !!float 1e6 + batch_size: 64 + n_steps: 512 + gamma: 0.98 + learning_rate: 2.0633e-05 + ent_coef: 0.000401762 + clip_range: 0.1 + n_epochs: 20 + gae_lambda: 0.92 + max_grad_norm: 0.8 + vf_coef: 0.58096 + policy_kwargs: "dict( + log_std_init=-2, + ortho_init=False, + activation_fn=nn.ReLU, + net_arch=[dict(pi=[256, 256], vf=[256, 256])] + )" + # Ant-v3: # normalize: true # n_envs: 1 @@ -432,6 +485,28 @@ Hopper-v3: net_arch=[dict(pi=[256, 256], vf=[256, 256])] )" +seals/Hopper-v0: + normalize: true + n_envs: 1 + policy: 'MlpPolicy' + n_timesteps: !!float 1e6 + batch_size: 32 + n_steps: 512 + gamma: 0.999 + learning_rate: 9.80828e-05 + ent_coef: 0.00229519 + clip_range: 0.2 + n_epochs: 5 + gae_lambda: 0.99 + max_grad_norm: 0.7 + vf_coef: 0.835671 + policy_kwargs: "dict( + log_std_init=-2, + ortho_init=False, + activation_fn=nn.ReLU, + net_arch=[dict(pi=[256, 256], vf=[256, 256])] + )" + HumanoidStandup-v3: normalize: true n_envs: 1 @@ -476,6 +551,28 @@ Humanoid-v3: net_arch=[dict(pi=[256, 256], vf=[256, 256])] )" +seals/Humanoid-v0: + normalize: true + n_envs: 1 + policy: 'MlpPolicy' + n_timesteps: !!float 1e7 + batch_size: 256 + n_steps: 512 + gamma: 0.95 + learning_rate: 3.56987e-05 + ent_coef: 0.00238306 + clip_range: 0.3 + n_epochs: 5 + gae_lambda: 0.9 + max_grad_norm: 2 + vf_coef: 0.431892 + policy_kwargs: "dict( + log_std_init=-2, + ortho_init=False, + activation_fn=nn.ReLU, + net_arch=[dict(pi=[256, 256], vf=[256, 256])] + )" + InvertedDoublePendulum-v3: normalize: true n_envs: 1 @@ -561,3 +658,19 @@ Walker2d-v3: gae_lambda: 0.95 max_grad_norm: 1 vf_coef: 0.871923 + +seals/Walker2d-v0: + normalize: true + n_envs: 1 + policy: 'MlpPolicy' + n_timesteps: !!float 1e6 + batch_size: 32 + n_steps: 512 + gamma: 0.99 + learning_rate: 5.05041e-05 + ent_coef: 0.000585045 + clip_range: 0.1 + n_epochs: 20 + gae_lambda: 0.95 + max_grad_norm: 1 + vf_coef: 0.871923 diff --git a/hyperparams/qrdqn.yml b/hyperparams/qrdqn.yml index 6a6f13bd0..aca12afba 100644 --- a/hyperparams/qrdqn.yml +++ b/hyperparams/qrdqn.yml @@ -23,6 +23,21 @@ CartPole-v1: exploration_final_eps: 0.04 policy_kwargs: "dict(net_arch=[256, 256], n_quantiles=10)" +seals/CartPole-v0: + n_timesteps: !!float 5e4 + policy: 'MlpPolicy' + learning_rate: !!float 2.3e-3 + batch_size: 64 + buffer_size: 100000 + learning_starts: 1000 + gamma: 0.99 + target_update_interval: 10 + train_freq: 256 + gradient_steps: 128 + exploration_fraction: 0.16 + exploration_final_eps: 0.04 + policy_kwargs: "dict(net_arch=[256, 256], n_quantiles=10)" + # Tuned MountainCar-v0: n_timesteps: !!float 1.2e5 @@ -39,6 +54,21 @@ MountainCar-v0: exploration_final_eps: 0.07 policy_kwargs: "dict(net_arch=[256, 256], n_quantiles=25)" +seals/MountainCar-v0: + n_timesteps: !!float 1.2e5 + policy: 'MlpPolicy' + learning_rate: !!float 4e-3 + batch_size: 128 + buffer_size: 10000 + learning_starts: 1000 + gamma: 0.98 + target_update_interval: 600 + train_freq: 16 + gradient_steps: 8 + exploration_fraction: 0.2 + exploration_final_eps: 0.07 + policy_kwargs: "dict(net_arch=[256, 256], n_quantiles=25)" + # Tuned LunarLander-v2: n_timesteps: !!float 1e5 diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml index 8aeaef616..c0af74cd8 100644 --- a/hyperparams/sac.yml +++ b/hyperparams/sac.yml @@ -195,23 +195,43 @@ HalfCheetah-v3: &mujoco-defaults policy: 'MlpPolicy' learning_starts: 10000 +seals/HalfCheetah-v0: + <<: *mujoco-defaults + Ant-v3: <<: *mujoco-defaults +seals/Ant-v0: + <<: *mujoco-defaults + Hopper-v3: <<: *mujoco-defaults +seals/Hopper-v0: + <<: *mujoco-defaults + Walker2d-v3: <<: *mujoco-defaults +seals/Walker2d-v0: + <<: *mujoco-defaults + Humanoid-v3: <<: *mujoco-defaults n_timesteps: !!float 2e6 +seals/Humanoid-v0: + <<: *mujoco-defaults + n_timesteps: !!float 2e6 + Swimmer-v3: <<: *mujoco-defaults gamma: 0.9999 +seals/Swimmer-v0: + <<: *mujoco-defaults + gamma: 0.9999 + # === HER Robotics GoalEnvs === FetchReach-v1: diff --git a/hyperparams/td3.yml b/hyperparams/td3.yml index 9b941516b..c29d49199 100644 --- a/hyperparams/td3.yml +++ b/hyperparams/td3.yml @@ -133,9 +133,15 @@ HalfCheetah-v3: &mujoco-defaults noise_type: 'normal' noise_std: 0.1 +seals/HalfCheetah-v0: + <<: *mujoco-defaults + Ant-v3: <<: *mujoco-defaults +seals/Ant-v0: + <<: *mujoco-defaults + Hopper-v3: <<: *mujoco-defaults # SAC Hyperparams @@ -144,9 +150,20 @@ Hopper-v3: learning_rate: !!float 3e-4 batch_size: 256 +seals/Hopper-v0: + <<: *mujoco-defaults + # SAC Hyperparams + train_freq: 1 + gradient_steps: 1 + learning_rate: !!float 3e-4 + batch_size: 256 + Walker2d-v3: <<: *mujoco-defaults +seals/Walker2d-v0: + <<: *mujoco-defaults + Humanoid-v3: <<: *mujoco-defaults n_timesteps: !!float 2e6 @@ -156,9 +173,24 @@ Humanoid-v3: learning_rate: !!float 3e-4 batch_size: 256 +seals/Humanoid-v0: + <<: *mujoco-defaults + n_timesteps: !!float 2e6 + # SAC Hyperparams + train_freq: 1 + gradient_steps: 1 + learning_rate: !!float 3e-4 + batch_size: 256 + # Tuned Swimmer-v3: <<: *mujoco-defaults gamma: 0.9999 train_freq: 1 gradient_steps: 1 + +seals/Swimmer-v0: + <<: *mujoco-defaults + gamma: 0.9999 + train_freq: 1 + gradient_steps: 1 diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index 54e2f3a4d..23587c2e6 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -145,24 +145,45 @@ HalfCheetah-v3: &mujoco-defaults policy: 'MlpPolicy' learning_starts: 10000 +seals/HalfCheetah-v0: + <<: *mujoco-defaults + Ant-v3: <<: *mujoco-defaults +seals/Ant-v0: + <<: *mujoco-defaults + Hopper-v3: <<: *mujoco-defaults top_quantiles_to_drop_per_net: 5 +seals/Hopper-v0: + <<: *mujoco-defaults + top_quantiles_to_drop_per_net: 5 + Walker2d-v3: <<: *mujoco-defaults +seals/Walker2d-v0: + <<: *mujoco-defaults + Humanoid-v3: <<: *mujoco-defaults n_timesteps: !!float 2e6 +seals/Humanoid-v0: + <<: *mujoco-defaults + n_timesteps: !!float 2e6 + Swimmer-v3: <<: *mujoco-defaults gamma: 0.9999 +seals/Swimmer-v0: + <<: *mujoco-defaults + gamma: 0.9999 + # === HER Robotics GoalEnvs === FetchReach-v1: env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper diff --git a/hyperparams/trpo.yml b/hyperparams/trpo.yml index c78263338..01fbfccba 100644 --- a/hyperparams/trpo.yml +++ b/hyperparams/trpo.yml @@ -11,6 +11,18 @@ CartPole-v1: learning_rate: !!float 1e-3 n_critic_updates: 20 +seals/CartPole-v0: + n_envs: 2 + n_timesteps: !!float 1e5 + policy: 'MlpPolicy' + n_steps: 512 + batch_size: 512 + cg_damping: !!float 1e-3 + gae_lambda: 0.98 + gamma: 0.99 + learning_rate: !!float 1e-3 + n_critic_updates: 20 + # Tuned Pendulum-v1: n_envs: 2 @@ -60,6 +72,14 @@ MountainCar-v0: n_steps: 1024 n_critic_updates: 20 +seals/MountainCar-v0: + normalize: true + n_envs: 2 + n_timesteps: !!float 1e5 + policy: 'MlpPolicy' + n_steps: 1024 + n_critic_updates: 20 + # Tuned MountainCarContinuous-v0: normalize: True @@ -122,25 +142,49 @@ Ant-v3: &mujoco-defaults <<: *pybullet-defaults n_timesteps: !!float 1e6 +seals/Ant-v0: + <<: *mujoco-defaults + # Tuned HalfCheetah-v3: <<: *mujoco-defaults target_kl: 0.04 + +seals/HalfCheetah-v0: + <<: *mujoco-defaults + target_kl: 0.04 + # Tuned Hopper-v3: <<: *mujoco-defaults + +seals/Hopper-v0: + <<: *mujoco-defaults + # Tuned Walker2d-v3: <<: *mujoco-defaults +seals/Walker2d-v0: + <<: *mujoco-defaults + Humanoid-v3: <<: *mujoco-defaults n_timesteps: !!float 2e6 + +seals/Humanoid-v0: + <<: *mujoco-defaults + n_timesteps: !!float 2e6 + # Tuned Swimmer-v3: <<: *mujoco-defaults gamma: 0.9999 +seals/Swimmer-v0: + <<: *mujoco-defaults + gamma: 0.9999 + # Tuned BipedalWalker-v3: <<: *mujoco-defaults diff --git a/requirements.txt b/requirements.txt index dbd890eb1..e28ddd32e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ plotly panda-gym==1.1.1 # tmp fix: until compatibility with panda-gym v2 rliable>=1.0.5 wandb +seals diff --git a/utils/import_envs.py b/utils/import_envs.py index fbe0370e3..8b74024ea 100644 --- a/utils/import_envs.py +++ b/utils/import_envs.py @@ -32,3 +32,8 @@ import panda_gym # pytype: disable=import-error except ImportError: panda_gym = None + +try: + import seals +except ImportError: + seals = None From fc8244468371faaa40f090aa136732b7615c6e64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <45557362+qgallouedec@users.noreply.github.com> Date: Sun, 17 Apr 2022 20:37:20 +0200 Subject: [PATCH 03/14] Use `NopPruner` when pruner is set to `"none"` (#234) * Replace diverted MedianPruner by NopPruner * Update changelog Co-authored-by: Antonin RAFFIN --- CHANGELOG.md | 1 + utils/exp_manager.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 25c7dd5ab..3108d0185 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ ### Documentation ### Other +- When pruner is set to `"none"`, use `NopPruner` instead of diverted `MedianPruner` (@qgallouedec) ## Release 1.5.0 (2022-03-25) diff --git a/utils/exp_manager.py b/utils/exp_manager.py index 5dffdb1f8..66a34c2df 100644 --- a/utils/exp_manager.py +++ b/utils/exp_manager.py @@ -14,7 +14,7 @@ import yaml from optuna.study import MaxTrialsCallback from optuna.integration.skopt import SkoptSampler -from optuna.pruners import BasePruner, MedianPruner, SuccessiveHalvingPruner +from optuna.pruners import BasePruner, MedianPruner, NopPruner, SuccessiveHalvingPruner from optuna.samplers import BaseSampler, RandomSampler, TPESampler from optuna.visualization import plot_optimization_history, plot_param_importances from sb3_contrib.common.vec_env import AsyncEval @@ -620,7 +620,7 @@ def _create_pruner(self, pruner_method: str) -> BasePruner: pruner = MedianPruner(n_startup_trials=self.n_startup_trials, n_warmup_steps=self.n_evaluations // 3) elif pruner_method == "none": # Do not prune - pruner = MedianPruner(n_startup_trials=self.n_trials, n_warmup_steps=self.n_evaluations) + pruner = NopPruner() else: raise ValueError(f"Unknown pruner: {pruner_method}") return pruner From 217cf1fee6e370252f107684c7f3c9e714363a21 Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Thu, 21 Apr 2022 17:34:52 +0200 Subject: [PATCH 04/14] Dox fixes and move to python 3.7+ style (#237) * Dox fixes and move to python 3.7+ style * Fix type * Fix broken packages --- .github/workflows/ci.yml | 4 +-- .github/workflows/trained_agents.yml | 4 +-- CHANGELOG.md | 5 ++- README.md | 2 +- docker/Dockerfile | 2 +- enjoy.py | 2 +- hyperparams/ppo.yml | 2 +- requirements.txt | 1 + scripts/plot_from_file.py | 2 +- utils/callbacks.py | 10 +++--- utils/exp_manager.py | 6 ++-- utils/record_video.py | 2 +- utils/utils.py | 4 +-- utils/wrappers.py | 48 ++++++++++++++-------------- 14 files changed, 49 insertions(+), 45 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 24390204e..45d873255 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,13 +30,13 @@ jobs: run: | python -m pip install --upgrade pip # cpu version of pytorch - faster to download - pip install torch==1.8.1+cpu -f https://download.pytorch.org/whl/torch_stable.html + pip install torch==1.11+cpu -f https://download.pytorch.org/whl/torch_stable.html pip install pybullet==3.1.9 pip install -r requirements.txt # Use headless version pip install opencv-python-headless # install parking-env to test HER (pinned so it works with gym 0.21) - pip install git+https://github.com/eleurent/highway-env@1a04c6a98be64632cf9683625022023e70ff1ab1 + pip install highway-env==1.5.0 - name: Type check run: | make type diff --git a/.github/workflows/trained_agents.yml b/.github/workflows/trained_agents.yml index 39a970f85..a047e9861 100644 --- a/.github/workflows/trained_agents.yml +++ b/.github/workflows/trained_agents.yml @@ -29,13 +29,13 @@ jobs: run: | python -m pip install --upgrade pip # cpu version of pytorch - faster to download - pip install torch==1.8.1+cpu -f https://download.pytorch.org/whl/torch_stable.html + pip install torch==1.11+cpu -f https://download.pytorch.org/whl/torch_stable.html pip install pybullet==3.1.9 pip install -r requirements.txt # Use headless version pip install opencv-python-headless # install parking-env to test HER (pinned so it works with gym 0.21) - pip install git+https://github.com/eleurent/highway-env@1a04c6a98be64632cf9683625022023e70ff1ab1 + pip install highway-env==1.5.0 # Add support for pickle5 protocol pip install pickle5 - name: Check trained agents diff --git a/CHANGELOG.md b/CHANGELOG.md index 3108d0185..bc7c863ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,12 +3,15 @@ ### Breaking Changes - Change default value for number of hyperparameter optimization trials from 10 to 500. (@ernestum) - Derive number of intermediate pruning evaluations from number of time steps (1 evaluation per 100k time steps.) (@ernestum) -- Updated default --eval-freq from 10k to 25k steps +- Updated default --eval-freq from 10k to 25k steps +- Update default horizon to 2 for the `HistoryWrapper` ### New Features - Support setting PyTorch's device with thye `--device` flag (@gregwar) ### Bug fixes +- Fix `Reacher-v3` name in PPO hyperparameter file +- Pinned ale-py==0.7.4 until new SB3 version is released ### Documentation diff --git a/README.md b/README.md index 9b760fa3a..181aaec0f 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ python scripts/plot_train.py -a her -e Fetch -y success -f rl-trained-agents/ -w Plot evaluation reward curve for TQC, SAC and TD3 on the HalfCheetah and Ant PyBullet environments: ``` -python scripts/all_plots.py -a sac td3 tqc --env HalfCheetah Ant -f rl-trained-agents/ +python3 scripts/all_plots.py -a sac td3 tqc --env HalfCheetahBullet AntBullet -f rl-trained-agents/ ``` ## Plot with the rliable library diff --git a/docker/Dockerfile b/docker/Dockerfile index a99e9a7b1..2baa348c1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -21,7 +21,7 @@ RUN \ mkdir -p ${CODE_DIR}/rl_zoo && \ pip uninstall -y stable-baselines3 && \ pip install -r /tmp/requirements.txt && \ - pip install git+https://github.com/eleurent/highway-env@1a04c6a98be64632cf9683625022023e70ff1ab1 && \ + pip install pip install highway-env==1.5.0 && \ rm -rf $HOME/.cache/pip ENV PATH=$VENV/bin:$PATH diff --git a/enjoy.py b/enjoy.py index c92304fb2..731b451bd 100644 --- a/enjoy.py +++ b/enjoy.py @@ -138,7 +138,7 @@ def step_count(checkpoint_path: str) -> int: env_kwargs = {} args_path = os.path.join(log_path, env_id, "args.yml") if os.path.isfile(args_path): - with open(args_path, "r") as f: + with open(args_path) as f: loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader) # pytype: disable=module-attr if loaded_args["env_kwargs"] is not None: env_kwargs = loaded_args["env_kwargs"] diff --git a/hyperparams/ppo.yml b/hyperparams/ppo.yml index 07e4e51f7..fc2d3e1aa 100644 --- a/hyperparams/ppo.yml +++ b/hyperparams/ppo.yml @@ -605,7 +605,7 @@ InvertedPendulum-v3: max_grad_norm: 0.3 vf_coef: 0.19816 -Reacher-v3: +Reacher-v2: normalize: true n_envs: 1 policy: 'MlpPolicy' diff --git a/requirements.txt b/requirements.txt index e28ddd32e..6fe1ba43e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,5 @@ plotly panda-gym==1.1.1 # tmp fix: until compatibility with panda-gym v2 rliable>=1.0.5 wandb +ale-py==0.7.4 # tmp fix: until new SB3 version is released seals diff --git a/scripts/plot_from_file.py b/scripts/plot_from_file.py index 4ec0043cf..b77660c86 100644 --- a/scripts/plot_from_file.py +++ b/scripts/plot_from_file.py @@ -193,7 +193,7 @@ def restyle_boxplot(artist_dict, color, gray="#222222", linewidth=1, fliersize=5 warnings.warn(f"{env} not found for normalizing scores, you should update `env_key_to_env_id`") # Truncate to convert to matrix - min_runs = min([len(algo_score) for algo_score in algo_scores]) + min_runs = min(len(algo_score) for algo_score in algo_scores) if min_runs > 0: algo_scores = [algo_score[:min_runs] for algo_score in algo_scores] # shape: (n_envs, n_runs) -> (n_runs, n_envs) diff --git a/utils/callbacks.py b/utils/callbacks.py index 608c5a778..5d3931d9f 100644 --- a/utils/callbacks.py +++ b/utils/callbacks.py @@ -31,7 +31,7 @@ def __init__( log_path: Optional[str] = None, ): - super(TrialEvalCallback, self).__init__( + super().__init__( eval_env=eval_env, n_eval_episodes=n_eval_episodes, eval_freq=eval_freq, @@ -46,7 +46,7 @@ def __init__( def _on_step(self) -> bool: if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: - super(TrialEvalCallback, self)._on_step() + super()._on_step() self.eval_idx += 1 # report best or report current ? # report num_timesteps or elasped time ? @@ -69,7 +69,7 @@ class SaveVecNormalizeCallback(BaseCallback): """ def __init__(self, save_freq: int, save_path: str, name_prefix: Optional[str] = None, verbose: int = 0): - super(SaveVecNormalizeCallback, self).__init__(verbose) + super().__init__(verbose) self.save_freq = save_freq self.save_path = save_path self.name_prefix = name_prefix @@ -111,7 +111,7 @@ class ParallelTrainCallback(BaseCallback): """ def __init__(self, gradient_steps: int = 100, verbose: int = 0, sleep_time: float = 0.0): - super(ParallelTrainCallback, self).__init__(verbose) + super().__init__(verbose) self.batch_size = 0 self._model_ready = True self._model = None @@ -202,7 +202,7 @@ class RawStatisticsCallback(BaseCallback): """ def __init__(self, verbose=0): - super(RawStatisticsCallback, self).__init__(verbose) + super().__init__(verbose) # Custom counter to reports stats # (and avoid reporting multiple values for the same step) self._timesteps_counter = 0 diff --git a/utils/exp_manager.py b/utils/exp_manager.py index 66a34c2df..6fe7bd870 100644 --- a/utils/exp_manager.py +++ b/utils/exp_manager.py @@ -48,7 +48,7 @@ from utils.utils import ALGOS, get_callback_list, get_latest_run_id, get_wrapper_class, linear_schedule -class ExperimentManager(object): +class ExperimentManager: """ Experiment manager: read the hyperparameters, preprocess them, create the environment and the RL model. @@ -91,7 +91,7 @@ def __init__( no_optim_plots: bool = False, device: Union[th.device, str] = "auto", ): - super(ExperimentManager, self).__init__() + super().__init__() self.algo = algo self.env_id = env_id # Custom params @@ -261,7 +261,7 @@ def _save_config(self, saved_hyperparams: Dict[str, Any]) -> None: def read_hyperparameters(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: # Load hyperparameters from yaml file - with open(f"hyperparams/{self.algo}.yml", "r") as f: + with open(f"hyperparams/{self.algo}.yml") as f: hyperparams_dict = yaml.safe_load(f) if self.env_id in list(hyperparams_dict.keys()): hyperparams = hyperparams_dict[self.env_id] diff --git a/utils/record_video.py b/utils/record_video.py index dd89c0220..cc9d9ab68 100644 --- a/utils/record_video.py +++ b/utils/record_video.py @@ -86,7 +86,7 @@ env_kwargs = {} args_path = os.path.join(log_path, env_id, "args.yml") if os.path.isfile(args_path): - with open(args_path, "r") as f: + with open(args_path) as f: loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader) # pytype: disable=module-attr if loaded_args["env_kwargs"] is not None: env_kwargs = loaded_args["env_kwargs"] diff --git a/utils/utils.py b/utils/utils.py index 6072cc7cd..e46213dbf 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -318,7 +318,7 @@ def get_saved_hyperparams( config_file = os.path.join(stats_path, "config.yml") if os.path.isfile(config_file): # Load saved hyperparameters - with open(os.path.join(stats_path, "config.yml"), "r") as f: + with open(os.path.join(stats_path, "config.yml")) as f: hyperparams = yaml.load(f, Loader=yaml.UnsafeLoader) # pytype: disable=module-attr hyperparams["normalize"] = hyperparams.get("normalize", False) else: @@ -347,7 +347,7 @@ class StoreDict(argparse.Action): def __init__(self, option_strings, dest, nargs=None, **kwargs): self._nargs = nargs - super(StoreDict, self).__init__(option_strings, dest, nargs=nargs, **kwargs) + super().__init__(option_strings, dest, nargs=nargs, **kwargs) def __call__(self, parser, namespace, values, option_string=None): arg_dict = {} diff --git a/utils/wrappers.py b/utils/wrappers.py index 9cdaf783f..a62155338 100644 --- a/utils/wrappers.py +++ b/utils/wrappers.py @@ -11,7 +11,7 @@ class DoneOnSuccessWrapper(gym.Wrapper): """ def __init__(self, env: gym.Env, reward_offset: float = 0.0, n_successes: int = 1): - super(DoneOnSuccessWrapper, self).__init__(env) + super().__init__(env) self.reward_offset = reward_offset self.n_successes = n_successes self.current_successes = 0 @@ -41,12 +41,12 @@ class ActionNoiseWrapper(gym.Wrapper): Add gaussian noise to the action (without telling the agent), to test the robustness of the control. - :param env: (gym.Env) - :param noise_std: (float) Standard deviation of the noise + :param env: + :param noise_std: Standard deviation of the noise """ - def __init__(self, env, noise_std=0.1): - super(ActionNoiseWrapper, self).__init__(env) + def __init__(self, env: gym.Env, noise_std: float = 0.1): + super().__init__(env) self.noise_std = noise_std def step(self, action): @@ -95,13 +95,13 @@ class LowPassFilterWrapper(gym.Wrapper): """ Butterworth-Lowpass - :param env: (gym.Env) + :param env: :param freq: Filter corner frequency. :param df: Sampling rate in Hz. """ - def __init__(self, env, freq=5.0, df=25.0): - super(LowPassFilterWrapper, self).__init__(env) + def __init__(self, env: gym.Env, freq: float = 5.0, df: float = 25.0): + super().__init__(env) self.freq = freq self.df = df self.signal = [] @@ -123,12 +123,12 @@ class ActionSmoothingWrapper(gym.Wrapper): """ Smooth the action using exponential moving average. - :param env: (gym.Env) - :param smoothing_coef: (float) Smoothing coefficient (0 no smoothing, 1 very smooth) + :param env: + :param smoothing_coef: Smoothing coefficient (0 no smoothing, 1 very smooth) """ - def __init__(self, env, smoothing_coef: float = 0.0): - super(ActionSmoothingWrapper, self).__init__(env) + def __init__(self, env: gym.Env, smoothing_coef: float = 0.0): + super().__init__(env) self.smoothing_coef = smoothing_coef self.smoothed_action = None # from https://github.com/rail-berkeley/softlearning/issues/3 @@ -152,12 +152,12 @@ class DelayedRewardWrapper(gym.Wrapper): Delay the reward by `delay` steps, it makes the task harder but more realistic. The reward is accumulated during those steps. - :param env: (gym.Env) - :param delay: (int) Number of steps the reward should be delayed. + :param env: + :param delay: Number of steps the reward should be delayed. """ - def __init__(self, env, delay=10): - super(DelayedRewardWrapper, self).__init__(env) + def __init__(self, env: gym.Env, delay: int = 10): + super().__init__(env) self.delay = delay self.current_step = 0 self.accumulated_reward = 0.0 @@ -185,11 +185,11 @@ class HistoryWrapper(gym.Wrapper): """ Stack past observations and actions to give an history to the agent. - :param env: (gym.Env) - :param horizon: (int) Number of steps to keep in the history. + :param env: + :param horizon:Number of steps to keep in the history. """ - def __init__(self, env: gym.Env, horizon: int = 5): + def __init__(self, env: gym.Env, horizon: int = 2): assert isinstance(env.observation_space, gym.spaces.Box) wrapped_obs_space = env.observation_space @@ -208,7 +208,7 @@ def __init__(self, env: gym.Env, horizon: int = 5): # Overwrite the observation space env.observation_space = gym.spaces.Box(low=low, high=high, dtype=wrapped_obs_space.dtype) - super(HistoryWrapper, self).__init__(env) + super().__init__(env) self.horizon = horizon self.low_action, self.high_action = low_action, high_action @@ -244,11 +244,11 @@ class HistoryWrapperObsDict(gym.Wrapper): """ History Wrapper for dict observation. - :param env: (gym.Env) - :param horizon: (int) Number of steps to keep in the history. + :param env: + :param horizon: Number of steps to keep in the history. """ - def __init__(self, env, horizon=5): + def __init__(self, env: gym.Env, horizon: int = 2): assert isinstance(env.observation_space.spaces["observation"], gym.spaces.Box) wrapped_obs_space = env.observation_space.spaces["observation"] @@ -267,7 +267,7 @@ def __init__(self, env, horizon=5): # Overwrite the observation space env.observation_space.spaces["observation"] = gym.spaces.Box(low=low, high=high, dtype=wrapped_obs_space.dtype) - super(HistoryWrapperObsDict, self).__init__(env) + super().__init__(env) self.horizon = horizon self.low_action, self.high_action = low_action, high_action From bcb42996ceb39a1001260fda50340d5237afffa6 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 21 Apr 2022 17:42:07 +0200 Subject: [PATCH 05/14] Fix division by zero with n-evaluations (closes #238) --- utils/exp_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/exp_manager.py b/utils/exp_manager.py index 6fe7bd870..1d756bd24 100644 --- a/utils/exp_manager.py +++ b/utils/exp_manager.py @@ -345,7 +345,7 @@ def _preprocess_hyperparams( # Derive n_evaluations from number of timesteps if needed if self.n_evaluations is None and self.optimize_hyperparameters: - self.n_evaluations = self.n_timesteps // int(1e5) + self.n_evaluations = max(1, self.n_timesteps // int(1e5)) print( f"Doing {self.n_evaluations} intermediate evaluations for pruning based on the number of timesteps." " (1 evaluation every 100k timesteps)" From 763acebde7e5227db0a3a505e6eb7f616a6d8a67 Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Tue, 26 Apr 2022 12:36:28 +0200 Subject: [PATCH 06/14] Add command line option for total number of trials. --- train.py | 8 ++++++++ utils/exp_manager.py | 15 ++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/train.py b/train.py index bbf85795e..3bf57b3d7 100644 --- a/train.py +++ b/train.py @@ -64,6 +64,13 @@ type=int, default=500, ) + parser.add_argument( + "--total-n-trials", + help="Number of trials for optimizing hyperparameters. " + "This applies to the entire optimization process and takes precedence over --n-trials if set.", + type=int, + default=None, + ) parser.add_argument( "-optimize", "--optimize-hyperparameters", action="store_true", default=False, help="Run hyperparameters search" ) @@ -201,6 +208,7 @@ args.storage, args.study_name, args.n_trials, + args.total_n_trials, args.n_jobs, args.sampler, args.pruner, diff --git a/utils/exp_manager.py b/utils/exp_manager.py index 1d756bd24..b00b22daa 100644 --- a/utils/exp_manager.py +++ b/utils/exp_manager.py @@ -16,6 +16,7 @@ from optuna.integration.skopt import SkoptSampler from optuna.pruners import BasePruner, MedianPruner, NopPruner, SuccessiveHalvingPruner from optuna.samplers import BaseSampler, RandomSampler, TPESampler +from optuna.trial import TrialState from optuna.visualization import plot_optimization_history, plot_param_importances from sb3_contrib.common.vec_env import AsyncEval @@ -74,6 +75,7 @@ def __init__( storage: Optional[str] = None, study_name: Optional[str] = None, n_trials: int = 1, + total_n_trials: Optional[int] = None, n_jobs: int = 1, sampler: str = "tpe", pruner: str = "median", @@ -134,6 +136,7 @@ def __init__( self.no_optim_plots = no_optim_plots # maximum number of trials for finding the best hyperparams self.n_trials = n_trials + self.total_n_trials = total_n_trials # number of parallel jobs when doing hyperparameter search self.n_jobs = n_jobs self.sampler = sampler @@ -748,7 +751,17 @@ def hyperparameters_optimization(self) -> None: ) try: - study.optimize(self.objective, n_jobs=self.n_jobs, callbacks=[MaxTrialsCallback(self.n_trials)]) + if self.total_n_trials is not None: + study.optimize(self.objective, + n_jobs=self.n_jobs, + callbacks= + [MaxTrialsCallback( + self.total_n_trials, + states=[TrialState.COMPLETE, TrialState.RUNNING])]) + else: + study.optimize(self.objective, + n_jobs=self.n_jobs, + n_trials=self.n_trials) except KeyboardInterrupt: pass From 975c520993181476600cc8f2a706278e1de0d97e Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Tue, 26 Apr 2022 12:46:04 +0200 Subject: [PATCH 07/14] Fix formatting. --- train.py | 2 +- utils/exp_manager.py | 17 +++++++---------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/train.py b/train.py index 3bf57b3d7..d55eac042 100644 --- a/train.py +++ b/train.py @@ -67,7 +67,7 @@ parser.add_argument( "--total-n-trials", help="Number of trials for optimizing hyperparameters. " - "This applies to the entire optimization process and takes precedence over --n-trials if set.", + "This applies to the entire optimization process and takes precedence over --n-trials if set.", type=int, default=None, ) diff --git a/utils/exp_manager.py b/utils/exp_manager.py index b00b22daa..4dbeba261 100644 --- a/utils/exp_manager.py +++ b/utils/exp_manager.py @@ -12,10 +12,10 @@ import optuna import torch as th import yaml -from optuna.study import MaxTrialsCallback from optuna.integration.skopt import SkoptSampler from optuna.pruners import BasePruner, MedianPruner, NopPruner, SuccessiveHalvingPruner from optuna.samplers import BaseSampler, RandomSampler, TPESampler +from optuna.study import MaxTrialsCallback from optuna.trial import TrialState from optuna.visualization import plot_optimization_history, plot_param_importances from sb3_contrib.common.vec_env import AsyncEval @@ -752,16 +752,13 @@ def hyperparameters_optimization(self) -> None: try: if self.total_n_trials is not None: - study.optimize(self.objective, - n_jobs=self.n_jobs, - callbacks= - [MaxTrialsCallback( - self.total_n_trials, - states=[TrialState.COMPLETE, TrialState.RUNNING])]) + study.optimize( + self.objective, + n_jobs=self.n_jobs, + callbacks=[MaxTrialsCallback(self.total_n_trials, states=[TrialState.COMPLETE, TrialState.RUNNING])], + ) else: - study.optimize(self.objective, - n_jobs=self.n_jobs, - n_trials=self.n_trials) + study.optimize(self.objective, n_jobs=self.n_jobs, n_trials=self.n_trials) except KeyboardInterrupt: pass From 0cafc7192dbdfac5e70372ac17119f124dcfd0cd Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Thu, 28 Apr 2022 14:11:14 +0200 Subject: [PATCH 08/14] Add test for training with multiple workers and the new --total-n-trials argument. --- tests/test_train.py | 46 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tests/test_train.py b/tests/test_train.py index ae0f5eda2..fabd88649 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -1,7 +1,9 @@ import os import subprocess +import optuna import pytest +from optuna.trial import TrialState def _assert_eq(left, right): @@ -101,3 +103,47 @@ def test_parallel_train(tmp_path): return_code = subprocess.call(["python", "train.py"] + args) _assert_eq(return_code, 0) + + +def test_multiple_workers(tmp_path): + study_name = "test-study" + storage = f"sqlite:///{tmp_path}/optuna.db" + n_trials = 6 + args = [ + "-optimize", + "--no-optim-plots", + "--storage", + storage, + "--total-n-trials", + str(n_trials), + "--study-name", + study_name, + "--n-evaluations", + str(1), + "-n", + str(100), + "--algo", + "ppo", + "--env", + "Pendulum-v1", + "--log-folder", + tmp_path, + ] + + p1 = subprocess.Popen(["python", "train.py"] + args) + p2 = subprocess.Popen(["python", "train.py"] + args) + p3 = subprocess.Popen(["python", "train.py"] + args) + p4 = subprocess.Popen(["python", "train.py"] + args) + + return_code1 = p1.wait() + return_code2 = p2.wait() + return_code3 = p3.wait() + return_code4 = p4.wait() + + study = optuna.load_study(study_name=study_name, storage=storage) + assert sum(t.state == TrialState.COMPLETE for t in study.trials) == n_trials + + assert return_code1 == 0 + assert return_code2 == 0 + assert return_code3 == 0 + assert return_code4 == 0 From 6110f3c31ef2b19e8cdab26ed40a48aec536a2da Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Thu, 28 Apr 2022 14:11:38 +0200 Subject: [PATCH 09/14] Ensure Pruned trials are counted and that no optimization is started when enough trials are already present. --- utils/exp_manager.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/utils/exp_manager.py b/utils/exp_manager.py index 4dbeba261..0002a9d56 100644 --- a/utils/exp_manager.py +++ b/utils/exp_manager.py @@ -752,11 +752,21 @@ def hyperparameters_optimization(self) -> None: try: if self.total_n_trials is not None: - study.optimize( - self.objective, - n_jobs=self.n_jobs, - callbacks=[MaxTrialsCallback(self.total_n_trials, states=[TrialState.COMPLETE, TrialState.RUNNING])], - ) + counted_states = [ + TrialState.COMPLETE, + TrialState.RUNNING, + TrialState.PRUNED, + ] + completed_trials = sum(t.state in counted_states for t in study.trials) + if completed_trials < self.total_n_trials: + study.optimize( + self.objective, + n_jobs=self.n_jobs, + callbacks=[MaxTrialsCallback( + self.total_n_trials, + states=counted_states, + )], + ) else: study.optimize(self.objective, n_jobs=self.n_jobs, n_trials=self.n_trials) except KeyboardInterrupt: From 069556e36ef4f6235a96bdf26c056d6cd67b154e Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Thu, 28 Apr 2022 14:19:50 +0200 Subject: [PATCH 10/14] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc7c863ea..71ea3442d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ ### New Features - Support setting PyTorch's device with thye `--device` flag (@gregwar) +- Add `--tital-n-trials` parameter to help with distributed optimization. (@ernestum) ### Bug fixes - Fix `Reacher-v3` name in PPO hyperparameter file From 7a195bc78f56aa57d3697681bd9f407a010be066 Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Thu, 28 Apr 2022 14:28:35 +0200 Subject: [PATCH 11/14] Ensure that pruned trials are understood as completed trials. --- tests/test_train.py | 2 +- train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_train.py b/tests/test_train.py index fabd88649..d2b87d4cc 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -141,7 +141,7 @@ def test_multiple_workers(tmp_path): return_code4 = p4.wait() study = optuna.load_study(study_name=study_name, storage=storage) - assert sum(t.state == TrialState.COMPLETE for t in study.trials) == n_trials + assert sum(t.state in (TrialState.COMPLETE, TrialState.PRUNED) for t in study.trials) == n_trials assert return_code1 == 0 assert return_code2 == 0 diff --git a/train.py b/train.py index d55eac042..191d01004 100644 --- a/train.py +++ b/train.py @@ -66,7 +66,7 @@ ) parser.add_argument( "--total-n-trials", - help="Number of trials for optimizing hyperparameters. " + help="Number of (potentially pruned) trials for optimizing hyperparameters. " "This applies to the entire optimization process and takes precedence over --n-trials if set.", type=int, default=None, From 645ea48db433cc657ada1d874fe8afe050b155f0 Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Thu, 28 Apr 2022 14:28:44 +0200 Subject: [PATCH 12/14] Fix formatting. --- utils/exp_manager.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/utils/exp_manager.py b/utils/exp_manager.py index 0002a9d56..430fe923e 100644 --- a/utils/exp_manager.py +++ b/utils/exp_manager.py @@ -762,10 +762,12 @@ def hyperparameters_optimization(self) -> None: study.optimize( self.objective, n_jobs=self.n_jobs, - callbacks=[MaxTrialsCallback( - self.total_n_trials, - states=counted_states, - )], + callbacks=[ + MaxTrialsCallback( + self.total_n_trials, + states=counted_states, + ) + ], ) else: study.optimize(self.objective, n_jobs=self.n_jobs, n_trials=self.n_trials) From 374b3f9a03bba8e9f8a2787fe2c15bd9432810ab Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Fri, 29 Apr 2022 15:34:32 +0200 Subject: [PATCH 13/14] Add tuned hyperparameters for ppo and seals environments. --- hyperparams/ppo.yml | 156 ++++++++++++++++++++++++++------------------ 1 file changed, 91 insertions(+), 65 deletions(-) diff --git a/hyperparams/ppo.yml b/hyperparams/ppo.yml index fc2d3e1aa..41bb70568 100644 --- a/hyperparams/ppo.yml +++ b/hyperparams/ppo.yml @@ -42,18 +42,22 @@ CartPole-v1: learning_rate: lin_0.001 clip_range: lin_0.2 +# Tuned seals/CartPole-v0: n_envs: 8 n_timesteps: !!float 1e5 policy: 'MlpPolicy' - n_steps: 32 batch_size: 256 - gae_lambda: 0.8 - gamma: 0.98 - n_epochs: 20 - ent_coef: 0.0 - learning_rate: lin_0.001 - clip_range: lin_0.2 + clip_range: 0.4 + ent_coef: 0.008508727919228772 + gae_lambda: 0.9 + gamma: 0.9999 + learning_rate: 0.0012403278189645594 + max_grad_norm: 0.8 + n_epochs: 10 + n_steps: 512 + policy_kwargs: dict(activation_fn=nn.ReLU, net_arch=[dict(pi=[64, 64], vf=[64, 64])]) + vf_coef: 0.489343896591493 MountainCar-v0: normalize: true @@ -66,16 +70,23 @@ MountainCar-v0: n_epochs: 4 ent_coef: 0.0 +# Tuned seals/MountainCar-v0: normalize: true n_envs: 16 n_timesteps: !!float 1e6 policy: 'MlpPolicy' - n_steps: 16 + batch_size: 512 + clip_range: 0.2 + ent_coef: 6.4940755116195606e-06 gae_lambda: 0.98 gamma: 0.99 - n_epochs: 4 - ent_coef: 0.0 + learning_rate: 0.0004476103728105138 + max_grad_norm: 1 + n_epochs: 20 + n_steps: 256 + policy_kwargs: dict(activation_fn=nn.Tanh, net_arch=[dict(pi=[64, 64], vf=[64, 64])]) + vf_coef: 0.25988158989488963 # Tuned MountainCarContinuous-v0: @@ -379,8 +390,20 @@ Ant-v3: &mujoco-defaults n_timesteps: !!float 1e6 policy: 'MlpPolicy' +# Tuned seals/Ant-v0: <<: *mujoco-defaults + batch_size: 16 + clip_range: 0.3 + ent_coef: 3.1441389214159857e-06 + gae_lambda: 0.8 + gamma: 0.995 + learning_rate: 0.00017959211641976886 + max_grad_norm: 0.9 + n_epochs: 10 + n_steps: 2048 + policy_kwargs: dict(activation_fn=nn.Tanh, net_arch=[dict(pi=[64, 64], vf=[64, 64])]) + vf_coef: 0.4351450387648799 # Hopper-v3: # <<: *mujoco-defaults @@ -396,9 +419,20 @@ Swimmer-v3: <<: *mujoco-defaults gamma: 0.9999 +# Tuned seals/Swimmer-v0: <<: *mujoco-defaults - gamma: 0.9999 + batch_size: 8 + clip_range: 0.1 + ent_coef: 5.167107294612664e-08 + gae_lambda: 0.95 + gamma: 0.999 + learning_rate: 0.0001214437022727675 + max_grad_norm: 2 + n_epochs: 20 + n_steps: 2048 + policy_kwargs: dict(activation_fn=nn.Tanh, net_arch=[dict(pi=[64, 64], vf=[64, 64])]) + vf_coef: 0.6162112311062333 # Tuned # 10 mujoco envs @@ -425,27 +459,23 @@ HalfCheetah-v3: net_arch=[dict(pi=[256, 256], vf=[256, 256])] )" +# Tuned seals/HalfCheetah-v0: normalize: true n_envs: 1 policy: 'MlpPolicy' n_timesteps: !!float 1e6 batch_size: 64 - n_steps: 512 - gamma: 0.98 - learning_rate: 2.0633e-05 - ent_coef: 0.000401762 clip_range: 0.1 - n_epochs: 20 - gae_lambda: 0.92 + ent_coef: 3.794797423594763e-06 + gae_lambda: 0.95 + gamma: 0.95 + learning_rate: 0.0003286871805949382 max_grad_norm: 0.8 - vf_coef: 0.58096 - policy_kwargs: "dict( - log_std_init=-2, - ortho_init=False, - activation_fn=nn.ReLU, - net_arch=[dict(pi=[256, 256], vf=[256, 256])] - )" + n_epochs: 5 + n_steps: 512 + policy_kwargs: dict(activation_fn=nn.Tanh, net_arch=[dict(pi=[64, 64], vf=[64, 64])]) + vf_coef: 0.11483689492120866 # Ant-v3: # normalize: true @@ -485,27 +515,23 @@ Hopper-v3: net_arch=[dict(pi=[256, 256], vf=[256, 256])] )" +# Tuned seals/Hopper-v0: normalize: true n_envs: 1 policy: 'MlpPolicy' n_timesteps: !!float 1e6 - batch_size: 32 - n_steps: 512 - gamma: 0.999 - learning_rate: 9.80828e-05 - ent_coef: 0.00229519 - clip_range: 0.2 - n_epochs: 5 - gae_lambda: 0.99 - max_grad_norm: 0.7 - vf_coef: 0.835671 - policy_kwargs: "dict( - log_std_init=-2, - ortho_init=False, - activation_fn=nn.ReLU, - net_arch=[dict(pi=[256, 256], vf=[256, 256])] - )" + batch_size: 512 + clip_range: 0.1 + ent_coef: 0.0010159833764878474 + gae_lambda: 0.98 + gamma: 0.995 + learning_rate: 0.0003904770450788824 + max_grad_norm: 0.9 + n_epochs: 20 + n_steps: 2048 + policy_kwargs: dict(activation_fn=nn.ReLU, net_arch=[dict(pi=[64, 64], vf=[64, 64])]) + vf_coef: 0.20315938606555833 HumanoidStandup-v3: normalize: true @@ -551,27 +577,24 @@ Humanoid-v3: net_arch=[dict(pi=[256, 256], vf=[256, 256])] )" +# Tuned seals/Humanoid-v0: normalize: true n_envs: 1 policy: 'MlpPolicy' n_timesteps: !!float 1e7 batch_size: 256 - n_steps: 512 - gamma: 0.95 - learning_rate: 3.56987e-05 - ent_coef: 0.00238306 - clip_range: 0.3 - n_epochs: 5 - gae_lambda: 0.9 - max_grad_norm: 2 - vf_coef: 0.431892 - policy_kwargs: "dict( - log_std_init=-2, - ortho_init=False, - activation_fn=nn.ReLU, - net_arch=[dict(pi=[256, 256], vf=[256, 256])] - )" + clip_range: 0.2 + ent_coef: 2.0745206045994986e-05 + gae_lambda: 0.92 + gamma: 0.999 + learning_rate: 2.0309225666232827e-05 + max_grad_norm: 0.5 + n_epochs: 20 + n_steps: 2048 + policy_kwargs: dict(activation_fn=nn.ReLU, net_arch=[dict(pi=[256, 256], vf=[256, + 256])]) + vf_coef: 0.819262464558427 InvertedDoublePendulum-v3: normalize: true @@ -659,18 +682,21 @@ Walker2d-v3: max_grad_norm: 1 vf_coef: 0.871923 +# Tuned seals/Walker2d-v0: normalize: true n_envs: 1 policy: 'MlpPolicy' n_timesteps: !!float 1e6 - batch_size: 32 - n_steps: 512 - gamma: 0.99 - learning_rate: 5.05041e-05 - ent_coef: 0.000585045 - clip_range: 0.1 - n_epochs: 20 - gae_lambda: 0.95 - max_grad_norm: 1 - vf_coef: 0.871923 + batch_size: 8 + clip_range: 0.4 + ent_coef: 0.00013057334805552262 + gae_lambda: 0.92 + gamma: 0.98 + learning_rate: 3.791707778339674e-05 + max_grad_norm: 0.6 + n_epochs: 5 + n_steps: 2048 + policy_kwargs: dict(activation_fn=nn.ReLU, net_arch=[dict(pi=[256, 256], vf=[256, + 256])]) + vf_coef: 0.6167177795726859 From ea16e61b7da8d0eee2b97049fff9a6254b765b10 Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Fri, 29 Apr 2022 16:05:51 +0200 Subject: [PATCH 14/14] Add tuned hyperparameters for SAC and seals environments. --- hyperparams/sac.yml | 55 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml index c0af74cd8..e9d2469f0 100644 --- a/hyperparams/sac.yml +++ b/hyperparams/sac.yml @@ -195,42 +195,95 @@ HalfCheetah-v3: &mujoco-defaults policy: 'MlpPolicy' learning_starts: 10000 +# Tuned seals/HalfCheetah-v0: <<: *mujoco-defaults + batch_size: 2048 + buffer_size: 100000 + gamma: 0.95 + learning_rate: 0.000884624878315995 + learning_starts: 10000 + policy_kwargs: dict(net_arch=[64, 64], log_std_init=-0.6932709443503001) + tau: 0.01 + train_freq: 64 Ant-v3: <<: *mujoco-defaults +# Tuned seals/Ant-v0: <<: *mujoco-defaults + batch_size: 512 + buffer_size: 1000000 + gamma: 0.98 + learning_rate: 0.0018514039303149058 + learning_starts: 1000 + policy_kwargs: dict(net_arch=[256, 256], log_std_init=-2.2692589009754176) + tau: 0.05 + train_freq: 64 Hopper-v3: <<: *mujoco-defaults +# Tuned seals/Hopper-v0: <<: *mujoco-defaults + batch_size: 128 + buffer_size: 100000 + gamma: 0.98 + learning_rate: 0.001709807687567946 + learning_starts: 1000 + policy_kwargs: dict(net_arch=[256, 256], log_std_init=-1.6829391077276037) + tau: 0.08 + train_freq: 32 Walker2d-v3: <<: *mujoco-defaults +# Tuned seals/Walker2d-v0: <<: *mujoco-defaults + batch_size: 128 + buffer_size: 100000 + gamma: 0.99 + learning_rate: 0.0005845844772048097 + learning_starts: 1000 + policy_kwargs: dict(net_arch=[400, 300], log_std_init=0.1955317469998743) + tau: 0.02 + train_freq: 1 Humanoid-v3: <<: *mujoco-defaults n_timesteps: !!float 2e6 +# Tuned seals/Humanoid-v0: <<: *mujoco-defaults n_timesteps: !!float 2e6 + batch_size: 64 + buffer_size: 100000 + gamma: 0.98 + learning_rate: 4.426351861707874e-05 + learning_starts: 20000 + policy_kwargs: dict(net_arch=[400, 300], log_std_init=-0.1034412732183072) + tau: 0.08 + train_freq: 8 Swimmer-v3: <<: *mujoco-defaults gamma: 0.9999 +# Tuned seals/Swimmer-v0: <<: *mujoco-defaults - gamma: 0.9999 + batch_size: 128 + buffer_size: 100000 + gamma: 0.995 + learning_rate: 0.00039981805535514633 + learning_starts: 1000 + policy_kwargs: dict(net_arch=[400, 300], log_std_init=-2.689958330139309) + tau: 0.01 + train_freq: 256 # === HER Robotics GoalEnvs ===