Add meta-gradient LPG tricks (multiple critic updates, advantage normalization)

EmptyJackson · EmptyJackson · commit 6729638ad4ef · 2024-05-01T15:50:50.000+01:00
diff --git a/README.md b/README.md
@@ -57,22 +57,39 @@ echo [KEY] > setup/wandb_key
 
 # Running experiments
 Meta-training is executed with `python3.8 train.py`, with all arguments found in [`experiments/parse_args.py`](https://github.com/EmptyJackson/groove/blob/main/experiments/parse_args.py).
-* `--log --wandb_entity [entity] --wandb_project [project]` enables logging to WandB.
-* `--num_agents [agents]` sets the meta-training batch size.
-* `--num_mini_batches [mini_batches]` computes each update in sequential mini-batches, in order to execute large batches with little memory. *RECOMMENDED: lower this to the smallest value that fits in memory.*
-* `--debug` disables JIT compilation.
+| Argument | Description |
+| --- | --- |
+| `--env_mode [env_mode]` | Sets the environment mode (below). |
+| `--num_agents [agents]` | Sets the meta-training batch size. |
+| `--num_mini_batches [mini_batches]` | Computes each update in sequential mini-batches, in order to execute large batches with little memory. *RECOMMENDED: lower this to the smallest value that fits in memory.* |
+| `--debug` | Disables JIT compilation. |
+| `--log --wandb_entity [entity] --wandb_project [project]` | Enables logging to WandB. |
+
+
+### Grid-World environments
+
+| Environment mode | Description | Lifetime (# of updates) |
+| --- | --- | --- |
+|`tabular`|Five tabular levels from [LPG](https://arxiv.org/abs/2007.08794)|Variable|
+|`mazes`|Maze levels from [MiniMax](https://github.com/facebookresearch/minimax)|2500|
+|`all_shortlife`|Uniformly sampled levels|250|
+|`all_vrandlife`|Uniformly sampled levels|10-250 (Log-sampled)|
+
+
+### Examples
+| Experiment | Command |
+| --- | --- |
+| LPG (meta-gradient) | `python3.8 train.py --num_agents 512 --num_mini_batches 16 --log --wandb_entity [entity] --wandb_project [project]` |
+| GROOVE | LPG with `--score_function alg_regret` (algorithmic regret is computed every step due to end-to-end compulation, so currently very inefficient) |
+| TA-LPG | LPG with `--num_mini_batches 8 --use_es --lifetime_conditioning --lpg_learning_rate 0.01 --env_mode all_vrandlife` |
+
 
 ### Docker
 To execute CPU or GPU docker containers, run the relevant script (with the GPU index as the first argument for the GPU script).
 ```
 ./run_gpu.sh [GPU id] python3.8 train.py [args]
 ```
 
-### Examples
-* LPG: `python3.8 train.py --num_agents 512 --num_mini_batches 16 --log --wandb_entity [entity] --wandb_project [project]`
-* GROOVE: LPG with `--score_function alg_regret`
-* TA-LPG: LPG with `--num_mini_batches 8 --use_es --lifetime_conditioning --lpg_learning_rate 0.01`
-
 # Citation
 If you use this implementation in your work, please cite us with the following:
 ```
@@ -96,5 +113,6 @@ If you use this implementation in your work, please cite us with the following:
 
 # Coming soon
 
+* Speed up GROOVE by removing recomputation of algorithmic regret every step.
 * Meta-testing script for checkpointed models.
 * Alternative UED metrics (PVL, MaxMC).
diff --git a/agents/lpg_agent.py b/agents/lpg_agent.py
@@ -126,14 +126,14 @@ def _train_step(carry, _):
         metrics = LPGAgentMetrics(
             pi_l2, actor_entropy, critic_loss, y_l2, critic_entropy
         )
-        return (rng, agent_state), metrics
+        return (rng, agent_state), (rollout, metrics)
 
     # --- Perform K agent updates ---
-    carry_out, metrics = jax.lax.scan(
+    carry_out, (rollout, metrics) = jax.lax.scan(
         _train_step,
         (rng, agent_state),
         None,
         length=num_train_steps,
     )
     _, agent_state = carry_out
-    return agent_state, jax.tree_map(jnp.mean, metrics)
+    return agent_state, rollout, jax.tree_map(jnp.mean, metrics)
diff --git a/experiments/parse_args.py b/experiments/parse_args.py
@@ -16,7 +16,7 @@ def parse_args(cmd_args=sys.argv[1:]):
         "--env_name", help="Environment name", type=str, default="GridWorld-v0"
     )
     parser.add_argument(
-        "--env_mode", help="Environment mode", type=str, default="all_vrandlife"
+        "--env_mode", help="Environment mode", type=str, default="all_shortlife"
     )
     parser.add_argument(
         "--env_workers",
diff --git a/meta/train.py b/meta/train.py
@@ -39,7 +39,9 @@ def _train_agent(lpg_params, rng, agent_state, value_critic_state):
 
         # --- Perform K agent train steps ---
         rng, _rng = jax.random.split(rng)
-        agent_state, agent_metrics = agent_train_fn(_rng, _lpg_train_state, agent_state)
+        agent_state, rollouts, agent_metrics = agent_train_fn(
+            _rng, _lpg_train_state, agent_state
+        )
 
         # --- Rollout updated agent ---
         rng, _rng = jax.random.split(rng)
@@ -56,19 +58,32 @@ def _train_agent(lpg_params, rng, agent_state, value_critic_state):
         )
 
         # --- Update value function ---
-        def _compute_value_loss(critic_params):
+        def _compute_value_loss(critic_params, rollouts):
             value_critic_state.replace(params=critic_params)
             value_loss, adv = jax.vmap(
                 compute_advantage, in_axes=(None, 0, None, None)
-            )(value_critic_state, eval_rollouts, gamma, gae_lambda)
+            )(value_critic_state, rollouts, gamma, gae_lambda)
             return value_loss.mean(), adv
 
-        (value_loss, adv), value_critic_grad = jax.value_and_grad(
-            _compute_value_loss, has_aux=True
-        )(value_critic_state.params)
-        value_critic_state = value_critic_state.apply_gradients(grads=value_critic_grad)
+        def _update_critic(value_critic_state, rollouts):
+            losses, value_critic_grad = jax.value_and_grad(
+                _compute_value_loss, has_aux=True
+            )(value_critic_state.params, rollouts)
+            return value_critic_state.apply_gradients(grads=value_critic_grad), losses
+
+        # Iteratively update on train rollouts
+        value_critic_state, _ = jax.lax.scan(
+            _update_critic, value_critic_state, rollouts
+        )
+        # Update critic on evaluation rollout
+        value_critic_state, (value_loss, adv) = _update_critic(
+            value_critic_state, eval_rollouts
+        )
 
         # --- Compute regularized LPG loss ---
+        # Normalize advantage across batch
+        adv = jnp.divide(jnp.subtract(adv, jnp.mean(adv)), jnp.std(adv) + 1e-8)
+
         def _compute_lpg_loss(rollout, adv):
             actor = agent_state.actor_state
             action_probs = actor.apply_fn({"params": actor.params}, rollout.obs)
@@ -157,7 +172,7 @@ def _compute_candidate_fitness(rng, candidate_params, agent_state):
         rng, _rng = jax.random.split(rng)
 
         # --- Train an agent using LPG with candidate parameters ---
-        agent_state, metrics = agent_train_fn(
+        agent_state, _, metrics = agent_train_fn(
             rng=_rng,
             lpg_train_state=candidate_train_state,
             agent_state=agent_state,

Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ def parse_args(cmd_args=sys.argv[1:]):`
`16`	`16`	`"--env_name", help="Environment name", type=str, default="GridWorld-v0"`
`17`	`17`	`)`
`18`	`18`	`parser.add_argument(`
`19`		`- "--env_mode", help="Environment mode", type=str, default="all_vrandlife"`
	`19`	`+ "--env_mode", help="Environment mode", type=str, default="all_shortlife"`
`20`	`20`	`)`
`21`	`21`	`parser.add_argument(`
`22`	`22`	`"--env_workers",`