Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update agilerl tutorials #1238

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 119 additions & 123 deletions docs/tutorials/agilerl/DQN.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/tutorials/agilerl/MADDPG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ To follow this tutorial, you will need to install the dependencies shown below.
```

## Code
### Train multiple agents using MADDPG
### Train agents using MADDPG
The following code should run without any issues. The comments are designed to help you understand how to use PettingZoo with AgileRL. If you have any questions, please feel free to ask in the [Discord server](https://discord.com/invite/eB8HyTA2ux).

```{eval-rst}
Expand Down
269 changes: 141 additions & 128 deletions tutorials/AgileRL/agilerl_dqn_curriculum.py

Large diffs are not rendered by default.

327 changes: 163 additions & 164 deletions tutorials/AgileRL/agilerl_maddpg.py

Large diffs are not rendered by default.

220 changes: 133 additions & 87 deletions tutorials/AgileRL/agilerl_matd3.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,28 @@

Authors: Michael (https://github.com/mikepratt1), Nickua (https://github.com/nicku-a)
"""

import os

import numpy as np
import torch
from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer
from agilerl.hpo.mutation import Mutations
from agilerl.hpo.tournament import TournamentSelection
from agilerl.utils.utils import initialPopulation
from agilerl.utils.utils import create_population
from agilerl.vector.pz_async_vec_env import AsyncPettingZooVecEnv
from tqdm import trange

from pettingzoo.mpe import simple_speaker_listener_v4

if __name__ == "__main__":
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("===== AgileRL MATD3 Demo =====")
print("===== AgileRL Online Multi-Agent Demo =====")

# Define the network configuration
NET_CONFIG = {
"arch": "mlp", # Network architecture
"h_size": [32, 32], # Actor hidden size
"hidden_size": [32, 32], # Actor hidden size
}

# Define the initial hyperparameters
Expand All @@ -31,36 +33,47 @@
# Swap image channels dimension from last to first [H, W, C] -> [C, H, W]
"CHANNELS_LAST": False,
"BATCH_SIZE": 32, # Batch size
"O_U_NOISE": True, # Ornstein Uhlenbeck action noise
"EXPL_NOISE": 0.1, # Action noise scale
"MEAN_NOISE": 0.0, # Mean action noise
"THETA": 0.15, # Rate of mean reversion in OU noise
"DT": 0.01, # Timestep for OU noise
"LR_ACTOR": 0.001, # Actor learning rate
"LR_CRITIC": 0.01, # Critic learning rate
"LR_CRITIC": 0.001, # Critic learning rate
"GAMMA": 0.95, # Discount factor
"MEMORY_SIZE": 100000, # Max memory buffer size
"LEARN_STEP": 5, # Learning frequency
"LEARN_STEP": 100, # Learning frequency
"TAU": 0.01, # For soft update of target parameters
"POLICY_FREQ": 2, # Policy frequnecy
}

num_envs = 8
# Define the simple speaker listener environment as a parallel environment
env = simple_speaker_listener_v4.parallel_env(continuous_actions=True)
env = AsyncPettingZooVecEnv([lambda: env for _ in range(num_envs)])
env.reset()

# Configure the multi-agent algo input arguments
try:
state_dim = [env.observation_space(agent).n for agent in env.agents]
state_dim = [env.single_observation_space(agent).n for agent in env.agents]
one_hot = True
except Exception:
state_dim = [env.observation_space(agent).shape for agent in env.agents]
state_dim = [env.single_observation_space(agent).shape for agent in env.agents]
one_hot = False
try:
action_dim = [env.action_space(agent).n for agent in env.agents]
action_dim = [env.single_action_space(agent).n for agent in env.agents]
INIT_HP["DISCRETE_ACTIONS"] = True
INIT_HP["MAX_ACTION"] = None
INIT_HP["MIN_ACTION"] = None
except Exception:
action_dim = [env.action_space(agent).shape[0] for agent in env.agents]
action_dim = [env.single_action_space(agent).shape[0] for agent in env.agents]
INIT_HP["DISCRETE_ACTIONS"] = False
INIT_HP["MAX_ACTION"] = [env.action_space(agent).high for agent in env.agents]
INIT_HP["MIN_ACTION"] = [env.action_space(agent).low for agent in env.agents]
INIT_HP["MAX_ACTION"] = [
env.single_action_space(agent).high for agent in env.agents
]
INIT_HP["MIN_ACTION"] = [
env.single_action_space(agent).low for agent in env.agents
]

# Not applicable to MPE environments, used when images are used for observations (Atari environments)
if INIT_HP["CHANNELS_LAST"]:
Expand All @@ -73,14 +86,15 @@
INIT_HP["AGENT_IDS"] = env.agents

# Create a population ready for evolutionary hyper-parameter optimisation
pop = initialPopulation(
pop = create_population(
INIT_HP["ALGO"],
state_dim,
action_dim,
one_hot,
NET_CONFIG,
INIT_HP,
population_size=INIT_HP["POPULATION_SIZE"],
num_envs=num_envs,
device=device,
)

Expand All @@ -98,8 +112,8 @@
tournament_size=2, # Tournament selection size
elitism=True, # Elitism in tournament selection
population_size=INIT_HP["POPULATION_SIZE"], # Population size
evo_step=1,
) # Evaluate using last N fitness scores
eval_loop=1, # Evaluate using last N fitness scores
)

# Instantiate a mutations object (used for HPO)
mutations = Mutations(
Expand All @@ -123,116 +137,148 @@
)

# Define training loop parameters
max_episodes = 500 # Total episodes (default: 6000)
max_steps = 25 # Maximum steps to take in each episode
epsilon = 1.0 # Starting epsilon value
eps_end = 0.1 # Final epsilon value
eps_decay = 0.995 # Epsilon decay
evo_epochs = 20 # Evolution frequency
evo_loop = 1 # Number of evaluation episodes
max_steps = 13000 # Max steps (default: 2000000)
learning_delay = 0 # Steps before starting learning
evo_steps = 1000 # Evolution frequency
eval_steps = None # Evaluation steps per episode - go until done
eval_loop = 1 # Number of evaluation episodes
elite = pop[0] # Assign a placeholder "elite" agent

# Training loop
for idx_epi in trange(max_episodes):
total_steps = 0

# TRAINING LOOP
print("Training...")
pbar = trange(max_steps, unit="step")
while np.less([agent.steps[-1] for agent in pop], max_steps).all():
pop_episode_scores = []
for agent in pop: # Loop through population
state, info = env.reset() # Reset environment at start of episode
agent_reward = {agent_id: 0 for agent_id in env.agents}
scores = np.zeros(num_envs)
completed_episode_scores = []
steps = 0
if INIT_HP["CHANNELS_LAST"]:
state = {
agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3])
agent_id: np.moveaxis(s, [-1], [-3])
for agent_id, s in state.items()
}

for _ in range(max_steps):
agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None
env_defined_actions = (
info["env_defined_actions"]
if "env_defined_actions" in info.keys()
else None
)

for idx_step in range(evo_steps // num_envs):
# Get next action from agent
cont_actions, discrete_action = agent.getAction(
state, epsilon, agent_mask, env_defined_actions
cont_actions, discrete_action = agent.get_action(
states=state, training=True, infos=info
)
if agent.discrete_actions:
action = discrete_action
else:
action = cont_actions

next_state, reward, termination, truncation, info = env.step(
action
) # Act in environment
# Act in environment
next_state, reward, termination, truncation, info = env.step(action)

scores += np.sum(np.array(list(reward.values())).transpose(), axis=-1)
total_steps += num_envs
steps += num_envs

# Image processing if necessary for the environment
if INIT_HP["CHANNELS_LAST"]:
state = {agent_id: np.squeeze(s) for agent_id, s in state.items()}
next_state = {
agent_id: np.moveaxis(ns, [-1], [-3])
for agent_id, ns in next_state.items()
}

# Save experiences to replay buffer
memory.save2memory(state, cont_actions, reward, next_state, termination)

# Collect the reward
for agent_id, r in reward.items():
agent_reward[agent_id] += r
memory.save_to_memory(
state,
cont_actions,
reward,
next_state,
termination,
is_vectorised=True,
)

# Learn according to learning frequency
if (memory.counter % agent.learn_step == 0) and (
len(memory) >= agent.batch_size
# Handle learn steps > num_envs
if agent.learn_step > num_envs:
learn_step = agent.learn_step // num_envs
if (
idx_step % learn_step == 0
and len(memory) >= agent.batch_size
and memory.counter > learning_delay
):
# Sample replay buffer
experiences = memory.sample(agent.batch_size)
# Learn according to agent's RL algorithm
agent.learn(experiences)
# Handle num_envs > learn step; learn multiple times per step in env
elif (
len(memory) >= agent.batch_size and memory.counter > learning_delay
):
experiences = memory.sample(
agent.batch_size
) # Sample replay buffer
agent.learn(experiences) # Learn according to agent's RL algorithm
for _ in range(num_envs // agent.learn_step):
# Sample replay buffer
experiences = memory.sample(agent.batch_size)
# Learn according to agent's RL algorithm
agent.learn(experiences)

# Update the state
if INIT_HP["CHANNELS_LAST"]:
next_state = {
agent_id: np.expand_dims(ns, 0)
for agent_id, ns in next_state.items()
}
state = next_state

# Stop episode if any agents have terminated
if any(truncation.values()) or any(termination.values()):
break

# Save the total episode reward
score = sum(agent_reward.values())
agent.scores.append(score)

# Update epsilon for exploration
epsilon = max(eps_end, epsilon * eps_decay)

# Now evolve population if necessary
if (idx_epi + 1) % evo_epochs == 0:
# Evaluate population
fitnesses = [
agent.test(
env,
swap_channels=INIT_HP["CHANNELS_LAST"],
max_steps=max_steps,
loop=evo_loop,
)
for agent in pop
]
# Calculate scores and reset noise for finished episodes
reset_noise_indices = []
term_array = np.array(list(termination.values())).transpose()
trunc_array = np.array(list(truncation.values())).transpose()
for idx, (d, t) in enumerate(zip(term_array, trunc_array)):
if np.any(d) or np.any(t):
completed_episode_scores.append(scores[idx])
agent.scores.append(scores[idx])
scores[idx] = 0
reset_noise_indices.append(idx)
agent.reset_action_noise(reset_noise_indices)

print(f"Episode {idx_epi + 1}/{max_episodes}")
print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}')
print(
f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}'
pbar.update(evo_steps // len(pop))

agent.steps[-1] += steps
pop_episode_scores.append(completed_episode_scores)

# Evaluate population
fitnesses = [
agent.test(
env,
swap_channels=INIT_HP["CHANNELS_LAST"],
max_steps=eval_steps,
loop=eval_loop,
)
for agent in pop
]
mean_scores = [
(
np.mean(episode_scores)
if len(episode_scores) > 0
else "0 completed episodes"
)
for episode_scores in pop_episode_scores
]

print(f"--- Global steps {total_steps} ---")
print(f"Steps {[agent.steps[-1] for agent in pop]}")
print(f"Scores: {mean_scores}")
print(f'Fitnesses: {["%.2f"%fitness for fitness in fitnesses]}')
print(
f'5 fitness avgs: {["%.2f"%np.mean(agent.fitness[-5:]) for agent in pop]}'
)

# Tournament selection and population mutation
elite, pop = tournament.select(pop)
pop = mutations.mutation(pop)
# Tournament selection and population mutation
elite, pop = tournament.select(pop)
pop = mutations.mutation(pop)

# Update step counter
for agent in pop:
agent.steps.append(agent.steps[-1])

# Save the trained algorithm
path = "./models/MATD3"
filename = "MATD3_trained_agent.pt"
os.makedirs(path, exist_ok=True)
save_path = os.path.join(path, filename)
elite.saveCheckpoint(save_path)
elite.save_checkpoint(save_path)

pbar.close()
env.close()
Loading