Skip to content

Commit

Permalink
Switch from gym to gymnasium and refresh RL code (#2864)
Browse files Browse the repository at this point in the history
* Switch from gym to gymnasium and refresh RL code

* autopep8 fix

---------

Co-authored-by: vfdev-5 <[email protected]>
  • Loading branch information
vfdev-5 and vfdev-5 authored Feb 15, 2023
1 parent 913b60f commit 0b2e1e3
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 92 deletions.
2 changes: 1 addition & 1 deletion examples/reinforcement_learning/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
ported from [pytorch-examples](https://github.com/pytorch/examples/tree/master/reinforcement_learning)

```bash
pip install gym
pip install gymnasium
# For REINFORCE:
python reinforce.py
# For actor critic:
Expand Down
149 changes: 102 additions & 47 deletions examples/reinforcement_learning/actor_critic.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import argparse
from collections import namedtuple
from collections import deque, namedtuple

import numpy as np
import torch
Expand All @@ -11,61 +11,110 @@
from ignite.engine import Engine, Events

try:
import gym
import gymnasium as gym
except ImportError:
raise ModuleNotFoundError("Please install opengym: pip install gym")
raise ModuleNotFoundError("Please install opengym: pip install gymnasium")


SavedAction = namedtuple("SavedAction", ["log_prob", "value"])

eps = np.finfo(np.float32).eps.item()


class Policy(nn.Module):
"""
implements both actor and critic in one model
"""

def __init__(self):
super(Policy, self).__init__()
self.affine1 = nn.Linear(4, 128)

# actor's layer
self.action_head = nn.Linear(128, 2)

# critic's layer
self.value_head = nn.Linear(128, 1)

# action & reward buffer
self.saved_actions = []
self.rewards = []

def forward(self, x):
"""
forward of both actor and critic
"""
x = F.relu(self.affine1(x))
action_scores = self.action_head(x)

# actor: choses action to take from state s_t
# by returning probability of each action
action_prob = F.softmax(self.action_head(x), dim=-1)

# critic: evaluates being in the state s_t
state_values = self.value_head(x)
return F.softmax(action_scores, dim=-1), state_values

# return values for both actor and critic as a tuple of 2 values:
# 1. a list with the probability of each action over the action space
# 2. the value from state s_t
return action_prob, state_values


def select_action(model, observation):
def select_action(policy, observation):
observation = torch.from_numpy(observation).float()
probs, observation_value = model(observation)
probs, observation_value = policy(observation)
# create a categorical distribution over the list of probabilities of actions
m = Categorical(probs)

# and sample an action using the distribution
action = m.sample()
model.saved_actions.append(SavedAction(m.log_prob(action), observation_value))

# save to action buffer
policy.saved_actions.append(SavedAction(m.log_prob(action), observation_value))

# the action to take (left or right)
return action.item()


def finish_episode(model, optimizer, gamma, eps):
def finish_episode(policy, optimizer, gamma):
"""
Training code. Calculates actor and critic loss and performs backprop.
"""
R = 0
saved_actions = model.saved_actions
policy_losses = []
value_losses = []
rewards = []
for r in model.rewards[::-1]:
saved_actions = policy.saved_actions
policy_losses = [] # list to save actor (policy) loss
value_losses = [] # list to save critic (value) loss
returns = deque() # list to save the true values

# calculate the true value using rewards returned from the environment
for r in policy.rewards[::-1]:
# calculate the discounted value
R = r + gamma * R
rewards.insert(0, R)
rewards = torch.tensor(rewards)
rewards = (rewards - rewards.mean()) / (rewards.std() + eps)
for (log_prob, value), r in zip(saved_actions, rewards):
reward = r - value.item()
policy_losses.append(-log_prob * reward)
value_losses.append(F.smooth_l1_loss(value, torch.tensor([r])))
returns.appendleft(R)

returns = torch.tensor(returns)
returns = (returns - returns.mean()) / (returns.std() + eps)

for (log_prob, value), R in zip(saved_actions, returns):
advantage = R - value.item()

# calculate actor (policy) loss
policy_losses.append(-log_prob * advantage)

# calculate critic (value) loss using L1 smooth loss
value_losses.append(F.smooth_l1_loss(value, torch.tensor([R])))

# reset gradients
optimizer.zero_grad()

# sum up all the values of policy_losses and value_losses
loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()

# perform backprop
loss.backward()
optimizer.step()
del model.rewards[:]
del model.saved_actions[:]
# reset rewards and action buffer
del policy.rewards[:]
del policy.saved_actions[:]


EPISODE_STARTED = Events.EPOCH_STARTED
Expand All @@ -74,57 +123,63 @@ def finish_episode(model, optimizer, gamma, eps):

def main(env, args):

model = Policy()
optimizer = optim.Adam(model.parameters(), lr=3e-2)
eps = np.finfo(np.float32).eps.item()
timesteps = list(range(10000))
policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=3e-2)
timesteps = range(10000)

def run_single_timestep(engine, timestep):
observation = engine.state.observation
action = select_action(model, observation)
# select action from policy
action = select_action(policy, observation)

# take the action
engine.state.observation, reward, done, _, _ = env.step(action)

if args.render:
env.render()
model.rewards.append(reward)

policy.rewards.append(reward)
engine.state.ep_reward += reward
if done:
engine.terminate_epoch()
engine.state.timestep = timestep

trainer = Engine(run_single_timestep)

@trainer.on(Events.STARTED)
def initialize(engine):
engine.state.running_reward = 10
trainer.state.running_reward = 10

@trainer.on(EPISODE_STARTED)
def reset_environment_state(engine):
def reset_environment_state():
# reset environment and episode reward
torch.manual_seed(args.seed + trainer.state.epoch)
engine.state.observation, _ = env.reset(seed=args.seed + trainer.state.epoch)
trainer.state.observation, _ = env.reset(seed=args.seed + trainer.state.epoch)
trainer.state.ep_reward = 0

@trainer.on(EPISODE_COMPLETED)
def update_model(engine):
t = engine.state.timestep
engine.state.running_reward = engine.state.running_reward * 0.99 + t * 0.01
finish_episode(model, optimizer, args.gamma, eps)
def update_model():
# update cumulative reward
t = trainer.state.timestep
trainer.state.running_reward = 0.05 * trainer.state.ep_reward + (1 - 0.05) * trainer.state.running_reward
# perform backprop
finish_episode(policy, optimizer, args.gamma)

@trainer.on(EPISODE_COMPLETED(every=args.log_interval))
def log_episode(engine):
i_episode = engine.state.epoch
def log_episode():
i_episode = trainer.state.epoch
print(
f"Episode {i_episode}\tLast length: {engine.state.timestep:5d}"
f"\tAverage length: {engine.state.running_reward:.2f}"
f"Episode {i_episode}\tLast reward: {trainer.state.ep_reward:.2f}"
f"\tAverage reward: {trainer.state.running_reward:.2f}"
)

@trainer.on(EPISODE_COMPLETED)
def should_finish_training(engine):
running_reward = engine.state.running_reward
def should_finish_training():
# check if we have "solved" the cart pole problem
running_reward = trainer.state.running_reward
if running_reward > env.spec.reward_threshold:
print(
f"Solved! Running reward is now {running_reward} and "
f"the last episode runs to {engine.state.timestep} time steps!"
f"the last episode runs to {trainer.state.timestep} time steps!"
)
engine.should_terminate = True
trainer.should_terminate = True

trainer.run(timesteps, max_epochs=args.max_episodes)

Expand Down
85 changes: 44 additions & 41 deletions examples/reinforcement_learning/reinforce.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
from collections import deque

import numpy as np
import torch
Expand All @@ -10,52 +11,58 @@
from ignite.engine import Engine, Events

try:
import gym
import gymnasium as gym
except ImportError:
raise ModuleNotFoundError("Please install opengym: pip install gym")
raise ModuleNotFoundError("Please install opengym: pip install gymnasium")


eps = np.finfo(np.float32).eps.item()


class Policy(nn.Module):
def __init__(self):
super(Policy, self).__init__()
self.affine1 = nn.Linear(4, 128)
self.dropout = nn.Dropout(p=0.6)
self.affine2 = nn.Linear(128, 2)

self.saved_log_probs = []
self.rewards = []

def forward(self, x):
x = F.relu(self.affine1(x))
x = self.affine1(x)
x = self.dropout(x)
x = F.relu(x)
action_scores = self.affine2(x)
return F.softmax(action_scores, dim=1)


def select_action(model, observation):
def select_action(policy, observation):
state = torch.from_numpy(observation).float().unsqueeze(0)
probs = model(state)
probs = policy(state)
m = Categorical(probs)
action = m.sample()
model.saved_log_probs.append(m.log_prob(action))
policy.saved_log_probs.append(m.log_prob(action))
return action.item()


def finish_episode(model, optimizer, gamma, eps):
def finish_episode(policy, optimizer, gamma):
R = 0
policy_loss = []
rewards = []
for r in model.rewards[::-1]:
returns = deque()
for r in policy.rewards[::-1]:
R = r + gamma * R
rewards.insert(0, R)
rewards = torch.tensor(rewards)
rewards = (rewards - rewards.mean()) / (rewards.std() + eps)
for log_prob, reward in zip(model.saved_log_probs, rewards):
policy_loss.append(-log_prob * reward)
returns.appendleft(R)
returns = torch.tensor(returns)
returns = (returns - returns.mean()) / (returns.std() + eps)
for log_prob, R in zip(policy.saved_log_probs, returns):
policy_loss.append(-log_prob * R)
optimizer.zero_grad()
policy_loss = torch.cat(policy_loss).sum()
policy_loss.backward()
optimizer.step()
del model.rewards[:]
del model.saved_log_probs[:]
del policy.rewards[:]
del policy.saved_log_probs[:]


EPISODE_STARTED = Events.EPOCH_STARTED
Expand All @@ -64,57 +71,53 @@ def finish_episode(model, optimizer, gamma, eps):

def main(env, args):

model = Policy()
optimizer = optim.Adam(model.parameters(), lr=1e-2)
eps = np.finfo(np.float32).eps.item()
timesteps = list(range(10000))
policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=1e-2)
timesteps = range(10000)

def run_single_timestep(engine, timestep):
observation = engine.state.observation
action = select_action(model, observation)
action = select_action(policy, observation)
engine.state.observation, reward, done, _, _ = env.step(action)
if args.render:
env.render()
model.rewards.append(reward)

policy.rewards.append(reward)
engine.state.ep_reward += reward
if done:
engine.terminate_epoch()
engine.state.timestep = timestep

trainer = Engine(run_single_timestep)

@trainer.on(Events.STARTED)
def initialize(engine):
engine.state.running_reward = 10
trainer.state.running_reward = 10

@trainer.on(EPISODE_STARTED)
def reset_environment_state(engine):
def reset_environment_state():
torch.manual_seed(args.seed + trainer.state.epoch)
engine.state.observation, _ = env.reset(seed=args.seed + trainer.state.epoch)
trainer.state.observation, _ = env.reset(seed=args.seed + trainer.state.epoch)
trainer.state.ep_reward = 0

@trainer.on(EPISODE_COMPLETED)
def update_model(engine):
t = engine.state.timestep
engine.state.running_reward = engine.state.running_reward * 0.99 + t * 0.01
finish_episode(model, optimizer, args.gamma, eps)
def update_model():
trainer.state.running_reward = 0.05 * trainer.state.ep_reward + (1 - 0.05) * trainer.state.running_reward
finish_episode(policy, optimizer, args.gamma)

@trainer.on(EPISODE_COMPLETED(every=args.log_interval))
def log_episode(engine):
i_episode = engine.state.epoch
def log_episode():
i_episode = trainer.state.epoch
print(
f"Episode {i_episode}\tLast length: {engine.state.timestep:5d}"
f"\tAverage length: {engine.state.running_reward:.2f}"
f"Episode {i_episode}\tLast reward: {trainer.state.ep_reward:.2f}"
f"\tAverage length: {trainer.state.running_reward:.2f}"
)

@trainer.on(EPISODE_COMPLETED)
def should_finish_training(engine):
running_reward = engine.state.running_reward
def should_finish_training():
running_reward = trainer.state.running_reward
if running_reward > env.spec.reward_threshold:
print(
f"Solved! Running reward is now {running_reward} and "
f"the last episode runs to {engine.state.timestep} time steps!"
f"the last episode runs to {trainer.state.timestep} time steps!"
)
engine.should_terminate = True
trainer.should_terminate = True

trainer.run(timesteps, max_epochs=args.max_episodes)

Expand Down
Loading

0 comments on commit 0b2e1e3

Please sign in to comment.