Skip to content

Commit

Permalink
fix: Fixed choose_action method from agent and model
Browse files Browse the repository at this point in the history
  • Loading branch information
RickFqt committed Aug 16, 2024
1 parent 63d565a commit 5a2bd8b
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 89 deletions.
12 changes: 6 additions & 6 deletions urnai/agents/agent_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@ def __init__(self, action_space : ActionSpaceBase,
def step(self) -> None:
...

# @abstractmethod
# def choose_action(self, action_space : ActionSpaceBase) -> ActionBase:
# """
# Method that contains the agent's strategy for choosing actions
# """
# ...
@abstractmethod
def choose_action(self, action_space : ActionSpaceBase) -> int:
"""
Method that contains the agent's strategy for choosing actions
"""
...

def reset(self, episode=0) -> None:
"""
Expand Down
54 changes: 4 additions & 50 deletions urnai/models/dqn_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,11 @@ class name
def __init__(self, action_wrapper: ActionSpaceBase, state_builder: StateBase,
gamma=0.99, learning_rate=0.001, learning_rate_min=0.0001,
learning_rate_decay=0.99995, learning_rate_decay_ep_cutoff=0,
name='DQNPytorch', epsilon_start=1.0, epsilon_min=0.01,
epsilon_decay_rate=0.995, per_episode_epsilon_decay=False,
name='DQNPytorch',
batch_size=64, memory_maxlen=50000, min_memory_size=1000,
build_model=ModelBuilder.DEFAULT_BUILD_MODEL, seed_value=None,
cpu_only=False, epsilon_linear_decay=False,
lr_linear_decay=False, epsilon_decay_ep_start=0):
cpu_only=False,
lr_linear_decay=False):

# -----------------------------------------------
self.seed_value = seed_value
Expand All @@ -90,14 +89,6 @@ def __init__(self, action_wrapper: ActionSpaceBase, state_builder: StateBase,
self.action_size = action_wrapper.size
self.state_size = state_builder.dimension

# EXPLORATION PARAMETERS FOR EPSILON GREEDY STRATEGY
self.epsilon_greedy = epsilon_start
self.epsilon_min = epsilon_min
self.epsilon_decay_rate = epsilon_decay_rate
self.per_episode_epsilon_decay = per_episode_epsilon_decay
self.epsilon_linear_decay = epsilon_linear_decay
self.epsilon_decay_ep_start = epsilon_decay_ep_start

# self.tensorboard_callback_logdir = ''
self.tensorboard_callback = None
# -----------------------------------------------------------------
Expand Down Expand Up @@ -174,9 +165,6 @@ def learn(self, s, a, r, s_, done):
# to do: add tau to model definition so that it can be passed here
self.soft_update(self.model, self.target_model)

if not self.per_episode_epsilon_decay:
self.decay_epsilon()

def soft_update(self, local_model, target_model, tau=1e-3):
"""Soft update model parameters.
θ_target = τ*θ_local + (1 - τ)*θ_target
Expand All @@ -191,25 +179,6 @@ def soft_update(self, local_model, target_model, tau=1e-3):
target_param.data.copy_(
tau * local_param.data + (1 - tau) * target_param.data)

def choose_action(self, state, excluded_actions, is_training=True):
"""
If current epsilon greedy strategy is reached a random action will
be returned. If not, self.predict will be called to choose the action
with the highest Q-Value.
"""
if not is_training:
return self.predict(state, excluded_actions)

else:
if np.random.rand() <= self.epsilon_greedy:
random_action = random.choice(self.actions)
# Removing excluded actions
while random_action in excluded_actions:
random_action = random.choice(self.actions)
return random_action
else:
return self.predict(state, excluded_actions)

def predict(self, state, excluded_actions):
"""Gets the action with the highest Q-value from our DQN PyTorch model"""
state = torch.from_numpy(state).float().unsqueeze(0).to(device)
Expand Down Expand Up @@ -264,11 +233,9 @@ def set_seeds(self):

def ep_reset(self, episode=0):
"""
This method is mainly used to enact the decay_epsilon and decay_lr
This method is mainly used to enact the decay_lr
at the end of every episode.
"""
if self.per_episode_epsilon_decay and episode >= self.epsilon_decay_ep_start:
self.decay_epsilon()

if (episode > self.learning_rate_decay_ep_cutoff
and self.learning_rate_decay != 1):
Expand All @@ -286,19 +253,6 @@ def decay_lr(self):
else:
if self.learning_rate > self.learning_rate_min:
self.learning_rate *= self.learning_rate_decay

def decay_epsilon(self):
"""
Implements the epsilon greedy strategy, effectivelly lowering the current
epsilon greedy value by multiplying it by the epsilon_decay_rate
(the higher the value, the less it lowers the epsilon_decay).
"""
if self.epsilon_linear_decay:
if self.epsilon_greedy > self.epsilon_min:
self.epsilon_greedy -= (1 - self.epsilon_decay_rate)
else:
if self.epsilon_greedy > self.epsilon_min:
self.epsilon_greedy *= self.epsilon_decay_rate


class QNetwork(nn.Module):
Expand Down
11 changes: 0 additions & 11 deletions urnai/models/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,6 @@ def predict(self, state) -> int:
"""Returns the best action for this given state"""
...

@abstractmethod
def choose_action(self, state, excluded_actions, is_testing=False) -> int:
"""
Implements the logic for choosing an action while training and while
testing an Agent. For most Reinforcement Learning Algorithms, this method
will choose an action directly When is_testing=True, and will implement the
exploration algorithm for when is_testing=False.
One such exploration algorithm commonly used it the epsilon greedy strategy.
"""
pass

def save(self, persist_path) -> None:
self.persistence.save(persist_path)

Expand Down
64 changes: 59 additions & 5 deletions urnai/sc2/agents/sc2_agent.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import os
import random
import sys

import numpy as np

from urnai.actions.action_space_base import ActionSpaceBase
from urnai.agents.agent_base import AgentBase
from urnai.models.model_base import ModelBase
Expand All @@ -12,26 +15,39 @@

class SC2Agent(AgentBase):
def __init__(self, action_space : ActionSpaceBase, state_builder : StateBase,
model: ModelBase, reward_builder: RewardBase):
model: ModelBase, reward_builder: RewardBase, epsilon_start=1.0,
epsilon_min=0.01, epsilon_decay_rate=0.995,
per_episode_epsilon_decay=False, epsilon_linear_decay=False,
epsilon_decay_ep_start=0):
super().__init__(action_space, state_builder, model, reward_builder)
# self.reward = 0
self.episodes = 0
self.steps = 0

# EXPLORATION PARAMETERS FOR EPSILON GREEDY STRATEGY
self.epsilon_greedy = epsilon_start
self.epsilon_min = epsilon_min
self.epsilon_decay_rate = epsilon_decay_rate
self.per_episode_epsilon_decay = per_episode_epsilon_decay
self.epsilon_linear_decay = epsilon_linear_decay
self.epsilon_decay_ep_start = epsilon_decay_ep_start


def reset(self, episode=0):
super().reset(episode)
self.episodes += 1
if self.per_episode_epsilon_decay and episode >= self.epsilon_decay_ep_start:
self.decay_epsilon()

def step(self, obs, done, is_training=True):
self.steps += 1

if self.action_space.is_action_done():
current_state = self.state_space.update(obs)
excluded_actions = self.action_space.get_excluded_actions(obs)
predicted_action_idx = self.model.choose_action(current_state,
excluded_actions,
is_training)
self.previous_action = predicted_action_idx
chosen_action_idx = self.choose_action(current_state, excluded_actions,
is_training)
self.previous_action = chosen_action_idx
self.previous_state = current_state
selected_action = [self.action_space.get_action(self.previous_action, obs)]

Expand All @@ -42,3 +58,41 @@ def step(self, obs, done, is_training=True):
# raise error.ActionError(
# 'Invalid function structure. Function name: %s.' % selected_action[0])
return selected_action

def choose_action(self, state, excluded_actions, is_training=True):
if is_training:
if np.random.rand() <= self.epsilon_greedy:
random_action = random.choice(self.action_space.get_actions())
# Removing excluded actions
while random_action in excluded_actions:
random_action = random.choice(self.action_space.get_actions())
return random_action
else:
return self.model.predict(state, excluded_actions)
else:
return self.model.predict(state, excluded_actions)

def learn(self, obs, reward, done) -> None:
"""
If it is not the very first step in an episode, this method will
call the model's learn method.
"""
if self.previous_state is not None:
next_state = self.state_space.update(obs)
self.model.learn(self.previous_state, self.previous_action,
reward, next_state, done)
if not self.per_episode_epsilon_decay:
self.decay_epsilon()

def decay_epsilon(self):
"""
Implements the epsilon greedy strategy, effectivelly lowering the current
epsilon greedy value by multiplying it by the epsilon_decay_rate
(the higher the value, the less it lowers the epsilon_decay).
"""
if self.epsilon_linear_decay:
if self.epsilon_greedy > self.epsilon_min:
self.epsilon_greedy -= (1 - self.epsilon_decay_rate)
else:
if self.epsilon_greedy > self.epsilon_min:
self.epsilon_greedy *= self.epsilon_decay_rate
43 changes: 26 additions & 17 deletions urnai/trainers/trainer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import inspect
import os
import sys
from datetime import datetime

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
Expand All @@ -11,16 +12,28 @@
class Trainer:
# TODO: Add an option to play every x episodes, instead of just training non-stop

def __init__(self, env, agent, max_training_episodes, max_playing_episodes,
max_steps_training, max_steps_playing,
):
def __init__(
self, env, agent, max_training_episodes, max_playing_episodes,
max_steps_training, max_steps_playing, enable_save=True, save_every=10,
save_path= None, file_name=None,
):

self.env = env
self.agent = agent
self.max_training_episodes = max_training_episodes
self.max_playing_episodes = max_playing_episodes
self.max_steps_training = max_steps_training
self.max_steps_playing = max_steps_playing
self.enable_save = enable_save
self.save_every = save_every
if save_path is None:
save_path = os.path.expanduser('~') + os.path.sep + 'urnai_saved_trainings'
if file_name is None:
file_name=str(datetime.now()).replace(' ','_').replace(':','_').replace('.',
'_')
# self.full_save_path = save_path + os.path.sep + file_name
# TODO: Change this to a user defined path
self.full_save_path = "saves/"

def train(self, reward_from_agent=True):
self.training_loop(is_training=True, reward_from_agent=reward_from_agent)
Expand Down Expand Up @@ -92,24 +105,20 @@ def training_loop(self, is_training, reward_from_agent=True):

if done:
print("Episode: %d, Reward: %d" % (current_episodes, ep_reward))
self.agent.model.save("saves/")
# self.agent.model.save("saves/")
break

# if this is not a test (evaluation), saving is enabled and we are in a
# multiple of our save_every variable then we save the model and generate
# graphs
# TODO
# if is_training \
# and self.enable_save \
# and current_episodes > 0 \
# and current_episodes % self.save_every == 0:
# self.save(self.full_save_path)
# if the agent is training, saving is enabled and we are in a
# multiple of our save_every variable then we save the model
if is_training \
and self.enable_save \
and current_episodes > 0 \
and current_episodes % self.save_every == 0:
self.agent.model.save(self.full_save_path)


self.env.close()

# Saving the model at the end of the training loop
# TODO
# if self.enable_save:
# if is_training:
# self.save(self.full_save_path)
if self.enable_save and is_training:
self.agent.model.save(self.full_save_path)

0 comments on commit 5a2bd8b

Please sign in to comment.