Skip to content

Commit

Permalink
Merge branch 'dev' of github.com:MushroomRL/mushroom-rl into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
carloderamo committed Jun 9, 2021
2 parents a274d60 + 29db0c7 commit 99942b4
Show file tree
Hide file tree
Showing 32 changed files with 533 additions and 1,430 deletions.
12 changes: 6 additions & 6 deletions examples/lqr_pg.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def experiment(alg, n_epochs, n_iterations, ep_per_run):
logger.info('Experiment Algorithm: ' + alg.__name__)

# MDP
mdp = LQR.generate(dimensions=1)
mdp = LQR.generate(dimensions=2, max_action=1., max_pos=1.)

approximator = Regressor(LinearApproximator,
input_shape=mdp.info.observation_space.shape,
Expand All @@ -39,32 +39,32 @@ def experiment(alg, n_epochs, n_iterations, ep_per_run):
input_shape=mdp.info.observation_space.shape,
output_shape=mdp.info.action_space.shape)

sigma_weights = 2 * np.ones(sigma.weights_size)
sigma_weights = 0.25 * np.ones(sigma.weights_size)
sigma.set_weights(sigma_weights)

policy = StateStdGaussianPolicy(approximator, sigma)

# Agent
optimizer = AdaptiveOptimizer(eps=.01)
optimizer = AdaptiveOptimizer(eps=1e-2)
algorithm_params = dict(optimizer=optimizer)
agent = alg(mdp.info, policy, **algorithm_params)

# Train
core = Core(agent, mdp)
dataset_eval = core.evaluate(n_episodes=ep_per_run)
J = compute_J(dataset_eval, gamma=mdp.info.gamma)
logger.epoch_info(0, J=np.mean(J), policy_weights=policy.get_weights())
logger.epoch_info(0, J=np.mean(J), policy_weights=policy.get_weights().tolist())

for i in trange(n_epochs, leave=False):
core.learn(n_episodes=n_iterations * ep_per_run,
n_episodes_per_fit=ep_per_run)
dataset_eval = core.evaluate(n_episodes=ep_per_run)
J = compute_J(dataset_eval, gamma=mdp.info.gamma)
logger.epoch_info(i+1, J=np.mean(J), policy_weights=policy.get_weights())
logger.epoch_info(i+1, J=np.mean(J), policy_weights=policy.get_weights().tolist())


if __name__ == '__main__':
algs = [REINFORCE, GPOMDP, eNAC]

for alg in algs:
experiment(alg, n_epochs=10, n_iterations=4, ep_per_run=100)
experiment(alg, n_epochs=10, n_iterations=4, ep_per_run=25)
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def _compute_gradient(self, J):

self.sum_grad_log_list = list()

return nat_grad,
return nat_grad

def _step_update(self, x, u, r):
self.sum_grad_log += self.policy.diff_log(x, u)
Expand Down
31 changes: 16 additions & 15 deletions mushroom_rl/algorithms/policy_search/policy_gradient/gpomdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,32 +40,34 @@ def __init__(self, mdp_info, policy, optimizer, features=None):
np.seterr(divide='ignore', invalid='ignore')

def _compute_gradient(self, J):
gradient = np.zeros(self.policy.weights_size)

n_episodes = len(self.list_sum_d_log_pi_ep)

grad_J_episode = list()
for i in range(n_episodes):
list_sum_d_log_pi = self.list_sum_d_log_pi_ep[i]
list_reward = self.list_reward_ep[i]

n_steps = len(list_sum_d_log_pi)

gradient = np.zeros(self.policy.weights_size)

for t in range(n_steps):
step_grad = list_sum_d_log_pi[t]
step_reward = list_reward[t]
baseline = self.baseline_num[t] / self.baseline_den[t]
baseline = np.mean(self.baseline_num[t], axis=0) / np.mean(self.baseline_den[t], axis=0)
baseline[np.logical_not(np.isfinite(baseline))] = 0.
gradient += (step_reward - baseline) * step_grad
gradient += step_grad * (step_reward - baseline)

gradient /= n_episodes
grad_J_episode.append(gradient)

gradJ = np.mean(grad_J_episode, axis=0)

self.list_reward_ep = list()
self.list_sum_d_log_pi_ep = list()

self.baseline_num = list()
self.baseline_den = list()

return gradient,
return gradJ

def _step_update(self, x, u, r):
discounted_reward = self.df * r
Expand All @@ -74,17 +76,16 @@ def _step_update(self, x, u, r):
d_log_pi = self.policy.diff_log(x, u)
self.sum_d_log_pi += d_log_pi

self.list_sum_d_log_pi.append(self.sum_d_log_pi)
self.list_sum_d_log_pi.append(self.sum_d_log_pi.copy())

squared_sum_d_log_pi = np.square(self.sum_d_log_pi)

if self.step_count < len(self.baseline_num):
self.baseline_num[
self.step_count] += discounted_reward * squared_sum_d_log_pi
self.baseline_den[self.step_count] += squared_sum_d_log_pi
else:
self.baseline_num.append(discounted_reward * squared_sum_d_log_pi)
self.baseline_den.append(squared_sum_d_log_pi)
if self.step_count >= len(self.baseline_num):
self.baseline_num.append(list())
self.baseline_den.append(list())

self.baseline_num[self.step_count].append(discounted_reward * squared_sum_d_log_pi)
self.baseline_den[self.step_count].append(squared_sum_d_log_pi)

self.step_count += 1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,11 @@ def _update_parameters(self, J):
episode in the dataset.
"""
res = self._compute_gradient(J)
grad = self._compute_gradient(J)

theta_old = self.policy.get_weights()

if len(res) == 1:
grad = res[0]
theta_new = self.optimizer(theta_old, grad)
else:
grad, nat_grad = res
theta_new = self.optimizer(theta_old, grad, nat_grad)
theta_new = self.optimizer(theta_old, grad)

self.policy.set_weights(theta_new)

Expand Down Expand Up @@ -111,6 +106,9 @@ def _compute_gradient(self, J):
J (list): list of the cumulative discounted rewards for each
episode in the dataset.
Returns:
The gradient computed by the algorithm.
"""
raise NotImplementedError('PolicyGradient is an abstract class')

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ def __init__(self, mdp_info, policy, optimizer, features=None):
np.seterr(divide='ignore', invalid='ignore')

def _compute_gradient(self, J):
baseline = np.mean(self.baseline_num, axis=0) / np.mean(
self.baseline_den, axis=0)
baseline = np.mean(self.baseline_num, axis=0) / np.mean(self.baseline_den, axis=0)
baseline[np.logical_not(np.isfinite(baseline))] = 0.
grad_J_episode = list()
for i, J_episode in enumerate(J):
Expand All @@ -41,7 +40,7 @@ def _compute_gradient(self, J):
self.baseline_den = list()
self.baseline_num = list()

return grad_J,
return grad_J

def _step_update(self, x, u, r):
d_log_pi = self.policy.diff_log(x, u)
Expand Down
1 change: 0 additions & 1 deletion mushroom_rl/environments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
try:
PyBullet = None
from .pybullet import PyBullet
from .pybullet_envs import *
except ImportError:
pass

Expand Down
Loading

0 comments on commit 99942b4

Please sign in to comment.