Skip to content

Commit 99942b4

Browse files
committed
Merge branch 'dev' of github.com:MushroomRL/mushroom-rl into dev
2 parents a274d60 + 29db0c7 commit 99942b4

32 files changed

+533
-1430
lines changed

examples/lqr_pg.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def experiment(alg, n_epochs, n_iterations, ep_per_run):
2929
logger.info('Experiment Algorithm: ' + alg.__name__)
3030

3131
# MDP
32-
mdp = LQR.generate(dimensions=1)
32+
mdp = LQR.generate(dimensions=2, max_action=1., max_pos=1.)
3333

3434
approximator = Regressor(LinearApproximator,
3535
input_shape=mdp.info.observation_space.shape,
@@ -39,32 +39,32 @@ def experiment(alg, n_epochs, n_iterations, ep_per_run):
3939
input_shape=mdp.info.observation_space.shape,
4040
output_shape=mdp.info.action_space.shape)
4141

42-
sigma_weights = 2 * np.ones(sigma.weights_size)
42+
sigma_weights = 0.25 * np.ones(sigma.weights_size)
4343
sigma.set_weights(sigma_weights)
4444

4545
policy = StateStdGaussianPolicy(approximator, sigma)
4646

4747
# Agent
48-
optimizer = AdaptiveOptimizer(eps=.01)
48+
optimizer = AdaptiveOptimizer(eps=1e-2)
4949
algorithm_params = dict(optimizer=optimizer)
5050
agent = alg(mdp.info, policy, **algorithm_params)
5151

5252
# Train
5353
core = Core(agent, mdp)
5454
dataset_eval = core.evaluate(n_episodes=ep_per_run)
5555
J = compute_J(dataset_eval, gamma=mdp.info.gamma)
56-
logger.epoch_info(0, J=np.mean(J), policy_weights=policy.get_weights())
56+
logger.epoch_info(0, J=np.mean(J), policy_weights=policy.get_weights().tolist())
5757

5858
for i in trange(n_epochs, leave=False):
5959
core.learn(n_episodes=n_iterations * ep_per_run,
6060
n_episodes_per_fit=ep_per_run)
6161
dataset_eval = core.evaluate(n_episodes=ep_per_run)
6262
J = compute_J(dataset_eval, gamma=mdp.info.gamma)
63-
logger.epoch_info(i+1, J=np.mean(J), policy_weights=policy.get_weights())
63+
logger.epoch_info(i+1, J=np.mean(J), policy_weights=policy.get_weights().tolist())
6464

6565

6666
if __name__ == '__main__':
6767
algs = [REINFORCE, GPOMDP, eNAC]
6868

6969
for alg in algs:
70-
experiment(alg, n_epochs=10, n_iterations=4, ep_per_run=100)
70+
experiment(alg, n_epochs=10, n_iterations=4, ep_per_run=25)

mushroom_rl/algorithms/policy_search/policy_gradient/enac.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def _compute_gradient(self, J):
4242

4343
self.sum_grad_log_list = list()
4444

45-
return nat_grad,
45+
return nat_grad
4646

4747
def _step_update(self, x, u, r):
4848
self.sum_grad_log += self.policy.diff_log(x, u)

mushroom_rl/algorithms/policy_search/policy_gradient/gpomdp.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -40,32 +40,34 @@ def __init__(self, mdp_info, policy, optimizer, features=None):
4040
np.seterr(divide='ignore', invalid='ignore')
4141

4242
def _compute_gradient(self, J):
43-
gradient = np.zeros(self.policy.weights_size)
44-
4543
n_episodes = len(self.list_sum_d_log_pi_ep)
46-
44+
grad_J_episode = list()
4745
for i in range(n_episodes):
4846
list_sum_d_log_pi = self.list_sum_d_log_pi_ep[i]
4947
list_reward = self.list_reward_ep[i]
5048

5149
n_steps = len(list_sum_d_log_pi)
5250

51+
gradient = np.zeros(self.policy.weights_size)
52+
5353
for t in range(n_steps):
5454
step_grad = list_sum_d_log_pi[t]
5555
step_reward = list_reward[t]
56-
baseline = self.baseline_num[t] / self.baseline_den[t]
56+
baseline = np.mean(self.baseline_num[t], axis=0) / np.mean(self.baseline_den[t], axis=0)
5757
baseline[np.logical_not(np.isfinite(baseline))] = 0.
58-
gradient += (step_reward - baseline) * step_grad
58+
gradient += step_grad * (step_reward - baseline)
5959

60-
gradient /= n_episodes
60+
grad_J_episode.append(gradient)
61+
62+
gradJ = np.mean(grad_J_episode, axis=0)
6163

6264
self.list_reward_ep = list()
6365
self.list_sum_d_log_pi_ep = list()
6466

6567
self.baseline_num = list()
6668
self.baseline_den = list()
6769

68-
return gradient,
70+
return gradJ
6971

7072
def _step_update(self, x, u, r):
7173
discounted_reward = self.df * r
@@ -74,17 +76,16 @@ def _step_update(self, x, u, r):
7476
d_log_pi = self.policy.diff_log(x, u)
7577
self.sum_d_log_pi += d_log_pi
7678

77-
self.list_sum_d_log_pi.append(self.sum_d_log_pi)
79+
self.list_sum_d_log_pi.append(self.sum_d_log_pi.copy())
7880

7981
squared_sum_d_log_pi = np.square(self.sum_d_log_pi)
8082

81-
if self.step_count < len(self.baseline_num):
82-
self.baseline_num[
83-
self.step_count] += discounted_reward * squared_sum_d_log_pi
84-
self.baseline_den[self.step_count] += squared_sum_d_log_pi
85-
else:
86-
self.baseline_num.append(discounted_reward * squared_sum_d_log_pi)
87-
self.baseline_den.append(squared_sum_d_log_pi)
83+
if self.step_count >= len(self.baseline_num):
84+
self.baseline_num.append(list())
85+
self.baseline_den.append(list())
86+
87+
self.baseline_num[self.step_count].append(discounted_reward * squared_sum_d_log_pi)
88+
self.baseline_den[self.step_count].append(squared_sum_d_log_pi)
8889

8990
self.step_count += 1
9091

mushroom_rl/algorithms/policy_search/policy_gradient/policy_gradient.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,16 +60,11 @@ def _update_parameters(self, J):
6060
episode in the dataset.
6161
6262
"""
63-
res = self._compute_gradient(J)
63+
grad = self._compute_gradient(J)
6464

6565
theta_old = self.policy.get_weights()
6666

67-
if len(res) == 1:
68-
grad = res[0]
69-
theta_new = self.optimizer(theta_old, grad)
70-
else:
71-
grad, nat_grad = res
72-
theta_new = self.optimizer(theta_old, grad, nat_grad)
67+
theta_new = self.optimizer(theta_old, grad)
7368

7469
self.policy.set_weights(theta_new)
7570

@@ -111,6 +106,9 @@ def _compute_gradient(self, J):
111106
J (list): list of the cumulative discounted rewards for each
112107
episode in the dataset.
113108
109+
Returns:
110+
The gradient computed by the algorithm.
111+
114112
"""
115113
raise NotImplementedError('PolicyGradient is an abstract class')
116114

mushroom_rl/algorithms/policy_search/policy_gradient/reinforce.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,7 @@ def __init__(self, mdp_info, policy, optimizer, features=None):
2828
np.seterr(divide='ignore', invalid='ignore')
2929

3030
def _compute_gradient(self, J):
31-
baseline = np.mean(self.baseline_num, axis=0) / np.mean(
32-
self.baseline_den, axis=0)
31+
baseline = np.mean(self.baseline_num, axis=0) / np.mean(self.baseline_den, axis=0)
3332
baseline[np.logical_not(np.isfinite(baseline))] = 0.
3433
grad_J_episode = list()
3534
for i, J_episode in enumerate(J):
@@ -41,7 +40,7 @@ def _compute_gradient(self, J):
4140
self.baseline_den = list()
4241
self.baseline_num = list()
4342

44-
return grad_J,
43+
return grad_J
4544

4645
def _step_update(self, x, u, r):
4746
d_log_pi = self.policy.diff_log(x, u)

mushroom_rl/environments/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
try:
3131
PyBullet = None
3232
from .pybullet import PyBullet
33-
from .pybullet_envs import *
3433
except ImportError:
3534
pass
3635

0 commit comments

Comments
 (0)