-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmain.py
109 lines (87 loc) · 3.6 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import numpy as np
import tensorflow as tf
import gym
from utils import *
from model import *
import argparse
from rollouts import *
import json
import gym_buttons
parser = argparse.ArgumentParser(description='TRPO.')
# these parameters should stay the same
parser.add_argument("--task", type=str, default='Reacher-v1')
parser.add_argument("--timesteps_per_batch", type=int, default=10000)
parser.add_argument("--n_steps", type=int, default=50000000)
parser.add_argument("--gamma", type=float, default=.99)
parser.add_argument("--max_kl", type=float, default=.01)
parser.add_argument("--cg_damping", type=float, default=1e-3)
parser.add_argument("--num_threads", type=int, default=5)
parser.add_argument("--monitor", type=bool, default=False)
parser.add_argument("--policymode", type=str, default="single")
# change these parameters for testing
parser.add_argument("--decay_method", type=str, default="none") # adaptive, none
parser.add_argument("--timestep_adapt", type=int, default=0)
parser.add_argument("--kl_adapt", type=float, default=0)
args = parser.parse_args()
args.max_pathlength = gym.spec(args.task).timestep_limit
learner_tasks = multiprocessing.JoinableQueue()
learner_results = multiprocessing.Queue()
learner_env = gym.make(args.task)
learner = TRPO(args, learner_env.observation_space, learner_env.action_space, learner_tasks, learner_results)
learner.start()
rollouts = ParallelRollout(args)
learner_tasks.put(1)
learner_tasks.join()
starting_weights = learner_results.get()
rollouts.set_policy_weights(starting_weights)
start_time = time.time()
history = {}
history["rollout_time"] = []
history["learn_time"] = []
history["mean_reward"] = []
history["timesteps"] = []
# start it off with a big negative number
last_reward = -1000000
recent_total_reward = 0
totalsteps = 0;
starting_timesteps = args.timesteps_per_batch
starting_kl = args.max_kl
# saved_policy = np.load("policy.npy")
# rollouts.set_policy_weights(saved_policy)
iteration = 0
while True:
iteration += 1;
if iteration % 100 == 0:
rollouts.discretize()
# runs a bunch of async processes that collect rollouts
rollout_start = time.time()
paths = rollouts.rollout()
rollout_time = (time.time() - rollout_start) / 60.0
# Why is the learner in an async process?
# Well, it turns out tensorflow has an issue: when there's a tf.Session in the main thread
# and an async process creates another tf.Session, it will freeze up.
# To solve this, we just make the learner's tf.Session in its own async process,
# and wait until the learner's done before continuing the main thread.
learn_start = time.time()
learner_tasks.put((2,args.max_kl))
learner_tasks.put(paths)
learner_tasks.join()
new_policy_weights, mean_reward = learner_results.get()
learn_time = (time.time() - learn_start) / 60.0
print "-------- Iteration %d ----------" % iteration
print "Total time: %.2f mins" % ((time.time() - start_time) / 60.0)
history["rollout_time"].append(rollout_time)
history["learn_time"].append(learn_time)
history["mean_reward"].append(mean_reward)
history["timesteps"].append(args.timesteps_per_batch)
recent_total_reward += mean_reward
print "Current steps is " + str(args.timesteps_per_batch) + " and KL is " + str(args.max_kl)
if iteration % 100 == 0:
with open("%s-%s" % (args.task, args.policymode), "w") as outfile:
json.dump(history,outfile)
totalsteps += args.timesteps_per_batch
print "%d total steps have happened" % totalsteps
if totalsteps > args.n_steps:
break
rollouts.set_policy_weights(new_policy_weights)
rollouts.end()