update batcher

vladfi1 · ysharma1126 · Apr 10, 2017 · Apr 10, 2017 · Apr 10, 2017 · Apr 10, 2017
commit 60c6abedf3a2d4b64f3e2633741cb7ad86ecb12f
diff --git a/batcher.py b/batcher.py
@@ -4,7 +4,7 @@
 
 SAVE_DIR = "saves/"
 LOG_DIR = "slurm_logs/"
-TRAIN_TIME = 3600 * 4  # 4 hours
+TRAIN_TIME = 3600 * 10  # 4 hours
 
 def get_jobs():
   not_ran = set()
@@ -21,25 +21,28 @@ def get_jobs():
   return not_ran
 
 def get_jobid(job):
-  logs = os.listdir(LOG_DIR)
-  for log in logs:
-    if job in log:
-      l = log.rfind("_")
-      r = log.rfind(".")
-      return int(log[l+1:r])
-  return 4294967294  # Seems like this is the default job id
+  cmd = "squeue -u vladfi1 -o '%i %j %t' | grep {0}".format(job)
+  output = subprocess.check_output(cmd, shell=True).strip().split()
+  return int(output[0])
+
+def get_status(job_id):
+  cmd = "squeue --job {0} -o '%t'".format(job_id)
+  output = subprocess.check_output(cmd, shell=True).splitlines()
+  if len(output) != 2:
+    raise ValueError("Bad output:" + str(output))
+
+  status = output[1]
+  if type(status)==bytes:
+    status = status.decode("utf-8")
+  return status
 
 def get_trainnode(job_id):
-  cmd = "squeue --job {0}".format(job_id)
+  cmd = "squeue --job {0} -o '%N'".format(job_id)
   output = subprocess.check_output(cmd, shell=True).splitlines()
   if len(output) != 2:
-    return None, None
-  output = output[1].split()
-  if len(output) != 8:
-    return None, None
-  status = output[4]
-  node = output[7][4:]
-  return status, int(node)
+    raise ValueError("Bad output:" + str(output))
+  node = output[1][4:]
+  return int(node)
 
 def main():
   queue = set()
@@ -54,7 +57,7 @@ def main():
 
     # start training
     train_cmd  = "python launcher.py {0}/{1} --init".format(SAVE_DIR,job)
-    print("Running train command:" train_cmd)
+    print("Running train command:", train_cmd)
     os.system(train_cmd)
 
     # make sure the job started
@@ -66,10 +69,15 @@ def main():
     status = "PD"
     while status == "PD":
       time.sleep(5)
-      status, train_machine = get_trainnode(job_id)
+      status = get_status(job_id)
+
+    if status != "R":
+      raise ValueError("Bad Status: " + str(status))
+
+    train_machine = get_trainnode(job_id)
 
     print("Done waiting status =",status,"train machine =",str(train_machine))
-    if status == None:
+    if status is None:
       continue
 
     agent_cmd = "python launcher.py {0}/{1} --trainer {2}".format(SAVE_DIR, job, train_machine)

diff --git a/phillip/dqn.py b/phillip/dqn.py
@@ -8,7 +8,7 @@ class DQN(Default):
   hidden_size = []
 
   _options = [
-    Option('q_layers', type=int, nargs='+', default=[128, 128], help="sizes of the dqn hidden layers"),
+    Option('q_fc_layers', type=int, nargs='+', default=[128, 128], help="sizes of the dqn hidden layers"),
     Option('epsilon', type=float, default=0.02, help="pick random action with probability EPSILON"),
     Option('temperature', type=float, default=0.01, help="Boltzmann distribution over actions"),
     Option('sarsa', type=bool, default=True, help="use action taken instead of max when computing target Q-values"),
@@ -32,7 +32,7 @@ def __init__(self, embedGame, embedAction, global_step, rlConfig, scope='q', **k
     with tf.variable_scope(scope):
       self.net = tfl.Sequential()
       prev_size = history_size
-      for i, size in enumerate(self.q_layers):
+      for i, size in enumerate(self.q_fc_layers):
         with tf.variable_scope("layer_%d" % i):
           self.net.append(tfl.FCLayer(prev_size, size, self.nl))
         prev_size = size