include SSD and thres SSD in eval_policy

rfal · rfal · commit c52ae8a2cebd · 2022-06-18T14:34:24.000-04:00
diff --git a/README.md b/README.md
@@ -54,4 +54,4 @@ $ python run_stable_baselines3.py -C [experiment config file (required)] -P [num
 Example configuration files are provided in the **config** directory, and see [parameters.md](parameters.md) for detailed explanations of common parameters.
 
 ## Third Party Libraries
-This project uses implementations of A2C, PPO, DQN and QRDQN agents from [stable-baselines3](https://github.com/DLR-RM/stable-baselines3) and [stable-baselines3-contrib](https://github.com/Stable-Baselines-Team/stable-baselines3-contrib), and makes some modifications to apply to the proposed environment. There are some agent specific parameters in the provided configuration files, please refer to [on_policy_algorithm.py](https://github.com/RobustFieldAutonomyLab/Stochastic_Road_Network/blob/main/thirdparty/stable_baselines3/common/on_policy_algorithm.py) ((A2C and PPO)) and [off_policy_algorithm.py](https://github.com/RobustFieldAutonomyLab/Stochastic_Road_Network/blob/main/thirdparty/stable_baselines3/common/off_policy_algorithm.py) (DQN and QRDQN) for further information.
+This project uses implementations of A2C, PPO, DQN and QR-DQN agents from [stable-baselines3](https://github.com/DLR-RM/stable-baselines3) and [stable-baselines3-contrib](https://github.com/Stable-Baselines-Team/stable-baselines3-contrib), and makes some modifications to apply to the proposed environment. There are some agent specific parameters in the provided configuration files, please refer to [on_policy_algorithm.py](https://github.com/RobustFieldAutonomyLab/Stochastic_Road_Network/blob/main/thirdparty/stable_baselines3/common/on_policy_algorithm.py) ((A2C and PPO)) and [off_policy_algorithm.py](https://github.com/RobustFieldAutonomyLab/Stochastic_Road_Network/blob/main/thirdparty/stable_baselines3/common/off_policy_algorithm.py) (DQN and QR-DQN) for further information.
diff --git a/config/config_A2C_Town01_cnn.json b/config/config_A2C_Town01_cnn.json
diff --git a/config/config_DQN_Town01_cnn.json b/config/config_DQN_Town01_cnn.json
diff --git a/config/config_PPO_Town01_cnn.json b/config/config_PPO_Town01_cnn.json
diff --git a/run_stable_baselines3.py b/run_stable_baselines3.py
@@ -70,28 +70,21 @@ def params_dashboard(params):
     print("seed: ",params["base"]["seed"])
     print("num_timesteps: ",params["base"]["num_timesteps"])
     print("agent: ",params["agent"]["name"])
-    print("policy: ",params["policy"])
+    print("network: ",params["policy"])
     print("discount: ",params["agent"]["discount"])
     print("learning rate: ",params["agent"]["alpha"])
     print("map: ",params["environment"]["map_name"])
     print("start_state: ",params["environment"]["start_state"])
     print("goal_states: ",params["environment"]["goal_states"])
-    print("crosswalk_states: ",params["environment"]["crosswalk_states"],"\n")
+    print("crosswalk_states: ",params["environment"]["crosswalk_states"])
+    if params["agent"]["name"] == "QRDQN":
+        print("eval policy: ",params["agent"]["eval_policy"])
+        if params["agent"]["eval_policy"] == "Thresholded_SSD":
+            print("ssd thres: ",params["agent"]["ssd_thres"])
+    print("\n")
 
 def run_trial(params,device):
 
-    #print("\n====== Trial Setup ======\n")
-    #print("seed: ",params["base"]["seed"])
-    #print("num_timesteps: ",params["base"]["num_timesteps"])
-    #print("agent: ",params["agent"]["name"])
-    #print("policy: ",params["policy"])
-    #print("discount: ",params["agent"]["discount"])
-    #print("learning rate: ",params["agent"]["alpha"])
-    #print("map: ",params["environment"]["map_name"])
-    #print("start_state: ",params["environment"]["start_state"])
-    #print("goal_states: ",params["environment"]["goal_states"])
-    #print("crosswalk_states: ",params["environment"]["crosswalk_states"],"\n")
-
     lr = params["agent"]["alpha"]
     sd = params["base"]["seed"]
     cw = params["environment"]["crosswalk_states"]
@@ -129,7 +122,7 @@ def run_trial(params,device):
     evaluate_env.reset()
     
     if params["agent"]["name"] == "QRDQN":
-        save_dir = os.path.join(params["save_dir"],params["agent"]["name"],params["environment"]["map_name"],params["policy"],stoc,"buffer_"+str(params["agent"]["buffer_size"]),"n_quantile_"+str(params["agent"]["n_quantiles"]),"lr_"+str(lr),"seed_"+str(sd))
+        save_dir = os.path.join(params["save_dir"],params["agent"]["name"],params["environment"]["map_name"],params["policy"],params["agent"]["eval_policy"],stoc,"buffer_"+str(params["agent"]["buffer_size"]),"n_quantile_"+str(params["agent"]["n_quantiles"]),"lr_"+str(lr),"seed_"+str(sd))
     else:
         save_dir = os.path.join(params["save_dir"],params["agent"]["name"],params["environment"]["map_name"],params["policy"],stoc,"buffer_"+str(params["agent"]["buffer_size"]),"lr_"+str(lr),"seed_"+str(sd)) 
 
@@ -144,7 +137,8 @@ def run_trial(params,device):
         policy_args = {"normalize_images":False}
     else:
         raise RuntimeError("The network strucutre is not available")
-
+    
+    eval_args = {}
     if params["agent"]["name"] == "PPO":
         model = PPO(params["policy"], 
                     behave_env, 
@@ -183,6 +177,9 @@ def run_trial(params,device):
                     device=device)
     elif params["agent"]["name"] == "QRDQN":
         policy_args["n_quantiles"] = params["agent"]["n_quantiles"]
+        eval_args["eval_policy"] = params["agent"]["eval_policy"]
+        if params["agent"]["eval_policy"] == "Thresholded_SSD":
+            eval_args["ssd_thres"] = params["agent"]["ssd_thres"]
         model = QRDQN(params["policy"],
                       behave_env, 
                       verbose=1,
@@ -198,23 +195,17 @@ def run_trial(params,device):
                       device=device)
     else:
         raise RuntimeError("The agent is not available.")
-    
-    model.learn(total_timesteps=params["base"]["num_timesteps"], eval_env=evaluate_env, eval_freq=params["base"]["eval_freq"], n_eval_episodes=1, eval_log_path=save_dir)
-    
-    # check number of steps since last reset of env
-    #behave_count = behave_env._get_count()
-    #evaluate_count = evaluate_env._get_count()
-    #count_file = os.path.join(save_dir,"count.txt")
-    #np.savetxt(count_file,[behave_count,evaluate_count],fmt="%d")
 
-    # save all paths in evaluation
+    model.learn(total_timesteps=params["base"]["num_timesteps"], eval_env=evaluate_env, eval_freq=params["base"]["eval_freq"], n_eval_episodes=1, eval_log_path=save_dir, **eval_args)
+
+    # save all paths in evaluations
     all_eval_paths = evaluate_env.get_all_paths()
     paths_file = os.path.join(save_dir,"eval_paths.csv")
     with open(paths_file, "w", newline="") as f:
         write = csv.writer(f)
         write.writerows(all_eval_paths)
 
-    # save all quantiles in evalution (for QR-DQN agent)
+    # save all quantiles in evalutions (for QR-DQN agent)
     if params["agent"]["name"] == "QRDQN":
         all_eval_q = evaluate_env.get_quantiles()
         np.save(os.path.join(save_dir,"eval_quantiles.npy"),all_eval_q)
diff --git a/thirdparty/sb3_contrib/qrdqn/qrdqn.py b/thirdparty/sb3_contrib/qrdqn/qrdqn.py
@@ -250,6 +250,9 @@ def learn(
         tb_log_name: str = "QRDQN",
         eval_log_path: Optional[str] = None,
         reset_num_timesteps: bool = True,
+        ##### local modification #####
+        eval_policy: str = "Greedy",
+        ssd_thres: float = 1e-03
     ) -> OffPolicyAlgorithm:
 
         return super(QRDQN, self).learn(
@@ -262,6 +265,9 @@ def learn(
             tb_log_name=tb_log_name,
             eval_log_path=eval_log_path,
             reset_num_timesteps=reset_num_timesteps,
+            ##### local modification #####
+            eval_policy=eval_policy,
+            ssd_thres=ssd_thres
         )
 
     def _excluded_save_params(self) -> List[str]:
diff --git a/thirdparty/stable_baselines3/common/base_class.py b/thirdparty/stable_baselines3/common/base_class.py
@@ -350,6 +350,9 @@ def _init_callback(
         eval_freq: int = 10000,
         n_eval_episodes: int = 5,
         log_path: Optional[str] = None,
+        ##### local modification #####
+        eval_policy: str = "Greedy",
+        ssd_thres: float = 1e-03
     ) -> BaseCallback:
         """
         :param callback: Callback(s) called at every step with state of the algorithm.
@@ -375,6 +378,9 @@ def _init_callback(
                 log_path=log_path,
                 eval_freq=eval_freq,
                 n_eval_episodes=n_eval_episodes,
+                ##### local modification #####
+                eval_policy=eval_policy,
+                ssd_thres=ssd_thres
             )
             callback = CallbackList([callback, eval_callback])
 
@@ -391,6 +397,9 @@ def _setup_learn(
         log_path: Optional[str] = None,
         reset_num_timesteps: bool = True,
         tb_log_name: str = "run",
+        ##### local modification #####
+        eval_policy: str = "Greedy",
+        ssd_thres: float = 1e-03
     ) -> Tuple[int, BaseCallback]:
         """
         Initialize different variables needed for training.
@@ -442,7 +451,8 @@ def _setup_learn(
             self._logger = utils.configure_logger(self.verbose, self.tensorboard_log, tb_log_name, reset_num_timesteps)
 
         # Create eval callback if needed
-        callback = self._init_callback(callback, eval_env, eval_freq, n_eval_episodes, log_path)
+        ##### local modification #####
+        callback = self._init_callback(callback, eval_env, eval_freq, n_eval_episodes, log_path, eval_policy, ssd_thres)
 
         return total_timesteps, callback
 
diff --git a/thirdparty/stable_baselines3/common/callbacks.py b/thirdparty/stable_baselines3/common/callbacks.py
@@ -304,6 +304,9 @@ def __init__(
         render: bool = False,
         verbose: int = 1,
         warn: bool = True,
+        ##### local modification #####
+        eval_policy: str = "Greedy",
+        ssd_thres: float = 1e-03
     ):
         super(EvalCallback, self).__init__(callback_on_new_best, verbose=verbose)
         self.n_eval_episodes = n_eval_episodes
@@ -313,6 +316,9 @@ def __init__(
         self.deterministic = deterministic
         self.render = render
         self.warn = warn
+        ##### local modification #####
+        self.eval_policy = eval_policy
+        self.ssd_thres = ssd_thres
 
         # Convert to VecEnv for consistency
         if not isinstance(eval_env, VecEnv):
@@ -384,6 +390,9 @@ def _on_step(self) -> bool:
                 return_episode_rewards=True,
                 warn=self.warn,
                 callback=self._log_success_callback,
+                ##### local modification #####
+                eval_policy=self.eval_policy,
+                ssd_thres=self.ssd_thres
             )
 
             if self.log_path is not None:
@@ -408,6 +417,11 @@ def _on_step(self) -> bool:
             mean_reward, std_reward = np.mean(episode_rewards), np.std(episode_rewards)
             mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std(episode_lengths)
             self.last_mean_reward = mean_reward
+            
+            if self.eval_policy == "Thresholded_SSD":
+                print("Eval policy: Thresholded SSD, ",f"Mean threshold: {self.ssd_thres:.1f}")
+            else:
+                print("Eval policy: ",self.eval_policy)
 
             if self.verbose > 0:
                 print(f"Eval num_timesteps={self.num_timesteps}, " f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}")
diff --git a/thirdparty/stable_baselines3/common/evaluation.py b/thirdparty/stable_baselines3/common/evaluation.py
@@ -7,6 +7,24 @@
 from stable_baselines3.common import base_class
 from stable_baselines3.common.vec_env import DummyVecEnv, VecEnv, VecMonitor, is_vecenv_wrapped
 
+##### local modification #####
+def ssd_policy(quantiles:np.ndarray, use_threshold:bool=False, mean_threshold:float=1e-03):
+    means = np.mean(quantiles,axis=0)
+    sort_idx = np.argsort(-1*means)
+    best_1 = sort_idx[0]
+    best_2 = sort_idx[1]
+    if means[best_1] - means[best_2] > mean_threshold:
+        return best_1
+    else:
+        if use_threshold:
+            signed_second_moment = -1 * np.var(quantiles,axis=0)
+        else:
+            signed_second_moment = -1 * np.mean(quantiles**2,axis=0)
+        action = best_1
+        if signed_second_moment[best_2] > signed_second_moment[best_1]:
+            action = best_2
+        return action
+
 
 def evaluate_policy(
     model: "base_class.BaseAlgorithm",
@@ -18,6 +36,9 @@ def evaluate_policy(
     reward_threshold: Optional[float] = None,
     return_episode_rewards: bool = False,
     warn: bool = True,
+    ##### local modification #####
+    eval_policy: str = "Greedy",
+    ssd_thres: float = 1e-03
 ) -> Union[Tuple[float, float], Tuple[List[float], List[int]]]:
     """
     Runs policy for ``n_eval_episodes`` episodes and returns average reward.
@@ -70,9 +91,9 @@ def evaluate_policy(
         )
     
     ##### local modification #####
-    # get quantiles prediction for all state action pair if the agent is QR-DQN
+    # store quantiles prediction for all state action pair if the agent is QR-DQN
     if env.save_q_vals:
-        print("predicting quantiles (QR-DQN)")
+        print("saving quantiles (QR-DQN)")
         all_quantiles = []
         for i in range(env.num_states):
             obs = env.get_obs_at_state(i)
@@ -94,7 +115,21 @@ def evaluate_policy(
     observations = env.reset()
     states = None
     while (episode_counts < episode_count_targets).any():
-        actions, states = model.predict(observations, state=states, deterministic=deterministic)
+        ##### local modification #####
+        if eval_policy == "Greedy":
+            actions, states = model.predict(observations, state=states, deterministic=deterministic)
+        # TODO: consider multi environments case
+        elif eval_policy == "SSD":
+            q_vals = model.predict_quantiles(observations)
+            actions = np.array([ssd_policy(q_vals.cpu().data.numpy()[0])])
+            states = None
+        elif eval_policy == "Thresholded_SSD":
+            q_vals = model.predict_quantiles(observations)
+            actions = np.array([ssd_policy(q_vals.cpu().data.numpy()[0],use_threshold=True,mean_threshold=ssd_thres)])
+            states = None
+        else:
+            raise RuntimeError("The evaluation policy is not available.")
+        
         observations, rewards, dones, infos = env.step(actions)
         ##### local modification #####
         #current_rewards += rewards
diff --git a/thirdparty/stable_baselines3/common/off_policy_algorithm.py b/thirdparty/stable_baselines3/common/off_policy_algorithm.py
@@ -278,6 +278,9 @@ def _setup_learn(
         log_path: Optional[str] = None,
         reset_num_timesteps: bool = True,
         tb_log_name: str = "run",
+        ##### local modification #####
+        eval_policy: str = "Greedy",
+        ssd_thres: float = 1e-03
     ) -> Tuple[int, BaseCallback]:
         """
         cf `BaseAlgorithm`.
@@ -320,6 +323,9 @@ def _setup_learn(
             log_path,
             reset_num_timesteps,
             tb_log_name,
+            ##### local modification #####
+            eval_policy,
+            ssd_thres
         )
 
     def learn(
@@ -333,6 +339,9 @@ def learn(
         tb_log_name: str = "run",
         eval_log_path: Optional[str] = None,
         reset_num_timesteps: bool = True,
+        ##### local modification #####
+        eval_policy: str = "Greedy",
+        ssd_thres: float = 1e-03
     ) -> "OffPolicyAlgorithm":
 
         total_timesteps, callback = self._setup_learn(
@@ -344,6 +353,9 @@ def learn(
             eval_log_path,
             reset_num_timesteps,
             tb_log_name,
+            ##### local modification #####
+            eval_policy,
+            ssd_thres
         )
 
         callback.on_training_start(locals(), globals())