pre-commit run check

Josh00-Lu · Josh00-Lu · commit 512160bff0d1 · 2024-02-29T12:56:04.000+08:00
diff --git a/cleanrl/ppo_continuous_action.py b/cleanrl/ppo_continuous_action.py
@@ -209,22 +209,22 @@ def get_action_and_value(self, x, action=None):
 
         for step in range(0, args.num_steps):
             global_step += args.num_envs
-            
+
             ob = next_ob
             # ALGO LOGIC: action logic
             with torch.no_grad():
                 action, logprob, _, value = agent.get_action_and_value(ob)
 
             # TRY NOT TO MODIFY: execute the game and log data.
             next_ob, reward, next_termination, next_truncation, info = envs.step(action.cpu().numpy())
-            
+
             # Correct next obervation (for vec gym)
             real_next_ob = next_ob.copy()
             for idx, trunc in enumerate(next_truncation):
                 if trunc:
                     real_next_ob[idx] = info["final_observation"][idx]
             next_ob = torch.Tensor(next_ob).to(device)
-            
+
             # Collect trajectory
             obs[step] = torch.Tensor(ob).to(device)
             next_obs[step] = torch.Tensor(real_next_ob).to(device)
@@ -234,7 +234,7 @@ def get_action_and_value(self, x, action=None):
             next_terminations[step] = torch.Tensor(next_termination).to(device)
             next_dones[step] = torch.Tensor(np.logical_or(next_termination, next_truncation)).to(device)
             rewards[step] = torch.tensor(reward).to(device).view(-1)
-            
+
             if "final_info" in info:
                 for info in info["final_info"]:
                     if info and "episode" in info:
@@ -253,7 +253,7 @@ def get_action_and_value(self, x, action=None):
                 else:
                     value_mask = next_dones[t].bool()
                     next_values[value_mask] = agent.get_value(next_obs[t][value_mask]).flatten()
-                    next_values[~value_mask] = values[t+1][~value_mask]
+                    next_values[~value_mask] = values[t + 1][~value_mask]
                 delta = rewards[t] + args.gamma * next_values * (1 - next_terminations[t]) - values[t]
                 advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * (1 - next_dones[t]) * lastgaelam
             returns = advantages + values