add pure pso optimization method

Lan-ping · Lan-ping · commit a45ed358be3e · 2021-09-25T15:37:20.000+08:00
diff --git a/main.py b/main.py
@@ -54,6 +54,7 @@ def main(args):
     parser.add_argument('--xlimit_min', default=-10, type=float, help='xlimit_min')
     parser.add_argument('--weight_particle_optmized_location', default=0.33, type=float, help='weight_particle_optmized_location')
     parser.add_argument('--weight_global_optmized_location', default=0.33, type=float, help='weight_global_optmized_location')
+    parser.add_argument('--use_sgd', default=True, type=bool, help='use_sgd')
 
     # trainer
     parser.add_argument('--divice', default="cuda" if torch.cuda.is_available() else "cpu", type=str, help='divice')
diff --git a/psosgd_optimizer.py b/psosgd_optimizer.py
@@ -54,7 +54,7 @@ def __setstate__(self, state):
             group.setdefault('nesterov', False)
 
     @torch.no_grad()
-    def step(self, local_best_param_group, global_best_param_group, is_psosgd, closure=None):
+    def step(self, local_best_param_group, global_best_param_group, use_pso, use_sgd, closure=None):
         """Performs a single optimization step.
 
         Arguments:
@@ -80,37 +80,48 @@ def step(self, local_best_param_group, global_best_param_group, is_psosgd, closu
             weight_global_optmized_location = group['weight_global_optmized_location']
 
             for p_index, p in enumerate(group['params']):
-                if is_psosgd:
+                if use_pso:
                     local_best_p = local_best_param_group[p_index]
                     global_best_p = global_best_param_group[p_index]
+
                 if p.grad is None:
                     continue
-                d_p = p.grad
-                if weight_decay != 0:
-                    d_p = d_p.add(p, alpha=weight_decay)
+
+                if use_sgd:
+                    d_p = p.grad
+                    if weight_decay != 0:
+                        d_p = d_p.add(p, alpha=weight_decay)
+                else:
+                    d_p = -(vlimit_min + (vlimit_max - vlimit_min) * torch.rand(p.shape))
+
                 if momentum != 0:
                     param_state = self.state[p]
                     if 'momentum_buffer' not in param_state:
                         buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                     else:
                         buf = param_state['momentum_buffer']
-                        # buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
+
                         buf.mul_(momentum)
-                        if is_psosgd:
+                        if use_pso:
                             buf.sub_(local_best_p.sub(p), alpha=weight_particle_optmized_location * random.random())
                             buf.sub_(global_best_p.sub(p), alpha=weight_global_optmized_location * random.random())
-                        buf.add_(d_p, alpha=1-dampening)
 
-                        if is_psosgd:
+                        if use_sgd:
+                            buf.add_(d_p, alpha=1-dampening)
+
+                        if use_pso:
                             buf[buf > vlimit_max] = vlimit_max
                             buf[buf < vlimit_min] = vlimit_min
      
-                    if nesterov:
+                    if use_sgd and nesterov:
                         d_p = d_p.add(buf, alpha=momentum)
                     else:
                         d_p = buf
 
-                p.add_(d_p, alpha=-lr)
+                if use_sgd:
+                    p.add_(d_p, alpha=-lr)
+                else: # When SGD is not used, the learning rate parameter lr is invalid.
+                    p.add_(d_p, alpha=-1)
                 # p[p>xlimit_max] = xlimit_max
                 # p[p<xlimit_min] = xlimit_min
 
diff --git a/psosgd_trainer.py b/psosgd_trainer.py
@@ -22,7 +22,8 @@ def __init__(self,
                  optimizer_config,
                  device = "cuda" if torch.cuda.is_available() else "cpu",
                  n_particle = 5,
-                 output_path = 'output', **kwargs):
+                 output_path = 'output', 
+                 use_sgd = True, **kwargs):
 
         # 预留模型参数
         self.model_config = model_config
@@ -40,6 +41,8 @@ def __init__(self,
 
         self.output_path = output_path
 
+        self.use_sgd = use_sgd
+
 
 class PSOSGD_Trainer:
 
@@ -88,7 +91,7 @@ def train(self, data_loader, loss_fn, epochs):
                             global_best_param_group = (batch_losses[i], [torch.clone(param).detach() for param in self.models[i].parameters()])
 
                 for i in range(self.config.n_particle):
-                    self.optimizers[i].step(local_best_param_groups[i][1], global_best_param_group[1], self.config.n_particle != 1)
+                    self.optimizers[i].step(local_best_param_groups[i][1], global_best_param_group[1], self.config.n_particle != 1, self.config.use_sgd)
 
                 losses.append(batch_losses)