forked from Vincent-Yu-83/pytorch_train
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
88 lines (74 loc) · 3.66 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from torch.optim.lr_scheduler import CosineAnnealingLR
import warnings
import math
import torch
class ZeroOneNormalize(object):
def __call__(self, img):
return img.float().div(255)
class CosineAnnealingLRWarmup(CosineAnnealingLR):
def __init__(self, optimizer, T_max, eta_min=1.0e-5, last_epoch=-1, verbose=False,
warmup_steps=2, warmup_start_lr=1.0e-5):
super(CosineAnnealingLRWarmup, self).__init__(optimizer, T_max=T_max,
eta_min=eta_min,
last_epoch=last_epoch)
self.warmup_steps = warmup_steps
self.warmup_start_lr = warmup_start_lr
if warmup_steps > 0:
self.base_warup_factors = [
(base_lr / warmup_start_lr) ** (1.0 / self.warmup_steps)
for base_lr in self.base_lrs
]
def get_lr(self):
if not self._get_lr_called_within_step:
warnings.warn("To get the last learning rate computed by the scheduler, "
"please use `get_last_lr()`.", UserWarning)
return self._get_closed_form_lr()
def _get_closed_form_lr(self):
if hasattr(self, 'warmup_steps'):
if self.last_epoch < self.warmup_steps:
return [self.warmup_start_lr * (warmup_factor ** self.last_epoch)
for warmup_factor in self.base_warup_factors]
else:
return [self.eta_min + (base_lr - self.eta_min) *
(1 + math.cos(
math.pi * (self.last_epoch - self.warmup_steps) / (self.T_max - self.warmup_steps))) * 0.5
for base_lr in self.base_lrs]
else:
return [self.eta_min + (base_lr - self.eta_min) *
(1 + math.cos(math.pi * self.last_epoch / self.T_max)) / 2
for base_lr in self.base_lrs]
def evaluate_accuracy_and_loss(data_iter, model, loss, accelerator=None, is_half=False, local_rank=-1, world_size=1):
acc_sum = 0.0
loss_sum = 0.0
n = 0
with torch.no_grad():
for X, y in data_iter:
X = X.cuda()
if is_half:
X = X.half()
y = y.cuda()
y_pred = model(X)
# if local_rank == 0:
# print("evaluate, local rank: {}, {}, {}, {}".format(local_rank, X.shape, y.shape, y_pred.shape))
# 适用于非分布式场景
y_gather = y.clone().detach()
y_pred_gather = y_pred.clone().detach()
# 适用于accelerate
if accelerator:
# X = accelerator.gather(X)
y_gather = accelerator.gather(y)
y_pred_gather = accelerator.gather(y_pred)
elif local_rank != -1: # 适用于DDP、FSDP
# torch.distributed.all_gather_into_tensor(X, X)
y_gather = torch.zeros_like(y).repeat(world_size)
y_pred_gather = torch.zeros_like(y_pred).repeat((world_size, 1))
torch.distributed.all_gather_into_tensor(y_gather, y)
torch.distributed.all_gather_into_tensor(y_pred_gather, y_pred)
# print("y_gather: {}, y_pred_gather: {}".format(y_gather.shape, y_pred_gather.shape))
# if local_rank == 0:
# print("evaluate, local rank: {}, {}, {}, {}".format(local_rank, X.shape, y_gather.shape, y_pred_gather.shape))
acc_sum += (y_pred_gather.argmax(dim=1) == y_gather).sum().item()
loss_sum += loss(y_pred_gather, y_gather).sum().item()
n += y_gather.shape[0]
# break
return acc_sum / n, loss_sum / n