Skip to content
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion src/accelerate/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import inspect

import torch

from collections import defaultdict
from .state import AcceleratorState, GradientState
from .utils import DistributedType, honor_type, is_lomo_available, is_torch_xla_available

Expand Down Expand Up @@ -196,6 +196,26 @@ def __setstate__(self, state):
self._optimizer_original_step_method = self.optimizer.step
self._optimizer_patched_step_method = patch_optimizer_step(self, self.optimizer.step)

def multiply_grads(self, constant: float | torch.Tensor) -> None:
"""
Multiplies the gradients of the parameters by a constant.
Needed during gradient accumulation.

Based on the implementation out of `fairseq`: https://github.com/facebookresearch/fairseq/blob/main/fairseq/optim/fairseq_optimizer.py
"""
per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))
for param_group in self.param_groups:
for param in param_group["params"]:
if param.grad is not None:
if param.grad.is_sparse:
param.grad.data.mul_(constant.to(param.grad.device) if torch.is_tensor(constant) else constant)
else:
per_device_and_dtype_grads[param.device][param.dtype].append(param.grad.data)

for device, per_dtype_grads in per_device_and_dtype_grads.items():
for grads in per_dtype_grads.values():
torch._foreach_mul_(grads, constant.to(device) if torch.is_tensor(constant) else constant)


def patch_optimizer_step(accelerated_optimizer: AcceleratedOptimizer, method):
def patched_step(*args, **kwargs):
Expand Down
Loading