Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions src/accelerate/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import inspect
from collections import defaultdict

import torch

Expand Down Expand Up @@ -196,6 +197,26 @@ def __setstate__(self, state):
self._optimizer_original_step_method = self.optimizer.step
self._optimizer_patched_step_method = patch_optimizer_step(self, self.optimizer.step)

def multiply_grads(self, constant):
"""
Multiplies the gradients of the parameters by a constant. Needed during gradient accumulation.

Based on the implementation out of `fairseq`:
https://github.com/facebookresearch/fairseq/blob/main/fairseq/optim/fairseq_optimizer.py
"""
per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))
for param_group in self.param_groups:
for param in param_group["params"]:
if param.grad is not None:
if param.grad.is_sparse:
param.grad.data.mul_(constant.to(param.grad.device) if torch.is_tensor(constant) else constant)
else:
per_device_and_dtype_grads[param.device][param.dtype].append(param.grad.data)

for device, per_dtype_grads in per_device_and_dtype_grads.items():
for grads in per_dtype_grads.values():
torch._foreach_mul_(grads, constant.to(device) if torch.is_tensor(constant) else constant)


def patch_optimizer_step(accelerated_optimizer: AcceleratedOptimizer, method):
def patched_step(*args, **kwargs):
Expand Down
Loading