-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Description
CI recurrently fails for all tests with error message: https://github.com/huggingface/trl/actions/runs/18367187589/job/52322217932
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.11 GiB. GPU 0 has a total capacity of 14.74 GiB of which 1.65 GiB is free. Process 19066 has 6.79 GiB memory in use. Process 19069 has 514.00 MiB memory in use. Process 19060 has 4.93 GiB memory in use. Process 19063 has 894.00 MiB memory in use. Of the allocated memory 4.50 GiB is allocated by PyTorch, and 2.14 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
For example:
FAILED tests/test_sft_trainer.py::TestSFTTrainer::test_train_toolcall_data - torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.11 GiB. GPU 0 has a total capacity of 14.74 GiB of which 1.65 GiB is free. Process 19066 has 6.79 GiB memory in use. Process 19069 has 514.00 MiB memory in use. Process 19060 has 4.93 GiB memory in use. Process 19063 has 894.00 MiB memory in use. Of the allocated memory 4.50 GiB is allocated by PyTorch, and 2.14 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
= 1 failed, 955 passed, 50 skipped, 1 xfailed, 1 xpassed, 140 warnings, 16 rerun in 947.45s (0:15:47) =
Stacktrace:
.venv/lib/python3.10/site-packages/transformers/trainer.py:2325: in train
return inner_training_loop(
.venv/lib/python3.10/site-packages/transformers/trainer.py:2674: in _inner_training_loop
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
trl/trainer/sft_trainer.py:1185: in training_step
return super().training_step(*args, **kwargs)
.venv/lib/python3.10/site-packages/transformers/trainer.py:4020: in training_step
loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <trl.trainer.sft_trainer.SFTTrainer object at 0x7fa98e244250>
model = Qwen2ForCausalLM(
(model): Qwen2Model(
(embed_tokens): Embedding(151665, 8)
(layers): ModuleList(
(0-1...one)
(rotary_emb): Qwen2RotaryEmbedding()
)
(lm_head): Linear(in_features=8, out_features=151665, bias=False)
)
inputs = {'attention_mask': tensor([[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 1, 1, 1... -100],
[151644, 8948, 198, ..., -100, -100, -100]],
device='cuda:0'), 'use_cache': False}
return_outputs = False, num_items_in_batch = tensor(2756, device='cuda:0')
def compute_loss(
self,
model: nn.Module,
inputs: dict[str, Union[torch.Tensor, Any]],
return_outputs: bool = False,
num_items_in_batch: Optional[torch.Tensor] = None,
):
"""
Compute training loss and additionally compute token accuracies
"""
mode = "train" if self.model.training else "eval"
# Set aside labels as it will be dropped by super().compute_loss() if a custom `compute_loss_func` is used.
# This can be removed when this issue is fixed.
labels = inputs["labels"]
# If not set, defaults from model config and may warn since cache isn't compatible with gradient checkpointing
inputs["use_cache"] = False
(loss, outputs) = super().compute_loss(
model, inputs, return_outputs=True, num_items_in_batch=num_items_in_batch
)
# Compute entropy
if not self.args.use_liger_kernel: # liger doesn't return logits
with torch.no_grad():
per_token_entropy = entropy_from_logits(outputs.logits)
# When using Prompt Tuning, skip the virtual tokens in logits before entropy computation, since they
# do not correspond to actual input tokens.
if (
self.num_virtual_tokens > 0
and model.peft_config[model.active_adapter].peft_type != PeftType.PREFIX_TUNING
):
per_token_entropy = per_token_entropy[:, self.num_virtual_tokens :]
if "attention_mask" in inputs:
attention_mask = inputs["attention_mask"]
entropy = torch.sum(per_token_entropy * attention_mask) / attention_mask.sum()
elif "position_ids" in inputs:
entropy = torch.mean(per_token_entropy)
else:
raise ValueError("Expected 'attention_mask' or 'position_ids' in inputs.")
entropy = self.accelerator.gather_for_metrics(entropy).mean().item()
self._metrics[mode]["entropy"].append(entropy)
if mode == "train":
# When using padding-free, the attention_mask is not present in the inputs, instead we have cu_seq_lens_q,
# cu_seq_lens_k, and max_length_k, max_length_q and position_ids.
if "attention_mask" in inputs:
num_tokens_in_batch = self.accelerator.gather_for_metrics(inputs["attention_mask"].sum()).sum().item()
elif "position_ids" in inputs:
local_num_tokens = torch.tensor(inputs["position_ids"].size(1), device=inputs["position_ids"].device)
num_tokens_in_batch = self.accelerator.gather_for_metrics(local_num_tokens).sum().item()
else:
raise ValueError("Expected 'attention_mask' or 'position_ids' in inputs.")
self._total_train_tokens += num_tokens_in_batch
self._metrics[mode]["num_tokens"] = [self._total_train_tokens]
# Compute token accuracy if we have labels and if the model is not using Liger (no logits)
if not self.args.use_liger_kernel:
with torch.no_grad():
if "shift_labels" in inputs:
# When using CP, labels are pre-shifted. We must use these (and cannot manually shift) because:
# - The first discarded token from inputs["labels"] actually belongs to process n-1
# - The last logits require the label from process n+1
shift_logits = outputs.logits.contiguous()
shift_labels = inputs["shift_labels"]
else:
> shift_logits = outputs.logits[..., :-1, :].contiguous()
E torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.11 GiB. GPU 0 has a total capacity of 14.74 GiB of which 1.65 GiB is free. Process 19066 has 6.79 GiB memory in use. Process 19069 has 514.00 MiB memory in use. Process 19060 has 4.93 GiB memory in use. Process 19063 has 894.00 MiB memory in use. Of the allocated memory 4.50 GiB is allocated by PyTorch, and 2.14 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
trl/trainer/sft_trainer.py:1146: OutOfMemoryError