huggingface · ArthurZucker · Oct 17, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
@@ -184,6 +184,9 @@ class PretrainedConfig(PushToHubMixin):
             Whether the model should use legacy TensorFlow losses. Legacy losses have variable output shapes and may
             not be XLA-compatible. This option is here for backward compatibility and will be removed in Transformers
             v5.
+        loss_type (`str`, *optional*):
+            The type of loss that the model should use. It should be in `LOSS_MAPPING`'s keys, otherwise the loss will
+            be automatically infered from the model architecture.
     """
 
     model_type: str = ""

diff --git a/src/transformers/loss_utils.py b/src/transformers/loss_utils.py
@@ -0,0 +1,98 @@
+import torch
+import torch.nn as nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from .models.detr.loss_detr import ForObjectDetectionLoss, ForSegmentationLoss
+
+
+def DefaultCrossEntropyLoss(logits, labels, **kwargs):
+    # Upcast to float if we need to compute the loss to avoid potential precision issues
+    logits = logits.float()
+    # Shift so that tokens < n predict n
+    shift_logits = logits[..., :-1, :].contiguous()
+    shift_labels = labels[..., 1:].contiguous()
+
+    # Flatten the tokens
+    shift_logits = shift_logits.view(-1, kwargs["vocab_size"])
+    shift_labels = shift_labels.view(-1)
+    # Enable model parallelism
+    shift_labels = shift_labels.to(shift_logits.device)
+
+    num_items = kwargs.pop("num_items", None)
+
+    if num_items is not None:
+        # Calculate the CrossEntropyLoss manually when using grad accum
+        log_probs = nn.functional.log_softmax(shift_logits, dim=-1)
+        loss = -log_probs[range(shift_labels.size(0)), shift_labels]
+        loss = loss.sum() / num_items
+    else:
+        loss = nn.functional.cross_entropy(shift_logits, shift_labels, ignore_index=-100)
+
+    return loss
+
+
+def ForSequenceClassificationLoss(logits, labels, pooled_logits, **kwargs):
+    config = kwargs["config"]
+    num_labels = config.num_labels
+    if config.problem_type is None:
+        if num_labels == 1:
+            config.problem_type = "regression"
+        elif num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+            config.problem_type = "single_label_classification"
+        else:
+            config.problem_type = "multi_label_classification"
+
+    if config.problem_type == "regression":
+        loss_fct = MSELoss()
+        if num_labels == 1:
+            loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+        else:
+            loss = loss_fct(pooled_logits, labels)
+    elif config.problem_type == "single_label_classification":
+        loss_fct = CrossEntropyLoss()
+        loss = loss_fct(pooled_logits.view(-1, num_labels), labels.view(-1))
+    elif config.problem_type == "multi_label_classification":
+        loss_fct = BCEWithLogitsLoss()
+        loss = loss_fct(pooled_logits, labels)
+    return loss
+
+
+def ForQuestionAnsweringLoss(start_logits, end_logits, start_positions, end_positions):
+    total_loss = None
+    if start_positions is not None and end_positions is not None:
+        # If we are on multi-GPU, split add a dimension
+        if len(start_positions.size()) > 1:
+            start_positions = start_positions.squeeze(-1).to(start_logits.device)
+        if len(end_positions.size()) > 1:
+            end_positions = end_positions.squeeze(-1).to(end_logits.device)
+        # sometimes the start/end positions are outside our model inputs, we ignore these terms
+        ignored_index = start_logits.size(1)
+        start_positions = start_positions.clamp(0, ignored_index)
+        end_positions = end_positions.clamp(0, ignored_index)
+
+        loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+        start_loss = loss_fct(start_logits, start_positions)
+        end_loss = loss_fct(end_logits, end_positions)
+        total_loss = (start_loss + end_loss) / 2
+    return total_loss
+
+
+def ForTokenClassification(logits, labels, config, **kwargs):
+    # Upcast to float if we need to compute the loss to avoid potential precision issues
+    logits = logits.view(-1, config.num_labels)
+    labels = labels.view(-1)
+    logits = logits.float()
+    # Flatten the tokens
+    loss_fct = CrossEntropyLoss()
+    return loss_fct(logits, labels)
+
+
+LOSS_MAPPING = {
+    "ForCausalLM": DefaultCrossEntropyLoss,
+    "ForQuestionAnswering": ForQuestionAnsweringLoss,
+    "ForSequenceClassification": ForSequenceClassificationLoss,
+    "ForTokenClassification": ForTokenClassification,
+}
+
+LOSS_MAPPING["ForSegmentation"] = ForSegmentationLoss
+LOSS_MAPPING["ForObjectDetection"] = ForObjectDetectionLoss
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -28,7 +28,7 @@
 import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass
-from functools import partial, wraps
+from functools import lru_cache, partial, wraps
 from threading import Thread
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
 from zipfile import is_zipfile
@@ -45,6 +45,7 @@
 from .dynamic_module_utils import custom_object_save
 from .generation import GenerationConfig, GenerationMixin
 from .integrations import PeftAdapterMixin, deepspeed_config, is_deepspeed_zero3_enabled
+from .loss_utils import LOSS_MAPPING
 from .pytorch_utils import (  # noqa: F401
     Conv1D,
     apply_chunking_to_forward,
@@ -4979,6 +4980,32 @@ def _is_quantized_training_enabled(self):
 
         return self.hf_quantizer.is_trainable
 
+    @property
+    @lru_cache
+    def loss_function(self):
+        if getattr(self.config, "loss_type", None) is not None:
+            loss_type = self.config.loss_type
+        else:
+            loss_type = self.__class__.__name__
+            if loss_type not in LOSS_MAPPING:
+                loss_groups = f"({'|'.join(LOSS_MAPPING)})"
+                loss_type = re.findall(loss_groups, self.__class__.__name__)
+                if len(loss_type) > 0:
+                    loss_type = loss_type[0]
+                else:
+                    loss_type = None
+        if loss_type is None:
+            raise ValueError(
+                "We could not determine which loss function to use."
+                f"based on the the class name. Make sure you add `{ self.__class__.__name__}` to the `LOSS_MAPPING`"
+            )
+        if loss_type not in LOSS_MAPPING and getattr(self.config, "loss_type", None) is not None:
+            raise ValueError(
+                f"`loss_type={loss_type}` was set in the config but it is unrecognised"
+                f"based on the the class name. Make sure you add `{loss_type}` to the `LOSS_MAPPING`"
+            )
+        return LOSS_MAPPING[loss_type]
+
 
 PreTrainedModel.push_to_hub = copy_func(PreTrainedModel.push_to_hub)
 if PreTrainedModel.push_to_hub.__doc__ is not None: