Merge pull request #477 from ArneBinder/fix/half_precision_model_tests

RainbowRivey · web-flow · commit 6d5a064249fb · 2025-08-04T17:26:52.000+02:00
Fix `test_annotation_pipeline` fails with half-precision-model=True
- adjust `test_annotation_pipeline`:
  - streamline the test (e.g. use `resolve()` etc.)
  - create individual test branches with individual expected scores for all combinations of `half_precision_ops` and `half_precision_model`
  - decrease absolute tolerance to `1e-6`
  - use `10e-2` as absolute tolerance when `half_precision_model` (reasoning: sing half_precision_model on cpu results in using dtype=torch.bfloat16 which has only 8 significant precision bits, so we use 10e-2 as absolute tolerance)
  - enable `torch.use_deterministic_algorithms` to make sure results are as reproducible as possible.
diff --git a/src/pytorch_ie/pipeline.py b/src/pytorch_ie/pipeline.py
@@ -481,6 +481,17 @@ def __call__(
 
         show_progress_bar = forward_params.pop("show_progress_bar", False)
         half_precision_ops = forward_params.pop("half_precision_ops", False)
+
+        # Torch documentation recommends: "When entering an autocast-enabled region, Tensors may be any type.
+        # You should not call half() or bfloat16() on your model(s) or inputs when using autocasting."
+        # (see https://docs.pytorch.org/docs/stable/amp.html#torch.autocast). So show a warning in this case.
+        if half_precision_ops:
+            if self.model.dtype == get_autocast_dtype(self.device.type):
+                logger.warning(
+                    "Using half precision operations with a model already in half precision. "
+                    "This is not recommended, as it may lead to unexpected results."
+                )
+
         model_outputs: List = []
         with torch.no_grad():
             with torch.autocast(device_type=self.device.type, enabled=half_precision_ops):
diff --git a/tests/pipeline/test_re_text_classification.py b/tests/pipeline/test_re_text_classification.py
@@ -1,7 +1,7 @@
 from dataclasses import dataclass
-from typing import Sequence
 
 import pytest
+import torch
 from pie_core import AnnotationLayer, annotation_field
 
 from pytorch_ie import AutoPipeline
@@ -11,6 +11,8 @@
 from pytorch_ie.pipeline import Pipeline
 from pytorch_ie.taskmodules import TransformerRETextClassificationTaskModule
 
+torch.use_deterministic_algorithms(True)
+
 
 @dataclass
 class ExampleDocument(TextDocument):
@@ -22,7 +24,9 @@ class ExampleDocument(TextDocument):
 @pytest.mark.parametrize("use_auto", [False, True])
 @pytest.mark.parametrize("half_precision_model", [False, True])
 @pytest.mark.parametrize("half_precision_ops", [False, True])
-def test_re_text_classification(use_auto, half_precision_model, half_precision_ops):
+def test_re_text_classification(use_auto, half_precision_model, half_precision_ops, caplog):
+
+    # set up the pipeline
     model_name_or_path = "pie/example-re-textclf-tacred"
     if use_auto:
         pipeline = AutoPipeline.from_pretrained(
@@ -45,39 +49,53 @@ def test_re_text_classification(use_auto, half_precision_model, half_precision_o
     assert pipeline.taskmodule.is_from_pretrained
     assert pipeline.model.is_from_pretrained
 
+    # create a document with entities
     document = ExampleDocument(
         "“Making a super tasty alt-chicken wing is only half of it,” said Po Bronson, general partner "
         "at SOSV and managing director of IndieBio."
     )
+    document.entities.append(LabeledSpan(start=65, end=75, label="PER"))
+    document.entities.append(LabeledSpan(start=96, end=100, label="ORG"))
+    document.entities.append(LabeledSpan(start=126, end=134, label="ORG"))
 
-    for start, end, label in [(65, 75, "PER"), (96, 100, "ORG"), (126, 134, "ORG")]:
-        document.entities.append(LabeledSpan(start=start, end=end, label=label))
+    # predict relations
+    with caplog.at_level("WARNING"):
+        pipeline(document, batch_size=2, half_precision_ops=half_precision_ops)
 
-    pipeline(document, batch_size=2, half_precision_ops=half_precision_ops)
-    relations: Sequence[BinaryRelation] = document["relations"].predictions
-    assert len(relations) == 3
+    # sort to get deterministic order
+    sorted_relations = sorted(document.relations.predictions)
 
-    rels = sorted(relations, key=lambda rel: (rel.head.start + rel.tail.start) / 2)
-
-    # Note: The scores are quite low, because the model is trained with the old version for the taskmodule,
-    # so the argument markers are not correct.
-    assert (str(rels[0].head), rels[0].label, str(rels[0].tail)) == (
-        "SOSV",
-        "org:top_members/employees",
-        "Po Bronson",
-    )
-    assert rels[0].score == pytest.approx(0.398, abs=1e-2)
+    # check the relations and their scores
+    assert [ann.resolve() for ann in sorted_relations] == [
+        ("per:employee_of", (("PER", "Po Bronson"), ("ORG", "IndieBio"))),
+        ("org:top_members/employees", (("ORG", "SOSV"), ("PER", "Po Bronson"))),
+        ("org:top_members/employees", (("ORG", "IndieBio"), ("PER", "Po Bronson"))),
+    ]
 
-    assert (str(rels[1].head), rels[1].label, str(rels[1].tail)) == (
-        "Po Bronson",
-        "per:employee_of",
-        "IndieBio",
+    half_precision_warning = (
+        "Using half precision operations with a model already in half precision. "
+        "This is not recommended, as it may lead to unexpected results."
     )
-    assert rels[1].score == pytest.approx(0.534, abs=1e-2)
 
-    assert (str(rels[2].head), rels[2].label, str(rels[2].tail)) == (
-        "IndieBio",
-        "org:top_members/employees",
-        "Po Bronson",
-    )
-    assert rels[2].score == pytest.approx(0.552, abs=1e-2)
+    scores = [rel.score for rel in sorted_relations]
+    # General note: The scores are quite low, because the model is trained with the old version
+    # for the taskmodule, so the argument markers are not correct.
+    # Below scores were obtained with dependencies from poetry.lock on local machine.
+    if not half_precision_model and not half_precision_ops:
+        # we low tolerance if no half precision is used
+        # (i.e., no autocast on forward pass and model is not cast to half precision)
+        assert scores == pytest.approx(
+            [0.5339038372039795, 0.3984701931476593, 0.5520647764205933], abs=1e-6
+        )
+        assert half_precision_warning not in caplog.messages
+    elif not half_precision_model and half_precision_ops:
+        # set high tolerance for half precision ops (i.e., autocast on forward pass)
+        assert scores == pytest.approx([0.53125, 0.39453125, 0.5546875], abs=1e-2)
+        assert half_precision_warning not in caplog.messages
+    elif half_precision_model and not half_precision_ops:
+        # set high tolerance for half precision model (i.e., model cast to half precision)
+        assert scores == pytest.approx([0.53515625, 0.400390625, 0.55859375], abs=1e-2)
+        assert half_precision_warning not in caplog.messages
+    else:
+        # just check that we got the warning about half precision ops in combination with half precision model
+        assert half_precision_warning in caplog.messages