use AutoAnnotationPipeline (#216)

ArneBinder · web-flow · commit 51171a27ca68 · 2025-09-30T19:22:49.000+02:00
* use pie-core 0.3.1

* switch back to use `AutoAnnotationPipeline` instead of `PyTorchIEPipeline`; add comments to add `pipeline_type: pytorch-ie` for old models to pipeline configs

* fix tests by setting `pipeline.pipeline_type: pytorch-ie` in train.yaml config

* save the pipeline after training (instead of individual model and taskmodule)

* during inference, raise an exception when trying to load a model checkpoint into a pipeline that is no PyTorchIEPipeline
diff --git a/configs/pipeline/_from_pretrained.yaml b/configs/pipeline/_from_pretrained.yaml
@@ -1,3 +1,5 @@
-_target_: pytorch_ie.PyTorchIEPipeline.from_pretrained
+_target_: pie_core.AutoAnnotationPipeline.from_pretrained
 pretrained_model_name_or_path: ???
 show_progress_bar: true
+# uncomment for "old" PyTorch-IE models that do not have a pipeline_type key in their config.json
+# pipeline_type: pytorch-ie
diff --git a/configs/pipeline/ner_re_pipeline.yaml b/configs/pipeline/ner_re_pipeline.yaml
@@ -10,5 +10,9 @@ show_progress_bar: true
 device: -1
 ner_pipeline:
   batch_size: 1
+  # uncomment for "old" PyTorch-IE models that do not have a pipeline_type key in their config.json
+  # pipeline_type: pytorch-ie
 re_pipeline:
   batch_size: 1
+  # uncomment for "old" PyTorch-IE models that do not have a pipeline_type key in their config.json
+  # pipeline_type: pytorch-ie
diff --git a/configs/predict.yaml b/configs/predict.yaml
@@ -24,6 +24,10 @@ name: "default"
 # or the url to huggingface hub where the taskmodule and model was pushed to.
 # It is used in the pipeline config.
 model_name_or_path: pie/example-ner-spanclf-conll03
+# required for "old" PyTorch-IE models that do not have a pipeline_type key in their config.json.
+# Saving a model with AnnotationPipeline.push_to_hub will automatically add this key to the config.json
+pipeline:
+  pipeline_type: pytorch-ie
 
 # to override model weights with content of a checkpoint
 ckpt_path: null
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,7 @@ dependencies = [
     # --------- python-ie --------- #
     "pytorch-ie (>=0.33.0,<0.34.0)",
     "pie-datasets (>=0.11.0,<0.12.0)",
+    "pie-core (>=0.3.1,<0.4.0)",  # to use AutoAnnotationPipeline with old models, see https://github.com/ArneBinder/pie-core/pull/95
 
     # ------- reprocessing -------- #
     "nltk (>=3.9.1,<4.0.0)",           # sentence splitter (just for drugprot.yaml experiment which dry-runs in slow tests, remove if not needed)
diff --git a/src/pipeline/ner_re_pipeline.py b/src/pipeline/ner_re_pipeline.py
@@ -4,9 +4,8 @@
 from functools import partial
 from typing import Callable, Dict, Iterable, List, Optional, Sequence, Type, TypeVar, Union
 
+from pie_core import AutoAnnotationPipeline, Document, WithDocumentTypeMixin
 from pie_core.utils.hydra import resolve_type
-from pytorch_ie import PyTorchIEPipeline, WithDocumentTypeMixin
-from pytorch_ie.core import Document
 
 logger = logging.getLogger(__name__)
 
@@ -166,7 +165,7 @@ def __call__(self, documents: Sequence[Document], inplace: bool = False) -> Sequ
                     layer_names=[self.entity_layer, self.relation_layer],
                     **self.processor_kwargs.get("clear_annotations", {}),
                 ),
-                "ner_pipeline": PyTorchIEPipeline.from_pretrained(
+                "ner_pipeline": AutoAnnotationPipeline.from_pretrained(
                     self.ner_model_path, **self.processor_kwargs.get("ner_pipeline", {})
                 ),
                 "use_predicted_entities": partial(
@@ -181,7 +180,7 @@ def __call__(self, documents: Sequence[Document], inplace: bool = False) -> Sequ
                 #        **self.processor_kwargs.get("create_candidate_relations", {})
                 #    ),
                 # ),
-                "re_pipeline": PyTorchIEPipeline.from_pretrained(
+                "re_pipeline": AutoAnnotationPipeline.from_pretrained(
                     self.re_model_path, **self.processor_kwargs.get("re_pipeline", {})
                 ),
                 # otherwise we can not move the entities back to predictions
diff --git a/src/predict.py b/src/predict.py
@@ -41,6 +41,7 @@
 from omegaconf import DictConfig, OmegaConf
 from pie_core import AnnotationPipeline
 from pie_datasets import DatasetDict
+from pytorch_ie import PyTorchIEPipeline
 from pytorch_ie.models import *  # noqa: F403
 from pytorch_ie.taskmodules import *  # noqa: F403
 
@@ -84,6 +85,10 @@ def predict(cfg: DictConfig) -> Tuple[dict, dict]:
         # However, ckpt_path can be used to load different weights from any checkpoint.
         if cfg.ckpt_path is not None:
             log.info(f"Loading model weights from checkpoint: {cfg.ckpt_path}")
+            if not isinstance(pipeline, PyTorchIEPipeline):
+                raise ValueError(
+                    "The pipeline has to be of type PyTorchIEPipeline to load a checkpoint."
+                )
             pipeline.model = (
                 type(pipeline.model)
                 .load_from_checkpoint(checkpoint_path=cfg.ckpt_path)
diff --git a/src/train.py b/src/train.py
@@ -42,7 +42,7 @@
 from pie_core import AnnotationPipeline, AutoModel, TaskModule
 from pie_core.utils.dictionary import flatten_dict_s
 from pie_datasets import DatasetDict
-from pytorch_ie import PieDataModule, PyTorchIEModel
+from pytorch_ie import PieDataModule, PyTorchIEModel, PyTorchIEPipeline
 from pytorch_ie.models import *  # noqa: F403
 from pytorch_ie.models.interface import (
     RequiresModelNameOrPath,
@@ -197,14 +197,6 @@ def train(cfg: DictConfig) -> Tuple[dict, dict]:
         log.info("Logging hyperparameters!")
         utils.log_hyperparameters(logger=logger, model=model, taskmodule=taskmodule, config=cfg)
 
-    if cfg.paths.model_save_dir is not None:
-        log.info(f"Save taskmodule to {cfg.paths.model_save_dir} [push_to_hub={cfg.push_to_hub}]")
-        taskmodule.save_pretrained(
-            save_directory=cfg.paths.model_save_dir, push_to_hub=cfg.push_to_hub
-        )
-    else:
-        log.warning("the taskmodule is not saved because no save_dir is specified")
-
     if cfg.get("train"):
         # Set model in training mode (since pytorch-lightning 2.2.0 the model is not set
         # to train mode automatically in trainer.fit). To just partly train the model
@@ -225,19 +217,25 @@ def train(cfg: DictConfig) -> Tuple[dict, dict]:
             checkpoint_dir=trainer.checkpoint_callback.dirpath,
         )
 
+    pipeline: Optional[AnnotationPipeline] = None
     if not cfg.trainer.get("fast_dev_run"):
         if cfg.paths.model_save_dir is not None:
             if best_ckpt_path == "":
                 log.warning("Best ckpt not found! Using current weights for saving...")
             else:
                 model = type(model).load_from_checkpoint(best_ckpt_path)
 
-            log.info(f"Save model to {cfg.paths.model_save_dir} [push_to_hub={cfg.push_to_hub}]")
-            model.save_pretrained(
+            log.info(
+                f"Save pipeline (model + taskmodule) to {cfg.paths.model_save_dir} [push_to_hub={cfg.push_to_hub}]"
+            )
+            pipeline = PyTorchIEPipeline(model=model, taskmodule=taskmodule)
+            pipeline.save_pretrained(
                 save_directory=cfg.paths.model_save_dir, push_to_hub=cfg.push_to_hub
             )
         else:
-            log.warning("the model is not saved because no save_dir is specified")
+            log.warning(
+                "the pipeline (model + taskmodule) is not saved because no save_dir is specified"
+            )
 
     if cfg.get("validate"):
         log.info("Starting validation!")
@@ -271,7 +269,6 @@ def train(cfg: DictConfig) -> Tuple[dict, dict]:
         # This can be overridden by the `predict_split` config parameter.
         split = cfg.get("predict_split", datamodule.test_split)
         # Init the inference pipeline
-        pipeline: Optional[AnnotationPipeline] = None
         if cfg.get("pipeline") and cfg.pipeline.get("_target_"):
             log.info(f"Instantiating inference pipeline <{cfg.pipeline._target_}>")
             pipeline = hydra.utils.instantiate(cfg.pipeline, _convert_="partial")