kaiko-ai
diff --git a/‎configs/vision/pathology/offline/regression/tiger_til_score.yaml‎
Lines changed: 6 additions & 5 deletions b/‎configs/vision/pathology/offline/regression/tiger_til_score.yaml‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎src/eva/core/data/datasets/classification/multi_embeddings.py‎
Lines changed: 7 additions & 101 deletions b/‎src/eva/core/data/datasets/classification/multi_embeddings.py‎
Lines changed: 7 additions & 101 deletions
diff --git a/‎src/eva/core/data/datasets/multi_embeddings.py‎
Lines changed: 114 additions & 0 deletions b/‎src/eva/core/data/datasets/multi_embeddings.py‎
Lines changed: 114 additions & 0 deletions
diff --git a/‎src/eva/core/data/datasets/regression/embeddings.py‎
Lines changed: 3 additions & 27 deletions b/‎src/eva/core/data/datasets/regression/embeddings.py‎
Lines changed: 3 additions & 27 deletions
@@ -19,7 +19,7 @@ trainer:
           filename: best
           save_last: ${oc.env:SAVE_LAST, false}
           save_top_k: 1
-          monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/MeanAbsoluteError}
+          monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/MAE}
           mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
       - class_path: lightning.pytorch.callbacks.EarlyStopping
         init_args:
@@ -40,7 +40,7 @@ trainer:
             init_args:
               model_name: ${oc.env:MODEL_NAME, universal/vit_small_patch16_224_dino}
               model_extra_kwargs: ${oc.env:MODEL_EXTRA_KWARGS, null}
-          overwrite: false
+          overwrite: true
     logger:
       - class_path: lightning.pytorch.loggers.TensorBoardLogger
         init_args:
@@ -53,8 +53,6 @@ model:
       class_path: eva.vision.models.networks.ABMIL
       init_args:
         input_size: ${oc.env:IN_FEATURES, 384}
-        output_size: &NUM_CLASSES 1
-        # task: regression
     criterion: torch.nn.MSELoss
     optimizer:
       class_path: torch.optim.AdamW
@@ -83,7 +81,10 @@ data:
             init_args:
               pad_size: &N_PATCHES ${oc.env:N_PATCHES, 200}
           target_transforms:
-            class_path: eva.core.data.transforms.dtype.SqueezeTensor
+            class_path: eva.vision.data.transforms.common.Squeeze
+            init_args:
+              dim: -1
+
       val:
         class_path: eva.datasets.MultiEmbeddingsRegressionDataset
         init_args:
 
@@ -1,110 +1,16 @@
-"""Dataset class for where a sample corresponds to multiple embeddings."""
-
-import os
-from typing import Callable, Dict, List, Literal
+"""Dataset class for where a classification task sample corresponds to multiple embeddings."""
 
 import numpy as np
-import torch
-from typing_extensions import override
 
-from eva.core.data.datasets import embeddings as embeddings_base
+from eva.core.data.datasets.multi_embeddings import MultiEmbeddingsDataset
 
 
-class MultiEmbeddingsClassificationDataset(embeddings_base.EmbeddingsDataset[torch.Tensor]):
+class MultiEmbeddingsClassificationDataset(MultiEmbeddingsDataset):
     """Dataset class for where a sample corresponds to multiple embeddings.
 
-    Example use case: Slide level dataset where each slide has multiple patch embeddings.
+    Specialised for classification data with an int target type.
     """
 
-    def __init__(
-        self,
-        root: str,
-        manifest_file: str,
-        split: Literal["train", "val", "test"],
-        column_mapping: Dict[str, str] = embeddings_base.default_column_mapping,
-        embeddings_transforms: Callable | None = None,
-        target_transforms: Callable | None = None,
-    ):
-        """Initialize dataset.
-
-        Expects a manifest file listing the paths of `.pt` files containing tensor embeddings.
-
-        The manifest must have a `column_mapping["multi_id"]` column that contains the
-        unique identifier group of embeddings. For oncology datasets, this would be usually
-        the slide id. Each row in the manifest file points to a .pt file that can contain
-        one or multiple embeddings (either as a list or stacked tensors). There can also be
-        multiple rows for the same `multi_id`, in which case the embeddings from the different
-        .pt files corresponding to that same `multi_id` will be stacked along the first dimension.
-
-        Args:
-            root: Root directory of the dataset.
-            manifest_file: The path to the manifest file, which is relative to
-                the `root` argument.
-            split: The dataset split to use. The `split` column of the manifest
-                file will be splitted based on this value.
-            column_mapping: Defines the map between the variables and the manifest
-                columns. It will overwrite the `default_column_mapping` with
-                the provided values, so that `column_mapping` can contain only the
-                values which are altered or missing.
-            embeddings_transforms: A function/transform that transforms the embedding.
-            target_transforms: A function/transform that transforms the target.
-        """
-        super().__init__(
-            manifest_file=manifest_file,
-            root=root,
-            split=split,
-            column_mapping=column_mapping,
-            embeddings_transforms=embeddings_transforms,
-            target_transforms=target_transforms,
-        )
-
-        self._multi_ids: List[int]
-
-    @override
-    def setup(self):
-        super().setup()
-        self._multi_ids = list(self._data[self._column_mapping["multi_id"]].unique())
-
-    @override
-    def load_embeddings(self, index: int) -> torch.Tensor:
-        """Loads and stacks all embedding corresponding to the `index`'th multi_id."""
-        # Get all embeddings for the given index (multi_id)
-        multi_id = self._multi_ids[index]
-        embedding_paths = self._data.loc[
-            self._data[self._column_mapping["multi_id"]] == multi_id, self._column_mapping["path"]
-        ].to_list()
-
-        # Load embeddings and stack them accross the first dimension
-        embeddings = []
-        for path in embedding_paths:
-            embedding = torch.load(os.path.join(self._root, path), map_location="cpu")
-            if isinstance(embedding, list):
-                embedding = torch.stack(embedding, dim=0)
-            embeddings.append(embedding.unsqueeze(0) if embedding.ndim == 1 else embedding)
-        embeddings = torch.cat(embeddings, dim=0)
-
-        if not embeddings.ndim == 2:
-            raise ValueError(f"Expected 2D tensor, got {embeddings.ndim} for {multi_id}.")
-
-        return embeddings
-
-    @override
-    def load_target(self, index: int) -> np.ndarray:
-        """Returns the target corresponding to the `index`'th multi_id.
-
-        This method assumes that all the embeddings corresponding to the same `multi_id`
-        have the same target. If this is not the case, it will raise an error.
-        """
-        multi_id = self._multi_ids[index]
-        targets = self._data.loc[
-            self._data[self._column_mapping["multi_id"]] == multi_id, self._column_mapping["target"]
-        ]
-
-        if not targets.nunique() == 1:
-            raise ValueError(f"Multiple targets found for {multi_id}.")
-
-        return np.asarray(targets.iloc[0], dtype=np.int64)
-
-    @override
-    def __len__(self) -> int:
-        return len(self._multi_ids)
+    def __init__(self, *args, **kwargs):
+        """Initialize dataset with the correct return type."""
+        super().__init__(*args, target_type=np.int64, **kwargs)
@@ -0,0 +1,114 @@
+"""Dataset class for where a sample corresponds to multiple embeddings."""
+
+import os
+from typing import Any, Callable, Dict, List, Literal
+
+import numpy as np
+import numpy.typing as npt
+import torch
+from typing_extensions import override
+
+from eva.core.data.datasets import embeddings as embeddings_base
+
+
+class MultiEmbeddingsDataset(embeddings_base.EmbeddingsDataset[torch.Tensor]):
+    """Dataset class for where a sample corresponds to multiple embeddings.
+
+    Example use case: Slide level dataset where each slide has multiple patch embeddings.
+    """
+
+    def __init__(
+        self,
+        root: str,
+        manifest_file: str,
+        split: Literal["train", "val", "test"],
+        column_mapping: Dict[str, str] = embeddings_base.default_column_mapping,
+        embeddings_transforms: Callable | None = None,
+        target_transforms: Callable | None = None,
+        target_type: type[np.generic] = np.int64,
+    ):
+        """Initialize dataset.
+
+        Expects a manifest file listing the paths of `.pt` files containing tensor embeddings.
+
+        The manifest must have a `column_mapping["multi_id"]` column that contains the
+        unique identifier group of embeddings. For oncology datasets, this would be usually
+        the slide id. Each row in the manifest file points to a .pt file that can contain
+        one or multiple embeddings (either as a list or stacked tensors). There can also be
+        multiple rows for the same `multi_id`, in which case the embeddings from the different
+        .pt files corresponding to that same `multi_id` will be stacked along the first dimension.
+
+        Args:
+            root: Root directory of the dataset.
+            manifest_file: The path to the manifest file, which is relative to
+                the `root` argument.
+            split: The dataset split to use. The `split` column of the manifest
+                file will be splitted based on this value.
+            column_mapping: Defines the map between the variables and the manifest
+                columns. It will overwrite the `default_column_mapping` with
+                the provided values, so that `column_mapping` can contain only the
+                values which are altered or missing.
+            embeddings_transforms: A function/transform that transforms the embedding.
+            target_transforms: A function/transform that transforms the target.
+            target_type: Desired type of the target data
+        """
+        super().__init__(
+            manifest_file=manifest_file,
+            root=root,
+            split=split,
+            column_mapping=column_mapping,
+            embeddings_transforms=embeddings_transforms,
+            target_transforms=target_transforms,
+        )
+
+        self._multi_ids: List[int]
+        self._target_type = target_type
+
+    @override
+    def setup(self):
+        super().setup()
+        self._multi_ids = list(self._data[self._column_mapping["multi_id"]].unique())
+
+    @override
+    def load_embeddings(self, index: int) -> torch.Tensor:
+        """Loads and stacks all embedding corresponding to the `index`'th multi_id."""
+        # Get all embeddings for the given index (multi_id)
+        multi_id = self._multi_ids[index]
+        embedding_paths = self._data.loc[
+            self._data[self._column_mapping["multi_id"]] == multi_id, self._column_mapping["path"]
+        ].to_list()
+
+        # Load embeddings and stack them accross the first dimension
+        embeddings = []
+        for path in embedding_paths:
+            embedding = torch.load(os.path.join(self._root, path), map_location="cpu")
+            if isinstance(embedding, list):
+                embedding = torch.stack(embedding, dim=0)
+            embeddings.append(embedding.unsqueeze(0) if embedding.ndim == 1 else embedding)
+        embeddings = torch.cat(embeddings, dim=0)
+
+        if not embeddings.ndim == 2:
+            raise ValueError(f"Expected 2D tensor, got {embeddings.ndim} for {multi_id}.")
+
+        return embeddings
+
+    @override
+    def load_target(self, index: int) -> npt.NDArray[Any]:
+        """Returns the target corresponding to the `index`'th multi_id.
+
+        This method assumes that all the embeddings corresponding to the same `multi_id`
+        have the same target. If this is not the case, it will raise an error.
+        """
+        multi_id = self._multi_ids[index]
+        targets = self._data.loc[
+            self._data[self._column_mapping["multi_id"]] == multi_id, self._column_mapping["target"]
+        ]
+
+        if not targets.nunique() == 1:
+            raise ValueError(f"Multiple targets found for {multi_id}.")
+
+        return np.asarray(targets.iloc[0], dtype=self._target_type)
+
+    @override
+    def __len__(self) -> int:
+        return len(self._multi_ids)
@@ -1,39 +1,15 @@
 """Embeddings regression dataset."""
 
-import os
-
 import torch
 from typing_extensions import override
 
-from eva.core.data.datasets import embeddings as embeddings_base
-
+from eva.core.data.datasets.classification import EmbeddingsClassificationDataset
 
-class EmbeddingsRegressionDataset(embeddings_base.EmbeddingsDataset[torch.Tensor]):
-    """Embeddings dataset class for regression tasks.
 
-    NOTE: This barely changes from the EmbeddingsClassificationDataset
-            but they have been kept apart for abstraction
-
-    """
-
-    @override
-    def load_embeddings(self, index: int) -> torch.Tensor:
-        filename = self.filename(index)
-        embeddings_path = os.path.join(self._root, filename)
-        tensor = torch.load(embeddings_path, map_location="cpu")
-        if isinstance(tensor, list):
-            if len(tensor) > 1:
-                raise ValueError(
-                    f"Expected a single tensor in the .pt file, but found {len(tensor)}."
-                )
-            tensor = tensor[0]
-        return tensor.squeeze(0)
+class EmbeddingsRegressionDataset(EmbeddingsClassificationDataset):
+    """Embeddings dataset class for regression tasks."""
 
     @override
     def load_target(self, index: int) -> torch.Tensor:
         target = self._data.at[index, self._column_mapping["target"]]
         return torch.tensor(float(target), dtype=torch.float32)
-
-    @override
-    def __len__(self) -> int:
-        return len(self._data)