aramis-lab · thibaultdvx · Jun 13, 2025 · Apr 16, 2025 · Apr 16, 2025 · Apr 16, 2025
diff --git a/clinicadl/data/dataloader/config.py b/clinicadl/data/dataloader/config.py
@@ -1,26 +1,22 @@
-from typing import Iterator, Optional, Union, overload
+from typing import Iterator, Optional, overload
 
 from pydantic import NonNegativeInt, PositiveInt, model_validator
-from torch.utils.data import DataLoader as TorchDataLoader
+from torch.utils.data import DataLoader as TorchDataLoaader
 from torch.utils.data import DistributedSampler, Sampler, WeightedRandomSampler
 
 from clinicadl.data.datasets import (
     CapsDataset,
-    ConcatDataset,
     PairedDataset,
     UnpairedDataset,
 )
+from clinicadl.data.datasets.types import Dataset, SimpleDataset, TupleDataset
 from clinicadl.utils.config import ClinicaDLConfig
 from clinicadl.utils.seed import pl_worker_init_function
 
 from .batch import SimpleBatch, simple_collate_fn, tuple_collate_fn
 
-SimpleDataset = Union[CapsDataset, ConcatDataset]
-TupleDataset = Union[PairedDataset, UnpairedDataset]
-Dataset = Union[SimpleDataset, TupleDataset]
 
-
-class DataLoader(TorchDataLoader):
+class DataLoader(TorchDataLoaader):
     """
     Overwrites :py:class:`torch.utils.data.DataLoader` only to add a `set_epoch` method.
     """
@@ -294,6 +290,8 @@ def get_object(
         ------
         ValueError
             If only one of ``dp_degree`` and ``rank`` is not ``None``.
+        ValueError
+            If ``rank`` is greater than ``dp_degree``.
         ValueError
             If the dataset is an :py:class:`~clinicadl.data.datasets.UnpairedDataset`,
             and ``sampling_weights`` is not ``None``.
@@ -304,6 +302,24 @@ def get_object(
             If ``sampling_weights`` is not ``None`` and the associated column cannot
             be converted to float values.
         """
+        if (rank is not None and dp_degree is None) or (
+            dp_degree is not None and rank is None
+        ):
+            raise ValueError(
+                "For data parallelism, none of 'dp_degree' and 'rank' can be None. "
+                f"Got rank={rank} and dp_degree={dp_degree}"
+            )
+
+        if dp_degree is None:
+            dp_degree = 1
+            rank = 0
+
+        if rank >= dp_degree:
+            raise ValueError(
+                "'rank' must be strictly smaller than 'dp_degree'. Got "
+                f"dp_degree={dp_degree} and rank={rank}"
+            )
+
         return DataLoader(
             dataset=dataset,
             sampler=self._generate_sampler(dataset, dp_degree, rank),
@@ -317,26 +333,15 @@ def get_object(
     def _generate_sampler(
         self,
         dataset: CapsDataset,
-        dp_degree: Optional[int],
-        rank: Optional[int],
+        dp_degree: int,
+        rank: int,
     ) -> Sampler:
         """
         Returns a WeightedRandomSampler if self.sampling_weights is not None, otherwise a
         a DistributedSampler, even when data parallelism is not performed (in this case
         the degree of data parallelism is set to 1, so it is equivalent to a simple PyTorch
         RandomSampler if self.shuffle is True or no sampler if self.shuffle is False).
         """
-        if (rank is not None and dp_degree is None) or (
-            dp_degree is not None and rank is None
-        ):
-            raise ValueError(
-                "For data parallelism, none of 'dp_degree' and 'rank' can be None. "
-                f"Got rank={rank} and dp_degree={dp_degree}"
-            )
-        if dp_degree is None:
-            dp_degree = 1
-            rank = 0
-
         if self.sampling_weights and rank is not None:
             weights = self._get_weights(dataset, self.sampling_weights)
             length = len(weights) // dp_degree + int(rank < len(weights) % dp_degree)

diff --git a/clinicadl/data/datasets/caps_dataset.py b/clinicadl/data/datasets/caps_dataset.py
@@ -26,14 +26,10 @@
 )
 from clinicadl.transforms.extraction import ExtractionMethod, Sample
 from clinicadl.transforms.transforms import Transforms
-from clinicadl.tsvtools.utils import (
-    check_df,
-    tsv_to_df,
-)
+from clinicadl.tsvtools.utils import read_data
 from clinicadl.utils.exceptions import (
     ClinicaDLArgumentError,
     ClinicaDLCAPSError,
-    ClinicaDLTSVError,
 )
 from clinicadl.utils.typing import DataType, PathType
 
@@ -332,7 +328,10 @@ def to_tensors(
         self._count_samples()
 
     def read_tensor_conversion(
-        self, json_name: str, check_transforms: bool = True, load_also: list[str] = []
+        self,
+        json_name: str,
+        check_transforms: bool = True,
+        load_also: Optional[list[str]] = None,
     ) -> None:
         """
         To read an old tensor conversion. The function will check that
@@ -365,7 +364,7 @@ def read_tensor_conversion(
             .. warning::
                 **To use carefully**. You must be sure that the transforms match before setting ``check_transforms=False``.
 
-        load_also : list[str] (optional, default=[])
+        load_also : list[str] (optional, default=None)
             To load additional information potentially stored in ``.pt`` files. By default, only the image, the label, and masks
             mentioned in the argument ``masks`` of the CapsDataset will be loaded.
 
@@ -421,28 +420,20 @@ def subset(self, data: DataType) -> CapsDataset:
         ClinicaDLTSVError
             If the DataFrame associated to ``data`` does not contain the columns ``"participant_id"``
             and ``"session_id"``.
-        ClinicaDLTSVError
-            If some (participant, session) pairs mentioned in ``data`` are not in the current CapsDataset.
+        ClinicaDLCAPSError
+            If no (participant, session) pairs mentioned in ``data`` are in the current CapsDataset
+            (this would lead to an empty dataset).
         """
-        new_df = self._check_data_instance(data).set_index([PARTICIPANT_ID, SESSION_ID])
-
-        try:
-            subset_df = (
-                self.df.set_index([PARTICIPANT_ID, SESSION_ID])
-                .loc[new_df.index]
-                .reset_index()
-            )
-        except KeyError as exc:
-            missing_pairs = new_df.index.difference(
-                self.df.set_index([PARTICIPANT_ID, SESSION_ID]).index
-            )
+        new_df = read_data(data, check_protected_names=False).set_index(
+            [PARTICIPANT_ID, SESSION_ID]
+        )
+        df = self.df.set_index([PARTICIPANT_ID, SESSION_ID])
+        subset_df = df.loc[new_df.index.intersection(df.index)].reset_index()
 
-            err_message = (
-                "Some couples (participant, session) are not in the dataset:\n"
+        if len(subset_df) == 0:
+            raise ClinicaDLCAPSError(
+                "No (participant, session) pairs mentioned in 'data' are in the CapsDataset. This would lead to an empty dataset!"
             )
-            for pair in missing_pairs:
-                err_message += f" - {pair} \n"
-            raise ClinicaDLTSVError(err_message) from exc
 
         dataset = deepcopy(self)
         dataset.df = subset_df
@@ -621,7 +612,7 @@ def _check_label(self, label: Optional[str]) -> Optional[Union[Column, Mask]]:
         """
         if isinstance(label, str):
             if label in self.df.columns:
-                if isinstance(self.df[label].iloc[0], str):
+                if not pd.api.types.is_numeric_dtype(self.df[label]):
                     label_list = self.df[label].unique()
                     if len(label_list) > 5:
                         raise ClinicaDLArgumentError(
@@ -727,24 +718,10 @@ def _get_df_from_input(self, data: Optional[DataType]) -> pd.DataFrame:
                 f"'data' must be a Pandas DataFrame, a path to a TSV file or None. Got {data}"
             )
 
-        df = self._check_data_instance(data)
+        df = read_data(data)
 
         return deepcopy(df)
 
-    @staticmethod
-    def _check_data_instance(data: DataType) -> pd.DataFrame:
-        """
-        Checks the DataFrame passed by the user (either as a DataFrame or
-        as a path to a TSV). Returns the checked DataFrame.
-        """
-        if isinstance(data, (str, Path)):
-            path = Path(data)
-            df = tsv_to_df(path)
-        elif isinstance(data, pd.DataFrame):
-            df = check_df(data)
-
-        return df  # pylint: disable=possibly-used-before-assignment
-
     ### for __getitem__ ###
     def _get_meta_data(self, idx: int) -> Tuple[str, str, int]:
         """

diff --git a/clinicadl/data/datasets/concat.py b/clinicadl/data/datasets/concat.py
@@ -10,10 +10,10 @@
 import pandas as pd
 from torch.utils.data import ConcatDataset as TorchConcatDataset
 
-from clinicadl.dictionary.words import N_SAMPLES, PARTICIPANT_ID, SESSION_ID
+from clinicadl.dictionary.words import DATASET_ID, PARTICIPANT_ID, SESSION_ID
 from clinicadl.transforms.extraction import Sample
 from clinicadl.transforms.extraction.slice import Slice
-from clinicadl.utils.exceptions import ClinicaDLCAPSError, ClinicaDLTSVError
+from clinicadl.utils.exceptions import ClinicaDLCAPSError
 from clinicadl.utils.typing import DataType
 
 from .caps_dataset import CapsDataset
@@ -163,38 +163,28 @@ def subset(self, data: DataType) -> ConcatDataset:
             If the DataFrame associated to ``data`` does not contain the columns ``"participant_id"``
             and ``"session_id"``.
         ClinicaDLCAPSError
-            If some (participant, session) pairs mentioned in ``data`` are not in any of the CapsDatasets
-            forming the ConcatDataset.
+            If no (participant, session) pairs mentioned in ``data`` are at least in one of the underlying datasets.
+            This would lead to an empty ConcatDataset.
         """
-        data = CapsDataset._check_data_instance(data).set_index(
-            [PARTICIPANT_ID, SESSION_ID]
-        )
-
-        in_a_df = {(participant, session): False for participant, session in data.index}
-        datasets = []
+        sub_datasets = []
+        not_empty = False
         for dataset in self.datasets:
-            participants_sessions = dataset.get_participant_session_couples()
-            participants_sessions = data.index.intersection(participants_sessions)
-
-            for participant_session in participants_sessions:
-                in_a_df[participant_session] = True
-
-            sub_data = data.loc[participants_sessions]
             try:
-                datasets.append(dataset.subset(sub_data.reset_index()))
-            except ClinicaDLTSVError:
+                sub_datasets.append(dataset.subset(data))
+                not_empty = True
+            except ClinicaDLCAPSError:  # empty dataset
                 continue
 
-        raise_error = False
-        err_message = "Some couples (participant, session) are not in any of the datasets forming the ConcatDataset:\n"
-        for participant_session in in_a_df:
-            if not in_a_df[participant_session]:
-                raise_error = True
-                err_message += f" - {participant_session} \n"
-        if raise_error:
-            raise ClinicaDLCAPSError(err_message)
+        if not not_empty:
+            raise ClinicaDLCAPSError(
+                "No (participant, session) pairs mentioned in 'data' are in the ConcatDataset. This would lead to an empty dataset!"
+            )
 
-        return ConcatDataset(datasets, ignore_spacing=True, raise_warnings=False)
+        return ConcatDataset(
+            sub_datasets,
+            ignore_spacing=True,
+            raise_warnings=False,
+        )
 
     def describe(self) -> tuple[Dict[str, Any], ...]:
         """
@@ -357,16 +347,14 @@ def _concat_dfs(self) -> pd.DataFrame:
         """
         Concatenates the dataframes from all the datasets.
         """
-        df = pd.concat(
-            [
-                dataset.df[[PARTICIPANT_ID, SESSION_ID, N_SAMPLES]]
-                for dataset in self.datasets
-            ],
+        df: pd.DataFrame = pd.concat(
+            [dataset.df for dataset in self.datasets],
             keys=range(len(self.datasets)),
-            names=["dataset_id"],
+            names=[DATASET_ID],
         )
+        CapsDataset._map_indices_to_images(df)
 
         return df.reset_index(
             drop=False,
-            level=0,
+            level=DATASET_ID,
         ).reset_index(drop=True)