no clean option

dtronmans · dtronmans · commit 8ce757cfac09 · 2026-01-13T13:57:16.000+01:00
diff --git a/luxonis_ml/data/__main__.py b/luxonis_ml/data/__main__.py
@@ -575,6 +575,15 @@ def parse(
             show_default=False,
         ),
     ] = None,
+    no_clean: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "--no-clean",
+            help="Skip automatic cleaning of known dataset issues "
+            "(e.g., duplicate images in COCO, duplicate class names in ImageNet).",
+        ),
+    ] = False,
 ):
     """Parses a directory with data and creates Luxonis dataset."""
     parser = LuxonisParser(
@@ -584,6 +593,7 @@ def parse(
         delete_local=delete_local,
         save_dir=save_dir,
         task_name=task_name,
+        skip_clean=no_clean,
     )
     dataset = parser.parse()
 
diff --git a/luxonis_ml/data/parsers/coco_parser.py b/luxonis_ml/data/parsers/coco_parser.py
@@ -127,6 +127,7 @@ def from_dir(
         use_keypoint_ann: bool = False,
         keypoint_ann_paths: dict[str, str] | None = None,
         split_val_to_test: bool = True,
+        skip_clean: bool = False,
     ) -> tuple[list[Path], list[Path], list[Path]]:
         dir_format, splits = COCOParser._detect_dataset_dir_format(dataset_dir)
         if dir_format is None:
@@ -160,7 +161,9 @@ def from_dir(
             and dir_format is COCOFormat.FIFTYONE
             else train_paths["annotation_path"]
         )
-        cleaned_annotation_path = clean_annotations(train_ann_path)
+        cleaned_annotation_path = (
+            train_ann_path if skip_clean else clean_annotations(train_ann_path)
+        )
         added_train_imgs = self._parse_split(
             image_dir=train_paths["image_dir"],
             annotation_path=cleaned_annotation_path,
diff --git a/luxonis_ml/data/parsers/fiftyone_classification_parser.py b/luxonis_ml/data/parsers/fiftyone_classification_parser.py
@@ -114,7 +114,9 @@ def _extract_native_classes(labels_path: Path) -> dict[int, str]:
         classes = labels_data.get("classes", [])
         return dict(enumerate(classes))
 
-    def from_split(self, split_path: Path) -> ParserOutput:
+    def from_split(
+        self, split_path: Path, skip_clean: bool = False
+    ) -> ParserOutput:
         labels_path = split_path / "labels.json"
         data_path = split_path / "data"
 
@@ -123,7 +125,8 @@ def from_split(self, split_path: Path) -> ParserOutput:
         # and label indices, and set native classes
         is_flat_structure = split_path.name not in self.SPLIT_NAMES
         if is_flat_structure:
-            labels_path = clean_imagenet_annotations(labels_path)
+            if not skip_clean:
+                labels_path = clean_imagenet_annotations(labels_path)
             native_classes = self._extract_native_classes(labels_path)
             if native_classes:
                 self.dataset.set_native_classes(native_classes, "imagenet")
diff --git a/luxonis_ml/data/parsers/luxonis_parser.py b/luxonis_ml/data/parsers/luxonis_parser.py
@@ -65,6 +65,7 @@ def __init__(
         dataset_plugin: T = None,
         dataset_type: DatasetType | None = None,
         task_name: str | dict[str, str] | None = None,
+        skip_clean: bool = False,
         **kwargs,
     ):
         """High-level abstraction over various parsers.
@@ -104,6 +105,10 @@ def __init__(
             a dictionary with class names as keys and task names as values.
             In the latter case, the task name for a record with a given
             class name will be taken from the dictionary.
+        @type skip_clean: bool
+        @param skip_clean: If C{True}, skip automatic cleaning of known
+            dataset issues (e.g., duplicate images in COCO, duplicate
+            class names in ImageNet). Defaults to C{False}.
         @type kwargs: Dict[str, Any]
         @param kwargs: Additional C{kwargs} to be passed to the
             constructor of specific L{BaseDataset} implementation.
@@ -155,6 +160,7 @@ class name will be taken from the dictionary.
         self.parser = self.parsers[self.dataset_type](
             self.dataset, self.dataset_type, task_name
         )
+        self.skip_clean = skip_clean
 
     @overload
     def parse(self: "LuxonisParser[str]", **kwargs) -> BaseDataset: ...
@@ -216,7 +222,9 @@ def _parse_dir(self, **kwargs) -> BaseDataset:
         @return: C{LDF} with all the images and annotations parsed.
         """
 
-        return self.parser.parse_dir(self.dataset_dir, **kwargs)
+        return self.parser.parse_dir(
+            self.dataset_dir, skip_clean=self.skip_clean, **kwargs
+        )
 
     def _parse_split(
         self,
@@ -252,7 +260,12 @@ def _parse_split(
                 f"Dataset {self.dataset_dir} is not in the expected format for {self.dataset_type} parser."
             )
         return self.parser.parse_split(
-            split, random_split, split_ratios, **parsed_kwargs, **kwargs
+            split,
+            random_split,
+            split_ratios,
+            skip_clean=self.skip_clean,
+            **parsed_kwargs,
+            **kwargs,
         )
 
     def _download_roboflow_dataset(