luxonis · dtronmans · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026
@@ -575,6 +575,15 @@ def parse(
             show_default=False,
         ),
     ] = None,
+    no_clean: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "--no-clean",
+            help="Skip automatic cleaning of known dataset issues "
+            "(e.g., duplicate images in COCO, duplicate class names in ImageNet).",
+        ),
+    ] = False,
 ):
     """Parses a directory with data and creates Luxonis dataset."""
     parser = LuxonisParser(
@@ -584,6 +593,7 @@ def parse(
         delete_local=delete_local,
         save_dir=save_dir,
         task_name=task_name,
+        skip_clean=no_clean,
     )
     dataset = parser.parse()
 

@@ -109,6 +109,27 @@ def get_classes(self) -> dict[str, list[str]]:
         """
         ...
 
+    @abstractmethod
+    def set_native_classes(
+        self,
+        native_classes: dict[int, str],
+        source_format: str,
+    ) -> None:
+        """Sets the native class indexing from the original dataset
+        format.
+
+        This stores the mapping from original dataset indices to class
+        names.
+
+        @type native_classes: Dict[int, str]
+        @param native_classes: A dictionary mapping original dataset
+            indices to class names.
+        @type source_format: str
+        @param source_format: The name of the source format (e.g.,
+            "coco", "imagenet").
+        """
+        ...
+
     @abstractmethod
     def set_skeletons(
         self,

@@ -848,6 +848,28 @@ def get_metadata_types(
     ) -> dict[str, Literal["float", "int", "str", "Category"]]:
         return self._metadata.metadata_types
 
+    def set_native_classes(
+        self,
+        native_classes: dict[int, str],
+        source_format: str,
+    ) -> None:
+        """Sets the native class indexing from the original dataset
+        format.
+
+        This stores the mapping from original dataset indices to class
+        names
+        """
+        self._metadata.native_classes[source_format] = native_classes
+        self._write_metadata()
+
+    def get_native_classes(
+        self,
+        source_format: str | None = None,
+    ) -> dict[str, dict[int, str]] | dict[int, str] | None:
+        if source_format is None:
+            return self._metadata.native_classes
+        return self._metadata.native_classes.get(source_format)
+
     def pull_from_cloud(
         self, update_mode: UpdateMode = UpdateMode.MISSING
     ) -> None:

@@ -23,6 +23,7 @@ class Metadata(BaseModelExtraForbid):
     categorical_encodings: dict[str, dict[str, int]] = {}
     metadata_types: dict[str, Literal["float", "int", "str", "Category"]] = {}
     parent_dataset: str | None = None
+    native_classes: dict[str, dict[int, str]] = {}
 
     def set_classes(
         self, classes: list[str] | dict[str, int], task: str
@@ -99,6 +100,8 @@ def merge_with(self, other: "Metadata") -> "Metadata":
         else:
             merged_source = None
 
+        merged_native_classes = {**self.native_classes, **other.native_classes}
+
         return Metadata(
             ldf_version=self.ldf_version,
             source=merged_source,
@@ -107,6 +110,7 @@ def merge_with(self, other: "Metadata") -> "Metadata":
             skeletons=merged_skeletons,
             categorical_encodings=merged_categorical_encodings,
             metadata_types=merged_metadata_types,  # type: ignore
+            native_classes=merged_native_classes,
         )
 
     def _sort_classes(self, classes: Iterable[str]) -> list[str]:

@@ -151,6 +151,7 @@ def parse_split(
         @rtype: LuxonisDataset
         @return: C{LDF} with all the images and annotations parsed.
         """
+        self.skip_clean = kwargs.pop("skip_clean", False)
         added_images = self._parse_split(**kwargs)
         if split is not None:
             self.dataset.make_splits({split: added_images})
@@ -169,6 +170,7 @@ def parse_dir(self, dataset_dir: Path, **kwargs) -> BaseDataset:
         @rtype: LuxonisDataset
         @return: C{LDF} with all the images and annotations parsed.
         """
+        self.skip_clean = kwargs.pop("skip_clean", False)
         train, val, test = self.from_dir(dataset_dir, **kwargs)
 
         self.dataset.make_splits({"train": train, "val": val, "test": test})

@@ -160,7 +160,11 @@ def from_dir(
             and dir_format is COCOFormat.FIFTYONE
             else train_paths["annotation_path"]
         )
-        cleaned_annotation_path = clean_annotations(train_ann_path)
+        cleaned_annotation_path = (
+            train_ann_path
+            if self.skip_clean
+            else clean_annotations(train_ann_path)
+        )
         added_train_imgs = self._parse_split(
             image_dir=train_paths["image_dir"],
             annotation_path=cleaned_annotation_path,
@@ -211,10 +215,27 @@ def from_dir(
                 annotation_path=test_ann_path,
             )
 
+        # Extract and set native COCO class indexing
+        native_classes = self._extract_native_classes(cleaned_annotation_path)
+        if native_classes:
+            self.dataset.set_native_classes(native_classes, "coco")
+
         return added_train_imgs, added_val_imgs, added_test_imgs
 
+    @staticmethod
+    def _extract_native_classes(annotation_path: Path) -> dict[int, str]:
+        with open(annotation_path) as f:
+            annotation_data = json.load(f)
+
+        categories = annotation_data.get("categories", [])
+        # Contiguous 0-indexed mapping
+        sorted_categories = sorted(categories, key=lambda x: x["id"])
+        return {i: cat["name"] for i, cat in enumerate(sorted_categories)}
+
     def from_split(
-        self, image_dir: Path, annotation_path: Path
+        self,
+        image_dir: Path,
+        annotation_path: Path,
     ) -> ParserOutput:
         """Parses annotations from COCO format to LDF. Annotations
         include classification, segmentation, object detection and
@@ -228,6 +249,11 @@ def from_split(
         @return: Annotation generator, list of classes names, skeleton
             dictionary for keypoints and list of added images.
         """
+        annotation_path = (
+            annotation_path
+            if self.skip_clean
+            else clean_annotations(annotation_path)
+        )
 
         with open(annotation_path) as f:
             annotation_data = json.load(f)

@@ -2,6 +2,8 @@
 from pathlib import Path
 from typing import Any
 
+from loguru import logger
+
 from luxonis_ml.data import DatasetIterator
 
 from .base_parser import BaseParser, ParserOutput
@@ -96,12 +98,37 @@ def from_dir(
                 split_path=dataset_dir / "test"
             )
 
+        train_labels_path = dataset_dir / "train" / "labels.json"
+        if train_labels_path.exists():
+            native_classes = self._extract_native_classes(train_labels_path)
+            if native_classes:
+                self.dataset.set_native_classes(native_classes, "imagenet")
+
         return added_train_imgs, added_val_imgs, added_test_imgs
 
+    @staticmethod
+    def _extract_native_classes(labels_path: Path) -> dict[int, str]:
+        with open(labels_path) as f:
+            labels_data = json.load(f)
+
+        classes = labels_data.get("classes", [])
+        return dict(enumerate(classes))
+
     def from_split(self, split_path: Path) -> ParserOutput:
         labels_path = split_path / "labels.json"
         data_path = split_path / "data"
 
+        # For flat structure (not a standard split directory), clean
+        # ImageNet annotations to fix known issues with class names
+        # and label indices, and set native classes
+        is_flat_structure = split_path.name not in self.SPLIT_NAMES
+        if is_flat_structure:
+            if not self.skip_clean:
+                labels_path = clean_imagenet_annotations(labels_path)
+            native_classes = self._extract_native_classes(labels_path)
+            if native_classes:
+                self.dataset.set_native_classes(native_classes, "imagenet")
+
         with open(labels_path) as f:
             labels_data = json.load(f)
 
@@ -127,3 +154,78 @@ def generator() -> DatasetIterator:
         added_images = self._get_added_images(generator())
 
         return generator(), {}, added_images
+
+
+def clean_imagenet_annotations(labels_path: Path) -> Path:
+    """Cleans ImageNet annotations by fixing known issues with class
+    names and label indices.
+
+    This function handles two known issues in ImageNet FiftyOne exports:
+
+        1. Duplicate class names: First instance of "crane" is renamed
+           to "crane_bird", second instance of "maillot" is renamed to
+           "maillot_swim_suit".
+
+        2. Misindexed labels: "006742" label 517 is corrected to 134,
+           "031933" label 639 is corrected to 638.
+
+    @type labels_path: Path
+    @param labels_path: Path to the labels.json file.
+    @rtype: Path
+    @return: Path to the cleaned labels file.
+    """
+    with open(labels_path) as f:
+        labels_data = json.load(f)
+
+    classes = labels_data["classes"]
+    labels = labels_data["labels"]
+
+    modified = False
+
+    # Fix duplicate class names
+    # First "crane" (bird) should be renamed to "crane_bird"
+    crane_indices = [i for i, c in enumerate(classes) if c == "crane"]
+    if len(crane_indices) >= 1:
+        first_crane_idx = crane_indices[0]
+        classes[first_crane_idx] = "crane_bird"
+        logger.info(
+            f"Renamed class 'crane' at index {first_crane_idx} to 'crane_bird'"
+        )
+        modified = True
+
+    # Second "maillot" should be renamed to "maillot_swim_suit"
+    maillot_indices = [i for i, c in enumerate(classes) if c == "maillot"]
+    if len(maillot_indices) >= 2:
+        second_maillot_idx = maillot_indices[1]
+        classes[second_maillot_idx] = "maillot_swim_suit"
+        logger.info(
+            f"Renamed class 'maillot' at index {second_maillot_idx} "
+            "to 'maillot_swim_suit'"
+        )
+        modified = True
+
+    # Fix misindexed labels
+    # Image 006742 should map to index 134, not 517
+    if labels.get("006742") == 517:
+        labels["006742"] = 134
+        logger.info("Fixed label index for image '006742': 517 -> 134")
+        modified = True
+
+    # Image 031933 should map to index 638, not 639
+    if labels.get("031933") == 639:
+        labels["031933"] = 638
+        logger.info("Fixed label index for image '031933': 639 -> 638")
+        modified = True
+
+    if not modified:
+        return labels_path
+
+    labels_data["classes"] = classes
+    labels_data["labels"] = labels
+
+    cleaned_labels_path = labels_path.with_name("labels_fixed.json")
+    with open(cleaned_labels_path, "w") as f:
+        json.dump(labels_data, f)
+
+    logger.info(f"Cleaned annotations saved to {cleaned_labels_path}")
+    return cleaned_labels_path
@@ -65,6 +65,7 @@ def __init__(
         dataset_plugin: T = None,
         dataset_type: DatasetType | None = None,
         task_name: str | dict[str, str] | None = None,
+        skip_clean: bool = False,
         **kwargs,
     ):
         """High-level abstraction over various parsers.
@@ -104,6 +105,10 @@ def __init__(
             a dictionary with class names as keys and task names as values.
             In the latter case, the task name for a record with a given
             class name will be taken from the dictionary.
+        @type skip_clean: bool
+        @param skip_clean: If C{True}, skip automatic cleaning of known
+            dataset issues (e.g., duplicate images in COCO, duplicate
+            class names in ImageNet). Defaults to C{False}.
         @type kwargs: Dict[str, Any]
         @param kwargs: Additional C{kwargs} to be passed to the
             constructor of specific L{BaseDataset} implementation.
@@ -155,6 +160,7 @@ class name will be taken from the dictionary.
         self.parser = self.parsers[self.dataset_type](
             self.dataset, self.dataset_type, task_name
         )
+        self.skip_clean = skip_clean
 
     @overload
     def parse(self: "LuxonisParser[str]", **kwargs) -> BaseDataset: ...
@@ -216,7 +222,9 @@ def _parse_dir(self, **kwargs) -> BaseDataset:
         @return: C{LDF} with all the images and annotations parsed.
         """
 
-        return self.parser.parse_dir(self.dataset_dir, **kwargs)
+        return self.parser.parse_dir(
+            self.dataset_dir, skip_clean=self.skip_clean, **kwargs
+        )
 
     def _parse_split(
         self,
@@ -252,7 +260,12 @@ def _parse_split(
                 f"Dataset {self.dataset_dir} is not in the expected format for {self.dataset_type} parser."
             )
         return self.parser.parse_split(
-            split, random_split, split_ratios, **parsed_kwargs, **kwargs
+            split,
+            random_split,
+            split_ratios,
+            skip_clean=self.skip_clean,
+            **parsed_kwargs,
+            **kwargs,
         )
 
     def _download_roboflow_dataset(