luxonis · JSabadin · Jul 14, 2025 · Jul 9, 2025 · Jul 9, 2025 · Jul 11, 2025
@@ -295,6 +295,7 @@ The available commands are:
 - `luxonis_ml data info <dataset_name>` - prints information about the dataset
 - `luxonis_ml data inspect <dataset_name>` - renders the data in the dataset on screen using `cv2`
 - `luxonis_ml data health <dataset_name>` -  checks the health of the dataset and logs and renders dataset statistics
+- `luxonis_ml data sanitize <dataset_name>` -  removes duplicate files and duplicate annotations from the dataset
 - `luxonis_ml data delete <dataset_name>` - deletes the dataset
 - `luxonis_ml data export <dataset_name>` - exports the dataset to a chosen format and directory
 - `luxonis_ml data push <dataset_name>` - pushes local dataset to remote storage

@@ -678,6 +678,12 @@ def health(
 
     console.print(summary_table)
 
+    if missing_annotations or duplicate_uuids or duplicate_annotations:
+        console.print(
+            "[bold red]Dataset is unhealthy![/bold red] "
+            "Run [green]luxonis_ml data sanitize[/green] to automatically remove duplicates and missing entries."
+        )
+
     all_task_names = sorted(
         set(stats["class_distributions"].keys())
         | set(stats["heatmaps"].keys())
@@ -935,5 +941,18 @@ def merge(
         )
 
 
+@app.command()
+def sanitize(
+    name: DatasetNameArgument,
+    bucket_storage: BucketStorage = bucket_option,
+):
+    """Remove duplicate annotations and duplicate files from the
+    dataset."""
+    check_exists(name, bucket_storage)
+    dataset = LuxonisDataset(name, bucket_storage=bucket_storage)
+    dataset.remove_duplicates()
+    print(f"[green]Duplicates removed from dataset '{name}'.")
+
+
 if __name__ == "__main__":
     app()
@@ -1613,3 +1613,34 @@
         stats["heatmaps"] = get_heatmaps(df, sample_size)
 
         return stats
+
+    def remove_duplicates(self) -> None:
+        """Removes duplicate files and annotations from the dataset."""
+        df = self._load_df_offline(lazy=True)
+        index = self._get_index(lazy=True)
+
+        if df is None or index is None:
+            raise ValueError(
+                "Dataset index or dataframe with annotations is not available."
+            )
+
+        df_extended = df.join(index, on="uuid").drop("file_right")
+        duplicate_info = get_duplicates_info(df_extended)
+
+        duplicate_files_to_remove = [
+            file
+            for duplicates in duplicate_info["duplicate_uuids"]
+            for file in duplicates["files"][1:]
+        ]
+        df = df.filter(~pl.col("file").is_in(duplicate_files_to_remove))
+
+        df = df.unique(subset=["file", "annotation"], maintain_order=True)
+
+        self._save_df_offline(df.collect())
+
+        if self.is_remote:
+            self.fs.put_dir(
+                local_paths=self.local_path / "annotations",
+                remote_dir="annotations",
+                copy_contents=True,
+            )
@@ -2,12 +2,14 @@
 
 import pytest
 
-from luxonis_ml.data import LuxonisParser
+from luxonis_ml.data import DatasetIterator, LuxonisParser
 from luxonis_ml.data.utils.plot_utils import (
     _prepare_class_data,
     _prepare_heatmap_data,
 )
 
+from .utils import create_dataset, create_image
+
 
 @pytest.mark.parametrize("url", ["COCO_people_subset.zip"])
 def test_dataset_health(
@@ -81,3 +83,65 @@ def test_dataset_health(
         )
         <= 2
     )
+
+
+def test_dataset_sanitize(
+    dataset_name: str,
+    tempdir: Path,
+):
+    def generator() -> DatasetIterator:
+        for i in range(5):
+            img = create_image(i, tempdir)
+            img_copy_path = tempdir / f"img_{i}_copy.jpg"
+            img_copy_path.write_bytes(img.read_bytes())
+            # Original image with annotations
+            yield {
+                "file": img,
+                "annotation": {
+                    "class": "person",
+                    "boundingbox": {
+                        "x": 0.1,
+                        "y": 0.1,
+                        "w": 0.8,
+                        "h": 0.8,
+                    },
+                },
+            }
+            # Duplicate image with same UUID
+            yield {
+                "file": img_copy_path,
+                "annotation": {
+                    "class": "person",
+                    "boundingbox": {
+                        "x": 0.11,
+                        "y": 0.11,
+                        "w": 0.78,
+                        "h": 0.78,
+                    },
+                },
+            }
+            # Duplicate annotations
+            yield {
+                "file": img,
+                "annotation": {
+                    "class": "person",
+                    "boundingbox": {
+                        "x": 0.1,
+                        "y": 0.1,
+                        "w": 0.8,
+                        "h": 0.8,
+                    },
+                },
+            }
+
+    dataset = create_dataset(dataset_name, generator())
+
+    stats_before = dataset.get_statistics()
+    assert len(stats_before["duplicates"]["duplicate_uuids"]) == 5
+    assert len(stats_before["duplicates"]["duplicate_annotations"]) == 10
+
+    dataset.remove_duplicates()
+
+    stats_after = dataset.get_statistics()
+    assert len(stats_after["duplicates"]["duplicate_uuids"]) == 0
+    assert len(stats_after["duplicates"]["duplicate_annotations"]) == 0