Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions luxonis_ml/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,7 @@ The available commands are:
- `luxonis_ml data info <dataset_name>` - prints information about the dataset
- `luxonis_ml data inspect <dataset_name>` - renders the data in the dataset on screen using `cv2`
- `luxonis_ml data health <dataset_name>` - checks the health of the dataset and logs and renders dataset statistics
- `luxonis_ml data sanitize <dataset_name>` - removes duplicate files and duplicate annotations from the dataset
- `luxonis_ml data delete <dataset_name>` - deletes the dataset
- `luxonis_ml data export <dataset_name>` - exports the dataset to a chosen format and directory
- `luxonis_ml data push <dataset_name>` - pushes local dataset to remote storage
Expand Down
19 changes: 19 additions & 0 deletions luxonis_ml/data/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,12 @@ def health(

console.print(summary_table)

if missing_annotations or duplicate_uuids or duplicate_annotations:
console.print(
"[bold red]Dataset is unhealthy![/bold red] "
"Run [green]luxonis_ml data sanitize[/green] to automatically remove duplicates and missing entries."
)

all_task_names = sorted(
set(stats["class_distributions"].keys())
| set(stats["heatmaps"].keys())
Expand Down Expand Up @@ -935,5 +941,18 @@ def merge(
)


@app.command()
def sanitize(
name: DatasetNameArgument,
bucket_storage: BucketStorage = bucket_option,
):
"""Remove duplicate annotations and duplicate files from the
dataset."""
check_exists(name, bucket_storage)
dataset = LuxonisDataset(name, bucket_storage=bucket_storage)
dataset.remove_duplicates()
print(f"[green]Duplicates removed from dataset '{name}'.")


if __name__ == "__main__":
app()
31 changes: 31 additions & 0 deletions luxonis_ml/data/datasets/luxonis_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1613,3 +1613,34 @@
stats["heatmaps"] = get_heatmaps(df, sample_size)

return stats

def remove_duplicates(self) -> None:
"""Removes duplicate files and annotations from the dataset."""
df = self._load_df_offline(lazy=True)
index = self._get_index(lazy=True)

if df is None or index is None:
raise ValueError(

Check warning on line 1623 in luxonis_ml/data/datasets/luxonis_dataset.py

View check run for this annotation

Codecov / codecov/patch

luxonis_ml/data/datasets/luxonis_dataset.py#L1623

Added line #L1623 was not covered by tests
"Dataset index or dataframe with annotations is not available."
)

df_extended = df.join(index, on="uuid").drop("file_right")
duplicate_info = get_duplicates_info(df_extended)

duplicate_files_to_remove = [
file
for duplicates in duplicate_info["duplicate_uuids"]
for file in duplicates["files"][1:]
]
df = df.filter(~pl.col("file").is_in(duplicate_files_to_remove))

df = df.unique(subset=["file", "annotation"], maintain_order=True)

self._save_df_offline(df.collect())

if self.is_remote:
self.fs.put_dir(

Check warning on line 1642 in luxonis_ml/data/datasets/luxonis_dataset.py

View check run for this annotation

Codecov / codecov/patch

luxonis_ml/data/datasets/luxonis_dataset.py#L1642

Added line #L1642 was not covered by tests
local_paths=self.local_path / "annotations",
remote_dir="annotations",
copy_contents=True,
)
66 changes: 65 additions & 1 deletion tests/test_data/test_health.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

import pytest

from luxonis_ml.data import LuxonisParser
from luxonis_ml.data import DatasetIterator, LuxonisParser
from luxonis_ml.data.utils.plot_utils import (
_prepare_class_data,
_prepare_heatmap_data,
)

from .utils import create_dataset, create_image


@pytest.mark.parametrize("url", ["COCO_people_subset.zip"])
def test_dataset_health(
Expand Down Expand Up @@ -81,3 +83,65 @@ def test_dataset_health(
)
<= 2
)


def test_dataset_sanitize(
dataset_name: str,
tempdir: Path,
):
def generator() -> DatasetIterator:
for i in range(5):
img = create_image(i, tempdir)
img_copy_path = tempdir / f"img_{i}_copy.jpg"
img_copy_path.write_bytes(img.read_bytes())
# Original image with annotations
yield {
"file": img,
"annotation": {
"class": "person",
"boundingbox": {
"x": 0.1,
"y": 0.1,
"w": 0.8,
"h": 0.8,
},
},
}
# Duplicate image with same UUID
yield {
"file": img_copy_path,
"annotation": {
"class": "person",
"boundingbox": {
"x": 0.11,
"y": 0.11,
"w": 0.78,
"h": 0.78,
},
},
}
# Duplicate annotations
yield {
"file": img,
"annotation": {
"class": "person",
"boundingbox": {
"x": 0.1,
"y": 0.1,
"w": 0.8,
"h": 0.8,
},
},
}

dataset = create_dataset(dataset_name, generator())

stats_before = dataset.get_statistics()
assert len(stats_before["duplicates"]["duplicate_uuids"]) == 5
assert len(stats_before["duplicates"]["duplicate_annotations"]) == 10

dataset.remove_duplicates()

stats_after = dataset.get_statistics()
assert len(stats_after["duplicates"]["duplicate_uuids"]) == 0
assert len(stats_after["duplicates"]["duplicate_annotations"]) == 0
Loading