Skip to content

Commit 8ce757c

Browse files
committed
no clean option
1 parent dcbf319 commit 8ce757c

File tree

4 files changed

+34
-5
lines changed

4 files changed

+34
-5
lines changed

luxonis_ml/data/__main__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,15 @@ def parse(
575575
show_default=False,
576576
),
577577
] = None,
578+
no_clean: Annotated[
579+
bool,
580+
typer.Option(
581+
...,
582+
"--no-clean",
583+
help="Skip automatic cleaning of known dataset issues "
584+
"(e.g., duplicate images in COCO, duplicate class names in ImageNet).",
585+
),
586+
] = False,
578587
):
579588
"""Parses a directory with data and creates Luxonis dataset."""
580589
parser = LuxonisParser(
@@ -584,6 +593,7 @@ def parse(
584593
delete_local=delete_local,
585594
save_dir=save_dir,
586595
task_name=task_name,
596+
skip_clean=no_clean,
587597
)
588598
dataset = parser.parse()
589599

luxonis_ml/data/parsers/coco_parser.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ def from_dir(
127127
use_keypoint_ann: bool = False,
128128
keypoint_ann_paths: dict[str, str] | None = None,
129129
split_val_to_test: bool = True,
130+
skip_clean: bool = False,
130131
) -> tuple[list[Path], list[Path], list[Path]]:
131132
dir_format, splits = COCOParser._detect_dataset_dir_format(dataset_dir)
132133
if dir_format is None:
@@ -160,7 +161,9 @@ def from_dir(
160161
and dir_format is COCOFormat.FIFTYONE
161162
else train_paths["annotation_path"]
162163
)
163-
cleaned_annotation_path = clean_annotations(train_ann_path)
164+
cleaned_annotation_path = (
165+
train_ann_path if skip_clean else clean_annotations(train_ann_path)
166+
)
164167
added_train_imgs = self._parse_split(
165168
image_dir=train_paths["image_dir"],
166169
annotation_path=cleaned_annotation_path,

luxonis_ml/data/parsers/fiftyone_classification_parser.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,9 @@ def _extract_native_classes(labels_path: Path) -> dict[int, str]:
114114
classes = labels_data.get("classes", [])
115115
return dict(enumerate(classes))
116116

117-
def from_split(self, split_path: Path) -> ParserOutput:
117+
def from_split(
118+
self, split_path: Path, skip_clean: bool = False
119+
) -> ParserOutput:
118120
labels_path = split_path / "labels.json"
119121
data_path = split_path / "data"
120122

@@ -123,7 +125,8 @@ def from_split(self, split_path: Path) -> ParserOutput:
123125
# and label indices, and set native classes
124126
is_flat_structure = split_path.name not in self.SPLIT_NAMES
125127
if is_flat_structure:
126-
labels_path = clean_imagenet_annotations(labels_path)
128+
if not skip_clean:
129+
labels_path = clean_imagenet_annotations(labels_path)
127130
native_classes = self._extract_native_classes(labels_path)
128131
if native_classes:
129132
self.dataset.set_native_classes(native_classes, "imagenet")

luxonis_ml/data/parsers/luxonis_parser.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ def __init__(
6565
dataset_plugin: T = None,
6666
dataset_type: DatasetType | None = None,
6767
task_name: str | dict[str, str] | None = None,
68+
skip_clean: bool = False,
6869
**kwargs,
6970
):
7071
"""High-level abstraction over various parsers.
@@ -104,6 +105,10 @@ def __init__(
104105
a dictionary with class names as keys and task names as values.
105106
In the latter case, the task name for a record with a given
106107
class name will be taken from the dictionary.
108+
@type skip_clean: bool
109+
@param skip_clean: If C{True}, skip automatic cleaning of known
110+
dataset issues (e.g., duplicate images in COCO, duplicate
111+
class names in ImageNet). Defaults to C{False}.
107112
@type kwargs: Dict[str, Any]
108113
@param kwargs: Additional C{kwargs} to be passed to the
109114
constructor of specific L{BaseDataset} implementation.
@@ -155,6 +160,7 @@ class name will be taken from the dictionary.
155160
self.parser = self.parsers[self.dataset_type](
156161
self.dataset, self.dataset_type, task_name
157162
)
163+
self.skip_clean = skip_clean
158164

159165
@overload
160166
def parse(self: "LuxonisParser[str]", **kwargs) -> BaseDataset: ...
@@ -216,7 +222,9 @@ def _parse_dir(self, **kwargs) -> BaseDataset:
216222
@return: C{LDF} with all the images and annotations parsed.
217223
"""
218224

219-
return self.parser.parse_dir(self.dataset_dir, **kwargs)
225+
return self.parser.parse_dir(
226+
self.dataset_dir, skip_clean=self.skip_clean, **kwargs
227+
)
220228

221229
def _parse_split(
222230
self,
@@ -252,7 +260,12 @@ def _parse_split(
252260
f"Dataset {self.dataset_dir} is not in the expected format for {self.dataset_type} parser."
253261
)
254262
return self.parser.parse_split(
255-
split, random_split, split_ratios, **parsed_kwargs, **kwargs
263+
split,
264+
random_split,
265+
split_ratios,
266+
skip_clean=self.skip_clean,
267+
**parsed_kwargs,
268+
**kwargs,
256269
)
257270

258271
def _download_roboflow_dataset(

0 commit comments

Comments
 (0)