Skip to content

Commit de1d69f

Browse files
authored
Improved Duplicate Warnings (#184)
1 parent 0183a45 commit de1d69f

File tree

9 files changed

+67
-26
lines changed

9 files changed

+67
-26
lines changed

luxonis_ml/data/datasets/luxonis_dataset.py

Lines changed: 54 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -199,14 +199,30 @@ def _load_df_offline(
199199
dfs = [pl.read_parquet(file) for file in path.glob("*.parquet")]
200200
return pl.concat(dfs) if dfs else None
201201

202-
def _get_file_index(self) -> Optional[pl.DataFrame]:
202+
@overload
203+
def _get_file_index(
204+
self, lazy: Literal[False] = ...
205+
) -> Optional[pl.DataFrame]: ...
206+
207+
@overload
208+
def _get_file_index(
209+
self, lazy: Literal[True] = ...
210+
) -> Optional[pl.LazyFrame]: ...
211+
212+
def _get_file_index(
213+
self, lazy: bool = False
214+
) -> Optional[Union[pl.DataFrame, pl.LazyFrame]]:
203215
path = get_file(
204216
self.fs, "metadata/file_index.parquet", self.media_path
205217
)
206218
if path is not None and path.exists():
207-
return pl.read_parquet(path).select(
208-
pl.all().exclude("^__index_level_.*$")
209-
)
219+
if not lazy:
220+
df = pl.read_parquet(path)
221+
else:
222+
df = pl.scan_parquet(path)
223+
224+
return df.select(pl.all().exclude("^__index_level_.*$"))
225+
210226
return None
211227

212228
def _write_index(
@@ -438,7 +454,7 @@ def _process_arrays(self, batch_data: List[DatasetRecord]) -> None:
438454
uuid_dict[str(ann.path)] = uuid
439455
ann.path = Path(uuid).with_suffix(ann.path.suffix)
440456
else:
441-
ann.path = ann.path.absolute()
457+
ann.path = ann.path.absolute().resolve()
442458
self.progress.stop()
443459
self.progress.remove_task(task)
444460
if self.is_remote:
@@ -496,7 +512,7 @@ def _add_process_batch(
496512
new_index["uuid"].append(uuid)
497513
new_index["file"].append(file)
498514
new_index["original_filepath"].append(
499-
str(filepath.absolute())
515+
str(filepath.absolute().resolve())
500516
)
501517
processed_uuids.add(uuid)
502518

@@ -514,7 +530,9 @@ def add(
514530

515531
batch_data: list[DatasetRecord] = []
516532

517-
classes_per_task: Dict[str, OrderedSet[str]] = defaultdict(OrderedSet)
533+
classes_per_task: Dict[str, OrderedSet[str]] = defaultdict(
534+
lambda: OrderedSet([])
535+
)
518536
num_kpts_per_task: Dict[str, int] = {}
519537

520538
annotations_path = get_dir(
@@ -584,36 +602,55 @@ def add(
584602

585603
def _warn_on_duplicates(self) -> None:
586604
df = self._load_df_offline(lazy=True)
587-
if df is None:
605+
index_df = self._get_file_index(lazy=True)
606+
if df is None or index_df is None:
588607
return
608+
df = df.join(index_df, on="uuid").drop("file_right")
589609
# Warn on duplicate UUIDs
590610
duplicates_paired = (
591611
df.group_by("uuid")
592612
.agg(pl.col("file").n_unique().alias("file_count"))
593613
.filter(pl.col("file_count") > 1)
594614
.join(df, on="uuid")
595-
.select(["uuid", "file"])
615+
.select("uuid", "file")
596616
.unique()
597617
.group_by("uuid")
598-
.agg([pl.col("file").alias("files")])
618+
.agg(pl.col("file").alias("files"))
599619
.filter(pl.col("files").len() > 1)
620+
.collect()
600621
)
601-
duplicates_paired_df = duplicates_paired.collect()
602-
for uuid, files in duplicates_paired_df.iter_rows():
622+
for uuid, files in duplicates_paired.iter_rows():
603623
self.logger.warning(
604624
f"UUID: {uuid} has multiple file names: {files}"
605625
)
606626

607627
# Warn on duplicate annotations
608628
duplicate_annotation = (
609-
df.group_by(["file", "annotation"])
629+
df.group_by(
630+
"original_filepath",
631+
"task",
632+
"type",
633+
"annotation",
634+
"instance_id",
635+
)
610636
.agg(pl.len().alias("count"))
611637
.filter(pl.col("count") > 1)
612-
)
613-
duplicate_annotation_df = duplicate_annotation.collect()
614-
for file_name, annotation, _ in duplicate_annotation_df.iter_rows():
638+
.filter(pl.col("annotation") != "{}")
639+
.drop("instance_id")
640+
).collect()
641+
642+
for (
643+
file_name,
644+
task,
645+
type_,
646+
annotation,
647+
count,
648+
) in duplicate_annotation.iter_rows():
649+
if "RLE" in type_ or "Mask" in type_:
650+
annotation = "<binary mask>"
615651
self.logger.warning(
616-
f"File '{file_name}' has the same annotation '{annotation}' added multiple times."
652+
f"File '{file_name}' has the same '{type_}' annotation "
653+
f"'{annotation}' ({task=}) added {count} times."
617654
)
618655

619656
def get_splits(self) -> Optional[Dict[str, List[str]]]:

luxonis_ml/data/datasets/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def find_filepath_uuid(
8686
if index is None:
8787
return None
8888

89-
abs_path = str(Path(filepath).absolute())
89+
abs_path = str(Path(filepath).absolute().resolve())
9090
matched = index.filter(pl.col("original_filepath") == abs_path)
9191

9292
if len(matched):

luxonis_ml/data/loaders/luxonis_loader.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def __init__(
8484
for view in self.view:
8585
self.instances.extend(splits[view])
8686

87-
self.idx_to_df_row = []
87+
self.idx_to_df_row: list[list[int]] = []
8888
for uuid in self.instances:
8989
boolean_mask = df["uuid"] == uuid
9090
row_indexes = boolean_mask.arg_true().to_list()
@@ -139,7 +139,9 @@ def __getitem__(self, idx: int) -> LuxonisLoaderOutput:
139139
else:
140140
picked_indices = set()
141141
max_val = len(self)
142-
while len(picked_indices) < self.augmentations.aug_batch_size - 1:
142+
while (
143+
len(picked_indices) < self.augmentations.aug_batch_size - 1
144+
):
143145
rand_idx = random.randint(0, max_val - 1)
144146
if rand_idx != idx and rand_idx not in picked_indices:
145147
picked_indices.add(rand_idx)

luxonis_ml/data/parsers/classification_directory_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def generator() -> DatasetIterator:
7777
for class_name in class_names:
7878
for img_path in (class_dir / class_name).iterdir():
7979
yield {
80-
"file": str(img_path.absolute()),
80+
"file": str(img_path.absolute().resolve()),
8181
"annotation": {
8282
"type": "classification",
8383
"class": class_name,

luxonis_ml/data/parsers/coco_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ def generator() -> DatasetIterator:
226226
ann_dict[img_id].append(ann)
227227

228228
for img_id, img in img_dict.items():
229-
path = image_dir.absolute() / img["file_name"]
229+
path = image_dir.absolute().resolve() / img["file_name"]
230230
if not path.exists():
231231
continue
232232
path = str(path)

luxonis_ml/data/parsers/create_ml_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ def from_split(
9292
class_names = set()
9393
images_annotations = []
9494
for annotations in annotations_data:
95-
path = image_dir.absolute() / annotations["image"]
95+
path = image_dir.absolute().resolve() / annotations["image"]
9696
if not path.exists():
9797
continue
9898
file = str(path)

luxonis_ml/data/parsers/segmentation_mask_directory_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def from_split(
104104
def generator() -> DatasetIterator:
105105
for mask_path in seg_dir.glob("*_mask.*"):
106106
image_path = next(image_dir.glob(f"{mask_path.stem[:-5]}.*"))
107-
file = str(image_path.absolute())
107+
file = str(image_path.absolute().resolve())
108108
mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)
109109

110110
ids = np.unique(mask)

luxonis_ml/data/parsers/voc_parser.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,9 @@ def from_split(
8686
annotation_data = ET.parse(anno_xml)
8787
root = annotation_data.getroot()
8888

89-
path = image_dir.absolute() / self._xml_find(root, "filename")
89+
path = image_dir.absolute().resolve() / self._xml_find(
90+
root, "filename"
91+
)
9092
if not path.exists():
9193
continue
9294

luxonis_ml/data/parsers/yolov4_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def generator() -> DatasetIterator:
9898
data = ann_line.split(" ")
9999
img_path = data[0]
100100

101-
path = image_dir.absolute() / img_path
101+
path = image_dir.absolute().resolve() / img_path
102102
if not path.exists():
103103
continue
104104

0 commit comments

Comments
 (0)