Improved Duplicate Warnings (#184)

kozlov721 · web-flow · commit de1d69fde0e3 · 2024-10-04T09:45:34.000+02:00
diff --git a/luxonis_ml/data/datasets/luxonis_dataset.py b/luxonis_ml/data/datasets/luxonis_dataset.py
@@ -199,14 +199,30 @@ def _load_df_offline(
             dfs = [pl.read_parquet(file) for file in path.glob("*.parquet")]
             return pl.concat(dfs) if dfs else None
 
-    def _get_file_index(self) -> Optional[pl.DataFrame]:
+    @overload
+    def _get_file_index(
+        self, lazy: Literal[False] = ...
+    ) -> Optional[pl.DataFrame]: ...
+
+    @overload
+    def _get_file_index(
+        self, lazy: Literal[True] = ...
+    ) -> Optional[pl.LazyFrame]: ...
+
+    def _get_file_index(
+        self, lazy: bool = False
+    ) -> Optional[Union[pl.DataFrame, pl.LazyFrame]]:
         path = get_file(
             self.fs, "metadata/file_index.parquet", self.media_path
         )
         if path is not None and path.exists():
-            return pl.read_parquet(path).select(
-                pl.all().exclude("^__index_level_.*$")
-            )
+            if not lazy:
+                df = pl.read_parquet(path)
+            else:
+                df = pl.scan_parquet(path)
+
+            return df.select(pl.all().exclude("^__index_level_.*$"))
+
         return None
 
     def _write_index(
@@ -438,7 +454,7 @@ def _process_arrays(self, batch_data: List[DatasetRecord]) -> None:
                 uuid_dict[str(ann.path)] = uuid
                 ann.path = Path(uuid).with_suffix(ann.path.suffix)
             else:
-                ann.path = ann.path.absolute()
+                ann.path = ann.path.absolute().resolve()
         self.progress.stop()
         self.progress.remove_task(task)
         if self.is_remote:
@@ -496,7 +512,7 @@ def _add_process_batch(
                     new_index["uuid"].append(uuid)
                     new_index["file"].append(file)
                     new_index["original_filepath"].append(
-                        str(filepath.absolute())
+                        str(filepath.absolute().resolve())
                     )
                     processed_uuids.add(uuid)
 
@@ -514,7 +530,9 @@ def add(
 
         batch_data: list[DatasetRecord] = []
 
-        classes_per_task: Dict[str, OrderedSet[str]] = defaultdict(OrderedSet)
+        classes_per_task: Dict[str, OrderedSet[str]] = defaultdict(
+            lambda: OrderedSet([])
+        )
         num_kpts_per_task: Dict[str, int] = {}
 
         annotations_path = get_dir(
@@ -584,36 +602,55 @@ def add(
 
     def _warn_on_duplicates(self) -> None:
         df = self._load_df_offline(lazy=True)
-        if df is None:
+        index_df = self._get_file_index(lazy=True)
+        if df is None or index_df is None:
             return
+        df = df.join(index_df, on="uuid").drop("file_right")
         # Warn on duplicate UUIDs
         duplicates_paired = (
             df.group_by("uuid")
             .agg(pl.col("file").n_unique().alias("file_count"))
             .filter(pl.col("file_count") > 1)
             .join(df, on="uuid")
-            .select(["uuid", "file"])
+            .select("uuid", "file")
             .unique()
             .group_by("uuid")
-            .agg([pl.col("file").alias("files")])
+            .agg(pl.col("file").alias("files"))
             .filter(pl.col("files").len() > 1)
+            .collect()
         )
-        duplicates_paired_df = duplicates_paired.collect()
-        for uuid, files in duplicates_paired_df.iter_rows():
+        for uuid, files in duplicates_paired.iter_rows():
             self.logger.warning(
                 f"UUID: {uuid} has multiple file names: {files}"
             )
 
         # Warn on duplicate annotations
         duplicate_annotation = (
-            df.group_by(["file", "annotation"])
+            df.group_by(
+                "original_filepath",
+                "task",
+                "type",
+                "annotation",
+                "instance_id",
+            )
             .agg(pl.len().alias("count"))
             .filter(pl.col("count") > 1)
-        )
-        duplicate_annotation_df = duplicate_annotation.collect()
-        for file_name, annotation, _ in duplicate_annotation_df.iter_rows():
+            .filter(pl.col("annotation") != "{}")
+            .drop("instance_id")
+        ).collect()
+
+        for (
+            file_name,
+            task,
+            type_,
+            annotation,
+            count,
+        ) in duplicate_annotation.iter_rows():
+            if "RLE" in type_ or "Mask" in type_:
+                annotation = "<binary mask>"
             self.logger.warning(
-                f"File '{file_name}' has the same annotation '{annotation}' added multiple times."
+                f"File '{file_name}' has the same '{type_}' annotation "
+                f"'{annotation}' ({task=}) added {count} times."
             )
 
     def get_splits(self) -> Optional[Dict[str, List[str]]]:
diff --git a/luxonis_ml/data/datasets/utils.py b/luxonis_ml/data/datasets/utils.py
@@ -86,7 +86,7 @@ def find_filepath_uuid(
     if index is None:
         return None
 
-    abs_path = str(Path(filepath).absolute())
+    abs_path = str(Path(filepath).absolute().resolve())
     matched = index.filter(pl.col("original_filepath") == abs_path)
 
     if len(matched):
diff --git a/luxonis_ml/data/loaders/luxonis_loader.py b/luxonis_ml/data/loaders/luxonis_loader.py
@@ -84,7 +84,7 @@ def __init__(
         for view in self.view:
             self.instances.extend(splits[view])
 
-        self.idx_to_df_row = []
+        self.idx_to_df_row: list[list[int]] = []
         for uuid in self.instances:
             boolean_mask = df["uuid"] == uuid
             row_indexes = boolean_mask.arg_true().to_list()
@@ -139,7 +139,9 @@ def __getitem__(self, idx: int) -> LuxonisLoaderOutput:
             else:
                 picked_indices = set()
                 max_val = len(self)
-                while len(picked_indices) < self.augmentations.aug_batch_size - 1:
+                while (
+                    len(picked_indices) < self.augmentations.aug_batch_size - 1
+                ):
                     rand_idx = random.randint(0, max_val - 1)
                     if rand_idx != idx and rand_idx not in picked_indices:
                         picked_indices.add(rand_idx)
diff --git a/luxonis_ml/data/parsers/classification_directory_parser.py b/luxonis_ml/data/parsers/classification_directory_parser.py
@@ -77,7 +77,7 @@ def generator() -> DatasetIterator:
             for class_name in class_names:
                 for img_path in (class_dir / class_name).iterdir():
                     yield {
-                        "file": str(img_path.absolute()),
+                        "file": str(img_path.absolute().resolve()),
                         "annotation": {
                             "type": "classification",
                             "class": class_name,
diff --git a/luxonis_ml/data/parsers/coco_parser.py b/luxonis_ml/data/parsers/coco_parser.py
@@ -226,7 +226,7 @@ def generator() -> DatasetIterator:
                 ann_dict[img_id].append(ann)
 
             for img_id, img in img_dict.items():
-                path = image_dir.absolute() / img["file_name"]
+                path = image_dir.absolute().resolve() / img["file_name"]
                 if not path.exists():
                     continue
                 path = str(path)
diff --git a/luxonis_ml/data/parsers/create_ml_parser.py b/luxonis_ml/data/parsers/create_ml_parser.py
@@ -92,7 +92,7 @@ def from_split(
         class_names = set()
         images_annotations = []
         for annotations in annotations_data:
-            path = image_dir.absolute() / annotations["image"]
+            path = image_dir.absolute().resolve() / annotations["image"]
             if not path.exists():
                 continue
             file = str(path)
diff --git a/luxonis_ml/data/parsers/segmentation_mask_directory_parser.py b/luxonis_ml/data/parsers/segmentation_mask_directory_parser.py
@@ -104,7 +104,7 @@ def from_split(
         def generator() -> DatasetIterator:
             for mask_path in seg_dir.glob("*_mask.*"):
                 image_path = next(image_dir.glob(f"{mask_path.stem[:-5]}.*"))
-                file = str(image_path.absolute())
+                file = str(image_path.absolute().resolve())
                 mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)
 
                 ids = np.unique(mask)
diff --git a/luxonis_ml/data/parsers/voc_parser.py b/luxonis_ml/data/parsers/voc_parser.py
@@ -86,7 +86,9 @@ def from_split(
             annotation_data = ET.parse(anno_xml)
             root = annotation_data.getroot()
 
-            path = image_dir.absolute() / self._xml_find(root, "filename")
+            path = image_dir.absolute().resolve() / self._xml_find(
+                root, "filename"
+            )
             if not path.exists():
                 continue
 
diff --git a/luxonis_ml/data/parsers/yolov4_parser.py b/luxonis_ml/data/parsers/yolov4_parser.py
@@ -98,7 +98,7 @@ def generator() -> DatasetIterator:
                 data = ann_line.split(" ")
                 img_path = data[0]
 
-                path = image_dir.absolute() / img_path
+                path = image_dir.absolute().resolve() / img_path
                 if not path.exists():
                     continue