More cleaning up and consistency across exporters, keep higher-bound estimation for two exporters for size

dtronmans · dtronmans · commit 52cacf3a5345 · 2025-11-10T11:20:06.000+01:00
diff --git a/luxonis_ml/data/exporters/createml_exporter.py b/luxonis_ml/data/exporters/createml_exporter.py
@@ -104,9 +104,9 @@ def _collect_bounding_box_annotations(
         per_image_anns: list[dict[str, Any]] = []
 
         for row in group_df.iter_rows(named=True):
-            ttype = row.get("task_type")
-            ann_str = row.get("annotation")
-            cname = row.get("class_name")
+            ttype = row["task_type"]
+            ann_str = row["annotation"]
+            cname = row["class_name"]
 
             if ttype != "boundingbox" or ann_str is None or not cname:
                 continue
diff --git a/luxonis_ml/data/exporters/darknet_exporter.py b/luxonis_ml/data/exporters/darknet_exporter.py
@@ -80,9 +80,9 @@ def _collect_darknet_bounding_box_labels(
         label_lines: list[str] = []
 
         for row in group_df.iter_rows(named=True):
-            ttype = row.get("task_type")
-            ann_str = row.get("annotation")
-            cname = row.get("class_name")
+            ttype = row["task_type"]
+            ann_str = row["annotation"]
+            cname = row["class_name"]
 
             if ttype != "boundingbox" or ann_str is None:
                 continue
diff --git a/luxonis_ml/data/exporters/segmentation_mask_directory_exporter.py b/luxonis_ml/data/exporters/segmentation_mask_directory_exporter.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 from PIL import Image
-from pycocotools import mask as maskUtils  # <- use pycocotools
+from pycocotools import mask as maskUtils
 
 from luxonis_ml.data.exporters.base_exporter import BaseExporter
 from luxonis_ml.data.exporters.exporter_utils import ExporterUtils, PreparedLDF
@@ -75,13 +75,13 @@ def transform(self, prepared_ldf: PreparedLDF) -> None:
             file_path = Path(str(file_name))
             split = ExporterUtils.split_of_group(prepared_ldf, group_id)
 
-            # Only segmentation rows (instance_id == -1)
+            # Only semantic segmentation rows for the entire image (instance_id == -1)
             seg_rows = [
                 row
                 for row in entry.iter_rows(named=True)
-                if row.get("task_type") == "segmentation"
-                and row.get("instance_id") == -1
-                and row.get("annotation")
+                if row["task_type"] == "segmentation"
+                and row["instance_id"] == -1
+                and row["annotation"]
             ]
             if not seg_rows:
                 continue
@@ -106,11 +106,11 @@ def transform(self, prepared_ldf: PreparedLDF) -> None:
             combined: np.ndarray | None = None
 
             for row in seg_rows:
-                cname = str(row.get("class_name") or "")
+                cname = str(row["class_name"] or "")
                 if not cname:
                     continue
 
-                ann = row.get("annotation")
+                ann = row["annotation"]
                 ann = json.loads(ann)
 
                 m = _decode_rle_with_pycoco(ann)  # uint8 {0,1}
diff --git a/luxonis_ml/data/exporters/tensorflow_csv_exporter.py b/luxonis_ml/data/exporters/tensorflow_csv_exporter.py
@@ -56,11 +56,11 @@ def transform(self, prepared_ldf: PreparedLDF) -> None:
 
             per_image_rows: list[dict[str, Any]] = []
             for row in group_df.iter_rows(named=True):
-                if row.get("task_type") != "boundingbox":
+                if row["task_type"] != "boundingbox":
                     continue
-                ann = row.get("annotation")
+                ann = row["annotation"]
                 ann = json.loads(ann)
-                cname = row.get("class_name")
+                cname = row["class_name"]
                 if ann is None or not cname:
                     continue
 
@@ -90,6 +90,9 @@ def transform(self, prepared_ldf: PreparedLDF) -> None:
             if per_image_rows:
                 rows_by_split[split_name].extend(per_image_rows)
 
+            # NOTE: We use a rough constant (64) to approximate the per-row CSV bytes that do NOT
+            # depend on variable-length fields. Getting the true on-disk size here would require
+            # serializing with csv.DictWriter using the exact dialect and encoding
             ann_size_est = sum(
                 64 + len(r["class"]) + len(r["filename"])
                 for r in per_image_rows
diff --git a/luxonis_ml/data/exporters/voc_exporter.py b/luxonis_ml/data/exporters/voc_exporter.py
@@ -47,10 +47,10 @@ def transform(self, prepared_ldf: PreparedLDF) -> None:
 
             objects: list[dict[str, Any]] = []
             for row in group_df.iter_rows(named=True):
-                if row.get("task_type") != "boundingbox":
+                if row["task_type"] != "boundingbox":
                     continue
-                ann_str = row.get("annotation")
-                cname = row.get("class_name")
+                ann_str = row["annotation"]
+                cname = row["class_name"]
                 if not ann_str or not cname:
                     continue
 
@@ -63,11 +63,9 @@ def transform(self, prepared_ldf: PreparedLDF) -> None:
                 xmin = int(round(xn * W))
                 ymin = int(round(yn * H))
 
-                # widths/heights: round once (no double rounding)
                 w_px = max(1, int(round(wn * W)))
                 h_px = max(1, int(round(hn * H)))
 
-                # build EXCLUSIVE max from min + size
                 xmax = xmin + w_px  # exclusive right edge
                 ymax = ymin + h_px  # exclusive bottom edge
 
@@ -214,7 +212,6 @@ def _build_voc_xml_string(
             SubElement(bb, "xmax").text = f"{xmax:.12f}"
             SubElement(bb, "ymax").text = f"{ymax:.12f}"
 
-        # pretty print with XML declaration
         xml_bytes = self._etree_to_pretty_bytes(ann)
         return xml_bytes.decode("utf-8")
 
diff --git a/luxonis_ml/data/exporters/yolo_exporter.py b/luxonis_ml/data/exporters/yolo_exporter.py
@@ -74,7 +74,7 @@ def transform(self, prepared_ldf: PreparedLDF) -> None:
             )
             new_name = f"{idx}{file_path.suffix}"
 
-            label_lines: list[BBox] = []
+            label_lines: list = []
 
             for row in group_df.iter_rows(named=True):
                 ttype = row["task_type"]
diff --git a/tests/test_data/test_parse_export_equivalence.py b/tests/test_data/test_parse_export_equivalence.py
@@ -246,7 +246,7 @@ def collect_classification_multiset(prepared_ldf: PreparedLDF):
     {"url": "D2_Tile.png-mask-semantic.zip", "types": ["segmentation"]},
     {
         "url": "COCO_people_subset.zip",
-        "types": ["instance_segmentation", "boundingbox", "keypoints"],
+        "types": ["instance_segmentation", "boundingbox"],
     },
 ]
 

Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,7 @@ def transform(self, prepared_ldf: PreparedLDF) -> None:`
`74`	`74`	`)`
`75`	`75`	`new_name = f"{idx}{file_path.suffix}"`
`76`	`76`
`77`		`- label_lines: list[BBox] = []`
	`77`	`+ label_lines: list = []`
`78`	`78`
`79`	`79`	`for row in group_df.iter_rows(named=True):`
`80`	`80`	`ttype = row["task_type"]`
Original file line number	Diff line number	Diff line change
`@@ -246,7 +246,7 @@ def collect_classification_multiset(prepared_ldf: PreparedLDF):`
`246`	`246`	`{"url": "D2_Tile.png-mask-semantic.zip", "types": ["segmentation"]},`
`247`	`247`	`{`
`248`	`248`	`"url": "COCO_people_subset.zip",`
`249`		`- "types": ["instance_segmentation", "boundingbox", "keypoints"],`
	`249`	`+ "types": ["instance_segmentation", "boundingbox"],`
`250`	`250`	`},`
`251`	`251`	`]`
`252`	`252`