Skip to content
Open
Show file tree
Hide file tree
Changes from 45 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
9f59950
Initial structure for refactoring exports to native and to COCO Roboflow
dtronmans Oct 30, 2025
7408af1
the extract function which is the same for every dataset type
dtronmans Oct 30, 2025
3431980
temp changes for now to compare to initial behavior
dtronmans Oct 30, 2025
9b4fde3
Temp changes and fixes, cleanup
dtronmans Oct 31, 2025
7a0e63e
Format avoid circular import
dtronmans Oct 31, 2025
5ff2f39
More moving of files and logic
dtronmans Oct 31, 2025
48edb64
More moving and cleaning up of export functions
dtronmans Nov 3, 2025
201c91d
Refactor COCO format enum re-usable by coco parser and coco exporter
dtronmans Nov 3, 2025
c46ea68
transform function cleaned with _row_to_record method
dtronmans Nov 3, 2025
af18bed
Support for annotation formats where there is one annotation file per…
dtronmans Nov 3, 2025
b261a37
Checkpoint: safer flow and fix both pyright and pre-commit
dtronmans Nov 3, 2025
ae29e7e
More comment reformatting and reorder of BaseExporter class
dtronmans Nov 3, 2025
506b308
move groupby away from function, coco exporter with custom logic per …
dtronmans Nov 3, 2025
25d96af
Working COCO exporter for both formats, some cleaning required
dtronmans Nov 4, 2025
2d55ea7
working coco and native exporters, minus refactor
dtronmans Nov 4, 2025
4cfff78
Refactor of the transform() method: move row-by-row processing away
dtronmans Nov 4, 2025
f6aa64c
Remove support for COCO exporter for now
dtronmans Nov 4, 2025
e5b2e57
Cleaned luxonis_dataset.py only accept NativeExporter for now
dtronmans Nov 4, 2025
8856957
Remove COCO Exporter file from tracking for now
dtronmans Nov 4, 2025
db77a01
temp changes: bring back intermediate saving
dtronmans Nov 4, 2025
7994413
Proper intermediate saving
dtronmans Nov 4, 2025
8740a92
More changes for cleaner prepare_ldf
dtronmans Nov 4, 2025
d9599a4
Fixed precommit and types and passing tests
dtronmans Nov 4, 2025
d5ddde7
Docformat CI
dtronmans Nov 4, 2025
5dfbeb1
Pre-commit
dtronmans Nov 4, 2025
62b5d0f
Fix Native and COCO exporter cleanliness, start wiht all exporter types
dtronmans Nov 5, 2025
d894e7d
Pre-commit for exporter utils
dtronmans Nov 5, 2025
a8c5f03
darknetexporter supported
dtronmans Nov 5, 2025
a42c49f
darknet pre-commit and clean
dtronmans Nov 5, 2025
0eeba1b
move to exporterutils
dtronmans Nov 5, 2025
9eda46d
exporter_utils file added
dtronmans Nov 5, 2025
1b04bc9
Singular exporter for all YOLO formats
dtronmans Nov 5, 2025
c738811
Working instance segmentation for COCO export
dtronmans Nov 5, 2025
37605d9
More allowed export formats
dtronmans Nov 5, 2025
4518f09
Add classification and segmentation directory exporters
dtronmans Nov 5, 2025
51639f5
clean up of every exporter
dtronmans Nov 5, 2025
d516469
Fix and clean all exporters and add tests for parse -> export -> parse
dtronmans Nov 6, 2025
60628f9
Much cleaner and more modular structure for tests with TODO comments
dtronmans Nov 6, 2025
a8899e5
Fixes and cleaning, add VOC exporter too
dtronmans Nov 7, 2025
b719107
Exporters for createml and tensorflow csv + more todo added
dtronmans Nov 7, 2025
8739124
More changes and extensions to existing exporters and tests
dtronmans Nov 10, 2025
52cacf3
More cleaning up and consistency across exporters, keep higher-bound …
dtronmans Nov 10, 2025
6d3b4ec
Working tests, pre-commit and pyright
dtronmans Nov 10, 2025
d98b8b1
Merge branch 'main' into feat/all-exporters
dtronmans Nov 10, 2025
b144881
Working changes to yolov8 minus instance segmentation and keypoints, …
dtronmans Nov 10, 2025
21807bc
yolov8 instance segmentation and keypoints and more tests
dtronmans Nov 12, 2025
9cf51b1
remove int declarations if round is already used
dtronmans Nov 14, 2025
6aff924
rename ldf to dataset for consistency
dtronmans Nov 14, 2025
466a747
yolov8 boundingbox defaults to yolov8
dtronmans Nov 14, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
327 changes: 88 additions & 239 deletions luxonis_ml/data/datasets/luxonis_dataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
import math
import shutil
import sys
from collections import defaultdict
from collections.abc import Iterable, Mapping, Sequence
from concurrent.futures import ThreadPoolExecutor
Expand All @@ -18,9 +17,29 @@
from semver.version import Version
from typing_extensions import Self, override

from luxonis_ml.data.exporters import (
BaseExporter,
ClassificationDirectoryExporter,
CocoExporter,
CreateMLExporter,
DarknetExporter,
NativeExporter,
PreparedLDF,
SegmentationMaskDirectoryExporter,
TensorflowCSVExporter,
VOCExporter,
YoloV4Exporter,
YoloV6Exporter,
YoloV8Exporter,
)
from luxonis_ml.data.exporters.exporter_utils import (
ExporterSpec,
ExporterUtils,
)
from luxonis_ml.data.utils import (
BucketStorage,
BucketType,
COCOFormat,
ParquetFileManager,
UpdateMode,
get_class_distributions,
Expand Down Expand Up @@ -1497,266 +1516,96 @@ def export(
partition) after export. Default is False.
@rtype: Union[Path, List[Path]]
@return: Path(s) to the ZIP file(s) containing the exported
dataset.
dataset (if zip_output=True). Otherwise, the output
directory.
"""

def _dump_annotations(
annotations: dict[str, list[Any]],
output_path: Path,
identifier: str,
part: int | None = None,
) -> None:
for split_name, annotation_data in annotations.items():
if part is not None:
split_path = (
output_path / f"{identifier}_part{part}" / split_name
)
else:
split_path = output_path / identifier / split_name
split_path.mkdir(parents=True, exist_ok=True)
with open(split_path / "annotations.json", "w") as f:
json.dump(annotation_data, f, indent=4)

def resolve_path(
img_path: str | Path, uuid: str, media_path: str
) -> str:
img_path = Path(img_path)
if img_path.exists():
return str(img_path)

ext = img_path.suffix.lstrip(".")
fallback = Path(media_path) / f"{uuid}.{ext}"
if not fallback.exists():
raise FileNotFoundError(f"Missing image: {fallback}")
return str(fallback)

if dataset_type is not DatasetType.NATIVE:
EXPORTER_MAP: dict[DatasetType, ExporterSpec] = {
DatasetType.NATIVE: ExporterSpec(NativeExporter, {}),
DatasetType.COCO: ExporterSpec(
CocoExporter,
{
"format": COCOFormat.ROBOFLOW,
"skeletons": getattr(self.metadata, "skeletons", None),
},
),
DatasetType.YOLOV8: ExporterSpec(
YoloV8Exporter,
{
"skeletons": getattr(self.metadata, "skeletons", None),
},
),
DatasetType.YOLOV6: ExporterSpec(YoloV6Exporter, {}),
DatasetType.YOLOV4: ExporterSpec(YoloV4Exporter, {}),
DatasetType.DARKNET: ExporterSpec(DarknetExporter, {}),
DatasetType.CLSDIR: ExporterSpec(
ClassificationDirectoryExporter, {}
),
DatasetType.SEGMASK: ExporterSpec(
SegmentationMaskDirectoryExporter, {}
),
DatasetType.VOC: ExporterSpec(VOCExporter, {}),
DatasetType.CREATEML: ExporterSpec(CreateMLExporter, {}),
DatasetType.TFCSV: ExporterSpec(TensorflowCSVExporter, {}),
}
spec = EXPORTER_MAP.get(dataset_type)
if spec is None:
raise NotImplementedError(
"Only 'NATIVE' dataset export is supported at the moment"
f"Unsupported export format: {dataset_type}"
)

logger.info(
f"Exporting '{self.identifier}' to '{dataset_type.name}' format"
)

splits = self.get_splits()
if splits is None:
raise ValueError("Cannot export dataset without splits")

output_path = Path(output_path)
if output_path.exists():
out_path = Path(output_path)
if out_path.exists():
raise ValueError(
f"Export path '{output_path}' already exists. Please remove it first."
)
output_path.mkdir(parents=True)
image_indices = {}
annotations = {"train": [], "val": [], "test": []}
df = self._load_df_offline(raise_when_empty=True)

# Capture the original order. Assume annotations are ordered if instance_id's were not specified.
df = df.with_row_count("row_idx").with_columns(
pl.col("row_idx").min().over("file").alias("first_occur")
)

# Resolve file paths to ensure they are absolute and exist
df = df.with_columns(
pl.struct(["file", "uuid"])
.map_elements(
lambda row: resolve_path(
row["file"], row["uuid"], str(self.media_path)
),
return_dtype=pl.Utf8,
)
.alias("file")
)

grouped_image_sources = df.select(
"group_id", "source_name", "file"
).unique()

# Filter out rows without annotations and ensure we have at least one row per group_id (images without annotations)
df = (
df.with_columns(
[
pl.col("annotation").is_not_null().alias("has_annotation"),
pl.col("group_id")
.cumcount()
.over("group_id")
.alias("first_occur"),
]
)
.pipe(
lambda df: (
df.filter(pl.col("has_annotation")).vstack(
df.filter(
~pl.col("group_id").is_in(
df.filter(pl.col("has_annotation"))
.select("group_id")
.unique()["group_id"]
)
).unique(subset=["group_id"], keep="first")
)
)
f"Export path '{out_path}' already exists. Please remove it first."
)
.sort(["row_idx"])
.select(
[
col
for col in df.columns
if col not in ["has_annotation", "row_idx", "first_occur"]
]
)
)
out_path.mkdir(parents=True)

splits = self.get_splits()
assert splits is not None
prepared_ldf = PreparedLDF.from_dataset(self)

current_size = 0
part = 0 if max_partition_size_gb else None
max_partition_size = (
max_partition_size_gb * 1024**3 if max_partition_size_gb else None
exporter: BaseExporter = spec.cls(
self.identifier, out_path, max_partition_size_gb, **spec.kwargs
)

# Group the full dataframe by group_id
df = df.group_by("group_id", maintain_order=True)
copied_files = set()

for group_id, group_df in df:
matched_df = grouped_image_sources.filter(
pl.col("group_id") == group_id
)
group_files = matched_df.get_column("file").to_list()
group_source_names = matched_df.get_column("source_name").to_list()

split = next(
(
s
for s, group_ids in splits.items()
if group_id in group_ids
),
None,
)
assert split is not None

group_total_size = sum(Path(f).stat().st_size for f in group_files)
annotation_records = []

for row in group_df.iter_rows(named=True):
task_name = row["task_name"]
class_name = row["class_name"]
instance_id = row["instance_id"]
task_type = row["task_type"]
ann_str = row["annotation"]

source_to_file = {
name: str(
(
Path("images")
/ f"{image_indices.setdefault(Path(f), len(image_indices))}{Path(f).suffix}"
).as_posix()
)
for name, f in zip(
group_source_names, group_files, strict=True
exporter.transform(prepared_ldf=prepared_ldf)

# Detect whether partitioned export was produced and the max part index
def _detect_last_part(base: Path, ds_id: str) -> int | None:
max_idx: int | None = None
prefix = f"{ds_id}_part"
for p in base.iterdir():
if p.is_dir() and p.name.startswith(prefix):
try:
idx = int(p.name[len(prefix) :])
except ValueError:
continue
max_idx = (
idx if (max_idx is None or idx > max_idx) else max_idx
)
}

record = {
"files" if len(group_source_names) > 1 else "file": (
source_to_file
if len(group_source_names) > 1
else source_to_file[group_source_names[0]]
),
"task_name": task_name,
}
return max_idx

if ann_str is not None:
data = json.loads(ann_str)
annotation_base = {
"instance_id": instance_id,
"class": class_name,
}
if task_type in {
"instance_segmentation",
"segmentation",
"boundingbox",
"keypoints",
}:
annotation_base[task_type] = data
elif task_type.startswith("metadata/"):
annotation_base["metadata"] = {task_type[9:]: data}
record["annotation"] = annotation_base

annotation_records.append(record)

annotations_size = sum(
sys.getsizeof(r) for r in annotation_records
)

if (
max_partition_size
and part is not None
and current_size + group_total_size + annotations_size
> max_partition_size
):
_dump_annotations(
annotations, output_path, self.identifier, part
)
current_size = 0
part += 1
annotations = {"train": [], "val": [], "test": []}

if max_partition_size:
data_path = (
output_path
/ f"{self.identifier}_part{part}"
/ split
/ "images"
)
else:
data_path = output_path / self.identifier / split / "images"
data_path.mkdir(parents=True, exist_ok=True)

for file in group_files:
file_path = Path(file)
if file_path not in copied_files:
copied_files.add(file_path)
image_index = image_indices[file_path]
dest_file = data_path / f"{image_index}{file_path.suffix}"
shutil.copy(file_path, dest_file)
current_size += file_path.stat().st_size

annotations[split].extend(annotation_records)
current_size += annotations_size

_dump_annotations(annotations, output_path, self.identifier, part)
last_part = _detect_last_part(out_path, self.identifier)

if zip_output:
archives = []
if max_partition_size:
assert part is not None
for i in range(part + 1):
folder = output_path / f"{self.identifier}_part{i}"
if folder.exists():
archive_file = shutil.make_archive(
str(folder), "zip", root_dir=folder
)
archives.append(Path(archive_file))
else:
folder = output_path / self.identifier
if folder.exists():
archive_file = shutil.make_archive(
str(folder), "zip", root_dir=folder
)
archives.append(Path(archive_file))
if len(archives) > 1:
archives = ExporterUtils.create_zip_output(
max_partition_size=max_partition_size_gb,
output_path=out_path,
part=last_part,
dataset_identifier=self.identifier,
)
if isinstance(archives, list):
logger.info(
f"Dataset successfully exported to: {[str(p) for p in archives]}"
)
return archives
logger.info(f"Dataset successfully exported to: {archives[0]}")
return archives[0]
logger.info(f"Dataset successfully exported to: {archives}")
return archives

logger.info(f"Dataset successfully exported to: {output_path}")
return output_path
logger.info(f"Dataset successfully exported to: {out_path}")
return out_path

def get_statistics(
self, sample_size: int | None = None, view: str | None = None
Expand Down
Loading