Fix the dataset merge, export on Machine B (#327)

JSabadin · web-flow · commit 92d4d38a8315 · 2025-06-03T16:57:03.000+02:00
diff --git a/luxonis_ml/data/__main__.py b/luxonis_ml/data/__main__.py
@@ -830,6 +830,15 @@ def clone(
         ),
     ] = True,
     bucket_storage: BucketStorage = bucket_option,
+    team_id: Annotated[
+        str | None,
+        typer.Option(
+            "--team-id",
+            "-t",
+            help="Team ID to use for the new dataset. If not provided, the dataset's current team ID will be used.",
+            show_default=False,
+        ),
+    ] = None,
 ):
     """Clone an existing dataset with a new name.
 
@@ -847,7 +856,9 @@ def clone(
 
     print(f"Cloning dataset '{name}' to '{new_name}'...")
     dataset = LuxonisDataset(name, bucket_storage=bucket_storage)
-    dataset.clone(new_dataset_name=new_name, push_to_cloud=push_to_cloud)
+    dataset.clone(
+        new_dataset_name=new_name, push_to_cloud=push_to_cloud, team_id=team_id
+    )
     print(f"[green]Dataset '{name}' successfully cloned to '{new_name}'.")
 
 
diff --git a/luxonis_ml/data/datasets/luxonis_dataset.py b/luxonis_ml/data/datasets/luxonis_dataset.py
@@ -1,6 +1,7 @@
 import json
 import math
 import shutil
+import sys
 from collections import defaultdict
 from collections.abc import Iterable, Mapping, Sequence
 from concurrent.futures import ThreadPoolExecutor
@@ -269,7 +270,10 @@ def _merge_metadata_with(self, other: "LuxonisDataset") -> None:
         self._write_metadata()
 
     def clone(
-        self, new_dataset_name: str, push_to_cloud: bool = True
+        self,
+        new_dataset_name: str,
+        push_to_cloud: bool = True,
+        team_id: str | None = None,
     ) -> "LuxonisDataset":
         """Create a new LuxonisDataset that is a local copy of the
         current dataset. Cloned dataset will overwrite the existing
@@ -281,10 +285,12 @@ def clone(
         @param push_to_cloud: Whether to push the new dataset to the
             cloud. Only if the current dataset is remote.
         """
+        if team_id is None:
+            team_id = self.team_id
 
         new_dataset = LuxonisDataset(
             dataset_name=new_dataset_name,
-            team_id=self.team_id,
+            team_id=team_id,
             bucket_type=self.bucket_type,
             bucket_storage=self.bucket_storage,
             delete_local=True,
@@ -391,6 +397,18 @@ def merge_with(
                 update_mode=UpdateMode.MISSING,
             )
 
+        for entry in (
+            df_other.select(["uuid", "file"])
+            .unique(subset=["uuid"])
+            .to_dicts()
+        ):
+            uid, rel_file = entry["uuid"], entry["file"]
+            src_path = other.media_path / f"{uid}{Path(rel_file).suffix}"
+            dst_path = target_dataset.media_path / src_path.name
+            if src_path.exists() and not dst_path.exists():
+                dst_path.parent.mkdir(parents=True, exist_ok=True)
+                shutil.copy(src_path, dst_path)
+
         target_dataset._merge_metadata_with(other)
 
         return target_dataset
@@ -1422,12 +1440,11 @@ def _dump_annotations(
             description="Exporting ...",
         ):
             uuid = row[7]
-            if self.is_remote:
+            file = Path(row[-1])
+            if self.is_remote or not file.exists():
                 file_extension = row[0].rsplit(".", 1)[-1]
                 file = self.media_path / f"{uuid}.{file_extension}"
                 assert file.exists()
-            else:
-                file = Path(row[-1])
 
             split = None
             for s, uuids in splits.items():
@@ -1445,9 +1462,13 @@ def _dump_annotations(
 
             if file not in image_indices:
                 file_size = file.stat().st_size
+                annotations_size = sum(
+                    sys.getsizeof(lst) for lst in annotations.values()
+                )
                 if (
                     max_partition_size
-                    and current_size + file_size > max_partition_size
+                    and current_size + file_size + annotations_size
+                    > max_partition_size
                 ):
                     _dump_annotations(
                         annotations, output_path, self.identifier, part
@@ -1511,6 +1532,10 @@ def _dump_annotations(
                 record["annotation"][task_type] = data
                 annotations[split].append(record)
 
+            elif task_type == "metadata/text":
+                record["annotation"]["metadata"] = {"text": data}
+                annotations[split].append(record)
+
         _dump_annotations(annotations, output_path, self.identifier, part)
 
         if zip_output:
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -139,7 +139,7 @@ def base_tempdir(worker_id: str):
 
 
 @pytest.fixture
-def tempdir(base_tempdir: Path, randint: int) -> Path:
+def tempdir(base_tempdir: Path, randint: int) -> Generator[Path, None, None]:
     t = time.time()
     unique_id = randint
     while True:
@@ -155,7 +155,9 @@ def tempdir(base_tempdir: Path, randint: int) -> Path:
 
     path.mkdir(exist_ok=True)
 
-    return path
+    yield path
+
+    shutil.rmtree(path, ignore_errors=True)
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/test_data/test_dataset.py b/tests/test_data/test_dataset.py
@@ -929,3 +929,61 @@ def generator(start: int, end: int) -> DatasetIterator:
         loader = LuxonisLoader(cloud_dataset)
         assert sum(1 for _ in loader) == 3
         assert cloud_dataset.get_statistics() == original_stats
+
+
+@pytest.mark.dependency(name="test_dataset[BucketStorage.LOCAL]")
+def test_merge_on_different_machines(dataset_name: str, tempdir: Path):
+    def generator(start: int, end: int) -> DatasetIterator:
+        """Generate sample dataset items with bounding boxes."""
+        for i in range(start, end):
+            img = create_image(i, tempdir)
+            yield {
+                "file": img,
+                "annotation": {
+                    "class": "person",
+                    "boundingbox": {"x": 0.1, "y": 0.1, "w": 0.1, "h": 0.1},
+                    "instance_id": i,
+                },
+            }
+
+    dataset1 = create_dataset(
+        dataset_name + "_1",
+        generator(0, 3),
+        bucket_storage=BucketStorage.GCS,
+        delete_local=True,
+        delete_remote=True,
+        splits=(1, 0, 0),
+    )
+    dataset2 = create_dataset(
+        dataset_name + "_2",
+        generator(3, 6),
+        bucket_storage=BucketStorage.GCS,
+        delete_local=True,
+        delete_remote=True,
+        splits=(1, 0, 0),
+    )
+    shutil.rmtree(tempdir)
+    dataset1.pull_from_cloud()
+    dataset2.pull_from_cloud()
+    dataset1.delete_dataset(delete_remote=True)
+    dataset2.delete_dataset(delete_remote=True)
+    dataset1 = LuxonisDataset(dataset_name + "_1")
+    dataset2 = LuxonisDataset(dataset_name + "_2")
+    assert len(list(dataset1.media_path.glob("*"))) == 3
+    assert len(list(dataset2.media_path.glob("*"))) == 3
+    dataset3 = dataset1.merge_with(
+        dataset2, inplace=False, new_dataset_name=dataset_name
+    )
+    loader = LuxonisLoader(dataset3)
+    assert sum(1 for _ in loader) == 6
+    dataset3.export(tempdir)
+    assert (
+        len(
+            list(
+                Path.cwd().glob(
+                    f"{tempdir}/{dataset3.dataset_name}/train/images/*"
+                )
+            )
+        )
+        == 6
+    )