luxonis
diff --git a/‎examples/Data_Custom_Example.ipynb‎
Lines changed: 343 additions & 20 deletions b/‎examples/Data_Custom_Example.ipynb‎
Lines changed: 343 additions & 20 deletions
diff --git a/‎luxonis_ml/data/datasets/luxonis_dataset.py‎
Lines changed: 128 additions & 122 deletions b/‎luxonis_ml/data/datasets/luxonis_dataset.py‎
Lines changed: 128 additions & 122 deletions
@@ -9,8 +9,7 @@
 from typing import Dict, List, Optional, Set, Tuple
 
 import numpy as np
-import pandas as pd
-import pyarrow as pa
+import polars as pl
 import pyarrow.parquet as pq
 import rich.progress
 
@@ -138,10 +137,7 @@ def __len__(self) -> int:
         """Returns the number of instances in the dataset."""
 
         df = self._load_df_offline(self.bucket_storage != BucketStorage.LOCAL)
-        if df is not None:
-            return len(set(df["uuid"]))
-        else:
-            return 0
+        return len(df.select("uuid").unique()) if df is not None else 0
 
     def _write_datasets(self) -> None:
         with open(self.datasets_cache_file, "w") as file:
@@ -182,7 +178,7 @@ def _init_path(self) -> None:
                 f"{self.team_id}/datasets/{self.dataset_name}"
             )
 
-    def _load_df_offline(self, sync_mode: bool = False) -> Optional[pd.DataFrame]:
+    def _load_df_offline(self, sync_mode: bool = False) -> Optional[pl.DataFrame]:
         dfs = []
         if self.bucket_storage == BucketStorage.LOCAL or sync_mode:
             annotations_path = self.annotations_path
@@ -192,32 +188,32 @@ def _load_df_offline(self, sync_mode: bool = False) -> Optional[pd.DataFrame]:
             return None
         for file in annotations_path.iterdir():
             if file.suffix == ".parquet":
-                dfs.append(pd.read_parquet(file))
-        if len(dfs):
-            return pd.concat(dfs)
+                dfs.append(pl.read_parquet(file))
+        if dfs:
+            return pl.concat(dfs)
         else:
             return None
 
     def _find_filepath_uuid(
         self,
         filepath: Path,
-        index: Optional[pd.DataFrame],
+        index: Optional[pl.DataFrame],
         *,
         raise_on_missing: bool = False,
     ) -> Optional[str]:
         if index is None:
             return None
 
         abs_path = str(filepath.absolute())
-        if abs_path in list(index["original_filepath"]):
-            matched = index[index["original_filepath"] == abs_path]
-            if len(matched):
-                return list(matched["uuid"])[0]
+        matched = index.filter(pl.col("original_filepath") == abs_path)
+
+        if len(matched):
+            return list(matched.select("uuid"))[0][0]
         elif raise_on_missing:
             raise ValueError(f"File {abs_path} not found in index")
         return None
 
-    def _get_file_index(self) -> Optional[pd.DataFrame]:
+    def _get_file_index(self) -> Optional[pl.DataFrame]:
         index = None
         if self.bucket_storage == BucketStorage.LOCAL:
             file_index_path = self.metadata_path / "file_index.parquet"
@@ -228,24 +224,25 @@ def _get_file_index(self) -> Optional[pd.DataFrame]:
             except Exception:
                 pass
         if file_index_path.exists():
-            index = pd.read_parquet(file_index_path)
+            index = pl.read_parquet(file_index_path).select(
+                pl.all().exclude("^__index_level_.*$")
+            )
         return index
 
     def _write_index(
         self,
-        index: Optional[pd.DataFrame],
-        new_index: Dict,
+        index: Optional[pl.DataFrame],
+        new_index: Dict[str, List[str]],
         override_path: Optional[str] = None,
     ) -> None:
         if override_path:
             file_index_path = override_path
         else:
             file_index_path = self.metadata_path / "file_index.parquet"
-        df = pd.DataFrame(new_index)
+        df = pl.DataFrame(new_index)
         if index is not None:
-            df = pd.concat([index, df])
-        table = pa.Table.from_pandas(df)
-        pq.write_table(table, file_index_path)
+            df = pl.concat([index, df])
+        pq.write_table(df.to_arrow(), file_index_path)
 
     @contextmanager
     def _log_time(self):
@@ -358,130 +355,141 @@ def delete_dataset(self) -> None:
         if self.bucket_storage == BucketStorage.LOCAL:
             shutil.rmtree(self.path)
 
-    def add(
-        self,
-        generator: DatasetIterator,
-        batch_size: int = 1_000_000,
-    ) -> None:
-        def _process_arrays(batch_data: List[DatasetRecord]) -> None:
-            array_paths = set(
-                ann.path for ann in batch_data if isinstance(ann, ArrayAnnotation)
+    def _process_arrays(self, batch_data: List[DatasetRecord]) -> None:
+        array_paths = set(
+            ann.path for ann in batch_data if isinstance(ann, ArrayAnnotation)
+        )
+        if array_paths:
+            task = self.progress.add_task(
+                "[magenta]Processing arrays...", total=len(batch_data)
             )
-            if array_paths:
-                task = self.progress.add_task(
-                    "[magenta]Processing arrays...", total=len(batch_data)
-                )
-                self.logger.info("Checking arrays...")
-                with self._log_time():
-                    data_utils.check_arrays(array_paths)
-                self.logger.info("Generating array UUIDs...")
-                with self._log_time():
-                    array_uuid_dict = self.fs.get_file_uuids(
-                        array_paths, local=True
-                    )  # TODO: support from bucket
-                if self.bucket_storage != BucketStorage.LOCAL:
-                    self.logger.info("Uploading arrays...")
-                    # TODO: support from bucket (likely with a self.fs.copy_dir)
-                    with self._log_time():
-                        arrays_upload_dict = self.fs.put_dir(
-                            local_paths=array_paths,
-                            remote_dir="arrays",
-                            uuid_dict=array_uuid_dict,
-                        )
-                self.logger.info("Finalizing paths...")
-                self.progress.start()
-                for ann in batch_data:
-                    if isinstance(ann, ArrayAnnotation):
-                        if self.bucket_storage != BucketStorage.LOCAL:
-                            remote_path = arrays_upload_dict[str(ann.path)]  # type: ignore
-                            remote_path = (
-                                f"{self.fs.protocol}://{self.fs.path / remote_path}"
-                            )
-                            ann.path = remote_path  # type: ignore
-                        else:
-                            ann.path = ann.path.absolute()
-                        self.progress.update(task, advance=1)
-                self.progress.stop()
-                self.progress.remove_task(task)
-
-        def _add_process_batch(batch_data: List[DatasetRecord]) -> None:
-            paths = list(set(data.file for data in batch_data))
-            self.logger.info("Generating UUIDs...")
+            self.logger.info("Checking arrays...")
+            with self._log_time():
+                data_utils.check_arrays(array_paths)
+            self.logger.info("Generating array UUIDs...")
             with self._log_time():
-                uuid_dict = self.fs.get_file_uuids(
-                    paths, local=True
+                array_uuid_dict = self.fs.get_file_uuids(
+                    array_paths, local=True
                 )  # TODO: support from bucket
             if self.bucket_storage != BucketStorage.LOCAL:
-                self.logger.info("Uploading media...")
+                self.logger.info("Uploading arrays...")
                 # TODO: support from bucket (likely with a self.fs.copy_dir)
-
                 with self._log_time():
-                    self.fs.put_dir(
-                        local_paths=paths, remote_dir="media", uuid_dict=uuid_dict
+                    arrays_upload_dict = self.fs.put_dir(
+                        local_paths=array_paths,
+                        remote_dir="arrays",
+                        uuid_dict=array_uuid_dict,
                     )
+            self.logger.info("Finalizing paths...")
+            self.progress.start()
+            for ann in batch_data:
+                if isinstance(ann, ArrayAnnotation):
+                    if self.bucket_storage != BucketStorage.LOCAL:
+                        remote_path = arrays_upload_dict[str(ann.path)]  # type: ignore
+                        remote_path = (
+                            f"{self.fs.protocol}://{self.fs.path / remote_path}"
+                        )
+                        ann.path = remote_path  # type: ignore
+                    else:
+                        ann.path = ann.path.absolute()
+                    self.progress.update(task, advance=1)
+            self.progress.stop()
+            self.progress.remove_task(task)
 
-            task = self.progress.add_task(
-                "[magenta]Processing data...", total=len(batch_data)
-            )
-
-            _process_arrays(batch_data)
+    def _add_process_batch(
+        self,
+        batch_data: List[DatasetRecord],
+        pfm: ParquetFileManager,
+        index: Optional[pl.DataFrame],
+        new_index: Dict[str, List[str]],
+        processed_uuids: Set[str],
+    ) -> None:
+        paths = list(set(data.file for data in batch_data))
+        self.logger.info("Generating UUIDs...")
+        with self._log_time():
+            uuid_dict = self.fs.get_file_uuids(
+                paths, local=True
+            )  # TODO: support from bucket
+        if self.bucket_storage != BucketStorage.LOCAL:
+            self.logger.info("Uploading media...")
+            # TODO: support from bucket (likely with a self.fs.copy_dir)
 
-            self.logger.info("Saving annotations...")
             with self._log_time():
-                self.progress.start()
-                for ann in batch_data:
-                    filepath = ann.file
-                    file = filepath.name
-                    uuid = uuid_dict[str(filepath)]
-                    matched_id = self._find_filepath_uuid(filepath, index)
-                    if matched_id is not None:
-                        if matched_id != uuid:
-                            # TODO: not sure if this should be an exception or how we should really handle it
-                            raise Exception(
-                                f"{filepath} already added to the dataset! Please skip or rename the file."
-                            )
-                            # TODO: we may also want to check for duplicate uuids to get a one-to-one relationship
-                    elif uuid not in new_index["uuid"]:
-                        new_index["uuid"].append(uuid)
-                        new_index["file"].append(file)
-                        new_index["original_filepath"].append(str(filepath.absolute()))
-
-                    self.pfm.write({"uuid": uuid, **ann.to_parquet_dict()})
-                    self.progress.update(task, advance=1)
-                self.progress.stop()
-                self.progress.remove_task(task)
+                self.fs.put_dir(
+                    local_paths=paths, remote_dir="media", uuid_dict=uuid_dict
+                )
 
+        task = self.progress.add_task(
+            "[magenta]Processing data...", total=len(batch_data)
+        )
+
+        self._process_arrays(batch_data)
+
+        self.logger.info("Saving annotations...")
+        with self._log_time():
+            self.progress.start()
+            for ann in batch_data:
+                filepath = ann.file
+                file = filepath.name
+                uuid = uuid_dict[str(filepath)]
+                matched_id = self._find_filepath_uuid(filepath, index)
+                if matched_id is not None:
+                    if matched_id != uuid:
+                        # TODO: not sure if this should be an exception or how we should really handle it
+                        raise Exception(
+                            f"{filepath} already added to the dataset! Please skip or rename the file."
+                        )
+                        # TODO: we may also want to check for duplicate uuids to get a one-to-one relationship
+                elif uuid not in processed_uuids:
+                    new_index["uuid"].append(uuid)
+                    new_index["file"].append(file)
+                    new_index["original_filepath"].append(str(filepath.absolute()))
+                    processed_uuids.add(uuid)
+
+                pfm.write({"uuid": uuid, **ann.to_parquet_dict()})
+                self.progress.update(task, advance=1)
+            self.progress.stop()
+            self.progress.remove_task(task)
+
+    def add(self, generator: DatasetIterator, batch_size: int = 1_000_000) -> None:
         if self.bucket_storage == BucketStorage.LOCAL:
-            self.pfm = ParquetFileManager(str(self.annotations_path))
+            annotations_dir = self.annotations_path
         else:
             self._make_temp_dir()
             annotations_dir = self.tmp_dir / "annotations"
             annotations_dir.mkdir(exist_ok=True, parents=True)
-            self.pfm = ParquetFileManager(str(annotations_dir))
 
         index = self._get_file_index()
         new_index = {"uuid": [], "file": [], "original_filepath": []}
+        processed_uuids = set()
 
         batch_data: list[DatasetRecord] = []
 
         classes_per_task: Dict[str, Set[str]] = defaultdict(set)
         num_kpts_per_task: Dict[str, int] = {}
 
-        for i, data in enumerate(generator, start=1):
-            record = data if isinstance(data, DatasetRecord) else DatasetRecord(**data)
-            if record.annotation is not None:
-                classes_per_task[record.annotation.task].add(record.annotation.class_)
-                if record.annotation.type_ == "keypoints":
-                    num_kpts_per_task[record.annotation.task] = len(
-                        record.annotation.keypoints
+        with ParquetFileManager(annotations_dir) as pfm:
+            for i, data in enumerate(generator, start=1):
+                record = (
+                    data if isinstance(data, DatasetRecord) else DatasetRecord(**data)
+                )
+                if record.annotation is not None:
+                    classes_per_task[record.annotation.task].add(
+                        record.annotation.class_
                     )
+                    if record.annotation.type_ == "keypoints":
+                        num_kpts_per_task[record.annotation.task] = len(
+                            record.annotation.keypoints
+                        )
 
-            batch_data.append(record)
-            if i % batch_size == 0:
-                _add_process_batch(batch_data)
-                batch_data = []
+                batch_data.append(record)
+                if i % batch_size == 0:
+                    self._add_process_batch(
+                        batch_data, pfm, index, new_index, processed_uuids
+                    )
+                    batch_data = []
 
-        _add_process_batch(batch_data)
+            self._add_process_batch(batch_data, pfm, index, new_index, processed_uuids)
 
         _, curr_classes = self.get_classes()
         for task, classes in classes_per_task.items():
@@ -490,8 +498,6 @@ def _add_process_batch(batch_data: List[DatasetRecord]) -> None:
             self.logger.info(f"Detected new classes for task {task}: {new_classes}")
             self.set_classes(list(classes | old_classes), task, _remove_tmp_dir=False)
 
-        self.pfm.close()
-
         if self.bucket_storage == BucketStorage.LOCAL:
             self._write_index(index, new_index)
         else:
@@ -516,7 +522,7 @@ def make_splits(
 
             df = self._load_df_offline()
             assert df is not None
-            ids = list(set(df["uuid"]))
+            ids = df.select("uuid").unique().get_column("uuid").to_list()
             np.random.shuffle(ids)
             N = len(ids)
             b1 = round(N * ratios[0])