Fix multiprocessing bug with num_workers>0 (#359)

gitttt-1234 · web-flow · commit b1a2026bd941 · 2025-10-20T14:23:51.000-07:00
Previously we added caching to support num_workers &gt; 0, since native
Labels objects (HDF5 via h5py) aren’t picklable under spawn
(macOS/Windows). However, the Labels object was still attached to the
dataset and got sent to worker processes, breaking multiprocessing. This
PR removes `sio.Labels` from the dataset state when caching is enabled
(it’s kept only when caching is disabled), so workers no longer receive
a non-picklable handle. Now, with caching on, users can safely set
num_workers &gt; 0 and get faster training on macOS/Windows without HDF5
pickling errors.
diff --git a/sleap_nn/data/custom_datasets.py b/sleap_nn/data/custom_datasets.py
diff --git a/sleap_nn/data/providers.py b/sleap_nn/data/providers.py
@@ -1,6 +1,6 @@
 """This module implements pipeline blocks for reading input data such as labels."""
 
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import numpy as np
 import sleap_io as sio
@@ -36,15 +36,19 @@ def get_max_height_width(labels: sio.Labels) -> Tuple[int, int]:
 
 
 def process_lf(
-    lf: sio.LabeledFrame,
+    instances_list: List[sio.Instance],
+    img: np.ndarray,
+    frame_idx: int,
     video_idx: int,
     max_instances: int,
     user_instances_only: bool = True,
 ) -> Dict[str, Any]:
     """Get sample dict from `sio.LabeledFrame`.
 
     Args:
-        lf: Input `sio.LabeledFrame`.
+        instances_list: List of `sio.Instance` objects.
+        img: Input image.
+        frame_idx: Frame index of the given lf.
         video_idx: Video index of the given lf.
         max_instances: Maximum number of instances that could occur in a single LabeledFrame.
         user_instances_only: True if filter labels only to user instances else False.
@@ -57,13 +61,14 @@ def process_lf(
     """
     # Filter to user instances
     if user_instances_only:
-        if lf.user_instances is not None and len(lf.user_instances) > 0:
-            lf.instances = lf.user_instances
+        user_instances = [inst for inst in instances_list if type(inst) is sio.Instance]
+        if len(user_instances) > 0:
+            instances_list = user_instances
 
-    image = np.transpose(lf.image, (2, 0, 1))  # HWC -> CHW
+    image = np.transpose(img, (2, 0, 1))  # HWC -> CHW
 
     instances = []
-    for inst in lf:
+    for inst in instances_list:
         if not inst.is_empty:
             instances.append(inst.numpy())
     instances = np.stack(instances, axis=0)
@@ -92,7 +97,7 @@ def process_lf(
         "image": torch.from_numpy(image.copy()),
         "instances": instances,
         "video_idx": torch.tensor(video_idx, dtype=torch.int32),
-        "frame_idx": torch.tensor(lf.frame_idx, dtype=torch.int32),
+        "frame_idx": torch.tensor(frame_idx, dtype=torch.int32),
         "orig_size": torch.Tensor([img_height, img_width]).unsqueeze(0),
         "num_instances": num_instances,
     }
diff --git a/tests/data/test_augmentation.py b/tests/data/test_augmentation.py
@@ -13,7 +13,13 @@ def test_apply_intensity_augmentation(minimal_instance):
     """Test `apply_intensity_augmentation` function."""
     labels = sio.load_slp(minimal_instance)
     lf = labels[0]
-    ex = process_lf(lf, 0, 2)
+    ex = process_lf(
+        instances_list=lf.instances,
+        img=lf.image,
+        frame_idx=lf.frame_idx,
+        video_idx=0,
+        max_instances=2,
+    )
     ex["image"] = apply_normalization(ex["image"])
 
     img, pts = apply_intensity_augmentation(
@@ -36,7 +42,13 @@ def test_apply_geometric_augmentation(minimal_instance):
     """Test `apply_geometric_augmentation` function."""
     labels = sio.load_slp(minimal_instance)
     lf = labels[0]
-    ex = process_lf(lf, 0, 2)
+    ex = process_lf(
+        instances_list=lf.instances,
+        img=lf.image,
+        frame_idx=lf.frame_idx,
+        video_idx=0,
+        max_instances=2,
+    )
     ex["image"] = apply_normalization(ex["image"])
 
     img, pts = apply_geometric_augmentation(
diff --git a/tests/data/test_confmaps.py b/tests/data/test_confmaps.py
@@ -15,7 +15,13 @@ def test_generate_confmaps(minimal_instance):
     """Test `generate_confmaps` function."""
     labels = sio.load_slp(minimal_instance)
     lf = labels[0]
-    ex = process_lf(lf, 0, 2)
+    ex = process_lf(
+        instances_list=lf.instances,
+        img=lf.image,
+        frame_idx=lf.frame_idx,
+        video_idx=0,
+        max_instances=2,
+    )
 
     confmaps = generate_confmaps(
         ex["instances"][:, 0].unsqueeze(dim=1), img_hw=(384, 384)
@@ -27,7 +33,13 @@ def test_generate_multiconfmaps(minimal_instance):
     """Test `generate_multiconfmaps` function."""
     labels = sio.load_slp(minimal_instance)
     lf = labels[0]
-    ex = process_lf(lf, 0, 2)
+    ex = process_lf(
+        instances_list=lf.instances,
+        img=lf.image,
+        frame_idx=lf.frame_idx,
+        video_idx=0,
+        max_instances=2,
+    )
 
     confmaps = generate_multiconfmaps(
         ex["instances"], img_hw=(384, 384), num_instances=ex["num_instances"]
diff --git a/tests/data/test_custom_datasets.py b/tests/data/test_custom_datasets.py
@@ -88,7 +88,7 @@ def test_bottomup_dataset(minimal_instance, tmp_path):
         cache_img="memory",
         apply_aug=base_bottom_config.use_augmentations_train,
     )
-    dataset._fill_cache()
+    dataset._fill_cache([sio.load_slp(minimal_instance)])
 
     gt_sample_keys = [
         "image",
@@ -201,7 +201,7 @@ def test_bottomup_dataset(minimal_instance, tmp_path):
         cache_img="disk",
         cache_img_path=f"{tmp_path}/cache_imgs",
     )
-    dataset._fill_cache()
+    dataset._fill_cache([sio.load_slp(minimal_instance)])
 
     gt_sample_keys = [
         "image",
@@ -314,7 +314,7 @@ def test_bottomup_multiclass_dataset(minimal_instance, tmp_path):
         cache_img="memory",
         apply_aug=base_bottom_config.use_augmentations_train,
     )
-    dataset._fill_cache()
+    dataset._fill_cache([tracked_labels])
 
     sample = next(iter(dataset))
     assert len(sample.keys()) == len(gt_sample_keys)
@@ -400,7 +400,7 @@ def test_bottomup_multiclass_dataset(minimal_instance, tmp_path):
         cache_img="disk",
         cache_img_path=f"{tmp_path}/cache_imgs",
     )
-    dataset._fill_cache()
+    dataset._fill_cache([tracked_labels])
 
     sample = next(iter(dataset))
     assert len(sample.keys()) == len(gt_sample_keys)
@@ -446,7 +446,13 @@ def test_centered_instance_dataset(minimal_instance, tmp_path):
         cache_img="disk",
         cache_img_path=f"{tmp_path}/cache_imgs",
     )
-    dataset._fill_cache()
+    dataset._fill_cache(
+        [
+            sio.load_slp(minimal_instance),
+            sio.load_slp(minimal_instance),
+            sio.load_slp(minimal_instance),
+        ]
+    )
 
     gt_sample_keys = [
         "centroid",
@@ -481,7 +487,7 @@ def test_centered_instance_dataset(minimal_instance, tmp_path):
         cache_img="memory",
         apply_aug=base_topdown_data_config.use_augmentations_train,
     )
-    dataset._fill_cache()
+    dataset._fill_cache([sio.load_slp(minimal_instance)])
 
     gt_sample_keys = [
         "centroid",
@@ -711,7 +717,7 @@ def test_centered_multiclass_dataset(minimal_instance, tmp_path):
         cache_img="disk",
         cache_img_path=f"{tmp_path}/cache_imgs",
     )
-    dataset._fill_cache()
+    dataset._fill_cache([tracked_labels, tracked_labels, tracked_labels])
 
     gt_sample_keys = [
         "centroid",
@@ -749,7 +755,7 @@ def test_centered_multiclass_dataset(minimal_instance, tmp_path):
         cache_img="memory",
         apply_aug=base_topdown_data_config.use_augmentations_train,
     )
-    dataset._fill_cache()
+    dataset._fill_cache([tracked_labels])
 
     sample = next(iter(dataset))
     assert len(sample.keys()) == len(gt_sample_keys)
@@ -923,7 +929,7 @@ def test_centroid_dataset(minimal_instance, tmp_path):
         cache_img="disk",
         cache_img_path=f"{tmp_path}/cache_imgs",
     )
-    dataset._fill_cache()
+    dataset._fill_cache([sio.load_slp(minimal_instance)])
 
     gt_sample_keys = [
         "image",
@@ -957,7 +963,7 @@ def test_centroid_dataset(minimal_instance, tmp_path):
         apply_aug=base_centroid_data_config.use_augmentations_train,
         labels=[sio.load_slp(minimal_instance)],
     )
-    dataset._fill_cache()
+    dataset._fill_cache([sio.load_slp(minimal_instance)])
 
     gt_sample_keys = [
         "image",
@@ -1094,7 +1100,7 @@ def test_single_instance_dataset(minimal_instance, tmp_path):
         cache_img="disk",
         cache_img_path=f"{tmp_path}/cache_imgs",
     )
-    dataset._fill_cache()
+    dataset._fill_cache([labels, labels, labels])
     sample = next(iter(dataset))
     assert len(dataset) == 3
 
@@ -1127,7 +1133,7 @@ def test_single_instance_dataset(minimal_instance, tmp_path):
         cache_img="memory",
         apply_aug=base_singleinstance_data_config.use_augmentations_train,
     )
-    dataset._fill_cache()
+    dataset._fill_cache([labels])
 
     sample = next(iter(dataset))
     assert len(dataset) == 1
diff --git a/tests/data/test_edge_maps.py b/tests/data/test_edge_maps.py
@@ -196,7 +196,13 @@ def test_generate_pafs(minimal_instance):
     """Test `generate_pafs` function."""
     labels = sio.load_slp(minimal_instance)
     lf = labels[0]
-    ex = process_lf(lf, 0, 2)
+    ex = process_lf(
+        instances_list=lf.instances,
+        img=lf.image,
+        frame_idx=lf.frame_idx,
+        video_idx=0,
+        max_instances=2,
+    )
 
     pafs = generate_pafs(
         ex["instances"],
diff --git a/tests/data/test_instance_centroids.py b/tests/data/test_instance_centroids.py
@@ -10,7 +10,13 @@ def test_generate_centroids(minimal_instance):
     """Test `generate_centroids` function."""
     labels = sio.load_slp(minimal_instance)
     lf = labels[0]
-    ex = process_lf(lf, 0, 2)
+    ex = process_lf(
+        instances_list=lf.instances,
+        img=lf.image,
+        frame_idx=lf.frame_idx,
+        video_idx=0,
+        max_instances=2,
+    )
 
     centroids = generate_centroids(ex["instances"], 1).int()
     gt = torch.Tensor([[[152, 158], [278, 203]]]).int()
diff --git a/tests/data/test_instance_cropping.py b/tests/data/test_instance_cropping.py
@@ -44,7 +44,13 @@ def test_generate_crops(minimal_instance):
     """Test `generate_crops` function."""
     labels = sio.load_slp(minimal_instance)
     lf = labels[0]
-    ex = process_lf(lf, 0, 2)
+    ex = process_lf(
+        instances_list=lf.instances,
+        img=lf.image,
+        frame_idx=lf.frame_idx,
+        video_idx=0,
+        max_instances=2,
+    )
     ex["image"] = apply_normalization(ex["image"])
 
     centroids = generate_centroids(ex["instances"], 0)
diff --git a/tests/data/test_providers.py b/tests/data/test_providers.py
@@ -250,7 +250,13 @@ def test_labelsreader_provider(minimal_instance):
 def test_process_lf(minimal_instance):
     labels = sio.load_slp(minimal_instance)
     lf = labels[0]
-    ex = process_lf(lf, 0, 4)
+    ex = process_lf(
+        instances_list=lf.instances,
+        img=lf.image,
+        frame_idx=lf.frame_idx,
+        video_idx=0,
+        max_instances=4,
+    )
 
     assert ex["image"].shape == torch.Size([1, 1, 384, 384])
     assert ex["instances"].shape == torch.Size([1, 4, 2, 2])
diff --git a/tests/data/test_resizing.py b/tests/data/test_resizing.py
@@ -30,7 +30,13 @@ def test_apply_resizer(minimal_instance):
     """Test `apply_resizer` function."""
     labels = sio.load_slp(minimal_instance)
     lf = labels[0]
-    ex = process_lf(lf, 0, 2)
+    ex = process_lf(
+        instances_list=lf.instances,
+        img=lf.image,
+        frame_idx=lf.frame_idx,
+        video_idx=0,
+        max_instances=2,
+    )
 
     image, instances = apply_resizer(ex["image"], ex["instances"], scale=2.0)
     assert image.shape == torch.Size([1, 1, 768, 768])
@@ -41,7 +47,13 @@ def test_apply_pad_to_stride(minimal_instance):
     """Test `apply_pad_to_stride` function."""
     labels = sio.load_slp(minimal_instance)
     lf = labels[0]
-    ex = process_lf(lf, 0, 2)
+    ex = process_lf(
+        instances_list=lf.instances,
+        img=lf.image,
+        frame_idx=lf.frame_idx,
+        video_idx=0,
+        max_instances=2,
+    )
 
     image = apply_pad_to_stride(ex["image"], max_stride=2)
     assert image.shape == torch.Size([1, 1, 384, 384])
@@ -54,7 +66,13 @@ def test_apply_sizematcher(caplog, minimal_instance):
     """Test `apply_sizematcher` function."""
     labels = sio.load_slp(minimal_instance)
     lf = labels[0]
-    ex = process_lf(lf, 0, 2)
+    ex = process_lf(
+        instances_list=lf.instances,
+        img=lf.image,
+        frame_idx=lf.frame_idx,
+        video_idx=0,
+        max_instances=2,
+    )
 
     image, _ = apply_sizematcher(ex["image"], 500, 500)
     assert image.shape == torch.Size([1, 1, 500, 500])
diff --git a/tests/inference/test_bottomup.py b/tests/inference/test_bottomup.py
@@ -38,7 +38,13 @@ def test_bottomup_inference_model(
     )
 
     labels = sio.load_slp(minimal_instance)
-    ex = process_lf(labels[0], 0, 2)
+    ex = process_lf(
+        instances_list=labels[0].instances,
+        img=labels[0].image,
+        frame_idx=labels[0].frame_idx,
+        video_idx=0,
+        max_instances=2,
+    )
     ex["image"] = apply_normalization(ex["image"]).unsqueeze(dim=0)
     ex["eff_scale"] = torch.Tensor([1.0])
 
@@ -130,7 +136,13 @@ def test_multiclass_bottomup_inference_model(
     )
 
     labels = sio.load_slp(minimal_instance)
-    ex = process_lf(labels[0], 0, 2)
+    ex = process_lf(
+        instances_list=labels[0].instances,
+        img=labels[0].image,
+        frame_idx=labels[0].frame_idx,
+        video_idx=0,
+        max_instances=2,
+    )
     ex["image"] = apply_normalization(ex["image"]).unsqueeze(dim=0)
     ex["eff_scale"] = torch.Tensor([1.0])
 
diff --git a/tests/inference/test_single_instance.py b/tests/inference/test_single_instance.py
@@ -35,7 +35,13 @@ def test_single_instance_inference_model(
     for lf in labels:
         lf.instances = lf.instances[:1]
 
-    ex = process_lf(labels[0], 0, 2)
+    ex = process_lf(
+        instances_list=labels[0].instances,
+        img=labels[0].image,
+        frame_idx=labels[0].frame_idx,
+        video_idx=0,
+        max_instances=2,
+    )
     ex["image"] = apply_normalization(ex["image"]).unsqueeze(dim=0)
     ex["eff_scale"] = torch.Tensor([1.0])
 
diff --git a/tests/inference/test_topdown.py b/tests/inference/test_topdown.py
diff --git a/tests/test_train.py b/tests/test_train.py
diff --git a/tests/training/test_model_trainer.py b/tests/training/test_model_trainer.py