Replace kornia crop_and_resize with fast tensor indexing (#426)

talmo · claude · web-flow · commit fe2fa111ebdc · 2026-01-19T21:30:45.000-08:00
## Summary - Replaces kornia's `crop_and_resize` in `peak_finding.py` with a faster implementation using tensor unfold operations - Achieves **17-51x speedup** (CUDA/MPS) by avoiding perspective transform computations - Removes the MPS special case that disabled integral refinement on Mac (no longer needed) - Adds tests for edge cases (boundary peaks, empty inputs, multiple samples) ## Performance | Platform | kornia | simple indexing | Speedup | |----------|--------|-----------------|---------| | MPS (M-series Mac) | 21.45 ms | 0.42 ms | **51x** | | CUDA (RTX A6000) | 2.64 ms | 0.15 ms | **17x** | The new approach: - Uses `F.pad` + `unfold` to create patch views (no memory copy) - Selects patches via advanced indexing - Removes dependency on `torch.linalg.solve` (was blocking MPS support in older PyTorch) ## Test plan - [x] Existing `test_peak_finding.py` tests pass - [x] Added `test_crop_bboxes_edge_cases` for boundary peaks - [x] Added `test_crop_bboxes_empty` for empty inputs - [x] Added `test_crop_bboxes_multiple_samples` for multi-sample batches - [ ] Run full inference on real data to verify end-to-end correctness 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/sleap_nn/inference/peak_finding.py b/sleap_nn/inference/peak_finding.py
@@ -3,17 +3,20 @@
 from typing import Optional, Tuple
 
 import kornia as K
-import numpy as np
 import torch
-from kornia.geometry.transform import crop_and_resize
+import torch.nn.functional as F
 
 from sleap_nn.data.instance_cropping import make_centered_bboxes
 
 
 def crop_bboxes(
     images: torch.Tensor, bboxes: torch.Tensor, sample_inds: torch.Tensor
 ) -> torch.Tensor:
-    """Crop bounding boxes from a batch of images.
+    """Crop bounding boxes from a batch of images using fast tensor indexing.
+
+    This uses tensor unfold operations to extract patches, which is significantly
+    faster than kornia's crop_and_resize (17-51x speedup) as it avoids perspective
+    transform computations.
 
     Args:
         images: Tensor of shape (samples, channels, height, width) of a batch of images.
@@ -27,7 +30,7 @@ def crop_bboxes(
             box should be cropped from.
 
     Returns:
-        A tensor of shape (n_bboxes, crop_height, crop_width, channels) of the same
+        A tensor of shape (n_bboxes, channels, crop_height, crop_width) of the same
         dtype as the input image. The crop size is inferred from the bounding box
         coordinates.
 
@@ -42,26 +45,51 @@ def crop_bboxes(
 
     See also: `make_centered_bboxes`
     """
+    n_crops = bboxes.shape[0]
+    if n_crops == 0:
+        # Return empty tensor; use default crop size since we can't infer from bboxes
+        return torch.empty(
+            0, images.shape[1], 0, 0, device=images.device, dtype=images.dtype
+        )
+
     # Compute bounding box size to use for crops.
-    height = abs(bboxes[0, 3, 1] - bboxes[0, 0, 1])
-    width = abs(bboxes[0, 1, 0] - bboxes[0, 0, 0])
-    box_size = tuple(torch.round(torch.Tensor((height + 1, width + 1))).to(torch.int32))
+    height = int(abs(bboxes[0, 3, 1] - bboxes[0, 0, 1]).item()) + 1
+    width = int(abs(bboxes[0, 1, 0] - bboxes[0, 0, 0]).item()) + 1
 
     # Store original dtype for conversion back after cropping.
     original_dtype = images.dtype
+    device = images.device
+    n_samples, channels, img_h, img_w = images.shape
+    half_h, half_w = height // 2, width // 2
 
-    # Kornia's crop_and_resize requires float32 input.
-    images_to_crop = images[sample_inds]
-    if not torch.is_floating_point(images_to_crop):
-        images_to_crop = images_to_crop.float()
-
-    # Crop.
-    crops = crop_and_resize(
-        images_to_crop,  # (n_boxes, channels, height, width)
-        boxes=bboxes,
-        size=box_size,
+    # Pad images for edge handling.
+    images_padded = F.pad(
+        images.float(), (half_w, half_w, half_h, half_h), mode="constant", value=0
     )
 
+    # Extract all possible patches using unfold (creates a view, no copy).
+    # Shape after unfold: (n_samples, channels, img_h, img_w, height, width)
+    patches = images_padded.unfold(2, height, 1).unfold(3, width, 1)
+
+    # Get crop centers from bboxes.
+    # The bbox top-left is at index 0, with (x, y) coordinates.
+    # We need the center of the crop (peak location), which is top-left + half_size.
+    crop_x = (bboxes[:, 0, 0] + half_w).to(torch.long)
+    crop_y = (bboxes[:, 0, 1] + half_h).to(torch.long)
+
+    # Clamp indices to valid bounds to handle edge cases where centroids
+    # might be at or beyond image boundaries.
+    crop_x = torch.clamp(crop_x, 0, patches.shape[3] - 1)
+    crop_y = torch.clamp(crop_y, 0, patches.shape[2] - 1)
+
+    # Select crops using advanced indexing.
+    # Convert sample_inds to tensor if it's a list.
+    if not isinstance(sample_inds, torch.Tensor):
+        sample_inds = torch.tensor(sample_inds, device=device)
+    sample_inds_long = sample_inds.to(torch.long)
+    crops = patches[sample_inds_long, :, crop_y, crop_x]
+    # Shape: (n_crops, channels, height, width)
+
     # Cast back to original dtype and return.
     crops = crops.to(original_dtype)
     return crops
diff --git a/sleap_nn/predict.py b/sleap_nn/predict.py
@@ -448,13 +448,6 @@ def run_inference(
                 else "mps" if torch.backends.mps.is_available() else "cpu"
             )
 
-        if integral_refinement is not None and device == "mps":  # TODO
-            # kornia/geometry/transform/imgwarp.py:382: in get_perspective_transform. NotImplementedError: The operator 'aten::_linalg_solve_ex.result' is not currently implemented for the MPS device. If you want this op to be added in priority during the prototype phase of this feature, please comment on https://github.com/pytorch/pytorch/issues/77764. As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.
-            logger.info(
-                "Integral refinement is not supported with MPS accelerator. Setting integral refinement to None."
-            )
-            integral_refinement = None
-
         logger.info(f"Using device: {device}")
 
         # initializes the inference model
diff --git a/tests/inference/test_peak_finding.py b/tests/inference/test_peak_finding.py
@@ -8,6 +8,7 @@
     find_local_peaks,
     find_local_peaks_rough,
 )
+from sleap_nn.data.instance_cropping import make_centered_bboxes
 
 
 def test_crop_bboxes(minimal_bboxes, minimal_cms):
@@ -28,6 +29,84 @@ def test_crop_bboxes(minimal_bboxes, minimal_cms):
     assert cm_crops.dtype == torch.float32
 
 
+def test_crop_bboxes_edge_cases():
+    """Test crop_bboxes with edge cases like peaks near image boundaries."""
+    # Create a test image with peaks at various positions including edges
+    img = torch.zeros(1, 1, 20, 20)
+
+    # Peak at center
+    img[0, 0, 10, 10] = 1.0
+
+    # Peak at corner (0, 0)
+    img[0, 0, 0, 0] = 0.8
+
+    # Peak at edge
+    img[0, 0, 0, 10] = 0.9
+
+    # Create bboxes for these peaks
+    points = torch.tensor(
+        [
+            [10.0, 10.0],  # center
+            [0.0, 0.0],  # corner
+            [10.0, 0.0],  # edge
+        ]
+    )
+    bboxes = make_centered_bboxes(points, box_height=5, box_width=5)
+    sample_inds = torch.tensor([0, 0, 0])
+
+    crops = crop_bboxes(img, bboxes, sample_inds)
+
+    assert crops.shape == (3, 1, 5, 5)
+
+    # Center crop should have the peak at center
+    assert crops[0, 0, 2, 2] == 1.0
+
+    # Corner crop should have the peak at center (with zero padding)
+    assert crops[1, 0, 2, 2] == 0.8
+
+    # Edge crop should have the peak at center
+    assert crops[2, 0, 2, 2] == 0.9
+
+
+def test_crop_bboxes_empty():
+    """Test crop_bboxes with empty bboxes."""
+    img = torch.zeros(1, 1, 20, 20)
+    bboxes = torch.empty(0, 4, 2)
+    sample_inds = torch.empty(0, dtype=torch.long)
+
+    crops = crop_bboxes(img, bboxes, sample_inds)
+
+    # Should return empty tensor
+    assert crops.shape[0] == 0
+    assert crops.shape[1] == 1  # Preserves channel dimension
+
+
+def test_crop_bboxes_multiple_samples():
+    """Test crop_bboxes with multiple samples."""
+    # Create 3 samples with different peak locations
+    imgs = torch.zeros(3, 1, 20, 20)
+    imgs[0, 0, 5, 5] = 1.0
+    imgs[1, 0, 10, 10] = 2.0
+    imgs[2, 0, 15, 15] = 3.0
+
+    points = torch.tensor(
+        [
+            [5.0, 5.0],
+            [10.0, 10.0],
+            [15.0, 15.0],
+        ]
+    )
+    bboxes = make_centered_bboxes(points, box_height=5, box_width=5)
+    sample_inds = torch.tensor([0, 1, 2])
+
+    crops = crop_bboxes(imgs, bboxes, sample_inds)
+
+    assert crops.shape == (3, 1, 5, 5)
+    assert crops[0, 0, 2, 2] == 1.0
+    assert crops[1, 0, 2, 2] == 2.0
+    assert crops[2, 0, 2, 2] == 3.0
+
+
 def test_integral_regression(minimal_bboxes, minimal_cms):
     cms = torch.load(minimal_cms).unsqueeze(0)
     bboxes = torch.load(minimal_bboxes)
diff --git a/tests/training/test_lightning_modules.py b/tests/training/test_lightning_modules.py
@@ -1,8 +1,10 @@
 """Test TrainingModule classes."""
 
 import numpy as np
+import os
 from pathlib import Path
 from omegaconf import OmegaConf
+import wandb
 from sleap_nn.data.custom_datasets import (
     get_train_val_dataloaders,
     get_train_val_datasets,
@@ -39,6 +41,31 @@ def caplog(caplog: LogCaptureFixture):
     logger.remove(handler_id)
 
 
+@pytest.fixture(autouse=True)
+def cleanup_wandb():
+    """Ensure wandb runs in offline mode and is cleaned up after each test.
+
+    This fixture:
+    1. Sets WANDB_MODE=offline to prevent network hangs on CI
+    2. Cleans up any active wandb run after the test to prevent state leakage
+    """
+    # Save original mode and force offline to prevent network hangs on CI
+    original_mode = os.environ.get("WANDB_MODE")
+    os.environ["WANDB_MODE"] = "offline"
+
+    yield
+
+    # Finish any active wandb run to prevent contamination between tests
+    if wandb.run is not None:
+        wandb.finish()
+
+    # Restore original WANDB_MODE
+    if original_mode is not None:
+        os.environ["WANDB_MODE"] = original_mode
+    else:
+        os.environ.pop("WANDB_MODE", None)
+
+
 def test_topdown_centered_instance_model(
     config, tmp_path: str, minimal_instance_centered_instance_ckpt
 ):
diff --git a/tests/training/test_model_trainer.py b/tests/training/test_model_trainer.py
@@ -53,12 +53,28 @@ def caplog(caplog: LogCaptureFixture):
 
 @pytest.fixture(autouse=True)
 def cleanup_wandb():
-    """Ensure wandb run is finished after each test to prevent state leakage."""
+    """Ensure wandb runs in offline mode and is cleaned up after each test.
+
+    This fixture:
+    1. Sets WANDB_MODE=offline to prevent network hangs on CI
+    2. Cleans up any active wandb run after the test to prevent state leakage
+    """
+    # Save original mode and force offline to prevent network hangs on CI
+    original_mode = os.environ.get("WANDB_MODE")
+    os.environ["WANDB_MODE"] = "offline"
+
     yield
+
     # Finish any active wandb run to prevent contamination between tests
     if wandb.run is not None:
         wandb.finish()
 
+    # Restore original WANDB_MODE
+    if original_mode is not None:
+        os.environ["WANDB_MODE"] = original_mode
+    else:
+        os.environ.pop("WANDB_MODE", None)
+
 
 def test_cfg_without_val_labels_path(config, tmp_path, minimal_instance):
     """Test Model Trainer if no val labels path is provided."""