Fix skip connection channel mismatch in ConvNext/SwinT decoders

talmo · claude · talmo · commit 8496a5c1d52d · 2026-01-19T23:05:29.000-08:00
The decoder incorrectly assumed skip connection channels match computed
decoder filters (refine_convs_filters). For ConvNext/SwinT, actual encoder
channels differ from computed filters, causing RuntimeError during training.

Changes:
- Add skip_channels parameter to SimpleUpsamplingBlock
- Add encoder_channels parameter to Decoder
- Pass actual encoder channels from ConvNextWrapper and SwinTWrapper

Fixes training with ConvNext/SwinT backbones when output_stride != 1.

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/sleap_nn/architectures/convnext.py b/sleap_nn/architectures/convnext.py
@@ -281,6 +281,10 @@ def __init__(
             # Keep the block output filters the same
             x_in_shape = int(self.arch["channels"][-1] * filters_rate)
 
+        # Encoder channels for skip connections (reversed to match decoder order)
+        # The forward pass uses enc_output[::2][::-1] for skip features
+        encoder_channels = self.arch["channels"][::-1]
+
         self.dec = Decoder(
             x_in_shape=x_in_shape,
             current_stride=self.current_stride,
@@ -293,6 +297,7 @@ def __init__(
             block_contraction=self.block_contraction,
             output_stride=self.output_stride,
             up_interpolate=up_interpolate,
+            encoder_channels=encoder_channels,
         )
 
         if len(self.dec.decoder_stack):
diff --git a/sleap_nn/architectures/encoder_decoder.py b/sleap_nn/architectures/encoder_decoder.py
@@ -25,7 +25,7 @@
 See the `EncoderDecoder` base class for requirements for creating new architectures.
 """
 
-from typing import List, Text, Tuple, Union
+from typing import List, Optional, Text, Tuple, Union
 from collections import OrderedDict
 import torch
 from torch import nn
@@ -391,10 +391,18 @@ def __init__(
         transpose_convs_activation: Text = "relu",
         feat_concat: bool = True,
         prefix: Text = "",
+        skip_channels: Optional[int] = None,
     ) -> None:
         """Initialize the class."""
         super().__init__()
 
+        # Determine skip connection channels
+        # If skip_channels is provided, use it; otherwise fall back to refine_convs_filters
+        # This allows ConvNext/SwinT to specify actual encoder channels
+        self.skip_channels = (
+            skip_channels if skip_channels is not None else refine_convs_filters
+        )
+
         self.x_in_shape = x_in_shape
         self.current_stride = current_stride
         self.upsampling_stride = upsampling_stride
@@ -469,13 +477,13 @@ def __init__(
                     first_conv_in_channels = refine_convs_filters
                 else:
                     if self.up_interpolate:
-                        # With interpolation, input is x_in_shape + feature channels
-                        # The feature channels are the same as x_in_shape since they come from the same level
-                        first_conv_in_channels = x_in_shape + refine_convs_filters
+                        # With interpolation, input is x_in_shape + skip_channels
+                        # skip_channels may differ from refine_convs_filters for ConvNext/SwinT
+                        first_conv_in_channels = x_in_shape + self.skip_channels
                     else:
-                        # With transpose conv, input is transpose_conv_output + feature channels
+                        # With transpose conv, input is transpose_conv_output + skip_channels
                         first_conv_in_channels = (
-                            refine_convs_filters + transpose_convs_filters
+                            self.skip_channels + transpose_convs_filters
                         )
             else:
                 if not self.feat_concat:
@@ -582,6 +590,7 @@ def __init__(
         block_contraction: bool = False,
         up_interpolate: bool = True,
         prefix: str = "dec",
+        encoder_channels: Optional[List[int]] = None,
     ) -> None:
         """Initialize the class."""
         super().__init__()
@@ -598,6 +607,7 @@ def __init__(
         self.block_contraction = block_contraction
         self.prefix = prefix
         self.stride_to_filters = {}
+        self.encoder_channels = encoder_channels
 
         self.current_strides = []
         self.residuals = 0
@@ -624,6 +634,13 @@ def __init__(
 
             next_stride = current_stride // 2
 
+            # Determine skip channels for this decoder block
+            # If encoder_channels provided, use actual encoder channels
+            # Otherwise fall back to computed filters (for UNet compatibility)
+            skip_channels = None
+            if encoder_channels is not None and block < len(encoder_channels):
+                skip_channels = encoder_channels[block]
+
             if self.stem_blocks > 0 and block >= down_blocks + self.stem_blocks:
                 # This accounts for the case where we dont have any more down block features to concatenate with.
                 # In this case, add a simple upsampling block with a conv layer and with no concatenation
@@ -642,6 +659,7 @@ def __init__(
                         transpose_convs_batch_norm=False,
                         feat_concat=False,
                         prefix=f"{self.prefix}{block}_s{current_stride}_to_s{next_stride}",
+                        skip_channels=skip_channels,
                     )
                 )
             else:
@@ -659,6 +677,7 @@ def __init__(
                         transpose_convs_filters=block_filters_out,
                         transpose_convs_batch_norm=False,
                         prefix=f"{self.prefix}{block}_s{current_stride}_to_s{next_stride}",
+                        skip_channels=skip_channels,
                     )
                 )
 
diff --git a/sleap_nn/architectures/swint.py b/sleap_nn/architectures/swint.py
@@ -309,6 +309,13 @@ def __init__(
             self.stem_patch_stride * (2**3) * 2
         )  # stem_stride * down_blocks_stride * final_max_pool_stride
 
+        # Encoder channels for skip connections (reversed to match decoder order)
+        # SwinT channels: embed * 2^i for each stage i, then reversed
+        num_stages = len(self.arch["depths"])
+        encoder_channels = [
+            self.arch["embed"] * (2 ** (num_stages - 1 - i)) for i in range(num_stages)
+        ]
+
         self.dec = Decoder(
             x_in_shape=block_filters,
             current_stride=self.current_stride,
@@ -321,6 +328,7 @@ def __init__(
             block_contraction=self.block_contraction,
             output_stride=output_stride,
             up_interpolate=up_interpolate,
+            encoder_channels=encoder_channels,
         )
 
         if len(self.dec.decoder_stack):