theislab
diff --git a/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/drvi/nn_modules/gradients.py‎
Lines changed: 28 additions & 0 deletions b/‎src/drvi/nn_modules/gradients.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎src/drvi/nn_modules/layer/factory.py‎
Lines changed: 6 additions & 6 deletions b/‎src/drvi/nn_modules/layer/factory.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/drvi/nn_modules/layer/linear_layer.py‎
Lines changed: 33 additions & 7 deletions b/‎src/drvi/nn_modules/layer/linear_layer.py‎
Lines changed: 33 additions & 7 deletions
@@ -14,6 +14,12 @@ and this project adheres to [Semantic Versioning][].
 ### Added
 
 - Add support for Python 3.13
+- Allow subset reconstruction
+- Allow gradient scaling in the last layer
+
+### Changed
+
+- Minor code improvements
 
 ## [0.2.0] - 2025-11-24
 
 
@@ -0,0 +1,28 @@
+import torch
+
+
+class GradScale(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, scale):
+        ctx.scale = scale
+        return x  # forward pass unchanged
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output * ctx.scale, None  # scale gradient only
+
+
+def grad_scale(x, scale):
+    return GradScale.apply(x, scale)
+
+
+class GradientScaler(torch.nn.Module):
+    def __init__(self, scale: float):
+        super().__init__()
+        self.register_buffer("scale", torch.tensor(scale, dtype=torch.float32))
+
+    def forward(self, x):
+        return grad_scale(x, self.scale)
+
+    def extra_repr(self):
+        return f"scale={self.scale.item()}"
@@ -4,7 +4,7 @@
 
 from torch import nn
 
-from drvi.nn_modules.layer.linear_layer import StackedLinearLayer
+from drvi.nn_modules.layer.linear_layer import LinearLayer, StackedLinearLayer
 from drvi.nn_modules.layer.structures import SimpleResidual
 
 if TYPE_CHECKING:
@@ -145,7 +145,7 @@ def get_normal_layer(
         if intermediate_layer is None:
             intermediate_layer = True
         if intermediate_layer and self.intermediate_arch == "FC":
-            layer = nn.Linear(d_in, d_out, bias)
+            layer = LinearLayer(d_in, d_out, bias)
         elif (not intermediate_layer) or self.intermediate_arch == "SAME":
             layer = self._get_normal_layer(d_in, d_out, bias=True, **kwargs)
         else:
@@ -243,7 +243,7 @@ class FCLayerFactory(LayerFactory):
     Notes
     -----
     This factory creates:
-    - Normal layers: `nn.Linear` layers
+    - Normal layers: `LinearLayer` layers
     - Stacked layers: `StackedLinearLayer` for processing multiple splits
 
     The "SAME" and "FC" architectures are equivalent for this factory since
@@ -264,7 +264,7 @@ class FCLayerFactory(LayerFactory):
     def __init__(self, intermediate_arch: Literal["SAME", "FC"] = "SAME", residual_preferred: bool = False) -> None:
         super().__init__(intermediate_arch=intermediate_arch, residual_preferred=residual_preferred)
 
-    def _get_normal_layer(self, d_in: int, d_out: int, bias: bool = True, **kwargs: Any) -> nn.Linear:
+    def _get_normal_layer(self, d_in: int, d_out: int, bias: bool = True, **kwargs: Any) -> LinearLayer:
         """Create a fully connected layer.
 
         Parameters
@@ -280,7 +280,7 @@ def _get_normal_layer(self, d_in: int, d_out: int, bias: bool = True, **kwargs:
 
         Returns
         -------
-        nn.Linear
+        LinearLayer
             A fully connected linear layer.
 
         Examples
@@ -290,7 +290,7 @@ def _get_normal_layer(self, d_in: int, d_out: int, bias: bool = True, **kwargs:
         >>> print(layer.weight.shape)  # torch.Size([128, 64])
         >>> print(layer.bias.shape)  # torch.Size([128])
         """
-        return nn.Linear(d_in, d_out, bias=bias)
+        return LinearLayer(d_in, d_out, bias=bias)
 
     def _get_stacked_layer(
         self, d_channel: int, d_in: int, d_out: int, bias: bool = True, **kwargs: Any
 
@@ -5,6 +5,22 @@
 
 import torch
 from torch import nn
+from torch.nn import functional as F
+
+
+class LinearLayer(nn.Linear):
+    def forward(self, x: torch.Tensor, output_subset: torch.Tensor | None = None) -> torch.Tensor:
+        if output_subset is None:
+            # x: (..., i) -> output: (..., o)
+            return super().forward(x)
+        elif output_subset.dim() == 1:
+            # x: (..., i) -> output_subset: (o_subset)
+            bias = self.bias[output_subset] if self.bias is not None else None  # (o_subset)
+            weight = self.weight[output_subset]  # (o_subset, i)
+            return F.linear(x, weight, bias)  # (..., i) -> (..., o_subset)
+        else:
+            raise NotImplementedError()
+
 
 if TYPE_CHECKING:
     from typing import Any
@@ -39,7 +55,7 @@ class StackedLinearLayer(nn.Module):
     - Bias shape: (n_channels, out_features) if bias=True, None otherwise
 
     The forward pass applies the transformation to each channel independently:
-    output[b, c, o] = sum_i(input[b, c, i] * weight[c, i, o]) + bias[c, o]
+    output[b, c, o] = sum_i(x[b, c, i] * weight[c, i, o]) + bias[c, o]
 
     This is equivalent to applying n_channels separate linear layers in parallel,
     which is more efficient than using separate nn.Linear layers.
@@ -137,13 +153,15 @@ def _init_bias(self) -> None:
             bound = 1 / math.sqrt(fan_in)
             nn.init.uniform_(self.bias, -bound, bound)
 
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, output_subset: torch.Tensor | None = None) -> torch.Tensor:
         r"""Forward pass through the stacked linear layer.
 
         Parameters
         ----------
-        input
+        x
             Input tensor with shape (batch_size, n_channels, in_features).
+        output_subset
+            Subset of outputs to provide in the output.
 
         Returns
         -------
@@ -178,10 +196,18 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
         >>> output = layer(x)
         >>> print(output.shape)  # torch.Size([2, 3, 5])
         """
-        mm = torch.einsum("bci,cio->bco", input, self.weight)
-        if self.bias is not None:
-            mm = mm + self.bias  # They will broadcast well
-        return mm
+        if output_subset is None or output_subset.dim() == 1:
+            # weight: (c, i, o), bias: (c, o)
+            # x: (b, c, i), output_subset: (o_subset) -> output: (b, c, o_subset)
+            weight = self.weight if output_subset is None else self.weight[:, :, output_subset]  # (c, i, o_subset)
+            # slower: mm = torch.einsum("bci,cio->bco", x, weight)
+            mm = torch.bmm(x.transpose(0, 1), weight).transpose(0, 1)  # (b, c, o_subset)
+            if self.bias is not None:
+                bias = self.bias if output_subset is None else self.bias[:, output_subset]  # (c, o_subset)
+                mm = mm + bias  # They (bco, co) will broadcast well
+            return mm
+        else:
+            raise NotImplementedError()
 
     def extra_repr(self) -> str:
         """String representation for printing the layer.