adapted linear layers with base_layer_result

gslama12 · gslama12 · commit cde062469941 · 2024-10-24T07:44:53.000+02:00
diff --git a/src/peft/tuners/lora/dora.py b/src/peft/tuners/lora/dora.py
@@ -62,12 +62,12 @@ def update_layer(self, *, base_layer, lora_A, lora_B, scaling, place_on_cpu=Fals
             weight_norm = weight_norm.to("cpu")
         self.weight = nn.Parameter(weight_norm, requires_grad=True)
 
-    def forward(self, x, *, lora_A, lora_B, scaling, base_layer):
+    def forward(self, x, *, lora_A, lora_B, scaling, base_layer, base_layer_result, dropout):
         """
         For DoRA, calculate the extra output from LoRA with DoRA applied. This should be added on top of the base layer
         output.
         """
-        lora_result = lora_B(lora_A(x))
+        lora_result = lora_B(lora_A(dropout(x)))
 
         # Don't use `lora_weight = lora_B.weight @ lora_A.weight` because this causes errors with FSDP. Instead,
         # calculate the same but using forward.
@@ -86,9 +86,7 @@ def forward(self, x, *, lora_A, lora_B, scaling, base_layer):
         # during backpropagation"
         weight_norm = weight_norm.detach()
         mag_norm_scale = (magnitude / weight_norm).view(1, -1)
-        result_dora = (mag_norm_scale - 1) * (
-            F.linear(x, transpose(weight, self.fan_in_fan_out))
-        ) + mag_norm_scale * lora_result * scaling
+        result_dora = (mag_norm_scale - 1) * base_layer_result + mag_norm_scale * lora_result * scaling
 
         # Note: Computation could potentially be accelerated by using the code below instead of calculating X@W again.
         # This is only correct if dropout=0, otherwise results will differ:
@@ -142,7 +140,7 @@ def get_weight_norm(self, weight, lora_weight, scaling) -> torch.Tensor:
         weight_norm = weight.norm(p=2, dim=dim, keepdim=True).transpose(1, 0)
         return weight_norm
 
-    def forward(self, x, *, lora_A, lora_B, scaling, base_layer, base_layer_result=None):
+    def forward(self, x, *, lora_A, lora_B, scaling, base_layer, base_layer_result, dropout):
         """
         For DoRA, calculate the extra output from LoRA with DoRA applied. This should be added on top of the base layer
         output.
@@ -161,21 +159,8 @@ def forward(self, x, *, lora_A, lora_B, scaling, base_layer, base_layer_result=N
         weight_norm = weight_norm.detach()
         mag_norm_scale = magnitude / weight_norm
 
-        if isinstance(base_layer_result, torch.Tensor):
-            # the base layer has already computed the convolution, we do not need to compute it again.
-            result_dora = (mag_norm_scale - 1) * base_layer_result + mag_norm_scale * lora_B(lora_A(x)) * scaling
-        else:
-            result_dora = (mag_norm_scale - 1) * (
-                F.conv2d(
-                    x,
-                    weight,
-                    bias=None,
-                    stride=base_layer.stride,
-                    padding=base_layer.padding,
-                    dilation=base_layer.dilation,
-                    groups=base_layer.groups,
-                )
-            ) + mag_norm_scale * lora_B(lora_A(x)) * scaling
+        # the base layer has already computed the convolution, we do not need to compute it again.
+        result_dora = (mag_norm_scale - 1) * base_layer_result + mag_norm_scale * lora_B(lora_A(dropout(x))) * scaling
 
         return result_dora
 
diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
@@ -585,13 +585,14 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
                 if not self.use_dora[active_adapter]:
                     result = result + lora_B(lora_A(dropout(x))) * scaling
                 else:
-                    x = dropout(x)
                     result = result + self.lora_magnitude_vector[active_adapter](
                         x,
                         lora_A=lora_A,
                         lora_B=lora_B,
                         scaling=scaling,
                         base_layer=self.get_base_layer(),
+                        base_layer_result=result,
+                        dropout=dropout
                     )
 
             result = result.to(torch_result_dtype)
@@ -1120,14 +1121,14 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
                 if not self.use_dora[active_adapter]:
                     result = result + lora_B(lora_A(dropout(x))) * scaling
                 else:
-                    x = dropout(x)
                     result = result + self.lora_magnitude_vector[active_adapter](
                         x,
                         lora_A=lora_A,
                         lora_B=lora_B,
                         scaling=scaling,
                         base_layer=self.get_base_layer(),
-                        base_layer_result=result
+                        base_layer_result=result,
+                        dropout=dropout
                     )
 
             result = result.to(torch_result_dtype)