huggingface · BenjaminBossan · Oct 16, 2024 · Oct 1, 2024 · Oct 2, 2024 · Oct 2, 2024
diff --git a/src/peft/tuners/lora/dora.py b/src/peft/tuners/lora/dora.py
@@ -62,13 +62,11 @@ def update_layer(self, *, base_layer, lora_A, lora_B, scaling, place_on_cpu=Fals
             weight_norm = weight_norm.to("cpu")
         self.weight = nn.Parameter(weight_norm, requires_grad=True)
 
-    def forward(self, x, *, lora_A, lora_B, scaling, base_layer):
+    def forward(self, x, *, lora_A, lora_B, scaling, base_layer, base_result=None):
         """
         For DoRA, calculate the extra output from LoRA with DoRA applied. This should be added on top of the base layer
         output.
         """
-        lora_result = lora_B(lora_A(x))
-
         # Don't use `lora_weight = lora_B.weight @ lora_A.weight` because this causes errors with FSDP. Instead,
         # calculate the same but using forward.
         x_eye = torch.eye(lora_A.weight.shape[1], device=lora_A.weight.device, dtype=x.dtype)
@@ -86,19 +84,23 @@ def forward(self, x, *, lora_A, lora_B, scaling, base_layer):
         # during backpropagation"
         weight_norm = weight_norm.detach()
         mag_norm_scale = (magnitude / weight_norm).view(1, -1)
-        result_dora = (mag_norm_scale - 1) * (
-            F.linear(x, transpose(weight, self.fan_in_fan_out))
-        ) + mag_norm_scale * lora_result * scaling
-
-        # Note: Computation could potentially be accelerated by using the code below instead of calculating X@W again.
-        # This is only correct if dropout=0, otherwise results will differ:
-        # https://github.com/huggingface/peft/pull/1474#issuecomment-1964682771
-        # bias = self.get_base_layer().bias
-        # if bias is not None:
-        #     result = result - bias
-        # result = mag_norm_scale * result + mag_norm_scale * lora_B(lora_A(x)) * scaling
-        # if bias is not None:
-        #     result = result + bias
+
+        lora_result = lora_B(lora_A(x))
+
+        if base_result is not None:
+            # `base_result` is provided only if dropout is set to 0 or if the model is in evaluation mode.
+            # This means we already have a deterministic output from the base layer that can be reused.
+            bias = base_layer.bias
+            if bias is not None:
+                base_result = base_result - bias
+            result_dora = mag_norm_scale * base_result + mag_norm_scale * lora_result * scaling
+            if bias is not None:
+                result_dora = result_dora + bias
+        else:
+            # If `base_result` is not provided (likely due to dropout being used or training mode),
+            # calculate it directly using the base layer weights.
+            base_result = F.linear(x, transpose(weight, self.fan_in_fan_out))
+            result_dora = (mag_norm_scale - 1) * base_result + mag_norm_scale * lora_result * scaling
 
         return result_dora
 

diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
@@ -585,14 +585,25 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
                 if not self.use_dora[active_adapter]:
                     result = result + lora_B(lora_A(dropout(x))) * scaling
                 else:
-                    x = dropout(x)
-                    result = result + self.lora_magnitude_vector[active_adapter](
-                        x,
-                        lora_A=lora_A,
-                        lora_B=lora_B,
-                        scaling=scaling,
-                        base_layer=self.get_base_layer(),
-                    )
+                    if isinstance(dropout, nn.Identity) or not self.training:
+                        result = self.lora_magnitude_vector[active_adapter](
+                            x,
+                            lora_A=lora_A,
+                            lora_B=lora_B,
+                            scaling=scaling,
+                            base_layer=self.get_base_layer(),
+                            base_result=result,
+                        )
+                    else:
+                        x = dropout(x)
+                        result = result + self.lora_magnitude_vector[active_adapter](
+                            x,
+                            lora_A=lora_A,
+                            lora_B=lora_B,
+                            scaling=scaling,
+                            base_layer=self.get_base_layer(),
+                            result=None,
+                        )
 
             result = result.to(torch_result_dtype)