huggingface · nsbg · May 17, 2025 · Aug 2, 2025 · Sep 2, 2025 · Sep 2, 2025
diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py
@@ -664,6 +664,16 @@ class LoraConfig(PeftConfig):
         default=None, metadata={"help": "The necessary config to apply arrow routing on the model."}
     )
 
+    use_kasa: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Enable <a href='https://arxiv.org/abs/2412.06071'>'Knowledge-Aware Singular-Value Adaptation of Large Language Models' (KaSA)</a>. This technique leverages "
+                "singular value decomposition (SVD) with knowledge-aware singular values to dynamically "
+                "activate parametric knowledge according to its relevance to downstream tasks."
+            )
+        }
+    )
     def to_dict(self):
         """
         Returns the configuration for your adapter model as a dictionary. Removes runtime configurations.

diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
@@ -113,6 +113,7 @@ def __init__(self, base_layer: nn.Module, ephemeral_gpu_offload: bool = False, *
         self._disable_adapters = False
         self.merged_adapters = []
         self.use_dora: dict[str, bool] = {}  # not actively used anymore after #2443, keep it for BC
+        self.use_kasa: dict[str, bool] = {}
         self.use_rslora: dict[str, bool] = {}
         self.lora_bias: dict[str, bool] = {}
         self.lora_magnitude_vector = torch.nn.ModuleDict()  # for DoRA
@@ -181,7 +182,7 @@ def __init__(self, base_layer: nn.Module, ephemeral_gpu_offload: bool = False, *
         self.in_features = in_features
         self.out_features = out_features
 
-    def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]:
+    def resolve_lora_variant(self, *, use_dora: bool, use_kasa: bool, **kwargs) -> Optional[LoraVariant]:
         """Return a matching LoRA variant for this layer type.
 
         Given the init arguments of this layer, return the correct LoRA variant, if any. E.g., if `use_dora=True`, this
@@ -204,6 +205,7 @@ def update_layer(
         init_lora_weights,
         use_rslora,
         use_dora: bool = False,
+        use_kasa: bool = False,
         use_alora: bool = False,
         use_qalora: bool = False,
         lora_bias: bool = False,
@@ -229,11 +231,13 @@ def update_layer(
 
         lora_variant = self.resolve_lora_variant(
             use_dora=use_dora,
+            use_kasa=use_kasa,
             use_alora=use_alora,
             use_qalora=use_qalora,
             qalora_group_size=qalora_group_size,
             arrow_config=arrow_config,
         )
+
         if lora_variant is not None:
             self.lora_variant[adapter_name] = lora_variant
 
@@ -664,6 +668,7 @@ def __init__(
         init_lora_weights: Union[bool, str] = True,
         use_rslora: bool = False,
         use_dora: bool = False,
+        use_kasa: bool = False,
         use_alora: bool = False,
         arrow_config: ArrowConfig = None,
         lora_bias: bool = False,
@@ -682,14 +687,15 @@ def __init__(
             init_lora_weights=init_lora_weights,
             use_rslora=use_rslora,
             use_dora=use_dora,
+            use_kasa=use_kasa,
             use_alora=use_alora,
             lora_bias=lora_bias,
             arrow_config=arrow_config,
         )
         self.is_target_conv_1d_layer = is_target_conv_1d_layer
 
     def resolve_lora_variant(
-        self, *, arrow_config: ArrowConfig, use_dora: bool, use_alora: bool, **kwargs
+        self, *, arrow_config: ArrowConfig, use_dora: bool, use_alora: bool, use_kasa: bool, **kwargs
     ) -> Optional[LoraVariant]:
         if arrow_config is not None:
             from .variants import ArrowLinearVariant
@@ -819,8 +825,6 @@ def get_delta_weight(self, adapter) -> torch.Tensor:
             weight_A = weight_A.float()
             weight_B = weight_B.float()
 
-        output_tensor = transpose(weight_B @ weight_A, self.fan_in_fan_out) * self.scaling[adapter]
-
         if cast_to_fp32:
             output_tensor = output_tensor.to(dtype=dtype)
 
@@ -916,7 +920,7 @@ def __init__(
             arrow_config=arrow_config,
         )
 
-    def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]:
+    def resolve_lora_variant(self, *, use_dora: bool, use_kasa: bool, **kwargs) -> Optional[LoraVariant]:
         if not use_dora:
             return None
 
@@ -933,6 +937,7 @@ def update_layer(
         init_lora_weights,
         use_rslora,
         use_dora,
+        use_kasa,
         lora_bias,
         arrow_config: ArrowConfig = None,
         inference_mode: bool = False,
@@ -945,7 +950,8 @@ def update_layer(
         if r <= 0:
             raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
 
-        lora_variant = self.resolve_lora_variant(use_dora=use_dora, arrow_config=arrow_config)
+        lora_variant = self.resolve_lora_variant(use_dora=use_dora, use_kasa=use_kasa, arrow_config=arrow_config)
+
         if lora_variant is not None:
             self.lora_variant[adapter_name] = lora_variant
 
@@ -1180,6 +1186,7 @@ def __init__(
         init_lora_weights: Union[bool, str] = True,
         use_rslora: bool = False,
         use_dora: bool = False,
+        use_kasa: bool = False,
         arrow_config: ArrowConfig = None,
         lora_bias: bool = False,
         **kwargs,
@@ -1222,6 +1229,7 @@ def update_layer(
         init_lora_weights,
         use_rslora,
         use_dora,
+        use_kasa,
         lora_bias,
         arrow_config: ArrowConfig = None,
         inference_mode: bool = False,
@@ -1241,7 +1249,7 @@ def update_layer(
                 PeftWarning,
             )
 
-        lora_variant = self.resolve_lora_variant(use_dora=use_dora, arrow_config=arrow_config)
+        lora_variant = self.resolve_lora_variant(use_dora=use_dora, arrow_config=arrow_config, use_kasa=use_kasa)
         if lora_variant is not None:
             self.lora_variant[adapter_name] = lora_variant
 
@@ -1485,7 +1493,7 @@ def __init__(self, *args, **kwargs):
             raise ValueError(f"Conv2d layer kernel must have 4 dimensions, not {self._kernel_dim}")
         self.conv_fn = F.conv2d
 
-    def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]:
+    def resolve_lora_variant(self, *, use_dora: bool, use_kasa: bool, **kwargs) -> Optional[LoraVariant]:
         if not use_dora:
             return None
 
@@ -1502,7 +1510,7 @@ def __init__(self, *args, **kwargs):
             raise ValueError(f"Conv1d layer kernel must have 3 dimensions, not {self._kernel_dim}")
         self.conv_fn = F.conv1d
 
-    def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]:
+    def resolve_lora_variant(self, *, use_dora: bool, use_kasa: bool, **kwargs) -> Optional[LoraVariant]:
         if not use_dora:
             return None
 
@@ -1519,7 +1527,7 @@ def __init__(self, *args, **kwargs):
             raise ValueError(f"Conv3d layer kernel must have 5 dimensions, not {self._kernel_dim}")
         self.conv_fn = F.conv3d
 
-    def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]:
+    def resolve_lora_variant(self, *, use_dora: bool, use_kasa: bool, **kwargs) -> Optional[LoraVariant]:
         if not use_dora:
             return None
 
@@ -2019,7 +2027,7 @@ def update_layer(
             raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
 
         lora_variant = self.resolve_lora_variant(
-            use_dora=use_dora, use_qalora=use_qalora, qalora_group_size=qalora_group_size
+            use_dora=use_dora, use_qalora=use_qalora, qalora_group_size=qalora_group_size, use_kasa=use_kasa
         )
         if lora_variant is not None:
             raise ValueError(f"lora.{self.__class__.__name__} does not work with LoRA variants like DoRA.")

diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py
@@ -439,6 +439,82 @@ def init(module: Conv3d, adapter_name: str, **kwargs: Any) -> None:
         _DoraConvNdVariant.init_convd_variant(module, adapter_name, dora_layer=dora_layer)
 
 
+class KasaLinearVariant(LoraVariant):
+    @staticmethod
+    def init(module: Linear, adapter_name: str, **kwargs: Any) -> None:
+        if not module.lora_diag:
+            module.adapter_layer_names = module.adapter_layer_names[:] + ("lora_diag",)
+
+        # Initialize lora_diag
+        module.lora_diag[adapter_name] = nn.Parameter(torch.randn(module.r[adapter_name]), requires_grad=True)
+
+        # see https://github.com/juyongjiang/KaSA/blob/f85e88c22d0fa4cb8ab2923d7c2bf1bbec152da3/peft/src/peft/tuners/lora/layer.py#L132
+
+        # SVD
+        weight = module.get_base_layer().weight
+        dtype = weight.dtype
+        svd_rank = module.in_features - module.r[adapter_name]
+        weight = weight.to(torch.float32)
+        U, S, Vh = torch.linalg.svd(weight.data, full_matrices=False)
+        U_principle, S_principle, Vh_principle = U[:, :svd_rank], S[:svd_rank], Vh[:svd_rank, :]
+        module.get_base_layer().weight.data = (U_principle @ torch.diag(S_principle) @ Vh_principle).to(dtype)
+
+    @staticmethod
+    def _get_delta_weight(weight_A, weight_B, lora_diag, scaling, fan_in_fan_out):
+        diag = torch.diag(lora_diag)
+        delta = weight_B @ diag @ weight_A
+        if fan_in_fan_out:
+            delta = delta.transpose(0, 1)
+        delta = delta * scaling
+        return delta
+
+    @staticmethod
+    def merge_safe(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor:
+        delta_weight = KasaLinearVariant._get_delta_weight(
+            module.lora_A[active_adapter].weight, 
+            module.lora_B[active_adapter].weight, 
+            module.lora_diag[active_adapter], 
+            module.scaling[active_adapter], 
+            module.fan_in_fan_out
+        )
+        return orig_weight + delta_weight
+
+    @staticmethod
+    def merge_unsafe(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> None:
+        delta_weight = KasaLinearVariant._get_delta_weight(
+            module.lora_A[active_adapter].weight,
+            module.lora_B[active_adapter].weight,
+            module.lora_diag[active_adapter],
+            module.scaling[active_adapter],
+            module.fan_in_fan_out,
+        )
+        orig_weight.data += delta_weight
+
+    @staticmethod
+    def unmerge(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor:
+        delta_weight = KasaLinearVariant._get_delta_weight(
+            module.lora_A[active_adapter].weight,
+            module.lora_B[active_adapter].weight,
+            module.lora_diag[active_adapter],
+            module.scaling[active_adapter],
+            module.fan_in_fan_out,
+        )
+        return orig_weight - delta_weight
+
+    @staticmethod
+    def forward(module: Linear, active_adapter: str, x: torch.Tensor, result: torch.Tensor) -> torch.Tensor:
+        lora_A = module.lora_A[active_adapter]
+        lora_B = module.lora_B[active_adapter]
+        dropout = module.lora_dropout[active_adapter]
+        scaling = module.scaling[active_adapter]
+        diag = torch.diag(module.lora_diag[active_adapter])
+
+        # KaSA calculation
+        # see https://github.com/juyongjiang/KaSA/blob/f85e88c22d0fa4cb8ab2923d7c2bf1bbec152da3/peft/src/peft/tuners/lora/layer.py#L602C21-L602C110
+        lora_output = lora_B(torch.einsum("ijk,kl->ijl", lora_A(dropout(x)), diag)) * scaling
+        return result + lora_output
+
+
 class QALoraLinearVariant(LoraVariant):
     @staticmethod
     def init(module: Linear, adapter_name: str, **kwargs: Any) -> None:

diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
@@ -159,6 +159,8 @@
         LoraConfig,
         {"target_modules": ["lin0"], "target_parameters": ["lin1.weight"]},
     ),
+    ("Vanilla MLP 7 LoRA with KaSA", "MLP", LoraConfig, {"target_modules": ["lin0"], "use_kasa": True}),
+    ("Vanilla MLP 8 LoRA with KaSA", "MLP", LoraConfig, {"target_modules": ["lin0", "lin1"], "use_kasa": True})
     #######
     # IA³ #
     #######