ENH Orthogonal LoRA layer initialization (2)

BenjaminBossan · BenjaminBossan · commit 4a59a80ab3a7 · 2025-04-15T14:41:49.000+02:00
Continuation of, and supersedes huggingface#2389 Check discussion there for further info.
diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py
@@ -233,7 +233,7 @@ class LoraConfig(PeftConfig):
             Otherwise, it will use the original default value of `lora_alpha/r`.
         modules_to_save (`List[str]`):
             List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint.
-        init_lora_weights (`bool` | `Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq"]`):
+        init_lora_weights (`bool` | `Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq", "orthogonal"]`):
             How to initialize the weights of the adapter layers. Passing True (default) results in the default
             initialization from the reference implementation from Microsoft, with the LoRA B weight being set to 0.
             This means that without further training, the LoRA adapter will be a no-op. Setting the initialization to
@@ -252,7 +252,9 @@ class LoraConfig(PeftConfig):
             a 7B model within seconds, and the training effect is approximately equivalent to using SVD. Passing
             `'corda'` results in the initialization of <ahref='https://arxiv.org/abs/2406.05223' >Context-Oriented
             Decomposition Adaptation</a>, which converges even more rapidly than PiSSA in Instruction-Previewed Mode,
-            and preserves world knowledge better than LoRA in Knowledge-Preserved Mode.
+            and preserves world knowledge better than LoRA in Knowledge-Preserved Mode. Passing `"orthogonal"` results
+            in LoRA A and B being intialized orthogonally; in this, it resembles `"olora"`, but the base weights are
+            left untouched (requires `r` to be even, only supported for linear layers for now).
         layers_to_transform (`Union[List[int], int]`):
             The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices
             that are specified in this list. If a single integer is passed, it will apply the transformations on the
@@ -356,7 +358,8 @@ class LoraConfig(PeftConfig):
         },
     )
     init_lora_weights: (
-        bool | Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq"]
+        bool
+        | Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq", "orthogonal"]
     ) = field(
         default=True,
         metadata={
@@ -375,7 +378,8 @@ class LoraConfig(PeftConfig):
                 "[number of iters] indicates the number of subspace iterations to perform fsvd, and must be a "
                 "nonnegative integer. "
                 "Passing `'corda'` results in CorDA initialization. "
-                "Pass `'loftq'` to use LoftQ initialization."
+                "Pass `'loftq'` to use LoftQ initialization. "
+                "Pass `'orthogonal'` for orthogonal initialization of LoRA A and B."
             ),
         },
     )
diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
@@ -230,6 +230,9 @@ def update_layer(
                 self.loftq_init(adapter_name)
         elif init_lora_weights == "eva":
             nn.init.zeros_(self.lora_B[adapter_name].weight)
+        elif init_lora_weights == "orthogonal":
+            with gather_params_ctx(self.get_base_layer().weight):
+                self.orthogonal_init(adapter_name)
         elif init_lora_weights:
             self.reset_lora_parameters(adapter_name, init_lora_weights)
         # call this before init of the lora variants
@@ -440,6 +443,23 @@ def loftq_init(self, adapter_name):
             self.lora_embedding_B[adapter_name].weight.data = lora_B
         self.get_base_layer().weight.data = qweight
 
+    @torch.no_grad()
+    def orthogonal_init(self, adapter_name):
+        # https://datta0.github.io/posts/rethink-lora-init/#orthogonal-initialisation
+        rank = self.r[adapter_name]
+        if rank % 2 != 0:
+            raise ValueError(f"Orthogonal initialization requires the LoRA rank to be even, got {rank} instead.")
+
+        X = torch.randn(rank, rank)
+        Q, _ = torch.linalg.qr(X)
+        q_odd = Q[0::2, :]  # Odd rows
+        q_even = Q[1::2, :]  # Even rows
+        dtype = self.get_base_layer().weight.dtype
+        lora_A = torch.randn(self.in_features, rank // 2).mm(q_odd).T / 10.0
+        lora_B = torch.randn(rank // 2, self.out_features).T.mm(q_even) / 10.0
+        self.lora_A[adapter_name].weight = nn.Parameter(lora_A.contiguous().to(dtype))
+        self.lora_B[adapter_name].weight = nn.Parameter(lora_B.contiguous().to(dtype))
+
     def _cache_store(self, key: str, value: Any) -> None:
         self._caches[key] = value
 
diff --git a/tests/test_initialization.py b/tests/test_initialization.py
@@ -277,6 +277,48 @@ def test_lora_conv2d_false(self):
         # as long as they are not zero, in order to avoid identity transformation.
         assert not torch.allclose(weight_B, torch.zeros_like(weight_B))
 
+    def test_lora_init_orthogonal(self):
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["linear"], init_lora_weights="orthogonal")
+        model = get_peft_model(model, config)
+
+        weight_A = model.linear.lora_A["default"].weight
+        weight_B = model.linear.lora_B["default"].weight
+
+        assert not torch.allclose(weight_A, torch.zeros_like(weight_A))
+        assert not torch.allclose(weight_B, torch.zeros_like(weight_B))
+        assert (weight_B @ weight_A).abs().max() < 1e-6
+
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+    def test_lora_init_orthogonal_half_precision_dtype(self, dtype):
+        try:
+            torch.zeros(1, dtype=dtype)
+        except Exception:
+            pytest.skip(f"dtype {dtype} not supported on this system, skipping test")
+
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["linear"], init_lora_weights="orthogonal")
+        model = get_peft_model(model, config).to(dtype)
+
+        weight_A = model.linear.lora_A["default"].weight
+        weight_B = model.linear.lora_B["default"].weight
+
+        assert weight_A.dtype == dtype
+        assert weight_B.dtype == dtype
+
+    def test_lora_init_orthogonal_odd_rank_raises(self):
+        torch.manual_seed(0)
+
+        model = self.get_model()
+        config = LoraConfig(target_modules=["linear"], init_lora_weights="orthogonal", r=7)
+        msg = "Orthogonal initialization requires the LoRA rank to be even, got 7 instead."
+        with pytest.raises(ValueError, match=msg):
+            get_peft_model(model, config)
+
     def test_lora_scaling_default(self):
         # default is True
         torch.manual_seed(0)
@@ -1254,6 +1296,7 @@ def test_lora_with_bias_embedding_raises(self):
             {"init_lora_weights": "olora"},
             {"init_lora_weights": "pissa"},
             {"init_lora_weights": "pissa_niter_3"},
+            {"init_lora_weights": "orthogonal"},
         ],
     )
     def test_lora_with_bias_incompatible_arguments(self, extra_kwargs):