huggingface · BenjaminBossan · Jan 8, 2025 · Jan 5, 2024 · Jan 5, 2024 · Jan 5, 2024
diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py
@@ -27,7 +27,7 @@
 import packaging.version
 import torch
 import transformers
-from accelerate import dispatch_model, infer_auto_device_map, init_empty_weights
+from accelerate import dispatch_model, infer_auto_device_map
 from accelerate.hooks import AlignDevicesHook, add_hook_to_module, remove_hook_from_submodules
 from accelerate.utils import get_balanced_memory, named_module_tensors
 from huggingface_hub import HfFileSystem, ModelCard, ModelCardData, hf_hub_download
@@ -39,6 +39,7 @@
 from transformers.utils import PushToHubMixin
 
 from peft.utils.constants import DUMMY_MODEL_CONFIG, PEFT_TYPE_TO_PREFIX_MAPPING
+from peft.utils.integrations import init_empty_weights
 
 from . import __version__
 from .config import PeftConfig

diff --git a/src/peft/tuners/lora/eva.py b/src/peft/tuners/lora/eva.py
@@ -32,10 +32,10 @@
 from peft.utils.other import _get_submodules, get_pattern_key
 
 from .config import LoraConfig
-from .layer import Embedding, LoraLayer, _ConvNd
+from .layer import Embedding, LoraLayer, MultiheadAttention, _ConvNd
 
 
-UNSUPPORTED_LORA_MODULES = (Embedding, _ConvNd)
+UNSUPPORTED_LORA_MODULES = (Embedding, MultiheadAttention, _ConvNd)
 
 
 class _Hook:

diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py
@@ -247,14 +247,6 @@ def _replace_module(self, parent, child_name, new_module, child):
         if hasattr(child, "base_layer"):
             child = child.base_layer
 
-        if not hasattr(new_module, "base_layer"):
-            if hasattr(new_module, "W_q"):  # HQQ
-                new_module.W_q = child.W_q
-            else:
-                new_module.weight = child.weight
-            if hasattr(child, "bias"):
-                new_module.bias = child.bias
-
         if getattr(child, "state", None) is not None:
             if hasattr(new_module, "base_layer"):
                 new_module.base_layer.state = child.state
@@ -266,15 +258,18 @@ def _replace_module(self, parent, child_name, new_module, child):
         # dispatch to correct device
         for name, module in new_module.named_modules():
             if (self.prefix in name) or ("ranknum" in name):
-                weight = (
-                    child.qweight
-                    if hasattr(child, "qweight")
-                    else child.W_q
-                    if hasattr(child, "W_q")
-                    else child.weight
-                    if hasattr(child, "weight")
-                    else next(child.parameters())
-                )
+                if hasattr(child, "qweight"):
+                    weight = child.qweight
+                elif hasattr(child, "W_q"):
+                    weight = child.W_q
+                elif hasattr(child, "weight"):
+                    weight = child.weight
+                elif getattr(child, "in_proj_weight", None) is not None:  # MHA
+                    weight = child.in_proj_weight
+                elif getattr(child, "q_proj_weight", None) is not None:  # MHA
+                    weight = child.q_proj_weight
+                else:
+                    weight = next(child.parameters())
                 if not any(p.device == meta for p in module.parameters()):
                     module.to(weight.device)
 
@@ -360,7 +355,7 @@ def dynamic_dispatch_func(target, adapter_name, lora_config, **kwargs):
             raise ValueError(
                 f"Target module {target} is not supported. Currently, only the following modules are supported: "
                 "`torch.nn.Linear`, `torch.nn.Embedding`, `torch.nn.Conv2d`, `torch.nn.Conv3d`, "
-                "`transformers.pytorch_utils.Conv1D`."
+                "`transformers.pytorch_utils.Conv1D`, `torch.nn.MultiheadAttention.`."
             )
 
         return new_module
@@ -509,7 +504,13 @@ def _unload_and_optionally_merge(
             except AttributeError:
                 continue
             with onload_layer(target):
-                if hasattr(target, "base_layer"):
+                if hasattr(target, "unload_and_optionally_merge_module"):
+                    # if layers have special unloading method, like MultiheadAttention, use that
+                    unloaded_module = target.unload_and_optionally_merge_module(
+                        merge=merge, safe_merge=safe_merge, adapter_names=adapter_names
+                    )
+                    self._replace_module(parent, target_name, unloaded_module, target)
+                elif hasattr(target, "base_layer"):
                     if merge:
                         target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
                     self._replace_module(parent, target_name, target.get_base_layer(), target)

diff --git a/src/peft/tuners/tuners_utils.py b/src/peft/tuners/tuners_utils.py
@@ -24,7 +24,6 @@
 from typing import Any, Optional, Union
 
 import torch
-from accelerate import init_empty_weights
 from accelerate.hooks import AlignDevicesHook
 from accelerate.utils import named_module_tensors, offload_state_dict
 from torch import nn
@@ -39,6 +38,7 @@
     MIN_TARGET_MODULES_FOR_OPTIMIZATION,
     SEQ_CLS_HEAD_NAMES,
 )
+from peft.utils.integrations import init_empty_weights
 from peft.utils.peft_types import PeftType, TaskType
 
 from ..config import PeftConfig
@@ -828,9 +828,12 @@ def _move_adapter_to_device_of_base_layer(self, adapter_name: str, device: Optio
         Move the adapter of the given name to the device of the base layer.
         """
         if device is None:
+            base_layer = self.get_base_layer()
+            if isinstance(base_layer, nn.MultiheadAttention):
+                base_layer = base_layer.out_proj
             # check weight and qweight (for GPTQ)
             for weight_name in ("weight", "qweight"):
-                weight = getattr(self.get_base_layer(), weight_name, None)
+                weight = getattr(base_layer, weight_name, None)
                 if weight is not None:
                     device = weight.device
                     dtype = weight.dtype

diff --git a/src/peft/utils/integrations.py b/src/peft/utils/integrations.py
@@ -14,12 +14,14 @@
 
 from __future__ import annotations
 
+import functools
 from contextlib import contextmanager
 from typing import Literal
 
 import packaging.version
 import torch
 import transformers
+from torch import nn
 
 
 @contextmanager
@@ -170,3 +172,109 @@ def map_cache_to_layer_device_map(model, cache) -> None:
         layer_device = layer_device_map[idx]
         cache.key_cache[idx] = cache.key_cache[idx].to(layer_device)
         cache.value_cache[idx] = cache.value_cache[idx].to(layer_device)
+
+
+##################################
+# START: ADAPTED FROM ACCELERATE #
+##################################
+#
+# Modified to support explicitly skipping layer initialization for faster switching between layer states
+# (necessary for supporting `nn.MultiHeadAttention` adapters)
+
+
+@contextmanager
+def init_empty_weights(include_buffers: bool = None):
+    # adapted from accelerate.big_modeling.py
+    with _init_on_device(torch.device("meta"), include_buffers=include_buffers) as f:
+        yield f
+
+
+@contextmanager
+def _init_on_device(device: torch.device, include_buffers: bool = None):
+    # adapted from accelerate.big_modeling.py
+    old_register_parameter = nn.Module.register_parameter
+    if include_buffers:
+        old_register_buffer = nn.Module.register_buffer
+
+    def register_empty_parameter(module, name, param):
+        # This works because torch first initializes the parameters with torch.empty, thus not assigning any new memory.
+        # Then the parameter is moved to meta device before reset_parameters() is called, which then operates on the
+        # meta device, making any subsequent calls to initialization methods no-ops.
+        old_register_parameter(module, name, param)
+        if (param is not None) and (getattr(_init_on_device, "_skip", False) is not True):
+            param_cls = type(module._parameters[name])
+            kwargs = module._parameters[name].__dict__
+            kwargs["requires_grad"] = param.requires_grad
+            module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
+
+    def register_empty_buffer(module, name, buffer, persistent=True):
+        old_register_buffer(module, name, buffer, persistent=persistent)
+        if buffer is not None:
+            module._buffers[name] = module._buffers[name].to(device)
+
+    # Patch tensor creation
+    if include_buffers:
+        tensor_constructors_to_patch = {
+            torch_function_name: getattr(torch, torch_function_name)
+            for torch_function_name in ["empty", "zeros", "ones", "full"]
+        }
+    else:
+        tensor_constructors_to_patch = {}
+
+    def patch_tensor_constructor(fn):
+        def wrapper(*args, **kwargs):
+            kwargs["device"] = device
+            return fn(*args, **kwargs)
+
+        return wrapper
+
+    try:
+        nn.Module.register_parameter = register_empty_parameter
+        if include_buffers:
+            nn.Module.register_buffer = register_empty_buffer
+        for torch_function_name in tensor_constructors_to_patch.keys():
+            setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
+        yield
+    finally:
+        nn.Module.register_parameter = old_register_parameter
+        if include_buffers:
+            nn.Module.register_buffer = old_register_buffer
+        for torch_function_name, old_torch_function in tensor_constructors_to_patch.items():
+            setattr(torch, torch_function_name, old_torch_function)
+
+
+@contextmanager
+def _skip_init_on_device():
+    # context manager to skip the _init_on_device context manager
+    old_val = getattr(_init_on_device, "_skip", False)
+    try:
+        _init_on_device._skip = True
+        yield
+    finally:
+        _init_on_device._skip = old_val
+
+
+def skip_init_on_device(func):
+    """
+    Ignore the init_on_device context manager when calling the decorated function.
+
+    This is a narrow use decorator that allows us to avoid initializing on meta device even when we're inside the
+    init_empty_weights context.
+
+    """
+
+    # The need for this functionality arose when working on MultiheadAttention, where we have to call _restore_weights
+    # repeatedly as parametes are overwritten and need to be re-registered. When using low_cpu_mem_usage=True, as
+    # register_parameter is patched inside of the init_empty_weights context, this would result in those parameters
+    # suddenly being moved to meta device. Using this decorator allows us to avoid this.
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        with _skip_init_on_device():
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
+#######
+# END #
+#######