Merge branch 'huggingface:main' into adaption_prompt_edits

efraimdahl · web-flow · commit 8e83d81ffcd2 · 2025-03-19T16:06:29.000+01:00
diff --git a/setup.py b/setup.py
@@ -15,7 +15,7 @@
 from setuptools import find_packages, setup
 
 
-VERSION = "0.14.1.dev0"
+VERSION = "0.15.0"
 
 extras = {}
 extras["quality"] = [
@@ -87,7 +87,7 @@
 )
 
 # Release checklist
-# 1. Change the version in __init__.py and setup.py to the release version, e.g. from "0.6.0.dev0" to "0.6.0"
+# 1. Change the version in __init__.py and setup.py to the release version, e.g. from "0.6.1.dev0" to "0.7.0"
 # 2. Check if there are any deprecations that need to be addressed for this release by searching for "# TODO" in the code
 # 3. Commit these changes with the message: "Release: VERSION", create a PR and merge it.
 # 4. Add a tag in git to mark the release: "git tag -a VERSION -m 'Adds tag VERSION for pypi' "
@@ -107,4 +107,4 @@
 #      twine upload dist/* -r pypi
 # 9. Add release notes to the tag on https://github.com/huggingface/peft/releases once everything is looking hunky-dory.
 #      Check the notes here: https://docs.google.com/document/d/1k-sOIfykuKjWcOIALqjhFKz4amFEp-myeJUJEzNgjoU/edit?usp=sharing
-# 10. Update the version in __init__.py, setup.py to the bumped minor version + ".dev0" (e.g. from "0.6.0" to "0.7.0.dev0")
+# 10. Update the version in __init__.py, setup.py to the bumped patch version + ".dev0" (e.g. from "0.7.0" to "0.7.1.dev0")
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.14.1.dev0"
+__version__ = "0.15.0"
 
 from .auto import (
     MODEL_TYPE_TO_PEFT_MODEL_MAPPING,
diff --git a/src/peft/tuners/tuners_utils.py b/src/peft/tuners/tuners_utils.py
@@ -452,9 +452,13 @@ def inject_adapter(
         # quite a lot. See: https://github.com/huggingface/diffusers/issues/9297
         # As there is a small chance for undiscovered bugs, we apply this optimization only if the list of
         # target_modules is sufficiently big.
+        # We also exclude IA³ from this optimization. This is because IA³ has both target_modules and
+        # feedforward_modules, which are coupled (the latter must be a subset). It would be possible to change the logic
+        # to keep both in sync, but it's not quite trivial and probably not worth the effort. See #2429.
         if (
             isinstance(peft_config.target_modules, (list, set))
-            and len(peft_config.target_modules) >= MIN_TARGET_MODULES_FOR_OPTIMIZATION
+            and (len(peft_config.target_modules) >= MIN_TARGET_MODULES_FOR_OPTIMIZATION)
+            and (peft_config.peft_type != PeftType.IA3)
         ):
             names_no_target = [
                 name
@@ -469,6 +473,13 @@ def inject_adapter(
             if not key:
                 continue
             # Check for modules_to_save in case
+            #
+            # Note that this is redundant with PeftModel.set_additional_trainable_models but might be necessary
+            # when calling inject_adapter without a PEFT model. This is outdated as it only focuses on
+            # ModulesToSaveWrapper and ignores other potentially configured AuxiliaryTrainingWrapper instances.
+            #
+            # TODO: determine if there's a good reason for this and refactor to support AuxiliaryTrainingWrapper,
+            # or remove if superfluous.
             if _check_for_modules_to_save and any(
                 key.endswith(module_to_save) for module_to_save in peft_config.modules_to_save
             ):
diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
@@ -499,6 +499,10 @@ def update(self, adapter_name, **kwargs):
             add_hook_to_module(self.modules_to_save[adapter_name], new_hook)
 
         self.original_module.requires_grad_(False)
+
+        # note that there currently cannot be more than one active adapter for the same layer with modules to save
+        # since there would be no clear way to decide which adapter's weights are the correct ones. therefore we
+        # assume that there is only one active adapter. this precondition is enforced by _set_adapter.
         if adapter_name == self.active_adapter:
             self.modules_to_save[adapter_name].requires_grad_(True)
 
@@ -550,6 +554,10 @@ def adapter_state_dict_load_map(self, adapter_name):
         return {k: f"modules_to_save.{adapter_name}.{k}" for k in self.adapter_state_dict(adapter_name)}
 
     def adapter_state_dict(self, adapter_name):
+        if adapter_name not in self._adapters:
+            # In caes of multiple adapters, each bringing their own modules to save, each
+            # ModulesToSaveWrapper will be queried but not every wrapper is obliged to serve the same adapters.
+            return {}
         return self.modules_to_save[adapter_name].state_dict()
 
     def unload_and_optionally_merge_module(
@@ -732,6 +740,7 @@ def _set_trainable(
     found_modules = set()
     # disable removal of duplicates to support targeting tied weights
     key_list = [key for key, _ in model.named_modules(remove_duplicate=False)]
+
     for key in key_list:
         target_module_found = any(key.endswith(target_key) for target_key in module_names)
         if target_module_found:
@@ -776,6 +785,7 @@ def check_adapter_name(adapter_name):
             # if the adapter is found in this module, set it as the active adapter, else disable the adapters of this
             # module
             if adapter_name in module._adapters:
+                module.enable_adapters(True)
                 module.set_adapter(adapter_name)
             else:
                 module.enable_adapters(False)
diff --git a/src/peft/utils/save_and_load.py b/src/peft/utils/save_and_load.py
@@ -21,7 +21,6 @@
 import torch
 from huggingface_hub import file_exists, hf_hub_download
 from huggingface_hub.errors import EntryNotFoundError, LocalEntryNotFoundError
-from packaging import version
 from safetensors.torch import load_file as safe_load_file
 
 from peft.mapping import PEFT_TYPE_TO_PREFIX_MAPPING
@@ -468,15 +467,13 @@ def renamed_dora_weights(k):
     return load_result
 
 
+# TODO: remove this function, use vanilla torch.load as soon as torch < 2.6.0 is no longer supported
 def torch_load(*args, weights_only=True, **kwargs):
     """Call torch.load and handle weights_only.
 
     Defaults to weights_only=True to anticipate upcoming switch on the PyTorch side.
 
     """
-    # TODO: weights_only was added in 1.13, remove if 1.12 no longer needs to be supported
-    if version.parse(torch.__version__) < version.parse("1.13"):
-        return torch.load(*args, **kwargs)
     return torch.load(*args, weights_only=weights_only, **kwargs)
 
 
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
@@ -1236,6 +1236,7 @@ def test_disable_adapters(self, test_name, model_id, config_cls, config_kwargs):
         outputs_base = model(**X)
         if issubclass(config_cls, (FourierFTConfig, TrainableTokensConfig)):
             config_kwargs = config_kwargs.copy()
+            # override the default value and make PEFT operation a no-op
             config_kwargs["init_weights"] = True
         config = config_cls(
             base_model_name_or_path=model_id,
@@ -1255,9 +1256,9 @@ def test_disable_adapters(self, test_name, model_id, config_cls, config_kwargs):
         model.train()
         # EmbConv1D is slow to learn for some reason
         lr = 0.01 if model_id != "EmbConv1D" else 1.0
-        if isinstance(config_cls, LNTuningConfig):
-            # LayerNorm tuning is slow to learn
-            lr = 1.0
+        if isinstance(config, TrainableTokensConfig):
+            # TrainableTokens is only changing a small subset, so we need a higher lr to see the difference
+            lr = 2.0
         optimizer = torch.optim.SGD(model.parameters(), lr=lr)
 
         # train at least 3 steps for all parameters to be updated (probably this is required because of symmetry
diff --git a/tests/test_other.py b/tests/test_other.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 
 import pytest
 import torch
@@ -107,6 +108,84 @@ def test_get_peft_model_revision_warning(tmp_path):
         _ = get_peft_model(base_model, lora_config, revision=overwrite_revision)
 
 
+def test_load_multiple_adapters_different_modules_to_save(tmp_path):
+    # This tests the error described in #2422 where loading multiple adapters with different modules_to_save
+    # attributes fails (due to a regression from #2376).
+
+    model = AutoModelForCausalLM.from_pretrained("trl-internal-testing/tiny-random-LlamaForCausalLM")
+
+    def peft_config(**kwargs):
+        return LoraConfig(target_modules="all-linear", **kwargs)
+
+    original_model = copy.deepcopy(model)
+
+    peft_config_0 = peft_config(modules_to_save=["0.post_attention_layernorm"])
+    peft_config_1 = peft_config(modules_to_save=["0.post_attention_layernorm"])
+    peft_config_2 = peft_config(modules_to_save=["1.post_attention_layernorm"])
+
+    # Save adapter 0, nothing fancy, should be equal to base model weighs
+    peft_model = get_peft_model(copy.deepcopy(original_model), peft_config_0)
+    peft_model.save_pretrained(tmp_path / "adapter_0")
+
+    # Save adapter 1, modules to save weights are modified randomly, should be unique to adapter 1
+    peft_model = get_peft_model(copy.deepcopy(original_model), peft_config_1)
+    peft_model.model.model.layers[0].post_attention_layernorm.weight.data = torch.rand_like(
+        peft_model.model.model.layers[0].post_attention_layernorm.weight.data
+    )
+    adapter_1_saved = peft_model.model.model.layers[0].post_attention_layernorm.weight.data.clone()
+    peft_model.save_pretrained(tmp_path / "adapter_1")
+
+    # Save adapter 2, modules to save weights are modified randomly, should be unique to adapter 2
+    peft_model = get_peft_model(copy.deepcopy(original_model), peft_config_2)
+    peft_model.model.model.layers[1].post_attention_layernorm.weight.data = torch.rand_like(
+        peft_model.model.model.layers[1].post_attention_layernorm.weight.data
+    )
+    adapter_2_saved = peft_model.model.model.layers[1].post_attention_layernorm.weight.data.clone()
+    peft_model.save_pretrained(tmp_path / "adapter_2")
+
+    del peft_model
+
+    combined_model = PeftModel.from_pretrained(original_model, tmp_path / "adapter_0", adapter_name="adapter_0")
+    combined_model.load_adapter(tmp_path / "adapter_1", adapter_name="adapter_1")
+    combined_model.load_adapter(tmp_path / "adapter_2", adapter_name="adapter_2")
+
+    # For adapter 0 we expect every mentioned modules to save layer of this test to be equal to the original model
+    # since we didn't modify it for adapter 0 and only adapter 0 is active.
+    combined_model.set_adapter("adapter_0")
+    assert torch.allclose(
+        combined_model.model.model.layers[0].post_attention_layernorm.weight,
+        original_model.model.layers[0].post_attention_layernorm.weight,
+    )
+    assert torch.allclose(
+        combined_model.model.model.layers[1].post_attention_layernorm.weight,
+        original_model.model.layers[1].post_attention_layernorm.weight,
+    )
+
+    # For adapter 1 we expect that the modified module to save 0.post_attention_layernorm is modified, the other
+    # module to save layers mentioned above should be untouched.
+    combined_model.set_adapter("adapter_1")
+    assert torch.allclose(
+        combined_model.model.model.layers[0].post_attention_layernorm.weight,
+        adapter_1_saved,
+    )
+    assert torch.allclose(
+        combined_model.model.model.layers[1].post_attention_layernorm.weight,
+        original_model.model.layers[1].post_attention_layernorm.weight,
+    )
+
+    # For adapter 2 we expect its module to save layer (1.post_attention_layernorm) to be modified but the other
+    # module to save weights should be kept original.
+    combined_model.set_adapter("adapter_2")
+    assert torch.allclose(
+        combined_model.model.model.layers[0].post_attention_layernorm.weight,
+        original_model.model.layers[0].post_attention_layernorm.weight,
+    )
+    assert torch.allclose(
+        combined_model.model.model.layers[1].post_attention_layernorm.weight,
+        adapter_2_saved,
+    )
+
+
 class TestModulesToSaveAttributeAccess:
     """Test attribute accces on the ModulesToSaveWrapper class.
 
diff --git a/tests/test_tuners_utils.py b/tests/test_tuners_utils.py
@@ -37,6 +37,7 @@
     IA3Config,
     LoHaConfig,
     LoraConfig,
+    PeftModel,
     PromptTuningConfig,
     VeraConfig,
     get_layer_status,
@@ -1502,6 +1503,36 @@ def __init__(self):
         # target modules should *not* be simplified to "query" as that would match "single_transformers_blocks" too
         assert model.peft_config["default"].target_modules != {"query"}
 
+    def test_find_minimal_target_modules_does_not_error_with_ia3(self, tmp_path):
+        # See #2429
+        # There is an issue with the compression of the target_modules attribute when using IA³. There, we additionally
+        # have the feedforward_modules attribute, which must be subset of target_modules. When target_modules is shrunk,
+        # the subset check will fail. This test ensures that this doesn't happen.
+        n_layers = MIN_TARGET_MODULES_FOR_OPTIMIZATION + 1
+
+        class InnerModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.query = nn.Linear(10, 10)
+
+        class OuterModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.blocks = nn.ModuleList([InnerModule() for _ in range(n_layers)])
+
+        target_modules = [f"blocks.{i}.query" for i in range(n_layers)]
+        feedforward_modules = [f"blocks.{i}.query" for i in range(n_layers)]
+        # the subset check happens here
+        config = IA3Config(target_modules=target_modules, feedforward_modules=feedforward_modules)
+        # the optimization step happens here, after the subset check, so at first we're fine, but we will run into an
+        # issue after a save/load roundtrip
+        model = get_peft_model(OuterModule(), config)
+        model.save_pretrained(tmp_path)
+        del model
+
+        # does not raise
+        PeftModel.from_pretrained(OuterModule(), tmp_path)
+
 
 class TestRankAndAlphaPattern:
     @pytest.fixture