FEAT Add gptqmodel support (#2247)

jiqing-feng · LRL-ModelCloud · Qubitium · web-flow · commit 6e30991e97ba · 2025-01-23T14:00:11.000+01:00
Add support for gptqmodel quantization. This is a replacement for
auto-gptq.

For now, both packages are supported, but since auto-gptq is no longer
being developed, it will be deprecated and removed at some point in the
future.

---------

Signed-off-by: jiqing-feng &lt;jiqing.feng@intel.com&gt;
Co-authored-by: LRL-ModelCloud &lt;165116337+LRL-ModelCloud@users.noreply.github.com&gt;
Co-authored-by: Qubitium-ModelCloud &lt;qubitium@modelcloud.ai&gt;
Co-authored-by: ZX-ModelCloud &lt;165115237+ZX-ModelCloud@users.noreply.github.com&gt;
Co-authored-by: LRL &lt;lrl@lbx.dev&gt;
Co-authored-by: Steven Liu &lt;59462357+stevhliu@users.noreply.github.com&gt;
diff --git a/Makefile b/Makefile
@@ -34,6 +34,7 @@ tests_core_single_gpu:
 tests_common_gpu:
 	python -m pytest tests/test_decoder_models.py $(if $(IS_GITHUB_CI),--report-log "common_decoder.log",)
 	python -m pytest tests/test_encoder_decoder_models.py $(if $(IS_GITHUB_CI),--report-log "common_encoder_decoder.log",)
+	python -m pytest tests/test_gptqmodel.py $(if $(IS_GITHUB_CI),--report-log "gptqmodel_gpu.log",)
 
 tests_examples_multi_gpu_bnb:
 	python -m pytest -m "multi_gpu_tests and bitsandbytes" tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "multi_gpu_examples.log",)
diff --git a/docs/source/developer_guides/quantization.md b/docs/source/developer_guides/quantization.md
@@ -107,6 +107,32 @@ QLoRA adds trainable weights to all the linear layers in the transformer archite
 config = LoraConfig(target_modules="all-linear", ...)
 ```
 
+## GPTQ quantization
+
+You can learn more about gptq based `[2, 3, 4, 8]` bits quantization at [GPTQModel](https://github.com/ModelCloud/GPTQModel) and the Transformers [GPTQ](https://huggingface.co/docs/transformers/quantization/gptq) doc. Post-quant training, PEFT can use both [GPTQModel](https://github.com/ModelCloud/GPTQModel) or [AutoGPTQ](https://github.com/autogptq/autogptq) libraries, but we recommend GPTQModel because AutoGPTQ will be deprecated in a future release. 
+
+```bash
+# gptqmodel install
+pip install gptqmodel --no-build-isolation
+```
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
+
+model_id = "facebook/opt-125m"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+gptq_config = GPTQConfig(bits=4, group_size=128, dataset="wikitext2", tokenizer=tokenizer)
+
+quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config)
+
+# save quantized model
+quantized_model.save_pretrained("./opt-125m-gptq")
+tokenizer.save_pretrained("./opt-125m-gptq")
+```
+
+Once quantized, you can post-train GPTQ models with PEFT APIs.
+
 ## AQLM quantization
 
 Additive Quantization of Language Models ([AQLM](https://arxiv.org/abs/2401.06118)) is a Large Language Models compression method. It quantizes multiple weights together and takes advantage of interdependencies between them. AQLM represents groups of 8-16 weights as a sum of multiple vector codes. This allows it to compress models down to as low as 2-bit with considerably low accuracy losses.
diff --git a/src/peft/import_utils.py b/src/peft/import_utils.py
@@ -49,6 +49,33 @@ def is_auto_gptq_available():
             )
 
 
+@lru_cache
+def is_gptqmodel_available():
+    if importlib.util.find_spec("gptqmodel") is not None:
+        GPTQMODEL_MINIMUM_VERSION = packaging.version.parse("1.7.0")
+        OPTIMUM_MINIMUM_VERSION = packaging.version.parse("1.23.99")
+        version_gptqmodel = packaging.version.parse(importlib_metadata.version("gptqmodel"))
+        if GPTQMODEL_MINIMUM_VERSION <= version_gptqmodel:
+            if is_optimum_available():
+                version_optimum = packaging.version.parse(importlib_metadata.version("optimum"))
+                if OPTIMUM_MINIMUM_VERSION <= version_optimum:
+                    return True
+                else:
+                    raise ImportError(
+                        f"gptqmodel requires optimum version {OPTIMUM_MINIMUM_VERSION} or higher. Found version {version_optimum}, "
+                        f"but only versions above {OPTIMUM_MINIMUM_VERSION} are supported"
+                    )
+            else:
+                raise ImportError(
+                    f"gptqmodel requires optimum version {OPTIMUM_MINIMUM_VERSION} or higher to be installed."
+                )
+        else:
+            raise ImportError(
+                f"Found an incompatible version of gptqmodel. Found version {version_gptqmodel}, "
+                f"but only versions above {GPTQMODEL_MINIMUM_VERSION} are supported"
+            )
+
+
 @lru_cache
 def is_optimum_available() -> bool:
     return importlib.util.find_spec("optimum") is not None
diff --git a/src/peft/tuners/adalora/model.py b/src/peft/tuners/adalora/model.py
@@ -17,14 +17,15 @@
 import torch
 from transformers.pytorch_utils import Conv1D
 
-from peft.import_utils import is_bnb_4bit_available, is_bnb_available
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_gptqmodel_available
 from peft.tuners.lora import LoraConfig, LoraModel
 from peft.tuners.tuners_utils import BaseTunerLayer
 from peft.utils import (
     TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING,
     _freeze_adapter,
     _get_submodules,
     get_auto_gptq_quant_linear,
+    get_gptqmodel_quant_linear,
     get_quantization_config,
 )
 from peft.utils.integrations import gather_params_ctx
@@ -135,7 +136,8 @@ def _create_and_replace(
 
         # If it is not an AdaLoraLayer, create a new module, else update it with new adapters
         if not isinstance(target, AdaLoraLayer):
-            new_module = self._create_new_module(lora_config, adapter_name, target, **kwargs)
+            device_map = self.model.hf_device_map if hasattr(self.model, "hf_device_map") else None
+            new_module = self._create_new_module(lora_config, adapter_name, target, device_map=device_map, **kwargs)
             if adapter_name not in self.active_adapters:
                 # adding an additional adapter: it is not automatically trainable
                 new_module.requires_grad_(False)
@@ -150,7 +152,7 @@ def _create_and_replace(
             )
 
     @staticmethod
-    def _create_new_module(lora_config, adapter_name, target, **kwargs):
+    def _create_new_module(lora_config, adapter_name, target, device_map=None, **kwargs):
         # avoid eager bnb import
         if is_bnb_available():
             import bitsandbytes as bnb
@@ -160,7 +162,11 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
             from .bnb import SVDLinear4bit
 
         gptq_quantization_config = kwargs.get("gptq_quantization_config", None)
-        AutoGPTQQuantLinear = get_auto_gptq_quant_linear(gptq_quantization_config)
+
+        if is_gptqmodel_available():
+            QuantLinear = get_gptqmodel_quant_linear(gptq_quantization_config, device_map=device_map)
+        else:
+            QuantLinear = get_auto_gptq_quant_linear(gptq_quantization_config)
 
         loaded_in_8bit = kwargs.pop("loaded_in_8bit", False)
         loaded_in_4bit = kwargs.pop("loaded_in_4bit", False)
@@ -189,7 +195,7 @@ def _create_new_module(lora_config, adapter_name, target, **kwargs):
                 }
             )
             new_module = SVDLinear4bit(target, adapter_name, **fourbit_kwargs)
-        elif AutoGPTQQuantLinear is not None and isinstance(target, AutoGPTQQuantLinear):
+        elif QuantLinear is not None and isinstance(target, QuantLinear):
             new_module = SVDQuantLinear(target, adapter_name, **kwargs)
         else:
             if isinstance(target_base_layer, torch.nn.Linear):
diff --git a/src/peft/tuners/lora/gptq.py b/src/peft/tuners/lora/gptq.py
@@ -16,9 +16,10 @@
 
 import torch
 
+from peft.import_utils import is_gptqmodel_available
 from peft.tuners.lora.layer import LoraLayer
 from peft.tuners.tuners_utils import BaseTunerLayer
-from peft.utils import get_auto_gptq_quant_linear
+from peft.utils import get_auto_gptq_quant_linear, get_gptqmodel_quant_linear
 
 
 class QuantLinear(torch.nn.Module, LoraLayer):
@@ -106,10 +107,15 @@ def dispatch_gptq(
     else:
         target_base_layer = target
 
-    gptq_quantization_config = kwargs.get("gptq_quantization_config", None)
-    AutoGPTQQuantLinear = get_auto_gptq_quant_linear(gptq_quantization_config)
+    cfg = kwargs.get("gptq_quantization_config", None)
 
-    if AutoGPTQQuantLinear is not None and isinstance(target_base_layer, AutoGPTQQuantLinear):
+    if is_gptqmodel_available():
+        device_map = kwargs.get("device_map", None)
+        quant_linear = get_gptqmodel_quant_linear(cfg, device_map=device_map)
+    else:
+        quant_linear = get_auto_gptq_quant_linear(cfg)
+
+    if quant_linear is not None and isinstance(target_base_layer, quant_linear):
         new_module = QuantLinear(target, adapter_name, **kwargs)
         target.qweight = target_base_layer.qweight
 
diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py
@@ -232,7 +232,8 @@ def _create_and_replace(
                 lora_bias=lora_config.lora_bias,
             )
         else:
-            new_module = self._create_new_module(lora_config, adapter_name, target, **kwargs)
+            device_map = self.model.hf_device_map if hasattr(self.model, "hf_device_map") else None
+            new_module = self._create_new_module(lora_config, adapter_name, target, device_map=device_map, **kwargs)
             if adapter_name not in self.active_adapters:
                 # adding an additional adapter: it is not automatically trainable
                 new_module.requires_grad_(False)
diff --git a/src/peft/utils/__init__.py b/src/peft/utils/__init__.py
@@ -39,6 +39,7 @@
     bloom_model_postprocess_past_key_value,
     cast_mixed_precision_params,
     get_auto_gptq_quant_linear,
+    get_gptqmodel_quant_linear,
     get_quantization_config,
     id_tensor_storage,
     infer_device,
@@ -77,6 +78,7 @@
     "bloom_model_postprocess_past_key_value",
     "cast_mixed_precision_params",
     "get_auto_gptq_quant_linear",
+    "get_gptqmodel_quant_linear",
     "get_peft_model_state_dict",
     "get_quantization_config",
     "id_tensor_storage",
diff --git a/src/peft/utils/other.py b/src/peft/utils/other.py
@@ -30,7 +30,7 @@
 from packaging import version
 from safetensors.torch import storage_ptr, storage_size
 
-from ..import_utils import is_auto_gptq_available, is_torch_tpu_available
+from ..import_utils import is_auto_gptq_available, is_gptqmodel_available, is_torch_tpu_available
 from .constants import (
     CONFIG_NAME,
     EMBEDDING_LAYER_NAMES,
@@ -610,30 +610,73 @@ def get_auto_gptq_quant_linear(gptq_quantization_config):
     """
     Get the right AutoGPTQQuantLinear class based on the quantization config file
     """
-    if gptq_quantization_config is not None and is_auto_gptq_available():
+    if gptq_quantization_config is None:
+        return None
+
+    if is_auto_gptq_available():
         from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
+    else:
+        return None
 
-        desc_act = gptq_quantization_config.desc_act
-        group_size = gptq_quantization_config.group_size
-        bits = gptq_quantization_config.bits
-        if hasattr(gptq_quantization_config, "use_exllama"):
-            use_exllama = gptq_quantization_config.use_exllama
-        else:
-            use_exllama = not gptq_quantization_config.disable_exllama
-        if hasattr(gptq_quantization_config, "exllama_config"):
-            exllama_version = gptq_quantization_config.exllama_config["version"]
-        else:
-            exllama_version = 1
-        AutoGPTQQuantLinear = dynamically_import_QuantLinear(
-            use_triton=False,
-            desc_act=desc_act,
-            group_size=group_size,
-            bits=bits,
-            disable_exllama=not (use_exllama and exllama_version == 1),
-            disable_exllamav2=not (use_exllama and exllama_version == 2),
-        )
-        return AutoGPTQQuantLinear
-    return None
+    desc_act = gptq_quantization_config.desc_act
+    group_size = gptq_quantization_config.group_size
+    bits = gptq_quantization_config.bits
+    if hasattr(gptq_quantization_config, "use_exllama"):
+        use_exllama = gptq_quantization_config.use_exllama
+    else:
+        use_exllama = not gptq_quantization_config.disable_exllama
+    if hasattr(gptq_quantization_config, "exllama_config"):
+        exllama_version = gptq_quantization_config.exllama_config["version"]
+    else:
+        exllama_version = 1
+
+    QuantLinear = dynamically_import_QuantLinear(
+        use_triton=False,
+        desc_act=desc_act,
+        group_size=group_size,
+        bits=bits,
+        disable_exllama=not (use_exllama and exllama_version == 1),
+        disable_exllamav2=not (use_exllama and exllama_version == 2),
+    )
+
+    return QuantLinear
+
+
+def get_gptqmodel_quant_linear(gptq_quantization_config, device_map=None):
+    """
+    Get the right GPTQQuantLinear class based on the quantization config file
+    """
+    if gptq_quantization_config is None:
+        return None
+
+    if not is_gptqmodel_available():
+        return None
+
+    from gptqmodel.utils.importer import hf_select_quant_linear
+
+    desc_act = gptq_quantization_config.desc_act
+    group_size = gptq_quantization_config.group_size
+    bits = gptq_quantization_config.bits
+    checkpoint_format = (
+        gptq_quantization_config.checkpoint_format
+        if hasattr(gptq_quantization_config, "checkpoint_format")
+        else "gptq"
+    )
+    sym = gptq_quantization_config.sym
+    meta = gptq_quantization_config.meta if hasattr(gptq_quantization_config, "meta") else None
+
+    QuantLinear = hf_select_quant_linear(
+        bits=bits,
+        group_size=group_size,
+        desc_act=desc_act,
+        sym=sym,
+        device_map=device_map,
+        checkpoint_format=checkpoint_format,
+        meta=meta,
+        backend="auto_trainable",
+    )
+
+    return QuantLinear
 
 
 def id_tensor_storage(tensor: torch.Tensor) -> tuple[torch.device, int, int]:
diff --git a/tests/test_common_gpu.py b/tests/test_common_gpu.py
@@ -406,19 +406,19 @@ def test_lora_gptq_quantization_from_pretrained_safetensors(self):
 
         config = LoraConfig(task_type="CAUSAL_LM")
         peft_model = get_peft_model(model, config)
-        peft_model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0))
+        peft_model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(peft_model.device))
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             peft_model.save_pretrained(tmp_dir)
             model = AutoModelForCausalLM.from_pretrained(**kwargs)
             model = PeftModel.from_pretrained(model, tmp_dir)
             model = prepare_model_for_kbit_training(model)
-            model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0))
+            model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(peft_model.device))
 
             # loading a 2nd adapter works, #1239
             model.load_adapter(tmp_dir, "adapter2")
             model.set_adapter("adapter2")
-            model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0))
+            model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(peft_model.device))
 
             # check that both adapters are in the same layer
             assert "default" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.lora_A
diff --git a/tests/test_gptqmodel.py b/tests/test_gptqmodel.py
diff --git a/tests/testing_utils.py b/tests/testing_utils.py