load and Inference bug with awq when using transformers and vllm

**Describe the bug**
Based on your provided example script, I completed the AWQ quantization of a personal SFT version of the Qwen2 model. However, during inference, an error occurred. Whether using transformers or vLLM, the inference failed.

**Expected behavior**
The model should be able to load and perform inference normally.


**Environment**
Include all relevant environment information:
1. OS [e.g. Ubuntu 20.04]:
2. Python version [e.g. 3.7]:3.10
3. LLM Compressor version or commit hash [e.g. 0.1.0, `f7245c8`]: 0.5.2.dev101+g8b2c612f， build from main branch
4. ML framework version(s) [e.g. torch 2.3.1]: 2.4.0
5. Other Python package versions [e.g. vLLM, compressed-tensors, numpy, ONNX]: compressed-tensors is 0.10.2a20250613； vLLM is 0.6.3+cu124；transformers is 4.45.2
6. Other relevant environment information [e.g. hardware, CUDA version]:12.4

**To Reproduce**
Exact steps to reproduce the behavior:
Setting model = Qwen/Qwen2-7B-Instruct, then run examples/awq/llama_example.py to get the model.

1. vLLM method: using  LLM(quant_model_path)  to load the model then failed.
2. transformers method：using AutoModelForCausalLM.from_pretrained(quant_model_path,device_map="cuda:0", torch_dtype="auto") to load the model； do infer then meet the error。



**Errors**
**Using the vLLM approach, an error occurs when loading the model:**
NotImplementedError                       Traceback (most recent call last)
Cell In[18], line 1
----> 1 quant_model = LLM(
      2     quant_model_path,dtype=torch.float16
      3 )

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/entrypoints/llm.py:177, in LLM.__init__(self, model, tokenizer, tokenizer_mode, skip_tokenizer_init, trust_remote_code, tensor_parallel_size, dtype, quantization, revision, tokenizer_revision, seed, gpu_memory_utilization, swap_space, cpu_offload_gb, enforce_eager, max_context_len_to_capture, max_seq_len_to_capture, disable_custom_all_reduce, disable_async_output_proc, mm_processor_kwargs, **kwargs)
    152     kwargs["disable_log_stats"] = True
    154 engine_args = EngineArgs(
    155     model=model,
    156     tokenizer=tokenizer,
   (...)
    175     **kwargs,
    176 )
--> 177 self.llm_engine = LLMEngine.from_engine_args(
    178     engine_args, usage_context=UsageContext.LLM_CLASS)
    179 self.request_counter = Counter()

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/engine/llm_engine.py:574, in LLMEngine.from_engine_args(cls, engine_args, usage_context, stat_loggers)
    572 executor_class = cls._get_executor_cls(engine_config)
    573 # Create the LLM engine.
--> 574 engine = cls(
    575     **engine_config.to_dict(),
    576     executor_class=executor_class,
    577     log_stats=not engine_args.disable_log_stats,
    578     usage_context=usage_context,
    579     stat_loggers=stat_loggers,
    580 )
    582 return engine

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/engine/llm_engine.py:335, in LLMEngine.__init__(self, model_config, cache_config, parallel_config, scheduler_config, device_config, load_config, lora_config, speculative_config, decoding_config, observability_config, prompt_adapter_config, executor_class, log_stats, usage_context, stat_loggers, input_registry, use_cached_outputs)
    331 self.input_registry = input_registry
    332 self.input_processor = input_registry.create_input_processor(
    333     model_config)
--> 335 self.model_executor = executor_class(
    336     model_config=model_config,
    337     cache_config=cache_config,
    338     parallel_config=parallel_config,
    339     scheduler_config=scheduler_config,
    340     device_config=device_config,
    341     lora_config=lora_config,
    342     speculative_config=speculative_config,
    343     load_config=load_config,
    344     prompt_adapter_config=prompt_adapter_config,
    345     observability_config=self.observability_config,
    346 )
    348 if not self.model_config.embedding_mode:
    349     self._initialize_kv_caches()

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/executor/executor_base.py:47, in ExecutorBase.__init__(self, model_config, cache_config, parallel_config, scheduler_config, device_config, load_config, lora_config, speculative_config, prompt_adapter_config, observability_config)
     45 self.prompt_adapter_config = prompt_adapter_config
     46 self.observability_config = observability_config
---> 47 self._init_executor()

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/executor/gpu_executor.py:40, in GPUExecutor._init_executor(self)
     38 self.driver_worker = self._create_worker()
     39 self.driver_worker.init_device()
---> 40 self.driver_worker.load_model()

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/worker/worker.py:183, in Worker.load_model(self)
    182 def load_model(self):
--> 183     self.model_runner.load_model()

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/worker/model_runner.py:1062, in GPUModelRunnerBase.load_model(self)
   1060 logger.info("Starting to load model %s...", self.model_config.model)
   1061 with DeviceMemoryProfiler() as m:
-> 1062     self.model = get_model(model_config=self.model_config,
   1063                            device_config=self.device_config,
   1064                            load_config=self.load_config,
   1065                            lora_config=self.lora_config,
   1066                            parallel_config=self.parallel_config,
   1067                            scheduler_config=self.scheduler_config,
   1068                            cache_config=self.cache_config)
   1070 self.model_memory_usage = m.consumed_memory
   1071 logger.info("Loading model weights took %.4f GB",
   1072             self.model_memory_usage / float(2**30))

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/model_loader/__init__.py:19, in get_model(model_config, load_config, device_config, parallel_config, scheduler_config, lora_config, cache_config)
     13 def get_model(*, model_config: ModelConfig, load_config: LoadConfig,
     14               device_config: DeviceConfig, parallel_config: ParallelConfig,
     15               scheduler_config: SchedulerConfig,
     16               lora_config: Optional[LoRAConfig],
     17               cache_config: CacheConfig) -> nn.Module:
     18     loader = get_model_loader(load_config)
---> 19     return loader.load_model(model_config=model_config,
     20                              device_config=device_config,
     21                              lora_config=lora_config,
     22                              parallel_config=parallel_config,
     23                              scheduler_config=scheduler_config,
     24                              cache_config=cache_config)

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py:398, in DefaultModelLoader.load_model(self, model_config, device_config, lora_config, parallel_config, scheduler_config, cache_config)
    396 with set_default_torch_dtype(model_config.dtype):
    397     with target_device:
--> 398         model = _initialize_model(model_config, self.load_config,
    399                                   lora_config, cache_config,
    400                                   scheduler_config)
    402     model.load_weights(self._get_all_weights(model_config, model))
    404     for _, module in model.named_modules():

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py:175, in _initialize_model(model_config, load_config, lora_config, cache_config, scheduler_config)
    172 """Initialize a model with the given configurations."""
    173 model_class, _ = get_model_architecture(model_config)
--> 175 return build_model(
    176     model_class,
    177     model_config.hf_config,
    178     cache_config=cache_config,
    179     quant_config=_get_quantization_config(model_config, load_config),
    180     lora_config=lora_config,
    181     multimodal_config=model_config.multimodal_config,
    182     scheduler_config=scheduler_config,
    183 )

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py:160, in build_model(model_class, hf_config, cache_config, quant_config, lora_config, multimodal_config, scheduler_config)
    150 def build_model(model_class: Type[nn.Module], hf_config: PretrainedConfig,
    151                 cache_config: Optional[CacheConfig],
    152                 quant_config: Optional[QuantizationConfig], *,
    153                 lora_config: Optional[LoRAConfig],
    154                 multimodal_config: Optional[MultiModalConfig],
    155                 scheduler_config: Optional[SchedulerConfig]) -> nn.Module:
    156     extra_kwargs = _get_model_initialization_kwargs(model_class, lora_config,
    157                                                     multimodal_config,
    158                                                     scheduler_config)
--> 160     return model_class(config=hf_config,
    161                        cache_config=cache_config,
    162                        quant_config=quant_config,
    163                        **extra_kwargs)

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.py:393, in Qwen2ForCausalLM.__init__(self, config, cache_config, quant_config, lora_config)
    390 self.lora_config = lora_config
    392 self.quant_config = quant_config
--> 393 self.model = Qwen2Model(config, cache_config, quant_config)
    395 if config.tie_word_embeddings:
    396     self.lm_head = self.model.embed_tokens

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.py:248, in Qwen2Model.__init__(self, config, cache_config, quant_config, prefix)
    245 else:
    246     self.embed_tokens = PPMissingLayer()
--> 248 self.start_layer, self.end_layer, self.layers = make_layers(
    249     config.num_hidden_layers,
    250     lambda prefix: Qwen2DecoderLayer(config=config,
    251                                      cache_config=cache_config,
    252                                      quant_config=quant_config),
    253     prefix=f"{prefix}.layers",
    254 )
    256 self.make_empty_intermediate_tensors = (
    257     make_empty_intermediate_tensors_factory(
    258         ["hidden_states", "residual"], config.hidden_size))
    259 if get_pp_group().is_last_rank:

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/models/utils.py:407, in make_layers(num_hidden_layers, layer_fn, prefix)
    402 from vllm.distributed.utils import get_pp_indices
    403 start_layer, end_layer = get_pp_indices(num_hidden_layers,
    404                                         get_pp_group().rank_in_group,
    405                                         get_pp_group().world_size)
    406 modules = torch.nn.ModuleList(
--> 407     [PPMissingLayer() for _ in range(start_layer)] + [
    408         maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
    409         for idx in range(start_layer, end_layer)
    410     ] + [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)])
    411 return start_layer, end_layer, modules

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/models/utils.py:408, in <listcomp>(.0)
    402 from vllm.distributed.utils import get_pp_indices
    403 start_layer, end_layer = get_pp_indices(num_hidden_layers,
    404                                         get_pp_group().rank_in_group,
    405                                         get_pp_group().world_size)
    406 modules = torch.nn.ModuleList(
    407     [PPMissingLayer() for _ in range(start_layer)] + [
--> 408         maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
    409         for idx in range(start_layer, end_layer)
    410     ] + [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)])
    411 return start_layer, end_layer, modules

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.py:250, in Qwen2Model.__init__.<locals>.<lambda>(prefix)
    245 else:
    246     self.embed_tokens = PPMissingLayer()
    248 self.start_layer, self.end_layer, self.layers = make_layers(
    249     config.num_hidden_layers,
--> 250     lambda prefix: Qwen2DecoderLayer(config=config,
    251                                      cache_config=cache_config,
    252                                      quant_config=quant_config),
    253     prefix=f"{prefix}.layers",
    254 )
    256 self.make_empty_intermediate_tensors = (
    257     make_empty_intermediate_tensors_factory(
    258         ["hidden_states", "residual"], config.hidden_size))
    259 if get_pp_group().is_last_rank:

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.py:175, in Qwen2DecoderLayer.__init__(self, config, cache_config, quant_config)
    173 rope_theta = getattr(config, "rope_theta", 1000000)
    174 rope_scaling = getattr(config, "rope_scaling", None)
--> 175 self.self_attn = Qwen2Attention(
    176     hidden_size=self.hidden_size,
    177     num_heads=config.num_attention_heads,
    178     max_position=config.max_position_embeddings,
    179     num_kv_heads=config.num_key_value_heads,
    180     rope_theta=rope_theta,
    181     cache_config=cache_config,
    182     quant_config=quant_config,
    183     rope_scaling=rope_scaling)
    184 self.mlp = Qwen2MLP(
    185     hidden_size=self.hidden_size,
    186     intermediate_size=config.intermediate_size,
    187     hidden_act=config.hidden_act,
    188     quant_config=quant_config,
    189 )
    190 self.input_layernorm = RMSNorm(config.hidden_size,
    191                                eps=config.rms_norm_eps)

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.py:118, in Qwen2Attention.__init__(self, hidden_size, num_heads, num_kv_heads, max_position, rope_theta, cache_config, quant_config, rope_scaling)
    115 self.scaling = self.head_dim**-0.5
    116 self.rope_theta = rope_theta
--> 118 self.qkv_proj = QKVParallelLinear(
    119     hidden_size,
    120     self.head_dim,
    121     self.total_num_heads,
    122     self.total_num_kv_heads,
    123     bias=True,
    124     quant_config=quant_config,
    125 )
    126 self.o_proj = RowParallelLinear(
    127     self.total_num_heads * self.head_dim,
    128     hidden_size,
    129     bias=False,
    130     quant_config=quant_config,
    131 )
    133 self.rotary_emb = get_rope(
    134     self.head_dim,
    135     rotary_dim=self.head_dim,
   (...)
    138     rope_scaling=rope_scaling,
    139 )

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/layers/linear.py:681, in QKVParallelLinear.__init__(self, hidden_size, head_size, total_num_heads, total_num_kv_heads, bias, skip_bias_add, params_dtype, quant_config, prefix)
    673 output_size = (self.num_heads +
    674                2 * self.num_kv_heads) * tp_size * self.head_size
    675 self.output_sizes = [
    676     self.num_heads * self.head_size * tp_size,  # q_proj
    677     self.num_kv_heads * self.head_size * tp_size,  # k_proj
    678     self.num_kv_heads * self.head_size * tp_size,  # v_proj 
    679 ]
--> 681 super().__init__(input_size=input_size,
    682                  output_size=output_size,
    683                  bias=bias,
    684                  gather_output=False,
    685                  skip_bias_add=skip_bias_add,
    686                  params_dtype=params_dtype,
    687                  quant_config=quant_config,
    688                  prefix=prefix)

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/layers/linear.py:284, in ColumnParallelLinear.__init__(self, input_size, output_size, bias, gather_output, skip_bias_add, params_dtype, quant_config, output_sizes, prefix)
    274 def __init__(self,
    275              input_size: int,
    276              output_size: int,
   (...)
    282              output_sizes: Optional[List[int]] = None,
    283              prefix: str = ""):
--> 284     super().__init__(input_size, output_size, skip_bias_add, params_dtype,
    285                      quant_config, prefix)
    287     self.gather_output = gather_output
    289     # Divide the weight matrix along the last dimension.

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/layers/linear.py:172, in LinearBase.__init__(self, input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix)
    169     self.quant_method: Optional[
    170         QuantizeMethodBase] = UnquantizedLinearMethod()
    171 else:
--> 172     self.quant_method = quant_config.get_quant_method(self,
    173                                                       prefix=prefix)

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py:70, in CompressedTensorsConfig.get_quant_method(self, layer, prefix)
     68     return UnquantizedLinearMethod()
     69 if isinstance(layer, LinearBase):
---> 70     scheme = self.get_scheme(layer=layer, layer_name=prefix)
     71     layer.scheme = scheme
     72     return CompressedTensorsLinearMethod(self)

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py:315, in CompressedTensorsConfig.get_scheme(self, layer, layer_name)
    313 # Find the quant_scheme
    314 scheme_dict = self.target_scheme_map[matched_target]
--> 315 scheme = self._get_scheme_from_parts(
    316     weight_quant=scheme_dict["weights"],
    317     input_quant=scheme_dict["input_activations"])
    319 # Raise error if device does not support the scheme
    320 # (e.g. fp8 needs ada lovelace)
    321 self._check_scheme_supported(scheme.get_min_capability())

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py:279, in CompressedTensorsConfig._get_scheme_from_parts(self, weight_quant, input_quant)
    273     if self._is_dynamic_token_w8a8(weight_quant, input_quant):
    274         return CompressedTensorsW8A8Int8(
    275             strategy=weight_quant.strategy,
    276             is_static_input_scheme=False,
    277             input_symmetric=input_quant.symmetric)
--> 279 raise NotImplementedError(
    280     "No compressed-tensors compatible scheme was found.")

NotImplementedError: No compressed-tensors compatible scheme was found.


**Using the Transformers approach, an error occurs during inference:**

ValueError                                Traceback (most recent call last)
Cell In[44], line 4
      2 for model_input in tqdm(model_inputs):
      3     input_ids = quant_tokenizer(text, return_tensors="pt").input_ids.to(quant_model.device)
----> 4     output = quant_model.generate(input_ids, max_new_tokens=10, eos_token_id=quant_tokenizer.eos_token_id, pad_token_id=quant_tokenizer.pad_token_id)
      5     ans = quant_tokenizer.decode(output[0],skip_special_tokens=True).split('\n')[-1]
      6     quant_ans.append(ans)

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    113 @functools.wraps(func)
    114 def decorate_context(*args, **kwargs):
    115     with ctx_factory():
--> 116         return func(*args, **kwargs)

File ~/.local/lib/python3.10/site-packages/transformers/generation/utils.py:2047, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
   2039     input_ids, model_kwargs = self._expand_inputs_for_generation(
   2040         input_ids=input_ids,
   2041         expand_size=generation_config.num_return_sequences,
   2042         is_encoder_decoder=self.config.is_encoder_decoder,
   2043         **model_kwargs,
   2044     )
   2046     # 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
-> 2047     result = self._sample(
   2048         input_ids,
   2049         logits_processor=prepared_logits_processor,
   2050         stopping_criteria=prepared_stopping_criteria,
   2051         generation_config=generation_config,
   2052         synced_gpus=synced_gpus,
   2053         streamer=streamer,
   2054         **model_kwargs,
   2055     )
   2057 elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
   2058     # 11. prepare beam search scorer
   2059     beam_scorer = BeamSearchScorer(
   2060         batch_size=batch_size,
   2061         num_beams=generation_config.num_beams,
   (...)
   2066         max_length=generation_config.max_length,
   2067     )

File ~/.local/lib/python3.10/site-packages/transformers/generation/utils.py:3007, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
   3004 model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
   3006 # forward pass to get next token
-> 3007 outputs = self(**model_inputs, return_dict=True)
   3009 if synced_gpus and this_peer_finished:
   3010     continue  # don't waste resources running the code we don't need

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
   1551     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1552 else:
-> 1553     return self._call_impl(*args, **kwargs)

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
   1557 # If we don't have any hooks, we want to skip the rest of the logic in
   1558 # this function, and just call forward.
   1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1560         or _global_backward_pre_hooks or _global_backward_hooks
   1561         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562     return forward_call(*args, **kwargs)
   1564 try:
   1565     result = None

File ~/.local/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py:1167, in Qwen2ForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, num_logits_to_keep)
   1164 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
   1166 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 1167 outputs = self.model(
   1168     input_ids=input_ids,
   1169     attention_mask=attention_mask,
   1170     position_ids=position_ids,
   1171     past_key_values=past_key_values,
   1172     inputs_embeds=inputs_embeds,
   1173     use_cache=use_cache,
   1174     output_attentions=output_attentions,
   1175     output_hidden_states=output_hidden_states,
   1176     return_dict=return_dict,
   1177     cache_position=cache_position,
   1178 )
   1180 hidden_states = outputs[0]
   1181 if labels is None and not is_torchdynamo_compiling():

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
   1551     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1552 else:
-> 1553     return self._call_impl(*args, **kwargs)

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
   1557 # If we don't have any hooks, we want to skip the rest of the logic in
   1558 # this function, and just call forward.
   1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1560         or _global_backward_pre_hooks or _global_backward_hooks
   1561         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562     return forward_call(*args, **kwargs)
   1564 try:
   1565     result = None

File ~/.local/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py:976, in Qwen2Model.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)
    964     layer_outputs = self._gradient_checkpointing_func(
    965         decoder_layer.__call__,
    966         hidden_states,
   (...)
    973         position_embeddings,
    974     )
    975 else:
--> 976     layer_outputs = decoder_layer(
    977         hidden_states,
    978         attention_mask=causal_mask,
    979         position_ids=position_ids,
    980         past_key_value=past_key_values,
    981         output_attentions=output_attentions,
    982         use_cache=use_cache,
    983         cache_position=cache_position,
    984         position_embeddings=position_embeddings,
    985     )
    987 hidden_states = layer_outputs[0]
    989 if use_cache:

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
   1551     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1552 else:
-> 1553     return self._call_impl(*args, **kwargs)

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
   1557 # If we don't have any hooks, we want to skip the rest of the logic in
   1558 # this function, and just call forward.
   1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1560         or _global_backward_pre_hooks or _global_backward_hooks
   1561         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562     return forward_call(*args, **kwargs)
   1564 try:
   1565     result = None

File ~/.local/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py:702, in Qwen2DecoderLayer.forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, cache_position, position_embeddings, **kwargs)
    699 hidden_states = self.input_layernorm(hidden_states)
    701 # Self Attention
--> 702 hidden_states, self_attn_weights, present_key_value = self.self_attn(
    703     hidden_states=hidden_states,
    704     attention_mask=attention_mask,
    705     position_ids=position_ids,
    706     past_key_value=past_key_value,
    707     output_attentions=output_attentions,
    708     use_cache=use_cache,
    709     cache_position=cache_position,
    710     position_embeddings=position_embeddings,
    711 )
    712 hidden_states = residual + hidden_states
    714 # Fully Connected

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
   1551     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1552 else:
-> 1553     return self._call_impl(*args, **kwargs)

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
   1557 # If we don't have any hooks, we want to skip the rest of the logic in
   1558 # this function, and just call forward.
   1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1560         or _global_backward_pre_hooks or _global_backward_hooks
   1561         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562     return forward_call(*args, **kwargs)
   1564 try:
   1565     result = None

File ~/.local/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py:580, in Qwen2SdpaAttention.forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, cache_position, position_embeddings)
    569     return super().forward(
    570         hidden_states=hidden_states,
    571         attention_mask=attention_mask,
   (...)
    575         use_cache=use_cache,
    576     )
    578 bsz, q_len, _ = hidden_states.size()
--> 580 query_states = self.q_proj(hidden_states)
    581 key_states = self.k_proj(hidden_states)
    582 value_states = self.v_proj(hidden_states)

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
   1551     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1552 else:
-> 1553     return self._call_impl(*args, **kwargs)

File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
   1557 # If we don't have any hooks, we want to skip the rest of the logic in
   1558 # this function, and just call forward.
   1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1560         or _global_backward_pre_hooks or _global_backward_hooks
   1561         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562     return forward_call(*args, **kwargs)
   1564 try:
   1565     result = None

File ~/.local/lib/python3.10/site-packages/compressed_tensors/quantization/lifecycle/forward.py:326, in wrap_module_forward_quantized.<locals>.wrapped_forward(self, *args, **kwargs)
    321     self.weight.data = forward_quantize(
    322         module, self.weight, "weight", scheme.weights
    323     )
    325 # perform wrapped forward call
--> 326 output = forward_func_orig.__get__(module, module.__class__)(
    327     input_, *args[1:], **kwargs
    328 )
    330 # restore back to unquantized_value
    331 if scheme.weights is not None and not compressed:

File ~/.local/lib/python3.10/site-packages/compressed_tensors/linear/compressed_linear.py:103, in CompressedLinear.forward(self, input)
     99 """
    100 Decompresses the weight, then runs the wrapped forward pass
    101 """
    102 if self.quantization_status == QuantizationStatus.COMPRESSED:
--> 103     weight_data = self.compressor.decompress_module(self)
    104     param = Parameter(weight_data, requires_grad=False)
    105     register_offload_parameter(self, "weight", param)

File ~/.local/lib/python3.10/site-packages/compressed_tensors/compressors/base.py:188, in BaseCompressor.decompress_module(self, module)
    185 for name, parameter in module.named_parameters():
    186     compressed_data[name] = parameter
--> 188 return self.decompress_weight(
    189     compressed_data=compressed_data, quantization_args=quantization_args
    190 ).to(device)

File ~/.local/lib/python3.10/site-packages/compressed_tensors/compressors/quantized_compressors/pack_quantized.py:174, in PackedQuantizationCompressor.decompress_weight(self, compressed_data, quantization_args)
    169 # NOTE: this will fail decompression as we don't currently handle packed zp on decompression
    170 if not quantization_args.symmetric and quantization_args.strategy in [
    171     QuantizationStrategy.GROUP.value,
    172     QuantizationStrategy.CHANNEL.value,
    173 ]:
--> 174     raise ValueError(
    175         "Decompression of packed zero points is currently not supported"
    176     )
    177     assert zero_point is not None
    178     original_zp_shape = (original_shape[0], scale.shape[-1])

ValueError: Decompression of packed zero points is currently not supported



**Additional context**
以下是模型quant之后的config：
{
  "_name_or_path": "/home/notebook/model_hub/model_only_level_10_12_data_with_shop",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151643,
  "hidden_act": "silu",
  "hidden_size": 3584,
  "initializer_range": 0.02,
  "intermediate_size": 18944,
  "max_position_embeddings": 131072,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 28,
  "num_hidden_layers": 28,
  "num_key_value_heads": 4,
  "quantization_config": {
    "config_groups": {
      "group_0": {
        "input_activations": null,
        "output_activations": null,
        "targets": [
          "Linear"
        ],
        "weights": {
          "actorder": null,
          "block_structure": null,
          "dynamic": false,
          "group_size": 128,
          "num_bits": 4,
          "observer": "minmax",
          "observer_kwargs": {},
          "strategy": "group",
          "symmetric": false,
          "type": "int"
        }
      }
    },
    "format": "pack-quantized",
    "global_compression_ratio": null,
    "ignore": [
      "lm_head"
    ],
    "kv_cache_scheme": null,
    "quant_method": "compressed-tensors",
    "quantization_status": "compressed"
  },
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.45.2",
  "use_cache": false,
  "use_sliding_window": false,
  "vocab_size": 152064
}


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

load and Inference bug with awq when using transformers and vllm #1550

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

load and Inference bug with awq when using transformers and vllm #1550

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions