Description
Describe the bug
Based on your provided example script, I completed the AWQ quantization of a personal SFT version of the Qwen2 model. However, during inference, an error occurred. Whether using transformers or vLLM, the inference failed.
Expected behavior
The model should be able to load and perform inference normally.
Environment
Include all relevant environment information:
- OS [e.g. Ubuntu 20.04]:
- Python version [e.g. 3.7]:3.10
- LLM Compressor version or commit hash [e.g. 0.1.0,
f7245c8
]: 0.5.2.dev101+g8b2c612f, build from main branch - ML framework version(s) [e.g. torch 2.3.1]: 2.4.0
- Other Python package versions [e.g. vLLM, compressed-tensors, numpy, ONNX]: compressed-tensors is 0.10.2a20250613; vLLM is 0.6.3+cu124;transformers is 4.45.2
- Other relevant environment information [e.g. hardware, CUDA version]:12.4
To Reproduce
Exact steps to reproduce the behavior:
Setting model = Qwen/Qwen2-7B-Instruct, then run examples/awq/llama_example.py to get the model.
- vLLM method: using LLM(quant_model_path) to load the model then failed.
- transformers method:using AutoModelForCausalLM.from_pretrained(quant_model_path,device_map="cuda:0", torch_dtype="auto") to load the model; do infer then meet the error。
Errors
Using the vLLM approach, an error occurs when loading the model:
NotImplementedError Traceback (most recent call last)
Cell In[18], line 1
----> 1 quant_model = LLM(
2 quant_model_path,dtype=torch.float16
3 )
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/entrypoints/llm.py:177, in LLM.init(self, model, tokenizer, tokenizer_mode, skip_tokenizer_init, trust_remote_code, tensor_parallel_size, dtype, quantization, revision, tokenizer_revision, seed, gpu_memory_utilization, swap_space, cpu_offload_gb, enforce_eager, max_context_len_to_capture, max_seq_len_to_capture, disable_custom_all_reduce, disable_async_output_proc, mm_processor_kwargs, **kwargs)
152 kwargs["disable_log_stats"] = True
154 engine_args = EngineArgs(
155 model=model,
156 tokenizer=tokenizer,
(...)
175 **kwargs,
176 )
--> 177 self.llm_engine = LLMEngine.from_engine_args(
178 engine_args, usage_context=UsageContext.LLM_CLASS)
179 self.request_counter = Counter()
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/engine/llm_engine.py:574, in LLMEngine.from_engine_args(cls, engine_args, usage_context, stat_loggers)
572 executor_class = cls._get_executor_cls(engine_config)
573 # Create the LLM engine.
--> 574 engine = cls(
575 **engine_config.to_dict(),
576 executor_class=executor_class,
577 log_stats=not engine_args.disable_log_stats,
578 usage_context=usage_context,
579 stat_loggers=stat_loggers,
580 )
582 return engine
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/engine/llm_engine.py:335, in LLMEngine.init(self, model_config, cache_config, parallel_config, scheduler_config, device_config, load_config, lora_config, speculative_config, decoding_config, observability_config, prompt_adapter_config, executor_class, log_stats, usage_context, stat_loggers, input_registry, use_cached_outputs)
331 self.input_registry = input_registry
332 self.input_processor = input_registry.create_input_processor(
333 model_config)
--> 335 self.model_executor = executor_class(
336 model_config=model_config,
337 cache_config=cache_config,
338 parallel_config=parallel_config,
339 scheduler_config=scheduler_config,
340 device_config=device_config,
341 lora_config=lora_config,
342 speculative_config=speculative_config,
343 load_config=load_config,
344 prompt_adapter_config=prompt_adapter_config,
345 observability_config=self.observability_config,
346 )
348 if not self.model_config.embedding_mode:
349 self._initialize_kv_caches()
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/executor/executor_base.py:47, in ExecutorBase.init(self, model_config, cache_config, parallel_config, scheduler_config, device_config, load_config, lora_config, speculative_config, prompt_adapter_config, observability_config)
45 self.prompt_adapter_config = prompt_adapter_config
46 self.observability_config = observability_config
---> 47 self._init_executor()
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/executor/gpu_executor.py:40, in GPUExecutor._init_executor(self)
38 self.driver_worker = self._create_worker()
39 self.driver_worker.init_device()
---> 40 self.driver_worker.load_model()
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/worker/worker.py:183, in Worker.load_model(self)
182 def load_model(self):
--> 183 self.model_runner.load_model()
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/worker/model_runner.py:1062, in GPUModelRunnerBase.load_model(self)
1060 logger.info("Starting to load model %s...", self.model_config.model)
1061 with DeviceMemoryProfiler() as m:
-> 1062 self.model = get_model(model_config=self.model_config,
1063 device_config=self.device_config,
1064 load_config=self.load_config,
1065 lora_config=self.lora_config,
1066 parallel_config=self.parallel_config,
1067 scheduler_config=self.scheduler_config,
1068 cache_config=self.cache_config)
1070 self.model_memory_usage = m.consumed_memory
1071 logger.info("Loading model weights took %.4f GB",
1072 self.model_memory_usage / float(2**30))
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/model_loader/init.py:19, in get_model(model_config, load_config, device_config, parallel_config, scheduler_config, lora_config, cache_config)
13 def get_model(*, model_config: ModelConfig, load_config: LoadConfig,
14 device_config: DeviceConfig, parallel_config: ParallelConfig,
15 scheduler_config: SchedulerConfig,
16 lora_config: Optional[LoRAConfig],
17 cache_config: CacheConfig) -> nn.Module:
18 loader = get_model_loader(load_config)
---> 19 return loader.load_model(model_config=model_config,
20 device_config=device_config,
21 lora_config=lora_config,
22 parallel_config=parallel_config,
23 scheduler_config=scheduler_config,
24 cache_config=cache_config)
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py:398, in DefaultModelLoader.load_model(self, model_config, device_config, lora_config, parallel_config, scheduler_config, cache_config)
396 with set_default_torch_dtype(model_config.dtype):
397 with target_device:
--> 398 model = _initialize_model(model_config, self.load_config,
399 lora_config, cache_config,
400 scheduler_config)
402 model.load_weights(self._get_all_weights(model_config, model))
404 for _, module in model.named_modules():
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py:175, in _initialize_model(model_config, load_config, lora_config, cache_config, scheduler_config)
172 """Initialize a model with the given configurations."""
173 model_class, _ = get_model_architecture(model_config)
--> 175 return build_model(
176 model_class,
177 model_config.hf_config,
178 cache_config=cache_config,
179 quant_config=_get_quantization_config(model_config, load_config),
180 lora_config=lora_config,
181 multimodal_config=model_config.multimodal_config,
182 scheduler_config=scheduler_config,
183 )
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py:160, in build_model(model_class, hf_config, cache_config, quant_config, lora_config, multimodal_config, scheduler_config)
150 def build_model(model_class: Type[nn.Module], hf_config: PretrainedConfig,
151 cache_config: Optional[CacheConfig],
152 quant_config: Optional[QuantizationConfig], *,
153 lora_config: Optional[LoRAConfig],
154 multimodal_config: Optional[MultiModalConfig],
155 scheduler_config: Optional[SchedulerConfig]) -> nn.Module:
156 extra_kwargs = _get_model_initialization_kwargs(model_class, lora_config,
157 multimodal_config,
158 scheduler_config)
--> 160 return model_class(config=hf_config,
161 cache_config=cache_config,
162 quant_config=quant_config,
163 **extra_kwargs)
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.py:393, in Qwen2ForCausalLM.init(self, config, cache_config, quant_config, lora_config)
390 self.lora_config = lora_config
392 self.quant_config = quant_config
--> 393 self.model = Qwen2Model(config, cache_config, quant_config)
395 if config.tie_word_embeddings:
396 self.lm_head = self.model.embed_tokens
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.py:248, in Qwen2Model.init(self, config, cache_config, quant_config, prefix)
245 else:
246 self.embed_tokens = PPMissingLayer()
--> 248 self.start_layer, self.end_layer, self.layers = make_layers(
249 config.num_hidden_layers,
250 lambda prefix: Qwen2DecoderLayer(config=config,
251 cache_config=cache_config,
252 quant_config=quant_config),
253 prefix=f"{prefix}.layers",
254 )
256 self.make_empty_intermediate_tensors = (
257 make_empty_intermediate_tensors_factory(
258 ["hidden_states", "residual"], config.hidden_size))
259 if get_pp_group().is_last_rank:
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/models/utils.py:407, in make_layers(num_hidden_layers, layer_fn, prefix)
402 from vllm.distributed.utils import get_pp_indices
403 start_layer, end_layer = get_pp_indices(num_hidden_layers,
404 get_pp_group().rank_in_group,
405 get_pp_group().world_size)
406 modules = torch.nn.ModuleList(
--> 407 [PPMissingLayer() for _ in range(start_layer)] + [
408 maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
409 for idx in range(start_layer, end_layer)
410 ] + [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)])
411 return start_layer, end_layer, modules
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/models/utils.py:408, in (.0)
402 from vllm.distributed.utils import get_pp_indices
403 start_layer, end_layer = get_pp_indices(num_hidden_layers,
404 get_pp_group().rank_in_group,
405 get_pp_group().world_size)
406 modules = torch.nn.ModuleList(
407 [PPMissingLayer() for _ in range(start_layer)] + [
--> 408 maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
409 for idx in range(start_layer, end_layer)
410 ] + [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)])
411 return start_layer, end_layer, modules
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.py:250, in Qwen2Model.init..(prefix)
245 else:
246 self.embed_tokens = PPMissingLayer()
248 self.start_layer, self.end_layer, self.layers = make_layers(
249 config.num_hidden_layers,
--> 250 lambda prefix: Qwen2DecoderLayer(config=config,
251 cache_config=cache_config,
252 quant_config=quant_config),
253 prefix=f"{prefix}.layers",
254 )
256 self.make_empty_intermediate_tensors = (
257 make_empty_intermediate_tensors_factory(
258 ["hidden_states", "residual"], config.hidden_size))
259 if get_pp_group().is_last_rank:
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.py:175, in Qwen2DecoderLayer.init(self, config, cache_config, quant_config)
173 rope_theta = getattr(config, "rope_theta", 1000000)
174 rope_scaling = getattr(config, "rope_scaling", None)
--> 175 self.self_attn = Qwen2Attention(
176 hidden_size=self.hidden_size,
177 num_heads=config.num_attention_heads,
178 max_position=config.max_position_embeddings,
179 num_kv_heads=config.num_key_value_heads,
180 rope_theta=rope_theta,
181 cache_config=cache_config,
182 quant_config=quant_config,
183 rope_scaling=rope_scaling)
184 self.mlp = Qwen2MLP(
185 hidden_size=self.hidden_size,
186 intermediate_size=config.intermediate_size,
187 hidden_act=config.hidden_act,
188 quant_config=quant_config,
189 )
190 self.input_layernorm = RMSNorm(config.hidden_size,
191 eps=config.rms_norm_eps)
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.py:118, in Qwen2Attention.init(self, hidden_size, num_heads, num_kv_heads, max_position, rope_theta, cache_config, quant_config, rope_scaling)
115 self.scaling = self.head_dim**-0.5
116 self.rope_theta = rope_theta
--> 118 self.qkv_proj = QKVParallelLinear(
119 hidden_size,
120 self.head_dim,
121 self.total_num_heads,
122 self.total_num_kv_heads,
123 bias=True,
124 quant_config=quant_config,
125 )
126 self.o_proj = RowParallelLinear(
127 self.total_num_heads * self.head_dim,
128 hidden_size,
129 bias=False,
130 quant_config=quant_config,
131 )
133 self.rotary_emb = get_rope(
134 self.head_dim,
135 rotary_dim=self.head_dim,
(...)
138 rope_scaling=rope_scaling,
139 )
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/layers/linear.py:681, in QKVParallelLinear.init(self, hidden_size, head_size, total_num_heads, total_num_kv_heads, bias, skip_bias_add, params_dtype, quant_config, prefix)
673 output_size = (self.num_heads +
674 2 * self.num_kv_heads) * tp_size * self.head_size
675 self.output_sizes = [
676 self.num_heads * self.head_size * tp_size, # q_proj
677 self.num_kv_heads * self.head_size * tp_size, # k_proj
678 self.num_kv_heads * self.head_size * tp_size, # v_proj
679 ]
--> 681 super().init(input_size=input_size,
682 output_size=output_size,
683 bias=bias,
684 gather_output=False,
685 skip_bias_add=skip_bias_add,
686 params_dtype=params_dtype,
687 quant_config=quant_config,
688 prefix=prefix)
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/layers/linear.py:284, in ColumnParallelLinear.init(self, input_size, output_size, bias, gather_output, skip_bias_add, params_dtype, quant_config, output_sizes, prefix)
274 def init(self,
275 input_size: int,
276 output_size: int,
(...)
282 output_sizes: Optional[List[int]] = None,
283 prefix: str = ""):
--> 284 super().init(input_size, output_size, skip_bias_add, params_dtype,
285 quant_config, prefix)
287 self.gather_output = gather_output
289 # Divide the weight matrix along the last dimension.
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/layers/linear.py:172, in LinearBase.init(self, input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix)
169 self.quant_method: Optional[
170 QuantizeMethodBase] = UnquantizedLinearMethod()
171 else:
--> 172 self.quant_method = quant_config.get_quant_method(self,
173 prefix=prefix)
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py:70, in CompressedTensorsConfig.get_quant_method(self, layer, prefix)
68 return UnquantizedLinearMethod()
69 if isinstance(layer, LinearBase):
---> 70 scheme = self.get_scheme(layer=layer, layer_name=prefix)
71 layer.scheme = scheme
72 return CompressedTensorsLinearMethod(self)
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py:315, in CompressedTensorsConfig.get_scheme(self, layer, layer_name)
313 # Find the quant_scheme
314 scheme_dict = self.target_scheme_map[matched_target]
--> 315 scheme = self._get_scheme_from_parts(
316 weight_quant=scheme_dict["weights"],
317 input_quant=scheme_dict["input_activations"])
319 # Raise error if device does not support the scheme
320 # (e.g. fp8 needs ada lovelace)
321 self._check_scheme_supported(scheme.get_min_capability())
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py:279, in CompressedTensorsConfig._get_scheme_from_parts(self, weight_quant, input_quant)
273 if self._is_dynamic_token_w8a8(weight_quant, input_quant):
274 return CompressedTensorsW8A8Int8(
275 strategy=weight_quant.strategy,
276 is_static_input_scheme=False,
277 input_symmetric=input_quant.symmetric)
--> 279 raise NotImplementedError(
280 "No compressed-tensors compatible scheme was found.")
NotImplementedError: No compressed-tensors compatible scheme was found.
Using the Transformers approach, an error occurs during inference:
ValueError Traceback (most recent call last)
Cell In[44], line 4
2 for model_input in tqdm(model_inputs):
3 input_ids = quant_tokenizer(text, return_tensors="pt").input_ids.to(quant_model.device)
----> 4 output = quant_model.generate(input_ids, max_new_tokens=10, eos_token_id=quant_tokenizer.eos_token_id, pad_token_id=quant_tokenizer.pad_token_id)
5 ans = quant_tokenizer.decode(output[0],skip_special_tokens=True).split('\n')[-1]
6 quant_ans.append(ans)
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/utils/_contextlib.py:116, in context_decorator..decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)
File ~/.local/lib/python3.10/site-packages/transformers/generation/utils.py:2047, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
2039 input_ids, model_kwargs = self._expand_inputs_for_generation(
2040 input_ids=input_ids,
2041 expand_size=generation_config.num_return_sequences,
2042 is_encoder_decoder=self.config.is_encoder_decoder,
2043 **model_kwargs,
2044 )
2046 # 12. run sample (it degenerates to greedy search when generation_config.do_sample=False
)
-> 2047 result = self._sample(
2048 input_ids,
2049 logits_processor=prepared_logits_processor,
2050 stopping_criteria=prepared_stopping_criteria,
2051 generation_config=generation_config,
2052 synced_gpus=synced_gpus,
2053 streamer=streamer,
2054 **model_kwargs,
2055 )
2057 elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
2058 # 11. prepare beam search scorer
2059 beam_scorer = BeamSearchScorer(
2060 batch_size=batch_size,
2061 num_beams=generation_config.num_beams,
(...)
2066 max_length=generation_config.max_length,
2067 )
File ~/.local/lib/python3.10/site-packages/transformers/generation/utils.py:3007, in GenerationMixin._sample(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, **model_kwargs)
3004 model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
3006 # forward pass to get next token
-> 3007 outputs = self(**model_inputs, return_dict=True)
3009 if synced_gpus and this_peer_finished:
3010 continue # don't waste resources running the code we don't need
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.local/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py:1167, in Qwen2ForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, num_logits_to_keep)
1164 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1166 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 1167 outputs = self.model(
1168 input_ids=input_ids,
1169 attention_mask=attention_mask,
1170 position_ids=position_ids,
1171 past_key_values=past_key_values,
1172 inputs_embeds=inputs_embeds,
1173 use_cache=use_cache,
1174 output_attentions=output_attentions,
1175 output_hidden_states=output_hidden_states,
1176 return_dict=return_dict,
1177 cache_position=cache_position,
1178 )
1180 hidden_states = outputs[0]
1181 if labels is None and not is_torchdynamo_compiling():
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.local/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py:976, in Qwen2Model.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)
964 layer_outputs = self._gradient_checkpointing_func(
965 decoder_layer.call,
966 hidden_states,
(...)
973 position_embeddings,
974 )
975 else:
--> 976 layer_outputs = decoder_layer(
977 hidden_states,
978 attention_mask=causal_mask,
979 position_ids=position_ids,
980 past_key_value=past_key_values,
981 output_attentions=output_attentions,
982 use_cache=use_cache,
983 cache_position=cache_position,
984 position_embeddings=position_embeddings,
985 )
987 hidden_states = layer_outputs[0]
989 if use_cache:
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.local/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py:702, in Qwen2DecoderLayer.forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, cache_position, position_embeddings, **kwargs)
699 hidden_states = self.input_layernorm(hidden_states)
701 # Self Attention
--> 702 hidden_states, self_attn_weights, present_key_value = self.self_attn(
703 hidden_states=hidden_states,
704 attention_mask=attention_mask,
705 position_ids=position_ids,
706 past_key_value=past_key_value,
707 output_attentions=output_attentions,
708 use_cache=use_cache,
709 cache_position=cache_position,
710 position_embeddings=position_embeddings,
711 )
712 hidden_states = residual + hidden_states
714 # Fully Connected
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.local/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py:580, in Qwen2SdpaAttention.forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, cache_position, position_embeddings)
569 return super().forward(
570 hidden_states=hidden_states,
571 attention_mask=attention_mask,
(...)
575 use_cache=use_cache,
576 )
578 bsz, q_len, _ = hidden_states.size()
--> 580 query_states = self.q_proj(hidden_states)
581 key_states = self.k_proj(hidden_states)
582 value_states = self.v_proj(hidden_states)
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File /opt/conda/envs/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.local/lib/python3.10/site-packages/compressed_tensors/quantization/lifecycle/forward.py:326, in wrap_module_forward_quantized..wrapped_forward(self, *args, **kwargs)
321 self.weight.data = forward_quantize(
322 module, self.weight, "weight", scheme.weights
323 )
325 # perform wrapped forward call
--> 326 output = forward_func_orig.get(module, module.class)(
327 input_, *args[1:], **kwargs
328 )
330 # restore back to unquantized_value
331 if scheme.weights is not None and not compressed:
File ~/.local/lib/python3.10/site-packages/compressed_tensors/linear/compressed_linear.py:103, in CompressedLinear.forward(self, input)
99 """
100 Decompresses the weight, then runs the wrapped forward pass
101 """
102 if self.quantization_status == QuantizationStatus.COMPRESSED:
--> 103 weight_data = self.compressor.decompress_module(self)
104 param = Parameter(weight_data, requires_grad=False)
105 register_offload_parameter(self, "weight", param)
File ~/.local/lib/python3.10/site-packages/compressed_tensors/compressors/base.py:188, in BaseCompressor.decompress_module(self, module)
185 for name, parameter in module.named_parameters():
186 compressed_data[name] = parameter
--> 188 return self.decompress_weight(
189 compressed_data=compressed_data, quantization_args=quantization_args
190 ).to(device)
File ~/.local/lib/python3.10/site-packages/compressed_tensors/compressors/quantized_compressors/pack_quantized.py:174, in PackedQuantizationCompressor.decompress_weight(self, compressed_data, quantization_args)
169 # NOTE: this will fail decompression as we don't currently handle packed zp on decompression
170 if not quantization_args.symmetric and quantization_args.strategy in [
171 QuantizationStrategy.GROUP.value,
172 QuantizationStrategy.CHANNEL.value,
173 ]:
--> 174 raise ValueError(
175 "Decompression of packed zero points is currently not supported"
176 )
177 assert zero_point is not None
178 original_zp_shape = (original_shape[0], scale.shape[-1])
ValueError: Decompression of packed zero points is currently not supported
Additional context
以下是模型quant之后的config:
{
"_name_or_path": "/home/notebook/model_hub/model_only_level_10_12_data_with_shop",
"architectures": [
"Qwen2ForCausalLM"
],
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151643,
"hidden_act": "silu",
"hidden_size": 3584,
"initializer_range": 0.02,
"intermediate_size": 18944,
"max_position_embeddings": 131072,
"max_window_layers": 28,
"model_type": "qwen2",
"num_attention_heads": 28,
"num_hidden_layers": 28,
"num_key_value_heads": 4,
"quantization_config": {
"config_groups": {
"group_0": {
"input_activations": null,
"output_activations": null,
"targets": [
"Linear"
],
"weights": {
"actorder": null,
"block_structure": null,
"dynamic": false,
"group_size": 128,
"num_bits": 4,
"observer": "minmax",
"observer_kwargs": {},
"strategy": "group",
"symmetric": false,
"type": "int"
}
}
},
"format": "pack-quantized",
"global_compression_ratio": null,
"ignore": [
"lm_head"
],
"kv_cache_scheme": null,
"quant_method": "compressed-tensors",
"quantization_status": "compressed"
},
"rms_norm_eps": 1e-06,
"rope_scaling": null,
"rope_theta": 1000000.0,
"sliding_window": null,
"tie_word_embeddings": false,
"torch_dtype": "bfloat16",
"transformers_version": "4.45.2",
"use_cache": false,
"use_sliding_window": false,
"vocab_size": 152064
}