Skip to content

Commit 2ca5636

Browse files
committed
fix formatting
Signed-off-by: Nir David <[email protected]>
1 parent f93f54b commit 2ca5636

File tree

6 files changed

+9
-7
lines changed

6 files changed

+9
-7
lines changed

docs/source/features/quantization/inc.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Intel Gaudi supports quantization of various modules and functions, including, b
1111
> [!NOTE]
1212
> Measurement files are required to run quantized models with vLLM on Gaudi accelerators. The FP8 model calibration procedure is described in the [vllm-hpu-extention](https://github.com/HabanaAI/vllm-hpu-extension/tree/main/calibration/README.md) package.
1313
14+
<!-- -->
1415
> [!NOTE]
1516
> `QUANT_CONFIG` is an environment variable that points to the measurement or quantization [JSON config file](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-json-config-file-options).
1617
> The measurement configuration file is used during the calibration procedure to collect measurements for a given model. The quantization configuration is used during inference.
@@ -27,6 +28,7 @@ vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtyp
2728
> [!TIP]
2829
> If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop.
2930
31+
<!-- -->
3032
> [!TIP]
3133
> When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables:
3234
> `VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.
@@ -35,6 +37,7 @@ vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtyp
3537
## Run Offline Inference Using FP8
3638

3739
To run offline inference (after completing the model calibration process):
40+
3841
* Set the "QUANT_CONFIG" environment variable to point to a JSON configuration file with QUANTIZE mode.
3942
* Pass `quantization=inc` and `kv_cache_dtype=fp8_inc` as parameters to the `LLM` object.
4043
* Call shutdown method of the model_executor at the end of the run.

vllm/engine/arg_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
325325
default=EngineArgs.weights_load_device,
326326
choices=DEVICE_OPTIONS,
327327
help='Device to which model weights '
328-
'will be loaded.')
328+
'will be loaded.')
329329
parser.add_argument(
330330
'--config-format',
331331
default=EngineArgs.config_format,

vllm/executor/hpu_executor.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ class HPUExecutor(ExecutorBase):
2626
def _init_executor(self) -> None:
2727
"""Initialize the worker and load the model."""
2828
self._init_worker()
29-
self.shutdown_inc = True
3029

3130
def _get_worker_kwargs(
3231
self,

vllm/model_executor/layers/quantization/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,13 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
5050
from .gptq_marlin import GPTQMarlinConfig
5151
from .gptq_marlin_24 import GPTQMarlin24Config
5252
from .hqq_marlin import HQQMarlinConfig
53+
from .inc import INCConfig
5354
from .ipex_quant import IPEXConfig
5455
from .marlin import MarlinConfig
5556
from .modelopt import ModelOptFp8Config
5657
from .neuron_quant import NeuronQuantConfig
5758
from .qqq import QQQConfig
5859
from .tpu_int8 import Int8TpuConfig
59-
from .inc import INCConfig
6060

6161
method_to_config: Dict[str, Type[QuantizationConfig]] = {
6262
"aqlm": AQLMConfig,

vllm/worker/hpu_model_runner.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,9 @@ def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt):
274274
return indices, offsets
275275

276276

277-
def modify_decoder_layer(module: torch.nn.Module, name="", suffix="DecoderLayer"):
277+
def modify_decoder_layer(module: torch.nn.Module,
278+
name="",
279+
suffix="DecoderLayer"):
278280
if module.__class__.__name__.endswith(suffix):
279281

280282
def forward_hook(module, args, output):

vllm/worker/hpu_worker.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -402,9 +402,7 @@ def _allocate_kv_cache(
402402
if device != 'hpu' and self.dtype == torch.float8_e4m3fn:
403403
dtype = torch.uint8
404404
for _ in range(self.num_attention_layers):
405-
key_cache = torch.zeros(kv_cache_shape,
406-
dtype=dtype,
407-
device=device)
405+
key_cache = torch.zeros(kv_cache_shape, dtype=dtype, device=device)
408406
value_cache = torch.zeros(kv_cache_shape,
409407
dtype=dtype,
410408
device=device)

0 commit comments

Comments
 (0)