fix formatting

nirda7 · nirda7 · commit 2ca56365db6a · 2025-01-14T03:24:05.000+02:00
Signed-off-by: Nir David &lt;ndavid@habana.ai&gt;
diff --git a/docs/source/features/quantization/inc.md b/docs/source/features/quantization/inc.md
@@ -11,6 +11,7 @@ Intel Gaudi supports quantization of various modules and functions, including, b
 > [!NOTE]
 > Measurement files are required to run quantized models with vLLM on Gaudi accelerators. The FP8 model calibration procedure is described in the [vllm-hpu-extention](https://github.com/HabanaAI/vllm-hpu-extension/tree/main/calibration/README.md) package.
 
+<!--  -->
 > [!NOTE]
 > `QUANT_CONFIG` is an environment variable that points to the measurement or quantization [JSON config file](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-json-config-file-options).
 > The measurement configuration file is used during the calibration procedure to collect measurements for a given model. The quantization configuration is used during inference.
@@ -27,6 +28,7 @@ vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtyp
 > [!TIP]
 > If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop.
 
+<!--  -->
 > [!TIP]
 > When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables:
 > `VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.
@@ -35,6 +37,7 @@ vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtyp
 ## Run Offline Inference Using FP8
 
 To run offline inference (after completing the model calibration process):
+
 * Set the "QUANT_CONFIG" environment variable to point to a JSON configuration file with QUANTIZE mode.
 * Pass `quantization=inc` and `kv_cache_dtype=fp8_inc` as parameters to the `LLM` object.
 * Call shutdown method of the model_executor at the end of the run.
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -325,7 +325,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             default=EngineArgs.weights_load_device,
                             choices=DEVICE_OPTIONS,
                             help='Device to which model weights '
-                                 'will be loaded.')
+                            'will be loaded.')
         parser.add_argument(
             '--config-format',
             default=EngineArgs.config_format,
diff --git a/vllm/executor/hpu_executor.py b/vllm/executor/hpu_executor.py
@@ -26,7 +26,6 @@ class HPUExecutor(ExecutorBase):
     def _init_executor(self) -> None:
         """Initialize the worker and load the model."""
         self._init_worker()
-        self.shutdown_inc = True
 
     def _get_worker_kwargs(
             self,
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
@@ -50,13 +50,13 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     from .gptq_marlin import GPTQMarlinConfig
     from .gptq_marlin_24 import GPTQMarlin24Config
     from .hqq_marlin import HQQMarlinConfig
+    from .inc import INCConfig
     from .ipex_quant import IPEXConfig
     from .marlin import MarlinConfig
     from .modelopt import ModelOptFp8Config
     from .neuron_quant import NeuronQuantConfig
     from .qqq import QQQConfig
     from .tpu_int8 import Int8TpuConfig
-    from .inc import INCConfig
 
     method_to_config: Dict[str, Type[QuantizationConfig]] = {
         "aqlm": AQLMConfig,
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
@@ -274,7 +274,9 @@ def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt):
     return indices, offsets
 
 
-def modify_decoder_layer(module: torch.nn.Module, name="", suffix="DecoderLayer"):
+def modify_decoder_layer(module: torch.nn.Module,
+                         name="",
+                         suffix="DecoderLayer"):
     if module.__class__.__name__.endswith(suffix):
 
         def forward_hook(module, args, output):
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
@@ -402,9 +402,7 @@ def _allocate_kv_cache(
         if device != 'hpu' and self.dtype == torch.float8_e4m3fn:
             dtype = torch.uint8
         for _ in range(self.num_attention_layers):
-            key_cache = torch.zeros(kv_cache_shape,
-                                    dtype=dtype,
-                                    device=device)
+            key_cache = torch.zeros(kv_cache_shape, dtype=dtype, device=device)
             value_cache = torch.zeros(kv_cache_shape,
                                       dtype=dtype,
                                       device=device)