vllm-project · alexm-neuralmagic · Jan 9, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025
diff --git a/examples/offline_inference/offline_inference.py b/examples/offline_inference/offline_inference.py
@@ -8,10 +8,10 @@
     "The future of AI is",
 ]
 # Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+sampling_params = SamplingParams()#temperature=0.8, top_p=0.95)
 
 # Create an LLM.
-llm = LLM(model="facebook/opt-125m")
+llm = LLM(model="Qwen/Qwen2-1.5B-Instruct", max_model_len=512, max_num_seqs=16, enforce_eager=True)
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)

@@ -20,7 +20,7 @@
 FILTER = "exact_match,strict-match"
 RTOL = 0.03
 EXPECTED_VALUE = 0.58
-DEFAULT_ARGS = ["--max-model-len", "2048", "--disable-log-requests"]
+DEFAULT_ARGS = ["--max-model-len", "2048", "--disable-log-requests", "--enforce-eager", "--max-num-seqs", "64"]
 MORE_ARGS_LIST = [
     [],  # Default
     ["--enable-chunked-prefill"],  # Chunked
@@ -61,12 +61,15 @@ def run_test(more_args):
         )
 
         measured_value = results["results"][TASK][FILTER]
+        print("measured_value = {}".format(measured_value))
+
         assert (measured_value - RTOL < EXPECTED_VALUE
                 and measured_value + RTOL > EXPECTED_VALUE
                 ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
 
 
-@pytest.mark.skipif(not current_platform.is_cuda(),
+@pytest.mark.skipif(not current_platform.is_cuda()
+                    and not current_platform.is_tpu(),
                     reason="V1 currently only supported on CUDA")
 def test_lm_eval_accuracy_v1_engine(monkeypatch):
     """Run with the V1 Engine."""

diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
@@ -163,6 +163,10 @@ def _cached_get_attn_backend(
         logger.info("Using Pallas backend.")
         from vllm.attention.backends.pallas import PallasAttentionBackend
         return PallasAttentionBackend
+    elif backend == _Backend.PALLAS_VLLM_V1:
+        logger.info("Using Pallas backend.")
+        from vllm.v1.attention.backends.pallas import PallasAttentionBackendV1
+        return PallasAttentionBackendV1
     elif backend == _Backend.NO_ATTENTION:
         from vllm.attention.backends.placeholder_attn import (
             PlaceholderAttentionBackend)

diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
@@ -32,6 +32,7 @@ class _Backend(enum.Enum):
     FLASHINFER = enum.auto()
     HPU_ATTN = enum.auto()
     PALLAS = enum.auto()
+    PALLAS_VLLM_V1 = enum.auto()
     IPEX = enum.auto()
     NO_ATTENTION = enum.auto()
 

diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
@@ -55,8 +55,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if compilation_config.level == CompilationLevel.NO_COMPILATION:
             # TPU does not support NO_COMPILATION
             compilation_config.level = CompilationLevel.DYNAMO_ONCE
-        assert compilation_config.level < CompilationLevel.PIECEWISE,\
-            "TPU does not support Inductor."
+        compilation_config.level = 2
+        # assert compilation_config.level < CompilationLevel.PIECEWISE,\
+        #     "TPU does not support Inductor. compilation_config.level = {}".format(compilation_config.level)
 
         if compilation_config.backend == "":
             compilation_config.backend = "openxla"
@@ -72,3 +73,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                     "vllm.worker.multi_step_tpu_worker.MultiStepTPUWorker"
             else:
                 parallel_config.worker_cls = "vllm.worker.tpu_worker.TPUWorker"
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        # TODO: Verify if it is indeed the case
+        logger.warning("Pin memory is not supported on TPU.")
+        return False