From 85c50e26dd4abb537cf6f8a67e3e810c6403039a Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 07:48:22 -0500
Subject: [PATCH 001/293] Squash commit of all changes from v1_logprobs

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 tests/v1/samplers/__init__.py                |   0
 tests/v1/samplers/test_logprobs.py           | 340 +++++++++++++++++++
 vllm/outputs.py                              |  16 +-
 vllm/transformers_utils/detokenizer_utils.py |  51 ++-
 vllm/v1/core/scheduler.py                    | 152 ++++++++-
 vllm/v1/engine/__init__.py                   |   9 +
 vllm/v1/engine/async_llm.py                  |   3 +-
 vllm/v1/engine/detokenizer.py                |  60 +++-
 vllm/v1/engine/llm_engine.py                 |  10 +-
 vllm/v1/engine/processor.py                  |  28 +-
 vllm/v1/outputs.py                           |   8 +-
 vllm/v1/request.py                           |   8 +-
 vllm/v1/sample/metadata.py                   |   8 +-
 vllm/v1/sample/sampler.py                    | 161 +++++++--
 vllm/v1/worker/gpu_model_runner.py           | 107 ++++--
 15 files changed, 885 insertions(+), 76 deletions(-)
 create mode 100644 tests/v1/samplers/__init__.py
 create mode 100644 tests/v1/samplers/test_logprobs.py

diff --git a/tests/v1/samplers/__init__.py b/tests/v1/samplers/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
new file mode 100644
index 0000000000000..114ce7bd1f2fb
--- /dev/null
+++ b/tests/v1/samplers/test_logprobs.py
@@ -0,0 +1,340 @@
+from typing import List, Tuple
+
+import pytest
+import torch
+
+from tests.kernels.utils import override_backend_env_variable
+from vllm import SamplingParams
+
+from ...conftest import VllmRunner
+
+MODELS = ["facebook/opt-125m"]
+
+
+def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
+    """Generate logprobs configs for a batch of requests
+    
+    A given request's logprobs configuration is (1) num_sample_logprobs and (2)
+    num_prompt_logprobs. The batch logprobs configuration is the list of request
+    logprobs configs.
+
+    batch_logprobs_composition == "NONE" yields a batch with no sample or prompt
+    logprobs
+
+    batch_logprobs_composition == "SAMPLE" yields a batch with some requests
+    configured for sample logprobs only, and others configured for no logprobs
+
+    batch_logprobs_composition == "PROMPT" yields a batch with some requests
+    configured for prompt logprobs only, and others configured for no logprobs
+
+    batch_logprobs_composition == "SAMPLE_PROMPT" yields a batch with some
+    requests configured for sample logprobs and prompt logprobs, some configured
+    for only sample logprobs or only prompt logprobs, and some configured for
+    no logprobs
+
+    Args:
+      
+      batch_logprobs_composition: types of logprobs configs to include in batch
+
+    Returns:
+
+      List of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs])
+      tuples
+    
+    """
+    if batch_logprobs_composition == "NONE":
+        # No requests with sample or prompt logprobs
+        return [(None, None), (0, None), (None, 0), (0, 0)]
+    elif batch_logprobs_composition == "SAMPLE":
+        return [
+            (None, None),
+            (None, 0),
+            (0, None),
+            (0, 0),
+            (5, None),
+            (3, 0),
+        ]
+    elif batch_logprobs_composition == "PROMPT":
+        return [
+            (None, 0),
+            (0, None),
+            (0, 0),
+            (None, 6),
+            (0, 5),
+        ]
+    elif batch_logprobs_composition == "SAMPLE_PROMPT":
+        return [
+            (None, 0),
+            (0, None),
+            (0, 0),
+            (5, None),
+            (3, 0),
+            (6, 3),
+            (None, 6),
+            (0, 5),
+        ]
+    else:
+        raise ValueError("Invalid logprobs batch configuration for test.")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype",
+                         ["half"])  # needed for comparing logprobs with HF
+# @pytest.mark.parametrize("detokenize", [True, False])
+@pytest.mark.parametrize("max_num_batched_tokens", [128, 256, 1024])
+@pytest.mark.parametrize("batch_logprobs_composition",
+                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
+def test_get_logprobs_and_prompt_logprobs(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+    # detokenize: bool,
+    batch_logprobs_composition: str,
+    max_num_batched_tokens: int,
+    example_prompts,
+    monkeypatch,
+):
+    """Test V1 Engine logprobs & prompt logprobs
+    
+    Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
+    settings and validate that
+    * The generated logprobs and prompt logprobs are consistent with the
+      configuration settings, in terms of whether or not the logprobs
+      (of either type) were requested and how many were requested
+    * The generated logprobs are consistent with the generated tokens
+    * The generated (prompt)logprobs are consistent with HuggingFace
+      (prompt)logprobs, as a reference
+
+    batch_logprobs_composition controls the logprobs configurations for
+    requests in the batch under test.
+
+    Args:
+      hf_runner
+      vllm_runner
+      model
+      dtype
+      detokenize: if False, return generated tokens bypassing detokenizer
+      batch_logprobs_composition: logprobs configuration for test batch
+      example_prompts
+      monkeypatch
+    """
+    detokenize = True
+
+    test_prompts = example_prompts
+
+    # LLM engine v1
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    override_backend_env_variable(monkeypatch, "FLASH_ATTN")
+
+    max_num_seqs = 128
+    max_num_batched_tokens = 128
+    max_model_len = 128
+
+    max_tokens = 5
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(
+            test_prompts,
+            max_tokens=max_tokens,
+        )
+        hf_logprobs = hf_model.generate_greedy_logprobs(
+            test_prompts,
+            max_tokens=max_tokens,
+        )
+
+    # Batch has mixed sample params
+    # (different logprobs/prompt logprobs combos)
+    logprob_prompt_logprob_list = _get_test_batch(batch_logprobs_composition)
+
+    # We rely on there being more prompts than combinations of
+    # logprobs & prompt logprobs which we want to test
+    assert len(test_prompts) >= len(logprob_prompt_logprob_list)
+    # Make sure there is a sample params for each prompt
+    num_extra_params = len(test_prompts) - len(logprob_prompt_logprob_list)
+    if num_extra_params > 0:
+        logprob_prompt_logprob_list = (
+            logprob_prompt_logprob_list +
+            logprob_prompt_logprob_list[-num_extra_params:])
+    # Now the number of prompts should match the number of sample params combos
+    assert len(test_prompts) == len(logprob_prompt_logprob_list)
+    # Generate SamplingParams
+    vllm_sampling_params = [
+        SamplingParams(max_tokens=max_tokens,
+                       logprobs=lp,
+                       prompt_logprobs=plp,
+                       temperature=0.0,
+                       detokenize=detokenize)
+        for lp, plp in logprob_prompt_logprob_list
+    ]
+
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_logprobs=7,
+            max_num_batched_tokens=max_num_batched_tokens,
+            max_num_seqs=max_num_seqs,
+            max_model_len=max_model_len,
+            enforce_eager=True,
+    ) as vllm_model:
+        vllm_results = vllm_model.model.generate(
+            test_prompts, sampling_params=vllm_sampling_params)
+
+    for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip(
+            vllm_results, hf_logprobs, hf_outputs,
+            logprob_prompt_logprob_list):
+
+        # Extract request-level (prompt)logprobs config
+        num_top_logprobs = logprob_prompt_logprob[0]
+        num_top_prompt_logprobs = logprob_prompt_logprob[1]
+
+        # Test whether sampled token output is consistent between vLLM and HF
+        # vLLM prompt+completion should match HF output
+        assert (vllm_result.prompt_token_ids +
+                vllm_result.outputs[0].token_ids == hf_output[0])
+
+        # Validate sample logprobs
+        if num_top_logprobs is not None and num_top_logprobs > 0:
+            assert num_top_logprobs is not None
+            # Confirm that the structure of the sample logprobs in the result is
+            # correct
+            assert vllm_result.outputs[0].logprobs is not None
+            assert len(vllm_result.outputs[0].logprobs) == max_tokens
+            for logprobs in vllm_result.outputs[0].logprobs:
+                assert logprobs is not None
+                # If the output token is not included in the top X
+                # logprob, it can return 1 more data
+                assert (len(logprobs) == num_top_logprobs
+                        or len(logprobs) == num_top_logprobs + 1)
+            output_text = vllm_result.outputs[0].text
+            output_string_from_most_likely_tokens_lst: List[str] = []
+            for top_logprobs in vllm_result.outputs[0].logprobs:
+                top_logprob = next(iter(top_logprobs.values()))
+                output_string_from_most_likely_tokens_lst.append(
+                    top_logprob.decoded_token)
+
+            if detokenize:
+                output_string_from_most_likely_tokens = "".join(
+                    output_string_from_most_likely_tokens_lst)
+                assert output_text == output_string_from_most_likely_tokens, (
+                    "The output text from the top logprob for each token "
+                    "position should be the same as the output text in the "
+                    "result.")
+            else:
+                assert output_text == ''
+                assert output_string_from_most_likely_tokens_lst == (
+                    [None] * max_tokens)
+
+            # Compare vLLM sample logprobs to HF
+            vllm_sample_logprobs = vllm_result.outputs[0].logprobs
+            for i, top_logprobs in enumerate(vllm_sample_logprobs):
+                for token_id, sample_logprob in top_logprobs.items():
+                    logprob = sample_logprob.logprob
+                    torch.testing.assert_close(
+                        logprob,
+                        hf_logprob[i][-1][token_id].item(),
+                        atol=1e-2,
+                        rtol=1e-2)
+                    if detokenize:
+                        assert isinstance(sample_logprob.decoded_token, str), (
+                            "The token should be decoded by the time it is"
+                            " returned to the user.")
+        else:
+            # Logprobs disabled for this request; should be None
+            assert vllm_result.outputs[0].logprobs is None
+
+        # Validate prompt logprobs
+        if (num_top_prompt_logprobs is not None
+                and num_top_prompt_logprobs > 0):
+            # Confirm that structure of prompt logprobs in result is correct
+            assert vllm_result.prompt_logprobs is not None
+            # - The first prompt logprob is always None
+            assert vllm_result.prompt_logprobs[0] is None
+            # - Prompt logprobs are returned for all indices in
+            #   the prompt
+            assert len(vllm_result.prompt_logprobs) == len(
+                vllm_result.prompt_token_ids)
+            for prompt_logprobs in vllm_result.prompt_logprobs[1:]:
+                assert prompt_logprobs is not None
+                # - If the prompt token is not included in the top X
+                #   logprob, it can return 1 more data
+                assert (len(prompt_logprobs) == num_top_prompt_logprobs
+                        or len(prompt_logprobs) == num_top_prompt_logprobs + 1)
+
+            # Compare prompt logprobs to HF
+            # The first prompt logprob is always None, so we compare it from
+            # 1:.
+            vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
+            for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
+                for token_id, logprob in vllm_prompt_logprob_dict.items():
+                    torch.testing.assert_close(
+                        logprob.logprob,
+                        hf_logprob[0][i][token_id].item(),
+                        atol=1e-2,
+                        rtol=1e-2)
+        else:
+            assert vllm_result.prompt_logprobs is None
+
+
+def test_max_logprobs(monkeypatch):
+    """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
+    
+    Should also fail for `prompt_logprobs > max_logprobs`
+    
+    Args:
+      monkeypatch
+    """
+    # LLM engine v1
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    override_backend_env_variable(monkeypatch, "FLASH_ATTN")
+
+    runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
+    vllm_sampling_params = SamplingParams(logprobs=1)
+    # should pass
+    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
+
+    bad_sampling_params = SamplingParams(logprobs=2)
+    with pytest.raises(ValueError):
+        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("detokenize", [True, False])
+def test_none_logprobs(vllm_runner, model, detokenize: bool, example_prompts,
+                       monkeypatch):
+    """Engine should return `logprobs` and `prompt_logprobs` as `None`
+    
+    Args:
+      vllm_runner
+      model
+      detokenize: whether to feed generated tokens to detokenizer
+      example_prompts
+      monkeypatch
+    """
+
+    # LLM engine v1
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    override_backend_env_variable(monkeypatch, "FLASH_ATTN")
+
+    max_num_seqs = 256
+    max_num_batched_tokens = None
+    max_tokens = 5
+
+    with vllm_runner(
+            model,
+            max_num_batched_tokens=max_num_batched_tokens,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
+                                                       logprobs=None,
+                                                       prompt_logprobs=None,
+                                                       temperature=0.0,
+                                                       detokenize=detokenize)
+        results_logprobs_none = vllm_model.model.generate(
+            example_prompts, sampling_params=sampling_params_logprobs_none)
+
+    for i in range(len(results_logprobs_none)):
+        # Check sample logprobs are None
+        assert results_logprobs_none[i].outputs[0].logprobs is None
+        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
+        # Check prompt logprobs are None
+        assert results_logprobs_none[i].prompt_logprobs is None
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 2d256803edfe8..9733158504945 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -127,24 +127,24 @@ def new(
         prompt_token_ids: Optional[List[int]],
         text: str,
         token_ids: List[int],
+        logprobs: Optional[SampleLogprobs],
+        prompt_logprobs: Optional[PromptLogprobs],
         finished: bool = False,
     ) -> "RequestOutput":
         """Initialize a new RequestOutput object."""
 
         # TODO: Support `n` > 1.
-        completion_output = CompletionOutput(
-            index=0,
-            text=text,
-            token_ids=token_ids,
-            cumulative_logprob=None,
-            logprobs=None,  # TODO
-        )
+        completion_output = CompletionOutput(index=0,
+                                             text=text,
+                                             token_ids=token_ids,
+                                             cumulative_logprob=None,
+                                             logprobs=logprobs)
 
         return RequestOutput(
             request_id=request_id,
             prompt=prompt,
             prompt_token_ids=prompt_token_ids,
-            prompt_logprobs=None,  # TODO
+            prompt_logprobs=prompt_logprobs,
             outputs=[completion_output],
             finished=finished,
         )
diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 37ff8a236e791..885e3b9d92f88 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -1,4 +1,6 @@
-from typing import List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
+
+from vllm.sequence import Logprob
 
 from .tokenizer import AnyTokenizer
 
@@ -165,3 +167,50 @@ def detokenize_incrementally(
 
     new_text = new_text[len(prefix_text):]
     return new_tokens, new_text, read_offset, len(output_tokens)
+
+
+def detokenize_logprob_incrementally_in_place(
+    tokenizer: AnyTokenizer,
+    logprob_dict: Dict[int, Logprob],
+    input_ids_prefix: List[int],
+    prev_tokens: Optional[List[str]],
+    prefix_offset: int,
+    read_offset: int,
+    skip_special_tokens: bool = False,
+    spaces_between_special_tokens: bool = True,
+) -> None:
+    """Detokenizes the logprobs at a single token offset incrementally.
+
+    For each top-token in `logprob_dict`, apply incremental detokenization
+    to the token list `input_ids_prefix + [top-token id]`
+
+    The logprob data structure is modified in-place with the string
+    representation of each decoded top-token.
+    
+    Args:
+        tokenizer: The tokenizer to use.
+        logprob_dict: logprob data structure for a single token position
+        input_ids_prefix: The input ids *preceding* the token offset under
+                          consideration
+        prev_tokens: The previous tokens. If None, this function will convert
+            the input ids to tokens and return the tokens and the new text.
+        prefix_offset: The prefix offset.
+        read_offset: The read offset.
+        skip_special_tokens: Whether to skip special tokens.
+        spaces_between_special_tokens: Whether to add spaces between special
+            tokens.
+    """
+
+    for token_id in logprob_dict:
+        # Detokenize logprob for a particular top
+        # token at a particular token offset
+
+        logprob_dict[token_id].decoded_token = detokenize_incrementally(
+            tokenizer=tokenizer,
+            all_input_ids=input_ids_prefix + [token_id],
+            prev_tokens=prev_tokens,
+            prefix_offset=prefix_offset,
+            read_offset=read_offset,
+            skip_special_tokens=skip_special_tokens,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+        )[1]
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index ba50a9786d805..476b12c705482 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -6,6 +6,7 @@
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
+from vllm.sequence import Logprob
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.engine import EngineCoreOutput
@@ -247,6 +248,13 @@ def schedule(self) -> "SchedulerOutput":
                         self.encoder_cache_manager.allocate(request, i)
                     encoder_budget = new_encoder_budget
 
+        # Now that requests are scheduled, generate a mask indicating which
+        # request is partial
+        partial_running_reqs = [
+            (req.num_computed_tokens + num_scheduled_tokens[req.request_id] <
+             req.num_tokens) for req in self.running
+        ]
+
         # Check if the scheduling constraints are satisfied.
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
         assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
@@ -277,6 +285,7 @@ def schedule(self) -> "SchedulerOutput":
             scheduled_new_reqs=new_reqs_data,
             scheduled_resumed_reqs=resumed_reqs_data,
             scheduled_running_reqs=running_reqs_data,
+            partial_running_reqs=partial_running_reqs,
             num_scheduled_tokens=num_scheduled_tokens,
             total_num_scheduled_tokens=total_num_scheduled_tokens,
             scheduled_encoder_inputs=scheduled_encoder_inputs,
@@ -384,11 +393,85 @@ def update_from_output(
         # NOTE(woosuk): This method doesn't consider speculative decoding.
         sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
+        do_logprobs = model_runner_output.logprobs_cpu is not None
+        do_prompt_logprobs = (
+            model_runner_output.prompt_logprobs_cpu is not None
+            and len(model_runner_output.prompt_logprobs_cpu) > 0)
+        if do_logprobs:
+            assert model_runner_output.logprob_token_ids_cpu is not None
+            logprob_token_ids_list = (
+                model_runner_output.logprob_token_ids_cpu.tolist())
+            logprob_values_list = (model_runner_output.logprobs_cpu.tolist())
+        if do_prompt_logprobs:
+            assert model_runner_output.prompt_logprob_token_ids_cpu is not None
+            prompt_logprob_token_ids_list = (
+                model_runner_output.prompt_logprob_token_ids_cpu.tolist())
+            prompt_logprob_values_list = (
+                model_runner_output.prompt_logprobs_cpu.tolist())
+            curr_prompt_base_idx = 0
         new_running: List[Request] = []
         engine_core_outputs: List[EngineCoreOutput] = []
         for request in self.running:
             req_id = request.request_id
             request.num_computed_tokens += num_scheduled_tokens[req_id]
+            req_index = model_runner_output.req_id_to_index[req_id]
+            num_new_tokens = 1
+            max_logprobs = request.max_logprobs
+            request_do_logprobs = (do_logprobs and max_logprobs is not None
+                                   and max_logprobs > 0)
+
+            if do_prompt_logprobs:
+                max_prompt_logprobs = request.max_prompt_logprobs
+                num_new_prompt_tokens = (
+                    num_scheduled_tokens[request.request_id] -
+                    int(not scheduler_output.partial_running_reqs[req_index]))
+
+                request_do_prompt_logprobs = (max_prompt_logprobs is not None
+                                              and max_prompt_logprobs > 0
+                                              and num_new_prompt_tokens > 0)
+
+                if request_do_prompt_logprobs:
+
+                    # Construct prompt logprobs, under the condition that
+                    # prompt logprobs were requested & a nonzero number of
+                    # prompt tokens were computed in this step for this request.
+                    #
+                    # Note that this scenario returns an EngineCoreOutput which
+                    # is empty except for the prompt logprobs which were
+                    # computed for these prompt tokens.
+
+                    slice_upper_index = (curr_prompt_base_idx +
+                                         num_new_prompt_tokens)
+                    prompt_logprob_token_ids = prompt_logprob_token_ids_list[
+                        curr_prompt_base_idx:slice_upper_index]
+                    prompt_logprob_values = prompt_logprob_values_list[
+                        curr_prompt_base_idx:slice_upper_index]
+                    curr_prompt_base_idx = slice_upper_index
+
+                    logprob_cnt = max_prompt_logprobs
+                    prompt_logprobs = [{
+                        lpt: Logprob(lpv, (idx + 1), None)
+                        for idx, (lpv, lpt) in enumerate(
+                            zip(plp_tok_values[0:logprob_cnt],
+                                plp_tok_token_ids[0:logprob_cnt]))
+                    } for plp_tok_values, plp_tok_token_ids in zip(
+                        prompt_logprob_values, prompt_logprob_token_ids)]
+
+                    if not request.prompt_logprobs:
+                        # Ensure that None is the first prompt logprob
+                        prompt_logprobs = [None] + prompt_logprobs
+
+                    curr_prompt_base_idx = slice_upper_index
+
+                    prompt_slice_range_upper = request.num_computed_tokens
+                    prompt_slice_range_lower = (prompt_slice_range_upper -
+                                                num_new_prompt_tokens)
+                    request.prompt_logprobs.extend(prompt_logprobs)
+                else:
+                    curr_prompt_base_idx += num_new_prompt_tokens
+            else:
+                request_do_prompt_logprobs = False
+
             # When the request's num_computed_tokens catches up its num_tokens,
             # the request generates output tokens. Otherwise, we ignore the
             # sampler output for the request.
@@ -405,12 +488,45 @@ def update_from_output(
                     self.encoder_cache_manager.free(request, input_id)
 
             if request.num_computed_tokens == request.num_tokens:
-                req_index = model_runner_output.req_id_to_index[req_id]
                 # NOTE(woosuk): Currently, we assume that each request
                 # generates at most one token at each step.
                 token_id = sampled_token_ids[req_index]
+                if request_do_logprobs:
+                    # Construct logprobs, if requested (TODO: assumes one
+                    # generated token).
+                    logprob_token_ids = logprob_token_ids_list[req_index]
+                    logprob_values = logprob_values_list[req_index]
+                    logprob_cnt = max_logprobs
+                    if token_id not in logprob_token_ids[0:max_logprobs]:
+                        # Sampled token is not in the in the top logprobs;
+                        # inject it & resort, ensuring that excess logprobs
+                        # not requested by the user have -inf probability
+                        logprob_values[max_logprobs:-1] = (
+                            [float('-inf')] *
+                            (len(logprob_values) - 1 - max_logprobs))
+
+                        indices = sorted(range(len(logprob_values)),
+                                         key=lambda k: logprob_values[k],
+                                         reverse=True)
+                        logprob_values = [logprob_values[i] for i in indices]
+                        logprob_token_ids = [
+                            logprob_token_ids[i] for i in indices
+                        ]
+
+                        # There will be one more logprob than the user requested
+                        logprob_cnt = max_logprobs + 1
+
+                    # Only keep the number of logprobs specified by the request
+                    # (plus possibly the sampled token id & its logprob)
+                    logprob_values = logprob_values[0:logprob_cnt]
+                    logprob_token_ids = logprob_token_ids[0:logprob_cnt]
+
+                    request.logprobs.append({
+                        lpt: Logprob(lpv, (idx + 1), None)
+                        for idx, (lpv, lpt) in enumerate(
+                            zip(logprob_values, logprob_token_ids))
+                    })
                 request.append_output_token_ids(token_id)
-                num_new_tokens = 1
                 # TODO: Update the KV cache manager for prefix caching.
 
                 # Check for stop and update request state.
@@ -418,18 +534,47 @@ def update_from_output(
                 stopped = self._check_stop(request)
 
                 # Add EngineCoreOutput for this Request.
+                # Return the logprob for the most recently computed tokens.
+                # Return no prompt logprobs in decode-phase.
                 output = EngineCoreOutput(
                     request_id=req_id,
                     new_token_ids=request.output_token_ids[-num_new_tokens:],
                     finished=request.is_finished(),
                     finish_reason=request.get_finished_reason(),
-                    stop_reason=request.stop_reason)
+                    stop_reason=request.stop_reason,
+                    logprobs=(request.logprobs[-num_new_tokens:]
+                              if request_do_logprobs else None),
+                    prompt_logprobs=(prompt_logprobs
+                                     if request_do_prompt_logprobs else None),
+                    prompt_logprobs_token_ids=(request.prompt_token_ids
+                                               if request_do_prompt_logprobs
+                                               else None))
                 engine_core_outputs.append(output)
 
                 # Breakout of the loop.
                 if stopped:
                     continue
 
+            elif request_do_prompt_logprobs:
+                # This request is still partial but prompt logprobs were
+                # requested
+                engine_core_outputs.append(
+                    EngineCoreOutput(
+                        request_id=req_id,
+                        new_token_ids=[],
+                        finished=request.is_finished(),
+                        finish_reason=request.get_finished_reason(),
+                        stop_reason=request.stop_reason,
+                        logprobs=[] if request_do_logprobs else None,
+                        prompt_logprobs=(
+                            prompt_logprobs if request_do_prompt_logprobs else
+                            ([] if request_do_prompt_logprobs else None)),
+                        prompt_logprobs_token_ids=(
+                            request.prompt_token_ids[prompt_slice_range_lower:
+                                                     prompt_slice_range_upper]
+                            if request_do_prompt_logprobs else
+                            ([] if request_do_prompt_logprobs else None))))
+
             new_running.append(request)
         self.running = new_running
         return engine_core_outputs
@@ -581,6 +726,7 @@ class SchedulerOutput:
     scheduled_new_reqs: List[NewRequestData]
     scheduled_resumed_reqs: List[ResumedRequestData]
     scheduled_running_reqs: List[RunningRequestData]
+    partial_running_reqs: List[bool]  # True if running req is partial
 
     num_scheduled_tokens: Dict[str, int]
     total_num_scheduled_tokens: int
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 967124fd850ea..46ee3154d69c0 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -7,6 +7,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
 from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.sequence import PromptLogprobs, SampleLogprobs
 
 
 @dataclass
@@ -22,6 +23,11 @@ class DetokenizerRequest:
     stop: List[str]
     include_stop_str_in_output: bool
 
+    # Per-request logprobs & prompt logprobs
+    # counts; None is equivalent to 0
+    logprobs: Optional[int]
+    prompt_logprobs: Optional[int]
+
 
 @dataclass
 class EngineCoreRequest:
@@ -52,6 +58,9 @@ class EngineCoreOutput(msgspec.Struct,
     request_id: str
     new_token_ids: List[int]
     finished: bool
+    logprobs: Optional[SampleLogprobs]
+    prompt_logprobs: Optional[PromptLogprobs]
+    prompt_logprobs_token_ids: Optional[List[int]]
     finish_reason: Optional[str] = None
     stop_reason: Union[int, str, None] = None
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a17c8eac4b77c..421ecc8c0d921 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -144,7 +144,8 @@ async def add_request(
 
         # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
         detokenizer_req, engine_core_req = self.processor.process_inputs(
-            request_id, prompt, params, arrival_time, lora_request,
+            request_id, prompt, params, arrival_time,
+            (await self.get_model_config()).max_logprobs, lora_request,
             trace_headers, prompt_adapter_request, priority)
 
         # 3) Add the request to Detokenizer (this process).
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 6249d60199a62..5ad8b8c725f3e 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,17 +1,21 @@
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Dict, Iterable, List, Optional, Tuple, Union
 
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import RequestOutputKind
+from vllm.sequence import PromptLogprobs, SampleLogprobs
 from vllm.transformers_utils.detokenizer_utils import (
-    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
+    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally,
+    detokenize_logprob_incrementally_in_place)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.v1.engine import DetokenizerRequest, EngineCoreOutput
 
 logger = init_logger(__name__)
 
+AnyLogprobs = Union[Optional[SampleLogprobs], Optional[PromptLogprobs]]
+
 
 @dataclass
 class IncrementalDetokenizer:
@@ -20,6 +24,8 @@ class IncrementalDetokenizer:
     output_text: str
     tokens: List[str]
     token_ids: List[int]
+    logprobs: Optional[SampleLogprobs]
+    prompt_logprobs: Optional[PromptLogprobs]
 
     # Stop strings
     stop: List[str]
@@ -72,6 +78,11 @@ def from_new_request(
         else:
             stop_buffer_length = 0
 
+        # Logprobs & prompt logprobs settings
+        do_logprobs = request.logprobs is not None and request.logprobs > 0
+        do_prompt_logprobs = (request.prompt_logprobs is not None
+                              and request.prompt_logprobs > 0)
+
         return cls(
             output_text="",
             tokens=tokens,
@@ -91,25 +102,34 @@ def from_new_request(
             prompt_token_ids=request.prompt_token_ids,
             tokenizer=tokenizer,
             stop_buffer_length=stop_buffer_length,
-        )
+            logprobs=[] if do_logprobs else None,
+            prompt_logprobs=[] if do_prompt_logprobs else None)
 
     def add_tokens(
         self,
         new_token_ids: List[int],
+        new_logprobs: Optional[SampleLogprobs],
+        new_prompt_logprobs: Optional[PromptLogprobs],
         finish_reason: Optional[str],
         stop_reason: Optional[str],
     ) -> Optional[RequestOutput]:
         """
         Update RequestState for the request_id by:
             1) Detokenize the new token ids incrementally.
+            1a) If necessary, detokenize logprobs incrementally
+            1b) If necessary, detokenize prompt logprobs incrementally
             2) Update the RequestOutput with the new text.
         """
 
-        # 1) Detokenize the new token ids incrementally.
+        do_logprobs = new_logprobs is not None and len(new_logprobs) > 0
+        assert not do_logprobs or len(new_logprobs) == len(new_token_ids)
+
+        # 1) Detokenize the new token ids incrementally. If necessary,
+        #    detokenize logprobs.
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
         decoded_text = ""
-        for new_token_id in new_token_ids:
+        for tdx, new_token_id in enumerate(new_token_ids):
             self.token_ids.append(new_token_id)
             (new_tokens, new_decoded_token_text, prefix_offset,
              read_offset) = detokenize_incrementally(
@@ -123,6 +143,23 @@ def add_tokens(
                  spaces_between_special_tokens,
              )
 
+            if do_logprobs:
+                # Detokenize individual token logprobs in-place
+                logprob_dict = new_logprobs[tdx]
+                assert logprob_dict is not None
+                detokenize_logprob_incrementally_in_place(
+                    tokenizer=self.tokenizer,
+                    logprob_dict=logprob_dict,
+                    input_ids_prefix=self.token_ids[0:-1],
+                    prev_tokens=self.tokens,
+                    prefix_offset=self.prefix_offset,
+                    read_offset=self.read_offset,
+                    skip_special_tokens=self.skip_special_tokens,
+                    spaces_between_special_tokens=self.
+                    spaces_between_special_tokens,
+                )
+                self.logprobs.append(logprob_dict)
+
             self.tokens.extend(new_tokens)
             self.prefix_offset = prefix_offset
             self.read_offset = read_offset
@@ -130,6 +167,10 @@ def add_tokens(
 
             decoded_text += new_decoded_token_text
 
+        # 1b) If necessary, detokenize prompt logprobs incrementally
+        if new_prompt_logprobs is not None and len(new_prompt_logprobs) > 0:
+            self.prompt_logprobs.extend(new_prompt_logprobs)
+
         # 2) Evaluate stop criteria.
         if self.stop:
             stop = StopChecker.check_stop_strings(
@@ -139,11 +180,10 @@ def add_tokens(
                 include_in_output=self.include_stop_str_in_output,
             )
             if stop is not None:
-                stop_str, truncate_to = stop
+                _, truncate_to = stop
                 if truncate_to != -1:
                     self.output_text = self.output_text[:truncate_to]
                 finish_reason = "stop"  # TODO: use constant
-                stop_reason = stop_str
 
         # TODO: handle stop_token_ids here too?
 
@@ -156,6 +196,8 @@ def add_tokens(
         delta = self.output_kind == RequestOutputKind.DELTA
         output_text = self._get_next_output_text(finished, delta)
         token_ids = new_token_ids if delta else self.output_token_ids
+        logprobs = new_logprobs if delta else self.logprobs
+        prompt_logprobs = new_prompt_logprobs if delta else self.prompt_logprobs
 
         request_output = RequestOutput.new(
             self.request_id,
@@ -163,6 +205,8 @@ def add_tokens(
             self.prompt_token_ids,
             output_text,
             token_ids,
+            logprobs,
+            prompt_logprobs,
             finished,
         )
 
@@ -254,6 +298,8 @@ def step(
             # Detokenize and update state.
             request_output = detokenizer.add_tokens(
                 new_token_ids=engine_core_output.new_token_ids,
+                new_logprobs=engine_core_output.logprobs,
+                new_prompt_logprobs=engine_core_output.prompt_logprobs,
                 finish_reason=engine_core_output.finish_reason,
                 stop_reason=engine_core_output.stop_reason,
             )
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index bd19d998a4adb..b93634230529e 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -134,8 +134,9 @@ def add_request(
 
         # 1) Process raw inputs into the request.
         detokenizer_req, engine_core_req = self.processor.process_inputs(
-            request_id, prompt, params, arrival_time, lora_request,
-            trace_headers, prompt_adapter_request, priority)
+            request_id, prompt, params, arrival_time,
+            self.get_model_config().max_logprobs, lora_request, trace_headers,
+            prompt_adapter_request, priority)
 
         # 2) Add the request to Detokenizer.
         self.detokenizer.add_request(detokenizer_req)
@@ -158,11 +159,12 @@ def step(self) -> List[RequestOutput]:
 
         return request_outputs
 
-    # TODO(rob): Can we get rid of these?
-
     def get_model_config(self):
+        """Gets the model configuration."""
         return self.model_config
 
+    # TODO(rob): Can we get rid of these?
+
     def start_profile(self):
         self.engine_core.profile(True)
 
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 5c1577190c75a..5bcf1b5e7b86e 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -39,6 +39,28 @@ def __init__(
         self.input_processor = input_registry.create_input_processor(
             model_config)
 
+    def _assert_valid_logprobs_prompt_logprobs(
+        self,
+        params: Union[SamplingParams, PoolingParams],
+        max_logprobs: int,
+    ):
+        """Validate requested number of sample logprobs & prompt logprobs
+        
+        Fails with ValueError if to many logprobs are requested.
+
+        Args:
+          params: Sampling parameters
+          max_logprobs: max number of logprobs or prompt logprobs
+        """
+
+        if isinstance(params, SamplingParams) and (
+            (params.logprobs and params.logprobs > max_logprobs) or
+            (params.prompt_logprobs
+             and params.prompt_logprobs > max_logprobs)):
+
+            raise ValueError(f"Cannot request more than "
+                             f"{max_logprobs} logprobs or prompt logprobs.")
+
     # TODO: run in an ThreadpoolExecutor or BackgroundProcess.
     # This ideally should releases the GIL, so we should not block the
     # asyncio loop while this is running.
@@ -48,6 +70,7 @@ def process_inputs(
         prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
         arrival_time: float,
+        max_logprobs: int,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
@@ -55,9 +78,10 @@ def process_inputs(
     ) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
 
         # TODO(woosuk): Support embedding mode.
-        # TODO(woosuk): Check max_logprobs
         # TODO(woosuk): Support encoder-decoder models.
 
+        self._assert_valid_logprobs_prompt_logprobs(params, max_logprobs)
+
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                              "not enabled!")
@@ -106,6 +130,8 @@ def process_inputs(
             sampling_params.output_kind,
             sampling_params.stop,
             sampling_params.include_stop_str_in_output,
+            sampling_params.logprobs,
+            sampling_params.prompt_logprobs,
         )
 
         # Make Request for EngineCore.
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 8574987728844..3cd0430aabd6f 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -15,8 +15,9 @@ class SamplerOutput:
     # [num_reqs, max_num_logprobs + 1]
     logprobs: Optional[torch.Tensor]
 
-    # TODO: Support prompt logprobs.
+    # [num_prompt_tokens, max_num_prompt_logprobs + 1]
     prompt_logprob_token_ids: Optional[torch.Tensor]
+    # [num_prompt_tokens, max_num_prompt_logprobs + 1]
     prompt_logprobs: Optional[torch.Tensor]
 
 
@@ -35,3 +36,8 @@ class ModelRunnerOutput:
     logprob_token_ids_cpu: Optional[torch.Tensor]
     # [num_reqs, max_num_logprobs + 1]
     logprobs_cpu: Optional[torch.Tensor]
+
+    # [num_reqs, max_num_prompt_logprobs]
+    prompt_logprob_token_ids_cpu: Optional[torch.Tensor]
+    # [num_reqs, max_num_prompt_logprobs]
+    prompt_logprobs_cpu: Optional[torch.Tensor]
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 51fb4003e5fe0..ce2accbd63aff 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -5,7 +5,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import RequestMetrics
+from vllm.sequence import PromptLogprobs, RequestMetrics, SampleLogprobs
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.utils import ConstantList
 
@@ -43,6 +43,12 @@ def __init__(
         self.num_prompt_tokens = len(self.prompt_token_ids)
         self._output_token_ids: List[int] = []
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
+        self.max_logprobs = sampling_params.logprobs
+        self.max_prompt_logprobs = sampling_params.prompt_logprobs
+        self.logprobs: Optional[SampleLogprobs] = (
+            None if self.max_logprobs is None else [])
+        self.prompt_logprobs: Optional[PromptLogprobs] = (
+            None if self.max_prompt_logprobs is None else [])
         self.num_computed_tokens = 0
 
         # Raw multimodal data before the mm input mapper (e.g., PIL images).
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 9ef36f2e6b212..3bf5a462d5070 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict
+from typing import Dict, Optional
 
 import torch
 
@@ -19,3 +19,9 @@ class SamplingMetadata:
     generators: Dict[int, torch.Generator]
 
     max_num_logprobs: int
+    max_num_prompt_logprobs: int
+
+    num_query_tokens: Optional[torch.Tensor] = None
+    num_sampled_tokens: Optional[torch.Tensor] = None
+    maybe_sample_logits_indices: Optional[torch.Tensor] = None
+    prompt_logits_mask: Optional[torch.Tensor] = None
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 927f274541c4d..77424df30e9ca 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,5 +1,5 @@
 """A layer that samples the next tokens from the model's outputs."""
-from typing import Dict
+from typing import Dict, Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -12,41 +12,150 @@
 
 class Sampler(nn.Module):
 
-    def forward(
+    def _apply_temperature_top_k_top_p(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
-    ) -> SamplerOutput:
-        logits = self.apply_temperature(logits, sampling_metadata.temperature)
-        logits = self.apply_top_k_top_p(logits, sampling_metadata)
+        num_query_tokens: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+
+        temperature = (sampling_metadata.temperature if
+                       num_query_tokens is None else torch.repeat_interleave(
+                           sampling_metadata.temperature, num_query_tokens))
+
+        return self._apply_top_k_top_p(
+            self._apply_temperature(logits, temperature), sampling_metadata)
 
-        probs = self.get_probs(logits)
+    def _probs_sample(
+        self,
+        maybe_sample_logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        probs = self.get_probs(maybe_sample_logits)
         sampled = self.sample(probs, sampling_metadata)
         # Use int32 to reduce the tensor size.
-        sampled = sampled.to(torch.int32)
-
-        if sampling_metadata.max_num_logprobs > 0:
-            logprobs = self.get_logprobs(logits)
-            # FIXME: Mask the sampled token_id, get topk logprobs,
-            # and concatenate the topk with the sampled token_id.
-            topk_logprobs, topk_indices = torch.topk(
-                logprobs, sampling_metadata.max_num_logprobs, dim=-1)
-            # Use int32 to reduce the tensor size.
-            topk_indices = topk_indices.to(torch.int32)
+        return sampled.to(torch.int32)
+
+    def _topk_logprobs_indices(
+        self,
+        logprobs: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        topk_logprobs, topk_indices = torch.topk(
+            logprobs, sampling_metadata.max_num_logprobs, dim=-1)
+        # Use int32 to reduce the tensor size.
+        return topk_logprobs, topk_indices.to(torch.int32)
+
+    def forward(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
+
+        do_logprobs = sampling_metadata.max_num_logprobs > 0
+        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
+        num_query_tokens = sampling_metadata.num_query_tokens
+        maybe_sample_logits_indices = (
+            sampling_metadata.maybe_sample_logits_indices)
+        prompt_logits_mask = sampling_metadata.prompt_logits_mask
+
+        if do_prompt_logprobs:
+            logits_w_tmp_tpk_tpp = self._apply_temperature_top_k_top_p(
+                logits, sampling_metadata, num_query_tokens)
+
+            maybe_sample_logits_w_tmp_tpk_tpp = (
+                logits_w_tmp_tpk_tpp[maybe_sample_logits_indices])
         else:
-            topk_logprobs = None
-            topk_indices = None
+            maybe_sample_logits_w_tmp_tpk_tpp = (
+                self._apply_temperature_top_k_top_p(
+                    logits[maybe_sample_logits_indices], sampling_metadata,
+                    None))
+
+        maybe_sampled = self._probs_sample(maybe_sample_logits_w_tmp_tpk_tpp,
+                                           sampling_metadata)
+
+        if do_logprobs and do_prompt_logprobs:
+            logprobs = self.get_logprobs(logits_w_tmp_tpk_tpp)
+
+            maybe_sampled_logprobs = logprobs[maybe_sample_logits_indices,
+                                              maybe_sampled]
+
+            topk_logprobs, topk_indices = self._topk_logprobs_indices(
+                logprobs, sampling_metadata)
+
+            maybe_sample_topk_logprobs = topk_logprobs[
+                maybe_sample_logits_indices, :]
+            maybe_sample_topk_indices = topk_indices[
+                maybe_sample_logits_indices, :]
+            prompt_topk_logprobs = topk_logprobs[prompt_logits_mask, :]
+            prompt_topk_indices = topk_indices[prompt_logits_mask, :]
+
+            # Concat sampled token logprobs
+            maybe_sample_topk_logprobs = torch.cat(
+                (maybe_sample_topk_logprobs,
+                 maybe_sampled_logprobs.unsqueeze(-1)),
+                dim=-1)
+            #Concat sampled token id
+            maybe_sample_topk_indices = torch.cat(
+                (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
+                dim=-1)
+        elif do_logprobs:
+            logprobs = self.get_logprobs(
+                logits_w_tmp_tpk_tpp[maybe_sample_logits_indices, :])
+
+            maybe_sampled_logprobs = logprobs[
+                torch.arange(maybe_sampled.shape[0]), maybe_sampled]
+
+            (
+                maybe_sample_topk_logprobs,
+                maybe_sample_topk_indices,
+            ) = self._topk_logprobs_indices(logprobs, sampling_metadata)
+
+            # Concat sampled token logprobs
+            maybe_sample_topk_logprobs = torch.cat(
+                (maybe_sample_topk_logprobs,
+                 maybe_sampled_logprobs.unsqueeze(-1)),
+                dim=-1)
+            #Concat sampled token id
+            maybe_sample_topk_indices = torch.cat(
+                (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
+                dim=-1)
+
+            (
+                prompt_topk_logprobs,
+                prompt_topk_indices,
+            ) = (None, None)
+
+        elif do_prompt_logprobs:
+            logprobs = self.get_logprobs(
+                logits_w_tmp_tpk_tpp[prompt_logits_mask, :])
+
+            prompt_topk_logprobs, prompt_topk_indices = (
+                self._topk_logprobs_indices(logprobs, sampling_metadata))
+
+            (
+                maybe_sample_topk_logprobs,
+                maybe_sample_topk_indices,
+            ) = (None, None)
+        else:
+            (
+                maybe_sample_topk_logprobs,
+                maybe_sample_topk_indices,
+                prompt_topk_logprobs,
+                prompt_topk_indices,
+            ) = (None, None, None, None)
 
         sampler_output = SamplerOutput(
-            sampled_token_ids=sampled,
-            logprob_token_ids=topk_indices,
-            logprobs=topk_logprobs,
-            prompt_logprob_token_ids=None,
-            prompt_logprobs=None,
-        )
+            sampled_token_ids=maybe_sampled,
+            logprob_token_ids=maybe_sample_topk_indices,
+            logprobs=maybe_sample_topk_logprobs,
+            prompt_logprob_token_ids=prompt_topk_indices,
+            prompt_logprobs=prompt_topk_logprobs)
+
         return sampler_output
 
-    def apply_temperature(
+    def _apply_temperature(
         self,
         logits: torch.Tensor,
         temp: torch.Tensor,
@@ -59,7 +168,7 @@ def apply_temperature(
         logits.div_(temp.unsqueeze(dim=1))
         return logits
 
-    def apply_top_k_top_p(
+    def _apply_top_k_top_p(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 13cbc8fa39c03..0a3fb0535e35a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -207,7 +207,15 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         if removed_req_indices:
             self.input_batch.condense(removed_req_indices)
 
-    def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
+    def _prepare_inputs(
+        self,
+        scheduler_output: "SchedulerOutput",
+        sampling_metadata: SamplingMetadata,
+        num_input_tokens: int,
+    ) -> Tuple[torch.Tensor, FlashAttentionMetadata, torch.Tensor,
+               torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
+
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
         num_reqs = self.input_batch.num_reqs
@@ -240,8 +248,9 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
         arange_matrix = np.tile(np.arange(max_num_scheduled_tokens),
                                 (num_reqs, 1))
-        mask = arange_matrix < num_scheduled_tokens[:, np.newaxis]
-        arange = arange_matrix[mask]
+        prompt_logits_mask = arange_matrix < num_scheduled_tokens[:,
+                                                                  np.newaxis]
+        arange = arange_matrix[prompt_logits_mask]
 
         # Get positions.
         positions = torch.empty((total_num_scheduled_tokens, ),
@@ -321,8 +330,27 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # partial request, we do so for simplicity. We will ignore the sampled
         # token from the partial request.
         # TODO: Support prompt logprobs.
-        logits_indices = query_start_loc[1:] - 1
-        return input_ids, attn_metadata, logits_indices
+        maybe_sample_logits_indices = query_start_loc[1:] - 1
+        num_query_tokens = torch.diff(query_start_loc)
+        num_sampled_tokens = torch.tensor(
+            scheduler_output.partial_running_reqs, device=self.device)
+
+        # One or more requests require prompt logprobs
+        complete_req_mask = torch.tensor(
+            [not x for x in scheduler_output.partial_running_reqs])
+
+        if do_prompt_logprobs:
+            prompt_logits_mask = torch.ones(num_input_tokens, dtype=torch.bool)
+            prompt_logits_mask[
+                maybe_sample_logits_indices[complete_req_mask]] = False
+
+            return (input_ids, attn_metadata, num_query_tokens,
+                    num_sampled_tokens, maybe_sample_logits_indices,
+                    prompt_logits_mask)
+        else:
+            # No requests require prompt logprobs
+            return (input_ids, attn_metadata, num_query_tokens,
+                    num_sampled_tokens, maybe_sample_logits_indices, None)
 
     def _prepare_sampling(
         self,
@@ -421,9 +449,8 @@ def execute_model(
         self._execute_encoder(scheduler_output)
         encoder_outputs = self._gather_encoder_outputs(scheduler_output)
 
-        # Prepare the decoder inputs.
-        input_ids, attn_metadata, logits_indices = self._prepare_inputs(
-            scheduler_output)
+        sampling_metadata = self._prepare_sampling(scheduler_output)
+
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
@@ -435,6 +462,21 @@ def execute_model(
             # Eager mode.
             num_input_tokens = num_scheduled_tokens
 
+        do_logprobs = sampling_metadata.max_num_logprobs > 0
+        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
+
+        # Prepare the decoder inputs.
+        (
+            input_ids,
+            attn_metadata,
+            num_query_tokens,
+            num_sampled_tokens,
+            maybe_sample_logits_indices,
+            prompt_logits_mask,
+        ) = self._prepare_inputs(scheduler_output=scheduler_output,
+                                 sampling_metadata=sampling_metadata,
+                                 num_input_tokens=num_input_tokens)
+
         # Get the inputs embeds.
         if encoder_outputs:
             inputs_embeds = self.model.get_input_embeddings(
@@ -456,14 +498,18 @@ def execute_model(
                 attn_metadata=None,
                 inputs_embeds=self.inputs_embeds[:num_input_tokens],
             )
+
         hidden_states = hidden_states[:num_scheduled_tokens]
-        hidden_states = hidden_states[logits_indices]
-        logits = self.model.compute_logits(hidden_states, None)
+
+        sampling_metadata.num_query_tokens = num_query_tokens
+        sampling_metadata.num_sampled_tokens = num_sampled_tokens
+        sampling_metadata.maybe_sample_logits_indices = (
+            maybe_sample_logits_indices)
+        sampling_metadata.prompt_logits_mask = prompt_logits_mask
 
         # Sample the next token and get logprobs if needed.
-        sampling_metadata = self._prepare_sampling(scheduler_output)
         sampler_output = self.model.sample(
-            logits=logits,
+            logits=self.model.compute_logits(hidden_states, None),
             sampling_metadata=sampling_metadata,
         )
 
@@ -491,21 +537,27 @@ def execute_model(
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
-        if sampler_output.logprob_token_ids is None:
-            logprob_token_ids = None
-        else:
-            logprob_token_ids = sampler_output.logprob_token_ids.cpu()
-        if sampler_output.logprobs is None:
-            logprobs = None
-        else:
-            logprobs = sampler_output.logprobs.cpu()
+        (
+            logprob_token_ids,
+            logprobs,
+        ) = ((sampler_output.logprob_token_ids.cpu(),
+              sampler_output.logprobs.cpu()) if do_logprobs else (None, None))
+
+        (
+            prompt_logprob_token_ids,
+            prompt_logprobs,
+        ) = ((sampler_output.prompt_logprob_token_ids.cpu(),
+              sampler_output.prompt_logprobs.cpu()) if do_prompt_logprobs else
+             (None, None))
+
         model_runner_output = ModelRunnerOutput(
             req_ids=self.input_batch.req_ids[:num_reqs],
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids_cpu=sampled_token_ids,
             logprob_token_ids_cpu=logprob_token_ids,
             logprobs_cpu=logprobs,
-        )
+            prompt_logprob_token_ids_cpu=prompt_logprob_token_ids,
+            prompt_logprobs_cpu=prompt_logprobs)
         return model_runner_output
 
     def load_model(self) -> None:
@@ -692,6 +744,7 @@ def __init__(
         self.generators: Dict[int, torch.Generator] = {}
 
         self.num_logprobs: Dict[str, int] = {}
+        self.num_prompt_logprobs: Dict[str, int] = {}
         self.prompt_logprob_reqs: Set[str] = set()
 
     def add_request(
@@ -737,8 +790,11 @@ def add_request(
         self.generators[req_index] = request.generator
 
         num_logprobs = sampling_params.logprobs
+        num_prompt_logprobs = sampling_params.prompt_logprobs
         if num_logprobs is not None and num_logprobs > 0:
             self.num_logprobs[req_id] = num_logprobs
+        if num_prompt_logprobs is not None and num_prompt_logprobs > 0:
+            self.num_prompt_logprobs[req_id] = num_prompt_logprobs
         if sampling_params.prompt_logprobs:
             self.prompt_logprob_reqs.add(req_id)
 
@@ -754,6 +810,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.top_k_reqs.discard(req_id)
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
+        self.num_prompt_logprobs.pop(req_id, None)
         self.prompt_logprob_reqs.discard(req_id)
         return req_index
 
@@ -766,6 +823,7 @@ def clear(self) -> None:
         self.top_k_reqs.clear()
         self.generators.clear()
         self.num_logprobs.clear()
+        self.num_prompt_logprobs.clear()
         self.prompt_logprob_reqs.clear()
 
     def condense(self, empty_req_indices: List[int]) -> None:
@@ -832,7 +890,7 @@ def make_sampling_metadata(
             no_top_k=self.no_top_k,
             generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
-        )
+            max_num_prompt_logprobs=self.max_num_prompt_logprobs)
 
     @property
     def num_reqs(self) -> int:
@@ -858,6 +916,11 @@ def no_top_k(self) -> bool:
     def max_num_logprobs(self) -> int:
         return max(self.num_logprobs.values()) if self.num_logprobs else 0
 
+    @property
+    def max_num_prompt_logprobs(self) -> int:
+        return (max(self.num_prompt_logprobs.values())
+                if self.num_prompt_logprobs else 0)
+
     @property
     def no_logprob(self) -> bool:
         return len(self.num_logprobs) == 0

From 3f151e99cff39203ff7c84210bcf7c530f786669 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 07:58:04 -0500
Subject: [PATCH 002/293] fixed issue with sample-logprob-only batches

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 vllm/v1/sample/sampler.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 77424df30e9ca..26dd4bafcff44 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -101,8 +101,7 @@ def forward(
                 (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
                 dim=-1)
         elif do_logprobs:
-            logprobs = self.get_logprobs(
-                logits_w_tmp_tpk_tpp[maybe_sample_logits_indices, :])
+            logprobs = self.get_logprobs(maybe_sample_logits_w_tmp_tpk_tpp)
 
             maybe_sampled_logprobs = logprobs[
                 torch.arange(maybe_sampled.shape[0]), maybe_sampled]

From 9ed75c7706675b92eb8d8a2ddbb0af976cde0a43 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 08:23:27 -0500
Subject: [PATCH 003/293] refactored logprobs tensor pythonization in scheduler

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 tests/v1/samplers/test_logprobs.py |  2 -
 vllm/outputs.py                    | 13 +++++-
 vllm/v1/core/scheduler.py          | 68 +++++++++++++++++++++++++-----
 3 files changed, 70 insertions(+), 13 deletions(-)

diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index 114ce7bd1f2fb..29e193e28092f 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -33,14 +33,12 @@ def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
     no logprobs
 
     Args:
-      
       batch_logprobs_composition: types of logprobs configs to include in batch
 
     Returns:
 
       List of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs])
       tuples
-    
     """
     if batch_logprobs_composition == "NONE":
         # No requests with sample or prompt logprobs
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 9733158504945..912e485e40b59 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -131,7 +131,18 @@ def new(
         prompt_logprobs: Optional[PromptLogprobs],
         finished: bool = False,
     ) -> "RequestOutput":
-        """Initialize a new RequestOutput object."""
+        """Initialize a new RequestOutput object.
+        
+        Args:
+          request_id
+          prompt: optional single prompt string
+          prompt_token_ids: optional list of prompt tokens
+          text: completion text
+          token_ids: completion token ids
+          logprobs: completion sample logprobs
+          prompt_logprobs: prompt logprobs
+          finished
+        """
 
         # TODO: Support `n` > 1.
         completion_output = CompletionOutput(index=0,
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 476b12c705482..0e09da028b16f 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -385,29 +385,77 @@ def _try_schedule_encoder_inputs(
             encoder_inputs_to_schedule.append(i)
         return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
 
-    def update_from_output(
+    def _pythonize_logprobs(
         self,
-        scheduler_output: "SchedulerOutput",
+        do_logprobs: bool,
+        do_prompt_logprobs: bool,
         model_runner_output: "ModelRunnerOutput",
-    ) -> List[EngineCoreOutput]:
-        # NOTE(woosuk): This method doesn't consider speculative decoding.
-        sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
-        num_scheduled_tokens = scheduler_output.num_scheduled_tokens
-        do_logprobs = model_runner_output.logprobs_cpu is not None
-        do_prompt_logprobs = (
-            model_runner_output.prompt_logprobs_cpu is not None
-            and len(model_runner_output.prompt_logprobs_cpu) > 0)
+    ) -> Tuple[List, List, List, List]:
+        """Convert logprobs tensors to Python data structures.
+        
+        Args:
+          do_logprobs: sample logprobs are required
+          do_prompt_logprobs: prompt logprobs are required
+          model_runner_output: model runner output contains CPU logprobs tensors
+
+        Returns:
+          logprob_token_ids_list
+          logprob_values_list
+          prompt_logprob_token_ids_list
+          prompt_logprob_values_list
+        """
         if do_logprobs:
+            # Pythonize sample logprobs if needed
             assert model_runner_output.logprob_token_ids_cpu is not None
             logprob_token_ids_list = (
                 model_runner_output.logprob_token_ids_cpu.tolist())
             logprob_values_list = (model_runner_output.logprobs_cpu.tolist())
+        else:
+            (
+                logprob_token_ids_list,
+                logprob_values_list,
+            ) = (None, None)
         if do_prompt_logprobs:
+            # Pythonize prompt logprobs if needed
             assert model_runner_output.prompt_logprob_token_ids_cpu is not None
             prompt_logprob_token_ids_list = (
                 model_runner_output.prompt_logprob_token_ids_cpu.tolist())
             prompt_logprob_values_list = (
                 model_runner_output.prompt_logprobs_cpu.tolist())
+        else:
+            (
+                prompt_logprob_token_ids_list,
+                prompt_logprob_values_list,
+            ) = (None, None)
+
+        return (logprob_token_ids_list, logprob_values_list,
+                prompt_logprob_token_ids_list, prompt_logprob_values_list)
+
+    def update_from_output(
+        self,
+        scheduler_output: "SchedulerOutput",
+        model_runner_output: "ModelRunnerOutput",
+    ) -> List[EngineCoreOutput]:
+        # NOTE(woosuk): This method doesn't consider speculative decoding.
+        sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
+        num_scheduled_tokens = scheduler_output.num_scheduled_tokens
+        do_logprobs = model_runner_output.logprobs_cpu is not None
+        do_prompt_logprobs = (
+            model_runner_output.prompt_logprobs_cpu is not None
+            and len(model_runner_output.prompt_logprobs_cpu) > 0)
+
+        # Get logprobs as Python data structures
+        (
+            logprob_token_ids_list,
+            logprob_values_list,
+            prompt_logprob_token_ids_list,
+            prompt_logprob_values_list,
+        ) = self._pythonize_logprobs(do_logprobs, do_prompt_logprobs,
+                                     model_runner_output)
+
+        if do_prompt_logprobs:
+            # Index into prompt tokens, for building
+            # prompt logprobs output data structure
             curr_prompt_base_idx = 0
         new_running: List[Request] = []
         engine_core_outputs: List[EngineCoreOutput] = []

From 2c2a17376291043ca47b0324dbf49605f8ce269d Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 08:42:57 -0500
Subject: [PATCH 004/293] added fast logprobs test

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 tests/v1/samplers/test_logprobs.py | 131 +++++++++++++++++++++--------
 vllm/v1/worker/gpu_model_runner.py |  26 ++----
 2 files changed, 104 insertions(+), 53 deletions(-)

diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index 29e193e28092f..86d34a8285a86 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -75,50 +75,17 @@ def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
         raise ValueError("Invalid logprobs batch configuration for test.")
 
 
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype",
-                         ["half"])  # needed for comparing logprobs with HF
-# @pytest.mark.parametrize("detokenize", [True, False])
-@pytest.mark.parametrize("max_num_batched_tokens", [128, 256, 1024])
-@pytest.mark.parametrize("batch_logprobs_composition",
-                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
-def test_get_logprobs_and_prompt_logprobs(
+def _test_case_get_logprobs_and_prompt_logprobs(
     hf_runner,
     vllm_runner,
     model: str,
     dtype: str,
-    # detokenize: bool,
+    detokenize: bool,
     batch_logprobs_composition: str,
     max_num_batched_tokens: int,
     example_prompts,
     monkeypatch,
-):
-    """Test V1 Engine logprobs & prompt logprobs
-    
-    Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
-    settings and validate that
-    * The generated logprobs and prompt logprobs are consistent with the
-      configuration settings, in terms of whether or not the logprobs
-      (of either type) were requested and how many were requested
-    * The generated logprobs are consistent with the generated tokens
-    * The generated (prompt)logprobs are consistent with HuggingFace
-      (prompt)logprobs, as a reference
-
-    batch_logprobs_composition controls the logprobs configurations for
-    requests in the batch under test.
-
-    Args:
-      hf_runner
-      vllm_runner
-      model
-      dtype
-      detokenize: if False, return generated tokens bypassing detokenizer
-      batch_logprobs_composition: logprobs configuration for test batch
-      example_prompts
-      monkeypatch
-    """
-    detokenize = True
-
+) -> None:
     test_prompts = example_prompts
 
     # LLM engine v1
@@ -273,6 +240,98 @@ def test_get_logprobs_and_prompt_logprobs(
             assert vllm_result.prompt_logprobs is None
 
 
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype",
+                         ["half"])  # needed for comparing logprobs with HF
+# @pytest.mark.parametrize("detokenize", [True, False])
+@pytest.mark.parametrize("max_num_batched_tokens", [128, 256, 1024])
+@pytest.mark.parametrize("batch_logprobs_composition",
+                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
+def test_get_logprobs_and_prompt_logprobs(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+    # detokenize: bool,
+    batch_logprobs_composition: str,
+    max_num_batched_tokens: int,
+    example_prompts,
+    monkeypatch,
+) -> None:
+    """Test V1 Engine logprobs & prompt logprobs
+    
+    Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
+    settings and validate that
+    * The generated logprobs and prompt logprobs are consistent with the
+      configuration settings, in terms of whether or not the logprobs
+      (of either type) were requested and how many were requested
+    * The generated logprobs are consistent with the generated tokens
+    * The generated (prompt)logprobs are consistent with HuggingFace
+      (prompt)logprobs, as a reference
+
+    batch_logprobs_composition controls the logprobs configurations for
+    requests in the batch under test.
+
+    Args:
+      hf_runner
+      vllm_runner
+      model
+      dtype
+      detokenize: if False, return generated tokens bypassing detokenizer
+      batch_logprobs_composition: logprobs configuration for test batch
+      example_prompts
+      monkeypatch
+    """
+    detokenize = True
+
+    _test_case_get_logprobs_and_prompt_logprobs(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        model=model,
+        dtype=dtype,
+        detokenize=detokenize,
+        batch_logprobs_composition=batch_logprobs_composition,
+        max_num_batched_tokens=max_num_batched_tokens,
+        example_prompts=example_prompts,
+        monkeypatch=monkeypatch)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype",
+                         ["half"])  # needed for comparing logprobs with HF
+# @pytest.mark.parametrize("detokenize", [True, False])
+@pytest.mark.parametrize("max_num_batched_tokens", [128])
+@pytest.mark.parametrize("batch_logprobs_composition", ["SAMPLE_PROMPT"])
+def test_fast_get_logprobs_and_prompt_logprobs(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+    # detokenize: bool,
+    batch_logprobs_composition: str,
+    max_num_batched_tokens: int,
+    example_prompts,
+    monkeypatch,
+) -> None:
+    """Fast test: V1 Engine logprobs & prompt logprobs
+    
+    Faster version of `test_get_logprobs_and_prompt_logprobs` with
+    fewer test cases.
+    """
+    detokenize = True
+
+    _test_case_get_logprobs_and_prompt_logprobs(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        model=model,
+        dtype=dtype,
+        detokenize=detokenize,
+        batch_logprobs_composition=batch_logprobs_composition,
+        max_num_batched_tokens=max_num_batched_tokens,
+        example_prompts=example_prompts,
+        monkeypatch=monkeypatch)
+
+
 def test_max_logprobs(monkeypatch):
     """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
     
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0a3fb0535e35a..96bf7763e98b3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -537,27 +537,19 @@ def execute_model(
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
-        (
-            logprob_token_ids,
-            logprobs,
-        ) = ((sampler_output.logprob_token_ids.cpu(),
-              sampler_output.logprobs.cpu()) if do_logprobs else (None, None))
-
-        (
-            prompt_logprob_token_ids,
-            prompt_logprobs,
-        ) = ((sampler_output.prompt_logprob_token_ids.cpu(),
-              sampler_output.prompt_logprobs.cpu()) if do_prompt_logprobs else
-             (None, None))
-
         model_runner_output = ModelRunnerOutput(
             req_ids=self.input_batch.req_ids[:num_reqs],
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids_cpu=sampled_token_ids,
-            logprob_token_ids_cpu=logprob_token_ids,
-            logprobs_cpu=logprobs,
-            prompt_logprob_token_ids_cpu=prompt_logprob_token_ids,
-            prompt_logprobs_cpu=prompt_logprobs)
+            logprob_token_ids_cpu=(sampler_output.logprob_token_ids.cpu()
+                                   if do_logprobs else None),
+            logprobs_cpu=(sampler_output.logprobs.cpu()
+                          if do_logprobs else None),
+            prompt_logprob_token_ids_cpu=(
+                sampler_output.prompt_logprob_token_ids.cpu()
+                if do_prompt_logprobs else None),
+            prompt_logprobs_cpu=(sampler_output.prompt_logprobs.cpu()
+                                 if do_prompt_logprobs else None))
         return model_runner_output
 
     def load_model(self) -> None:

From b7d9453ab99ed6a7eacb0e3a9a68af3ccce6f6ab Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 26 Nov 2024 14:55:00 +0800
Subject: [PATCH 005/293] [Misc] Remove outdated init protocols (#10655)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 vllm/model_executor/models/interfaces.py      | 30 -------------------
 vllm/model_executor/models/interfaces_base.py |  2 +-
 2 files changed, 1 insertion(+), 31 deletions(-)

diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 4f0c75b2c6a57..9b4a97abf9b51 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -10,7 +10,6 @@
 from .interfaces_base import is_embedding_model
 
 if TYPE_CHECKING:
-    from vllm.config import LoRAConfig, MultiModalConfig, SchedulerConfig
     from vllm.sequence import IntermediateTensors
 
 logger = init_logger(__name__)
@@ -29,9 +28,6 @@ class SupportsMultiModal(Protocol):
         MRO of your model class.
     """
 
-    def __init__(self, *, multimodal_config: "MultiModalConfig") -> None:
-        ...
-
 
 # We can't use runtime_checkable with ClassVar for issubclass checks
 # so we need to treat the class as an instance and use isinstance instead
@@ -39,9 +35,6 @@ def __init__(self, *, multimodal_config: "MultiModalConfig") -> None:
 class _SupportsMultiModalType(Protocol):
     supports_multimodal: Literal[True]
 
-    def __call__(self, *, multimodal_config: "MultiModalConfig") -> None:
-        ...
-
 
 @overload
 def supports_multimodal(
@@ -81,10 +74,6 @@ class SupportsLoRA(Protocol):
     embedding_modules: ClassVar[Dict[str, str]]
     embedding_padding_modules: ClassVar[List[str]]
 
-    # lora_config is None when LoRA is not enabled
-    def __init__(self, *, lora_config: Optional["LoRAConfig"] = None) -> None:
-        ...
-
 
 # We can't use runtime_checkable with ClassVar for issubclass checks
 # so we need to treat the class as an instance and use isinstance instead
@@ -97,9 +86,6 @@ class _SupportsLoRAType(Protocol):
     embedding_modules: Dict[str, str]
     embedding_padding_modules: List[str]
 
-    def __call__(self, *, lora_config: Optional["LoRAConfig"] = None) -> None:
-        ...
-
 
 @overload
 def supports_lora(model: Type[object]) -> TypeIs[Type[SupportsLoRA]]:
@@ -276,21 +262,11 @@ class HasInnerState(Protocol):
         for max_num_seqs, etc. True for e.g. both Mamba and Jamba.
     """
 
-    def __init__(self,
-                 *,
-                 scheduler_config: Optional["SchedulerConfig"] = None) -> None:
-        ...
-
 
 @runtime_checkable
 class _HasInnerStateType(Protocol):
     has_inner_state: ClassVar[Literal[True]]
 
-    def __init__(self,
-                 *,
-                 scheduler_config: Optional["SchedulerConfig"] = None) -> None:
-        ...
-
 
 @overload
 def has_inner_state(model: object) -> TypeIs[HasInnerState]:
@@ -323,17 +299,11 @@ class IsAttentionFree(Protocol):
         True for Mamba but not Jamba.
     """
 
-    def __init__(self) -> None:
-        ...
-
 
 @runtime_checkable
 class _IsAttentionFreeType(Protocol):
     is_attention_free: ClassVar[Literal[True]]
 
-    def __init__(self) -> None:
-        ...
-
 
 @overload
 def is_attention_free(model: object) -> TypeIs[IsAttentionFree]:
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 7bb43beff255c..957a5a6e26b5c 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -71,7 +71,7 @@ def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool:
                         and issubclass(model, nn.Module)):
         logger.warning(
             "The model (%s) is missing "
-            "vLLM-specific keywords from its initializer: %s",
+            "vLLM-specific keywords from its `forward` method: %s",
             model,
             missing_kws,
         )

From 6109c69eacc3707df0370f2852efc4d7077b4c37 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 26 Nov 2024 00:20:04 -0800
Subject: [PATCH 006/293] [ci] add vllm_test_utils (#10659)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 Dockerfile                                    |  4 ++
 Dockerfile.cpu                                |  4 ++
 Dockerfile.hpu                                |  3 ++
 Dockerfile.neuron                             |  3 ++
 Dockerfile.openvino                           |  3 ++
 Dockerfile.ppc64le                            |  3 ++
 Dockerfile.rocm                               |  3 ++
 Dockerfile.tpu                                |  3 ++
 Dockerfile.xpu                                |  3 +-
 tests/entrypoints/llm/test_lazy_outlines.py   | 23 +++++---
 tests/test_lazy_torch_compile.py              | 54 +------------------
 tests/vllm_test_utils/setup.py                |  7 +++
 .../vllm_test_utils/__init__.py               |  8 +++
 .../vllm_test_utils/vllm_test_utils/blame.py  | 53 ++++++++++++++++++
 14 files changed, 113 insertions(+), 61 deletions(-)
 create mode 100644 tests/vllm_test_utils/setup.py
 create mode 100644 tests/vllm_test_utils/vllm_test_utils/__init__.py
 create mode 100644 tests/vllm_test_utils/vllm_test_utils/blame.py

diff --git a/Dockerfile b/Dockerfile
index 220dbe26712ec..682f046d4b6ec 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -191,6 +191,10 @@ ADD . /vllm-workspace/
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-dev.txt
 
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -e tests/vllm_test_utils
+
 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install hf_transfer
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 287b4958da4e5..d2f72ea975a3d 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -62,4 +62,8 @@ WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -e tests/vllm_test_utils
+
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/Dockerfile.hpu b/Dockerfile.hpu
index d18fc016387bf..87e0c1a6a934e 100644
--- a/Dockerfile.hpu
+++ b/Dockerfile.hpu
@@ -11,6 +11,9 @@ ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
 
 RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 2143315d2a078..76dbd4c04d3f3 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -38,4 +38,7 @@ ENV VLLM_TARGET_DEVICE neuron
 RUN --mount=type=bind,source=.git,target=.git \
     pip install --no-build-isolation -v -e .
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 CMD ["/bin/bash"]
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index a05ff452cd36e..8bd188ffde408 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -22,4 +22,7 @@ RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVIC
 COPY examples/ /workspace/examples
 COPY benchmarks/ /workspace/benchmarks
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 CMD ["/bin/bash"]
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index b19c6ddec7948..971248577983f 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -29,6 +29,9 @@ RUN --mount=type=cache,target=/root/.cache/pip  \
 RUN --mount=type=bind,source=.git,target=.git \
     VLLM_TARGET_DEVICE=cpu python3 setup.py install
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 62d4a9b4909c3..e733994f8c33e 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -168,4 +168,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     if ls libs/*.whl; then \
     python3 -m pip install libs/*.whl; fi
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 CMD ["/bin/bash"]
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index 0a507b6ecdf60..b617932a85b47 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -22,4 +22,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         -r requirements-tpu.txt
 RUN python3 setup.py develop
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 CMD ["/bin/bash"]
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index 63bc682770422..a374f20d7d949 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -64,5 +64,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 ENV VLLM_USAGE_SOURCE production-docker-image \
     TRITON_XPU_PROFILE 1
-
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index cbfb0cc32c1ce..81fb000d8ac56 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -1,12 +1,12 @@
 import sys
 
+from vllm_test_utils import blame
+
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 
 
-def test_lazy_outlines(sample_regex):
-    """If users don't use guided decoding, outlines should not be imported.
-    """
+def run_normal():
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
@@ -25,13 +25,12 @@ def test_lazy_outlines(sample_regex):
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-    # make sure outlines is not imported
-    assert 'outlines' not in sys.modules
-
     # Destroy the LLM object and free up the GPU memory.
     del llm
     cleanup_dist_env_and_memory()
 
+
+def run_lmfe(sample_regex):
     # Create an LLM with guided decoding enabled.
     llm = LLM(model="facebook/opt-125m",
               enforce_eager=True,
@@ -51,5 +50,15 @@ def test_lazy_outlines(sample_regex):
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
+
+def test_lazy_outlines(sample_regex):
+    """If users don't use guided decoding, outlines should not be imported.
+    """
     # make sure outlines is not imported
-    assert 'outlines' not in sys.modules
+    module_name = "outlines"
+    with blame(lambda: module_name in sys.modules) as result:
+        run_normal()
+        run_lmfe(sample_regex)
+    assert not result.found, (
+        f"Module {module_name} is already imported, the"
+        f" first import location is:\n{result.trace_stack}")
diff --git a/tests/test_lazy_torch_compile.py b/tests/test_lazy_torch_compile.py
index b8ac4dd93732b..4756fac8e2a8d 100644
--- a/tests/test_lazy_torch_compile.py
+++ b/tests/test_lazy_torch_compile.py
@@ -1,61 +1,9 @@
 # Description: Test the lazy import module
 # The utility function cannot be placed in `vllm.utils`
 # this needs to be a standalone script
-
-import contextlib
-import dataclasses
 import sys
-import traceback
-from typing import Callable, Generator
-
-
-@dataclasses.dataclass
-class BlameResult:
-    found: bool = False
-    trace_stack: str = ""
-
-
-@contextlib.contextmanager
-def blame(func: Callable) -> Generator[BlameResult, None, None]:
-    """
-    Trace the function calls to find the first function that satisfies the
-    condition. The trace stack will be stored in the result.
-
-    Usage:
-
-    ```python
-    with blame(lambda: some_condition()) as result:
-        # do something
-    
-    if result.found:
-        print(result.trace_stack)
-    """
-    result = BlameResult()
-
-    def _trace_calls(frame, event, arg=None):
-        nonlocal result
-        if event in ['call', 'return']:
-            # for every function call or return
-            try:
-                # Temporarily disable the trace function
-                sys.settrace(None)
-                # check condition here
-                if not result.found and func():
-                    result.found = True
-                    result.trace_stack = "".join(traceback.format_stack())
-                # Re-enable the trace function
-                sys.settrace(_trace_calls)
-            except NameError:
-                # modules are deleted during shutdown
-                pass
-        return _trace_calls
-
-    sys.settrace(_trace_calls)
-
-    yield result
-
-    sys.settrace(None)
 
+from vllm_test_utils import blame
 
 module_name = "torch._inductor.async_compile"
 
diff --git a/tests/vllm_test_utils/setup.py b/tests/vllm_test_utils/setup.py
new file mode 100644
index 0000000000000..790e891ec837d
--- /dev/null
+++ b/tests/vllm_test_utils/setup.py
@@ -0,0 +1,7 @@
+from setuptools import setup
+
+setup(
+    name='vllm_test_utils',
+    version='0.1',
+    packages=['vllm_test_utils'],
+)
diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py
new file mode 100644
index 0000000000000..bf0b62a5b75e3
--- /dev/null
+++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py
@@ -0,0 +1,8 @@
+"""
+vllm_utils is a package for vLLM testing utilities.
+It does not import any vLLM modules.
+"""
+
+from .blame import BlameResult, blame
+
+__all__ = ["blame", "BlameResult"]
diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py
new file mode 100644
index 0000000000000..ad23ab83c2d81
--- /dev/null
+++ b/tests/vllm_test_utils/vllm_test_utils/blame.py
@@ -0,0 +1,53 @@
+import contextlib
+import dataclasses
+import sys
+import traceback
+from typing import Callable, Generator
+
+
+@dataclasses.dataclass
+class BlameResult:
+    found: bool = False
+    trace_stack: str = ""
+
+
+@contextlib.contextmanager
+def blame(func: Callable) -> Generator[BlameResult, None, None]:
+    """
+    Trace the function calls to find the first function that satisfies the
+    condition. The trace stack will be stored in the result.
+
+    Usage:
+
+    ```python
+    with blame(lambda: some_condition()) as result:
+        # do something
+    
+    if result.found:
+        print(result.trace_stack)
+    """
+    result = BlameResult()
+
+    def _trace_calls(frame, event, arg=None):
+        nonlocal result
+        if event in ['call', 'return']:
+            # for every function call or return
+            try:
+                # Temporarily disable the trace function
+                sys.settrace(None)
+                # check condition here
+                if not result.found and func():
+                    result.found = True
+                    result.trace_stack = "".join(traceback.format_stack())
+                # Re-enable the trace function
+                sys.settrace(_trace_calls)
+            except NameError:
+                # modules are deleted during shutdown
+                pass
+        return _trace_calls
+
+    sys.settrace(_trace_calls)
+
+    yield result
+
+    sys.settrace(None)

From 8acd4ebe600e6d97145e56e08b4c45d09008b055 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Tue, 26 Nov 2024 18:36:45 +0800
Subject: [PATCH 007/293] [V1] Enable profile for LLMEngine (#10665)

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 vllm/v1/engine/llm_engine.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 7a5482f03b6fa..bd19d998a4adb 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -161,13 +161,13 @@ def step(self) -> List[RequestOutput]:
     # TODO(rob): Can we get rid of these?
 
     def get_model_config(self):
-        pass
+        return self.model_config
 
     def start_profile(self):
-        pass
+        self.engine_core.profile(True)
 
     def stop_profile(self):
-        pass
+        self.engine_core.profile(False)
 
     def get_tokenizer_group(self, group_type):
         pass

From 4621a0b26e57a4ba77de0a1c1e1b8c9bd84b0a6b Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 07:48:22 -0500
Subject: [PATCH 008/293] Squash commit of all changes from v1_logprobs

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 tests/v1/samplers/__init__.py                |   0
 tests/v1/samplers/test_logprobs.py           | 340 +++++++++++++++++++
 vllm/outputs.py                              |  16 +-
 vllm/transformers_utils/detokenizer_utils.py |  51 ++-
 vllm/v1/core/scheduler.py                    | 152 ++++++++-
 vllm/v1/engine/__init__.py                   |   9 +
 vllm/v1/engine/async_llm.py                  |   3 +-
 vllm/v1/engine/detokenizer.py                |  60 +++-
 vllm/v1/engine/llm_engine.py                 |  10 +-
 vllm/v1/engine/processor.py                  |  28 +-
 vllm/v1/outputs.py                           |   8 +-
 vllm/v1/request.py                           |   8 +-
 vllm/v1/sample/metadata.py                   |   8 +-
 vllm/v1/sample/sampler.py                    | 161 +++++++--
 vllm/v1/worker/gpu_model_runner.py           | 107 ++++--
 15 files changed, 885 insertions(+), 76 deletions(-)
 create mode 100644 tests/v1/samplers/__init__.py
 create mode 100644 tests/v1/samplers/test_logprobs.py

diff --git a/tests/v1/samplers/__init__.py b/tests/v1/samplers/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
new file mode 100644
index 0000000000000..114ce7bd1f2fb
--- /dev/null
+++ b/tests/v1/samplers/test_logprobs.py
@@ -0,0 +1,340 @@
+from typing import List, Tuple
+
+import pytest
+import torch
+
+from tests.kernels.utils import override_backend_env_variable
+from vllm import SamplingParams
+
+from ...conftest import VllmRunner
+
+MODELS = ["facebook/opt-125m"]
+
+
+def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
+    """Generate logprobs configs for a batch of requests
+    
+    A given request's logprobs configuration is (1) num_sample_logprobs and (2)
+    num_prompt_logprobs. The batch logprobs configuration is the list of request
+    logprobs configs.
+
+    batch_logprobs_composition == "NONE" yields a batch with no sample or prompt
+    logprobs
+
+    batch_logprobs_composition == "SAMPLE" yields a batch with some requests
+    configured for sample logprobs only, and others configured for no logprobs
+
+    batch_logprobs_composition == "PROMPT" yields a batch with some requests
+    configured for prompt logprobs only, and others configured for no logprobs
+
+    batch_logprobs_composition == "SAMPLE_PROMPT" yields a batch with some
+    requests configured for sample logprobs and prompt logprobs, some configured
+    for only sample logprobs or only prompt logprobs, and some configured for
+    no logprobs
+
+    Args:
+      
+      batch_logprobs_composition: types of logprobs configs to include in batch
+
+    Returns:
+
+      List of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs])
+      tuples
+    
+    """
+    if batch_logprobs_composition == "NONE":
+        # No requests with sample or prompt logprobs
+        return [(None, None), (0, None), (None, 0), (0, 0)]
+    elif batch_logprobs_composition == "SAMPLE":
+        return [
+            (None, None),
+            (None, 0),
+            (0, None),
+            (0, 0),
+            (5, None),
+            (3, 0),
+        ]
+    elif batch_logprobs_composition == "PROMPT":
+        return [
+            (None, 0),
+            (0, None),
+            (0, 0),
+            (None, 6),
+            (0, 5),
+        ]
+    elif batch_logprobs_composition == "SAMPLE_PROMPT":
+        return [
+            (None, 0),
+            (0, None),
+            (0, 0),
+            (5, None),
+            (3, 0),
+            (6, 3),
+            (None, 6),
+            (0, 5),
+        ]
+    else:
+        raise ValueError("Invalid logprobs batch configuration for test.")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype",
+                         ["half"])  # needed for comparing logprobs with HF
+# @pytest.mark.parametrize("detokenize", [True, False])
+@pytest.mark.parametrize("max_num_batched_tokens", [128, 256, 1024])
+@pytest.mark.parametrize("batch_logprobs_composition",
+                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
+def test_get_logprobs_and_prompt_logprobs(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+    # detokenize: bool,
+    batch_logprobs_composition: str,
+    max_num_batched_tokens: int,
+    example_prompts,
+    monkeypatch,
+):
+    """Test V1 Engine logprobs & prompt logprobs
+    
+    Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
+    settings and validate that
+    * The generated logprobs and prompt logprobs are consistent with the
+      configuration settings, in terms of whether or not the logprobs
+      (of either type) were requested and how many were requested
+    * The generated logprobs are consistent with the generated tokens
+    * The generated (prompt)logprobs are consistent with HuggingFace
+      (prompt)logprobs, as a reference
+
+    batch_logprobs_composition controls the logprobs configurations for
+    requests in the batch under test.
+
+    Args:
+      hf_runner
+      vllm_runner
+      model
+      dtype
+      detokenize: if False, return generated tokens bypassing detokenizer
+      batch_logprobs_composition: logprobs configuration for test batch
+      example_prompts
+      monkeypatch
+    """
+    detokenize = True
+
+    test_prompts = example_prompts
+
+    # LLM engine v1
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    override_backend_env_variable(monkeypatch, "FLASH_ATTN")
+
+    max_num_seqs = 128
+    max_num_batched_tokens = 128
+    max_model_len = 128
+
+    max_tokens = 5
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(
+            test_prompts,
+            max_tokens=max_tokens,
+        )
+        hf_logprobs = hf_model.generate_greedy_logprobs(
+            test_prompts,
+            max_tokens=max_tokens,
+        )
+
+    # Batch has mixed sample params
+    # (different logprobs/prompt logprobs combos)
+    logprob_prompt_logprob_list = _get_test_batch(batch_logprobs_composition)
+
+    # We rely on there being more prompts than combinations of
+    # logprobs & prompt logprobs which we want to test
+    assert len(test_prompts) >= len(logprob_prompt_logprob_list)
+    # Make sure there is a sample params for each prompt
+    num_extra_params = len(test_prompts) - len(logprob_prompt_logprob_list)
+    if num_extra_params > 0:
+        logprob_prompt_logprob_list = (
+            logprob_prompt_logprob_list +
+            logprob_prompt_logprob_list[-num_extra_params:])
+    # Now the number of prompts should match the number of sample params combos
+    assert len(test_prompts) == len(logprob_prompt_logprob_list)
+    # Generate SamplingParams
+    vllm_sampling_params = [
+        SamplingParams(max_tokens=max_tokens,
+                       logprobs=lp,
+                       prompt_logprobs=plp,
+                       temperature=0.0,
+                       detokenize=detokenize)
+        for lp, plp in logprob_prompt_logprob_list
+    ]
+
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_logprobs=7,
+            max_num_batched_tokens=max_num_batched_tokens,
+            max_num_seqs=max_num_seqs,
+            max_model_len=max_model_len,
+            enforce_eager=True,
+    ) as vllm_model:
+        vllm_results = vllm_model.model.generate(
+            test_prompts, sampling_params=vllm_sampling_params)
+
+    for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip(
+            vllm_results, hf_logprobs, hf_outputs,
+            logprob_prompt_logprob_list):
+
+        # Extract request-level (prompt)logprobs config
+        num_top_logprobs = logprob_prompt_logprob[0]
+        num_top_prompt_logprobs = logprob_prompt_logprob[1]
+
+        # Test whether sampled token output is consistent between vLLM and HF
+        # vLLM prompt+completion should match HF output
+        assert (vllm_result.prompt_token_ids +
+                vllm_result.outputs[0].token_ids == hf_output[0])
+
+        # Validate sample logprobs
+        if num_top_logprobs is not None and num_top_logprobs > 0:
+            assert num_top_logprobs is not None
+            # Confirm that the structure of the sample logprobs in the result is
+            # correct
+            assert vllm_result.outputs[0].logprobs is not None
+            assert len(vllm_result.outputs[0].logprobs) == max_tokens
+            for logprobs in vllm_result.outputs[0].logprobs:
+                assert logprobs is not None
+                # If the output token is not included in the top X
+                # logprob, it can return 1 more data
+                assert (len(logprobs) == num_top_logprobs
+                        or len(logprobs) == num_top_logprobs + 1)
+            output_text = vllm_result.outputs[0].text
+            output_string_from_most_likely_tokens_lst: List[str] = []
+            for top_logprobs in vllm_result.outputs[0].logprobs:
+                top_logprob = next(iter(top_logprobs.values()))
+                output_string_from_most_likely_tokens_lst.append(
+                    top_logprob.decoded_token)
+
+            if detokenize:
+                output_string_from_most_likely_tokens = "".join(
+                    output_string_from_most_likely_tokens_lst)
+                assert output_text == output_string_from_most_likely_tokens, (
+                    "The output text from the top logprob for each token "
+                    "position should be the same as the output text in the "
+                    "result.")
+            else:
+                assert output_text == ''
+                assert output_string_from_most_likely_tokens_lst == (
+                    [None] * max_tokens)
+
+            # Compare vLLM sample logprobs to HF
+            vllm_sample_logprobs = vllm_result.outputs[0].logprobs
+            for i, top_logprobs in enumerate(vllm_sample_logprobs):
+                for token_id, sample_logprob in top_logprobs.items():
+                    logprob = sample_logprob.logprob
+                    torch.testing.assert_close(
+                        logprob,
+                        hf_logprob[i][-1][token_id].item(),
+                        atol=1e-2,
+                        rtol=1e-2)
+                    if detokenize:
+                        assert isinstance(sample_logprob.decoded_token, str), (
+                            "The token should be decoded by the time it is"
+                            " returned to the user.")
+        else:
+            # Logprobs disabled for this request; should be None
+            assert vllm_result.outputs[0].logprobs is None
+
+        # Validate prompt logprobs
+        if (num_top_prompt_logprobs is not None
+                and num_top_prompt_logprobs > 0):
+            # Confirm that structure of prompt logprobs in result is correct
+            assert vllm_result.prompt_logprobs is not None
+            # - The first prompt logprob is always None
+            assert vllm_result.prompt_logprobs[0] is None
+            # - Prompt logprobs are returned for all indices in
+            #   the prompt
+            assert len(vllm_result.prompt_logprobs) == len(
+                vllm_result.prompt_token_ids)
+            for prompt_logprobs in vllm_result.prompt_logprobs[1:]:
+                assert prompt_logprobs is not None
+                # - If the prompt token is not included in the top X
+                #   logprob, it can return 1 more data
+                assert (len(prompt_logprobs) == num_top_prompt_logprobs
+                        or len(prompt_logprobs) == num_top_prompt_logprobs + 1)
+
+            # Compare prompt logprobs to HF
+            # The first prompt logprob is always None, so we compare it from
+            # 1:.
+            vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
+            for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
+                for token_id, logprob in vllm_prompt_logprob_dict.items():
+                    torch.testing.assert_close(
+                        logprob.logprob,
+                        hf_logprob[0][i][token_id].item(),
+                        atol=1e-2,
+                        rtol=1e-2)
+        else:
+            assert vllm_result.prompt_logprobs is None
+
+
+def test_max_logprobs(monkeypatch):
+    """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
+    
+    Should also fail for `prompt_logprobs > max_logprobs`
+    
+    Args:
+      monkeypatch
+    """
+    # LLM engine v1
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    override_backend_env_variable(monkeypatch, "FLASH_ATTN")
+
+    runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
+    vllm_sampling_params = SamplingParams(logprobs=1)
+    # should pass
+    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
+
+    bad_sampling_params = SamplingParams(logprobs=2)
+    with pytest.raises(ValueError):
+        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("detokenize", [True, False])
+def test_none_logprobs(vllm_runner, model, detokenize: bool, example_prompts,
+                       monkeypatch):
+    """Engine should return `logprobs` and `prompt_logprobs` as `None`
+    
+    Args:
+      vllm_runner
+      model
+      detokenize: whether to feed generated tokens to detokenizer
+      example_prompts
+      monkeypatch
+    """
+
+    # LLM engine v1
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    override_backend_env_variable(monkeypatch, "FLASH_ATTN")
+
+    max_num_seqs = 256
+    max_num_batched_tokens = None
+    max_tokens = 5
+
+    with vllm_runner(
+            model,
+            max_num_batched_tokens=max_num_batched_tokens,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
+                                                       logprobs=None,
+                                                       prompt_logprobs=None,
+                                                       temperature=0.0,
+                                                       detokenize=detokenize)
+        results_logprobs_none = vllm_model.model.generate(
+            example_prompts, sampling_params=sampling_params_logprobs_none)
+
+    for i in range(len(results_logprobs_none)):
+        # Check sample logprobs are None
+        assert results_logprobs_none[i].outputs[0].logprobs is None
+        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
+        # Check prompt logprobs are None
+        assert results_logprobs_none[i].prompt_logprobs is None
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 2d256803edfe8..9733158504945 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -127,24 +127,24 @@ def new(
         prompt_token_ids: Optional[List[int]],
         text: str,
         token_ids: List[int],
+        logprobs: Optional[SampleLogprobs],
+        prompt_logprobs: Optional[PromptLogprobs],
         finished: bool = False,
     ) -> "RequestOutput":
         """Initialize a new RequestOutput object."""
 
         # TODO: Support `n` > 1.
-        completion_output = CompletionOutput(
-            index=0,
-            text=text,
-            token_ids=token_ids,
-            cumulative_logprob=None,
-            logprobs=None,  # TODO
-        )
+        completion_output = CompletionOutput(index=0,
+                                             text=text,
+                                             token_ids=token_ids,
+                                             cumulative_logprob=None,
+                                             logprobs=logprobs)
 
         return RequestOutput(
             request_id=request_id,
             prompt=prompt,
             prompt_token_ids=prompt_token_ids,
-            prompt_logprobs=None,  # TODO
+            prompt_logprobs=prompt_logprobs,
             outputs=[completion_output],
             finished=finished,
         )
diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 37ff8a236e791..885e3b9d92f88 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -1,4 +1,6 @@
-from typing import List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
+
+from vllm.sequence import Logprob
 
 from .tokenizer import AnyTokenizer
 
@@ -165,3 +167,50 @@ def detokenize_incrementally(
 
     new_text = new_text[len(prefix_text):]
     return new_tokens, new_text, read_offset, len(output_tokens)
+
+
+def detokenize_logprob_incrementally_in_place(
+    tokenizer: AnyTokenizer,
+    logprob_dict: Dict[int, Logprob],
+    input_ids_prefix: List[int],
+    prev_tokens: Optional[List[str]],
+    prefix_offset: int,
+    read_offset: int,
+    skip_special_tokens: bool = False,
+    spaces_between_special_tokens: bool = True,
+) -> None:
+    """Detokenizes the logprobs at a single token offset incrementally.
+
+    For each top-token in `logprob_dict`, apply incremental detokenization
+    to the token list `input_ids_prefix + [top-token id]`
+
+    The logprob data structure is modified in-place with the string
+    representation of each decoded top-token.
+    
+    Args:
+        tokenizer: The tokenizer to use.
+        logprob_dict: logprob data structure for a single token position
+        input_ids_prefix: The input ids *preceding* the token offset under
+                          consideration
+        prev_tokens: The previous tokens. If None, this function will convert
+            the input ids to tokens and return the tokens and the new text.
+        prefix_offset: The prefix offset.
+        read_offset: The read offset.
+        skip_special_tokens: Whether to skip special tokens.
+        spaces_between_special_tokens: Whether to add spaces between special
+            tokens.
+    """
+
+    for token_id in logprob_dict:
+        # Detokenize logprob for a particular top
+        # token at a particular token offset
+
+        logprob_dict[token_id].decoded_token = detokenize_incrementally(
+            tokenizer=tokenizer,
+            all_input_ids=input_ids_prefix + [token_id],
+            prev_tokens=prev_tokens,
+            prefix_offset=prefix_offset,
+            read_offset=read_offset,
+            skip_special_tokens=skip_special_tokens,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+        )[1]
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index ba50a9786d805..476b12c705482 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -6,6 +6,7 @@
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
+from vllm.sequence import Logprob
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.engine import EngineCoreOutput
@@ -247,6 +248,13 @@ def schedule(self) -> "SchedulerOutput":
                         self.encoder_cache_manager.allocate(request, i)
                     encoder_budget = new_encoder_budget
 
+        # Now that requests are scheduled, generate a mask indicating which
+        # request is partial
+        partial_running_reqs = [
+            (req.num_computed_tokens + num_scheduled_tokens[req.request_id] <
+             req.num_tokens) for req in self.running
+        ]
+
         # Check if the scheduling constraints are satisfied.
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
         assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
@@ -277,6 +285,7 @@ def schedule(self) -> "SchedulerOutput":
             scheduled_new_reqs=new_reqs_data,
             scheduled_resumed_reqs=resumed_reqs_data,
             scheduled_running_reqs=running_reqs_data,
+            partial_running_reqs=partial_running_reqs,
             num_scheduled_tokens=num_scheduled_tokens,
             total_num_scheduled_tokens=total_num_scheduled_tokens,
             scheduled_encoder_inputs=scheduled_encoder_inputs,
@@ -384,11 +393,85 @@ def update_from_output(
         # NOTE(woosuk): This method doesn't consider speculative decoding.
         sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
+        do_logprobs = model_runner_output.logprobs_cpu is not None
+        do_prompt_logprobs = (
+            model_runner_output.prompt_logprobs_cpu is not None
+            and len(model_runner_output.prompt_logprobs_cpu) > 0)
+        if do_logprobs:
+            assert model_runner_output.logprob_token_ids_cpu is not None
+            logprob_token_ids_list = (
+                model_runner_output.logprob_token_ids_cpu.tolist())
+            logprob_values_list = (model_runner_output.logprobs_cpu.tolist())
+        if do_prompt_logprobs:
+            assert model_runner_output.prompt_logprob_token_ids_cpu is not None
+            prompt_logprob_token_ids_list = (
+                model_runner_output.prompt_logprob_token_ids_cpu.tolist())
+            prompt_logprob_values_list = (
+                model_runner_output.prompt_logprobs_cpu.tolist())
+            curr_prompt_base_idx = 0
         new_running: List[Request] = []
         engine_core_outputs: List[EngineCoreOutput] = []
         for request in self.running:
             req_id = request.request_id
             request.num_computed_tokens += num_scheduled_tokens[req_id]
+            req_index = model_runner_output.req_id_to_index[req_id]
+            num_new_tokens = 1
+            max_logprobs = request.max_logprobs
+            request_do_logprobs = (do_logprobs and max_logprobs is not None
+                                   and max_logprobs > 0)
+
+            if do_prompt_logprobs:
+                max_prompt_logprobs = request.max_prompt_logprobs
+                num_new_prompt_tokens = (
+                    num_scheduled_tokens[request.request_id] -
+                    int(not scheduler_output.partial_running_reqs[req_index]))
+
+                request_do_prompt_logprobs = (max_prompt_logprobs is not None
+                                              and max_prompt_logprobs > 0
+                                              and num_new_prompt_tokens > 0)
+
+                if request_do_prompt_logprobs:
+
+                    # Construct prompt logprobs, under the condition that
+                    # prompt logprobs were requested & a nonzero number of
+                    # prompt tokens were computed in this step for this request.
+                    #
+                    # Note that this scenario returns an EngineCoreOutput which
+                    # is empty except for the prompt logprobs which were
+                    # computed for these prompt tokens.
+
+                    slice_upper_index = (curr_prompt_base_idx +
+                                         num_new_prompt_tokens)
+                    prompt_logprob_token_ids = prompt_logprob_token_ids_list[
+                        curr_prompt_base_idx:slice_upper_index]
+                    prompt_logprob_values = prompt_logprob_values_list[
+                        curr_prompt_base_idx:slice_upper_index]
+                    curr_prompt_base_idx = slice_upper_index
+
+                    logprob_cnt = max_prompt_logprobs
+                    prompt_logprobs = [{
+                        lpt: Logprob(lpv, (idx + 1), None)
+                        for idx, (lpv, lpt) in enumerate(
+                            zip(plp_tok_values[0:logprob_cnt],
+                                plp_tok_token_ids[0:logprob_cnt]))
+                    } for plp_tok_values, plp_tok_token_ids in zip(
+                        prompt_logprob_values, prompt_logprob_token_ids)]
+
+                    if not request.prompt_logprobs:
+                        # Ensure that None is the first prompt logprob
+                        prompt_logprobs = [None] + prompt_logprobs
+
+                    curr_prompt_base_idx = slice_upper_index
+
+                    prompt_slice_range_upper = request.num_computed_tokens
+                    prompt_slice_range_lower = (prompt_slice_range_upper -
+                                                num_new_prompt_tokens)
+                    request.prompt_logprobs.extend(prompt_logprobs)
+                else:
+                    curr_prompt_base_idx += num_new_prompt_tokens
+            else:
+                request_do_prompt_logprobs = False
+
             # When the request's num_computed_tokens catches up its num_tokens,
             # the request generates output tokens. Otherwise, we ignore the
             # sampler output for the request.
@@ -405,12 +488,45 @@ def update_from_output(
                     self.encoder_cache_manager.free(request, input_id)
 
             if request.num_computed_tokens == request.num_tokens:
-                req_index = model_runner_output.req_id_to_index[req_id]
                 # NOTE(woosuk): Currently, we assume that each request
                 # generates at most one token at each step.
                 token_id = sampled_token_ids[req_index]
+                if request_do_logprobs:
+                    # Construct logprobs, if requested (TODO: assumes one
+                    # generated token).
+                    logprob_token_ids = logprob_token_ids_list[req_index]
+                    logprob_values = logprob_values_list[req_index]
+                    logprob_cnt = max_logprobs
+                    if token_id not in logprob_token_ids[0:max_logprobs]:
+                        # Sampled token is not in the in the top logprobs;
+                        # inject it & resort, ensuring that excess logprobs
+                        # not requested by the user have -inf probability
+                        logprob_values[max_logprobs:-1] = (
+                            [float('-inf')] *
+                            (len(logprob_values) - 1 - max_logprobs))
+
+                        indices = sorted(range(len(logprob_values)),
+                                         key=lambda k: logprob_values[k],
+                                         reverse=True)
+                        logprob_values = [logprob_values[i] for i in indices]
+                        logprob_token_ids = [
+                            logprob_token_ids[i] for i in indices
+                        ]
+
+                        # There will be one more logprob than the user requested
+                        logprob_cnt = max_logprobs + 1
+
+                    # Only keep the number of logprobs specified by the request
+                    # (plus possibly the sampled token id & its logprob)
+                    logprob_values = logprob_values[0:logprob_cnt]
+                    logprob_token_ids = logprob_token_ids[0:logprob_cnt]
+
+                    request.logprobs.append({
+                        lpt: Logprob(lpv, (idx + 1), None)
+                        for idx, (lpv, lpt) in enumerate(
+                            zip(logprob_values, logprob_token_ids))
+                    })
                 request.append_output_token_ids(token_id)
-                num_new_tokens = 1
                 # TODO: Update the KV cache manager for prefix caching.
 
                 # Check for stop and update request state.
@@ -418,18 +534,47 @@ def update_from_output(
                 stopped = self._check_stop(request)
 
                 # Add EngineCoreOutput for this Request.
+                # Return the logprob for the most recently computed tokens.
+                # Return no prompt logprobs in decode-phase.
                 output = EngineCoreOutput(
                     request_id=req_id,
                     new_token_ids=request.output_token_ids[-num_new_tokens:],
                     finished=request.is_finished(),
                     finish_reason=request.get_finished_reason(),
-                    stop_reason=request.stop_reason)
+                    stop_reason=request.stop_reason,
+                    logprobs=(request.logprobs[-num_new_tokens:]
+                              if request_do_logprobs else None),
+                    prompt_logprobs=(prompt_logprobs
+                                     if request_do_prompt_logprobs else None),
+                    prompt_logprobs_token_ids=(request.prompt_token_ids
+                                               if request_do_prompt_logprobs
+                                               else None))
                 engine_core_outputs.append(output)
 
                 # Breakout of the loop.
                 if stopped:
                     continue
 
+            elif request_do_prompt_logprobs:
+                # This request is still partial but prompt logprobs were
+                # requested
+                engine_core_outputs.append(
+                    EngineCoreOutput(
+                        request_id=req_id,
+                        new_token_ids=[],
+                        finished=request.is_finished(),
+                        finish_reason=request.get_finished_reason(),
+                        stop_reason=request.stop_reason,
+                        logprobs=[] if request_do_logprobs else None,
+                        prompt_logprobs=(
+                            prompt_logprobs if request_do_prompt_logprobs else
+                            ([] if request_do_prompt_logprobs else None)),
+                        prompt_logprobs_token_ids=(
+                            request.prompt_token_ids[prompt_slice_range_lower:
+                                                     prompt_slice_range_upper]
+                            if request_do_prompt_logprobs else
+                            ([] if request_do_prompt_logprobs else None))))
+
             new_running.append(request)
         self.running = new_running
         return engine_core_outputs
@@ -581,6 +726,7 @@ class SchedulerOutput:
     scheduled_new_reqs: List[NewRequestData]
     scheduled_resumed_reqs: List[ResumedRequestData]
     scheduled_running_reqs: List[RunningRequestData]
+    partial_running_reqs: List[bool]  # True if running req is partial
 
     num_scheduled_tokens: Dict[str, int]
     total_num_scheduled_tokens: int
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 967124fd850ea..46ee3154d69c0 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -7,6 +7,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
 from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.sequence import PromptLogprobs, SampleLogprobs
 
 
 @dataclass
@@ -22,6 +23,11 @@ class DetokenizerRequest:
     stop: List[str]
     include_stop_str_in_output: bool
 
+    # Per-request logprobs & prompt logprobs
+    # counts; None is equivalent to 0
+    logprobs: Optional[int]
+    prompt_logprobs: Optional[int]
+
 
 @dataclass
 class EngineCoreRequest:
@@ -52,6 +58,9 @@ class EngineCoreOutput(msgspec.Struct,
     request_id: str
     new_token_ids: List[int]
     finished: bool
+    logprobs: Optional[SampleLogprobs]
+    prompt_logprobs: Optional[PromptLogprobs]
+    prompt_logprobs_token_ids: Optional[List[int]]
     finish_reason: Optional[str] = None
     stop_reason: Union[int, str, None] = None
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a17c8eac4b77c..421ecc8c0d921 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -144,7 +144,8 @@ async def add_request(
 
         # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
         detokenizer_req, engine_core_req = self.processor.process_inputs(
-            request_id, prompt, params, arrival_time, lora_request,
+            request_id, prompt, params, arrival_time,
+            (await self.get_model_config()).max_logprobs, lora_request,
             trace_headers, prompt_adapter_request, priority)
 
         # 3) Add the request to Detokenizer (this process).
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 6249d60199a62..5ad8b8c725f3e 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,17 +1,21 @@
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Dict, Iterable, List, Optional, Tuple, Union
 
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import RequestOutputKind
+from vllm.sequence import PromptLogprobs, SampleLogprobs
 from vllm.transformers_utils.detokenizer_utils import (
-    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
+    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally,
+    detokenize_logprob_incrementally_in_place)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.v1.engine import DetokenizerRequest, EngineCoreOutput
 
 logger = init_logger(__name__)
 
+AnyLogprobs = Union[Optional[SampleLogprobs], Optional[PromptLogprobs]]
+
 
 @dataclass
 class IncrementalDetokenizer:
@@ -20,6 +24,8 @@ class IncrementalDetokenizer:
     output_text: str
     tokens: List[str]
     token_ids: List[int]
+    logprobs: Optional[SampleLogprobs]
+    prompt_logprobs: Optional[PromptLogprobs]
 
     # Stop strings
     stop: List[str]
@@ -72,6 +78,11 @@ def from_new_request(
         else:
             stop_buffer_length = 0
 
+        # Logprobs & prompt logprobs settings
+        do_logprobs = request.logprobs is not None and request.logprobs > 0
+        do_prompt_logprobs = (request.prompt_logprobs is not None
+                              and request.prompt_logprobs > 0)
+
         return cls(
             output_text="",
             tokens=tokens,
@@ -91,25 +102,34 @@ def from_new_request(
             prompt_token_ids=request.prompt_token_ids,
             tokenizer=tokenizer,
             stop_buffer_length=stop_buffer_length,
-        )
+            logprobs=[] if do_logprobs else None,
+            prompt_logprobs=[] if do_prompt_logprobs else None)
 
     def add_tokens(
         self,
         new_token_ids: List[int],
+        new_logprobs: Optional[SampleLogprobs],
+        new_prompt_logprobs: Optional[PromptLogprobs],
         finish_reason: Optional[str],
         stop_reason: Optional[str],
     ) -> Optional[RequestOutput]:
         """
         Update RequestState for the request_id by:
             1) Detokenize the new token ids incrementally.
+            1a) If necessary, detokenize logprobs incrementally
+            1b) If necessary, detokenize prompt logprobs incrementally
             2) Update the RequestOutput with the new text.
         """
 
-        # 1) Detokenize the new token ids incrementally.
+        do_logprobs = new_logprobs is not None and len(new_logprobs) > 0
+        assert not do_logprobs or len(new_logprobs) == len(new_token_ids)
+
+        # 1) Detokenize the new token ids incrementally. If necessary,
+        #    detokenize logprobs.
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
         decoded_text = ""
-        for new_token_id in new_token_ids:
+        for tdx, new_token_id in enumerate(new_token_ids):
             self.token_ids.append(new_token_id)
             (new_tokens, new_decoded_token_text, prefix_offset,
              read_offset) = detokenize_incrementally(
@@ -123,6 +143,23 @@ def add_tokens(
                  spaces_between_special_tokens,
              )
 
+            if do_logprobs:
+                # Detokenize individual token logprobs in-place
+                logprob_dict = new_logprobs[tdx]
+                assert logprob_dict is not None
+                detokenize_logprob_incrementally_in_place(
+                    tokenizer=self.tokenizer,
+                    logprob_dict=logprob_dict,
+                    input_ids_prefix=self.token_ids[0:-1],
+                    prev_tokens=self.tokens,
+                    prefix_offset=self.prefix_offset,
+                    read_offset=self.read_offset,
+                    skip_special_tokens=self.skip_special_tokens,
+                    spaces_between_special_tokens=self.
+                    spaces_between_special_tokens,
+                )
+                self.logprobs.append(logprob_dict)
+
             self.tokens.extend(new_tokens)
             self.prefix_offset = prefix_offset
             self.read_offset = read_offset
@@ -130,6 +167,10 @@ def add_tokens(
 
             decoded_text += new_decoded_token_text
 
+        # 1b) If necessary, detokenize prompt logprobs incrementally
+        if new_prompt_logprobs is not None and len(new_prompt_logprobs) > 0:
+            self.prompt_logprobs.extend(new_prompt_logprobs)
+
         # 2) Evaluate stop criteria.
         if self.stop:
             stop = StopChecker.check_stop_strings(
@@ -139,11 +180,10 @@ def add_tokens(
                 include_in_output=self.include_stop_str_in_output,
             )
             if stop is not None:
-                stop_str, truncate_to = stop
+                _, truncate_to = stop
                 if truncate_to != -1:
                     self.output_text = self.output_text[:truncate_to]
                 finish_reason = "stop"  # TODO: use constant
-                stop_reason = stop_str
 
         # TODO: handle stop_token_ids here too?
 
@@ -156,6 +196,8 @@ def add_tokens(
         delta = self.output_kind == RequestOutputKind.DELTA
         output_text = self._get_next_output_text(finished, delta)
         token_ids = new_token_ids if delta else self.output_token_ids
+        logprobs = new_logprobs if delta else self.logprobs
+        prompt_logprobs = new_prompt_logprobs if delta else self.prompt_logprobs
 
         request_output = RequestOutput.new(
             self.request_id,
@@ -163,6 +205,8 @@ def add_tokens(
             self.prompt_token_ids,
             output_text,
             token_ids,
+            logprobs,
+            prompt_logprobs,
             finished,
         )
 
@@ -254,6 +298,8 @@ def step(
             # Detokenize and update state.
             request_output = detokenizer.add_tokens(
                 new_token_ids=engine_core_output.new_token_ids,
+                new_logprobs=engine_core_output.logprobs,
+                new_prompt_logprobs=engine_core_output.prompt_logprobs,
                 finish_reason=engine_core_output.finish_reason,
                 stop_reason=engine_core_output.stop_reason,
             )
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index bd19d998a4adb..b93634230529e 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -134,8 +134,9 @@ def add_request(
 
         # 1) Process raw inputs into the request.
         detokenizer_req, engine_core_req = self.processor.process_inputs(
-            request_id, prompt, params, arrival_time, lora_request,
-            trace_headers, prompt_adapter_request, priority)
+            request_id, prompt, params, arrival_time,
+            self.get_model_config().max_logprobs, lora_request, trace_headers,
+            prompt_adapter_request, priority)
 
         # 2) Add the request to Detokenizer.
         self.detokenizer.add_request(detokenizer_req)
@@ -158,11 +159,12 @@ def step(self) -> List[RequestOutput]:
 
         return request_outputs
 
-    # TODO(rob): Can we get rid of these?
-
     def get_model_config(self):
+        """Gets the model configuration."""
         return self.model_config
 
+    # TODO(rob): Can we get rid of these?
+
     def start_profile(self):
         self.engine_core.profile(True)
 
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 5c1577190c75a..5bcf1b5e7b86e 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -39,6 +39,28 @@ def __init__(
         self.input_processor = input_registry.create_input_processor(
             model_config)
 
+    def _assert_valid_logprobs_prompt_logprobs(
+        self,
+        params: Union[SamplingParams, PoolingParams],
+        max_logprobs: int,
+    ):
+        """Validate requested number of sample logprobs & prompt logprobs
+        
+        Fails with ValueError if to many logprobs are requested.
+
+        Args:
+          params: Sampling parameters
+          max_logprobs: max number of logprobs or prompt logprobs
+        """
+
+        if isinstance(params, SamplingParams) and (
+            (params.logprobs and params.logprobs > max_logprobs) or
+            (params.prompt_logprobs
+             and params.prompt_logprobs > max_logprobs)):
+
+            raise ValueError(f"Cannot request more than "
+                             f"{max_logprobs} logprobs or prompt logprobs.")
+
     # TODO: run in an ThreadpoolExecutor or BackgroundProcess.
     # This ideally should releases the GIL, so we should not block the
     # asyncio loop while this is running.
@@ -48,6 +70,7 @@ def process_inputs(
         prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
         arrival_time: float,
+        max_logprobs: int,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
@@ -55,9 +78,10 @@ def process_inputs(
     ) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
 
         # TODO(woosuk): Support embedding mode.
-        # TODO(woosuk): Check max_logprobs
         # TODO(woosuk): Support encoder-decoder models.
 
+        self._assert_valid_logprobs_prompt_logprobs(params, max_logprobs)
+
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                              "not enabled!")
@@ -106,6 +130,8 @@ def process_inputs(
             sampling_params.output_kind,
             sampling_params.stop,
             sampling_params.include_stop_str_in_output,
+            sampling_params.logprobs,
+            sampling_params.prompt_logprobs,
         )
 
         # Make Request for EngineCore.
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 8574987728844..3cd0430aabd6f 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -15,8 +15,9 @@ class SamplerOutput:
     # [num_reqs, max_num_logprobs + 1]
     logprobs: Optional[torch.Tensor]
 
-    # TODO: Support prompt logprobs.
+    # [num_prompt_tokens, max_num_prompt_logprobs + 1]
     prompt_logprob_token_ids: Optional[torch.Tensor]
+    # [num_prompt_tokens, max_num_prompt_logprobs + 1]
     prompt_logprobs: Optional[torch.Tensor]
 
 
@@ -35,3 +36,8 @@ class ModelRunnerOutput:
     logprob_token_ids_cpu: Optional[torch.Tensor]
     # [num_reqs, max_num_logprobs + 1]
     logprobs_cpu: Optional[torch.Tensor]
+
+    # [num_reqs, max_num_prompt_logprobs]
+    prompt_logprob_token_ids_cpu: Optional[torch.Tensor]
+    # [num_reqs, max_num_prompt_logprobs]
+    prompt_logprobs_cpu: Optional[torch.Tensor]
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 51fb4003e5fe0..ce2accbd63aff 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -5,7 +5,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import RequestMetrics
+from vllm.sequence import PromptLogprobs, RequestMetrics, SampleLogprobs
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.utils import ConstantList
 
@@ -43,6 +43,12 @@ def __init__(
         self.num_prompt_tokens = len(self.prompt_token_ids)
         self._output_token_ids: List[int] = []
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
+        self.max_logprobs = sampling_params.logprobs
+        self.max_prompt_logprobs = sampling_params.prompt_logprobs
+        self.logprobs: Optional[SampleLogprobs] = (
+            None if self.max_logprobs is None else [])
+        self.prompt_logprobs: Optional[PromptLogprobs] = (
+            None if self.max_prompt_logprobs is None else [])
         self.num_computed_tokens = 0
 
         # Raw multimodal data before the mm input mapper (e.g., PIL images).
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 9ef36f2e6b212..3bf5a462d5070 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict
+from typing import Dict, Optional
 
 import torch
 
@@ -19,3 +19,9 @@ class SamplingMetadata:
     generators: Dict[int, torch.Generator]
 
     max_num_logprobs: int
+    max_num_prompt_logprobs: int
+
+    num_query_tokens: Optional[torch.Tensor] = None
+    num_sampled_tokens: Optional[torch.Tensor] = None
+    maybe_sample_logits_indices: Optional[torch.Tensor] = None
+    prompt_logits_mask: Optional[torch.Tensor] = None
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 927f274541c4d..77424df30e9ca 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,5 +1,5 @@
 """A layer that samples the next tokens from the model's outputs."""
-from typing import Dict
+from typing import Dict, Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -12,41 +12,150 @@
 
 class Sampler(nn.Module):
 
-    def forward(
+    def _apply_temperature_top_k_top_p(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
-    ) -> SamplerOutput:
-        logits = self.apply_temperature(logits, sampling_metadata.temperature)
-        logits = self.apply_top_k_top_p(logits, sampling_metadata)
+        num_query_tokens: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+
+        temperature = (sampling_metadata.temperature if
+                       num_query_tokens is None else torch.repeat_interleave(
+                           sampling_metadata.temperature, num_query_tokens))
+
+        return self._apply_top_k_top_p(
+            self._apply_temperature(logits, temperature), sampling_metadata)
 
-        probs = self.get_probs(logits)
+    def _probs_sample(
+        self,
+        maybe_sample_logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        probs = self.get_probs(maybe_sample_logits)
         sampled = self.sample(probs, sampling_metadata)
         # Use int32 to reduce the tensor size.
-        sampled = sampled.to(torch.int32)
-
-        if sampling_metadata.max_num_logprobs > 0:
-            logprobs = self.get_logprobs(logits)
-            # FIXME: Mask the sampled token_id, get topk logprobs,
-            # and concatenate the topk with the sampled token_id.
-            topk_logprobs, topk_indices = torch.topk(
-                logprobs, sampling_metadata.max_num_logprobs, dim=-1)
-            # Use int32 to reduce the tensor size.
-            topk_indices = topk_indices.to(torch.int32)
+        return sampled.to(torch.int32)
+
+    def _topk_logprobs_indices(
+        self,
+        logprobs: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        topk_logprobs, topk_indices = torch.topk(
+            logprobs, sampling_metadata.max_num_logprobs, dim=-1)
+        # Use int32 to reduce the tensor size.
+        return topk_logprobs, topk_indices.to(torch.int32)
+
+    def forward(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
+
+        do_logprobs = sampling_metadata.max_num_logprobs > 0
+        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
+        num_query_tokens = sampling_metadata.num_query_tokens
+        maybe_sample_logits_indices = (
+            sampling_metadata.maybe_sample_logits_indices)
+        prompt_logits_mask = sampling_metadata.prompt_logits_mask
+
+        if do_prompt_logprobs:
+            logits_w_tmp_tpk_tpp = self._apply_temperature_top_k_top_p(
+                logits, sampling_metadata, num_query_tokens)
+
+            maybe_sample_logits_w_tmp_tpk_tpp = (
+                logits_w_tmp_tpk_tpp[maybe_sample_logits_indices])
         else:
-            topk_logprobs = None
-            topk_indices = None
+            maybe_sample_logits_w_tmp_tpk_tpp = (
+                self._apply_temperature_top_k_top_p(
+                    logits[maybe_sample_logits_indices], sampling_metadata,
+                    None))
+
+        maybe_sampled = self._probs_sample(maybe_sample_logits_w_tmp_tpk_tpp,
+                                           sampling_metadata)
+
+        if do_logprobs and do_prompt_logprobs:
+            logprobs = self.get_logprobs(logits_w_tmp_tpk_tpp)
+
+            maybe_sampled_logprobs = logprobs[maybe_sample_logits_indices,
+                                              maybe_sampled]
+
+            topk_logprobs, topk_indices = self._topk_logprobs_indices(
+                logprobs, sampling_metadata)
+
+            maybe_sample_topk_logprobs = topk_logprobs[
+                maybe_sample_logits_indices, :]
+            maybe_sample_topk_indices = topk_indices[
+                maybe_sample_logits_indices, :]
+            prompt_topk_logprobs = topk_logprobs[prompt_logits_mask, :]
+            prompt_topk_indices = topk_indices[prompt_logits_mask, :]
+
+            # Concat sampled token logprobs
+            maybe_sample_topk_logprobs = torch.cat(
+                (maybe_sample_topk_logprobs,
+                 maybe_sampled_logprobs.unsqueeze(-1)),
+                dim=-1)
+            #Concat sampled token id
+            maybe_sample_topk_indices = torch.cat(
+                (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
+                dim=-1)
+        elif do_logprobs:
+            logprobs = self.get_logprobs(
+                logits_w_tmp_tpk_tpp[maybe_sample_logits_indices, :])
+
+            maybe_sampled_logprobs = logprobs[
+                torch.arange(maybe_sampled.shape[0]), maybe_sampled]
+
+            (
+                maybe_sample_topk_logprobs,
+                maybe_sample_topk_indices,
+            ) = self._topk_logprobs_indices(logprobs, sampling_metadata)
+
+            # Concat sampled token logprobs
+            maybe_sample_topk_logprobs = torch.cat(
+                (maybe_sample_topk_logprobs,
+                 maybe_sampled_logprobs.unsqueeze(-1)),
+                dim=-1)
+            #Concat sampled token id
+            maybe_sample_topk_indices = torch.cat(
+                (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
+                dim=-1)
+
+            (
+                prompt_topk_logprobs,
+                prompt_topk_indices,
+            ) = (None, None)
+
+        elif do_prompt_logprobs:
+            logprobs = self.get_logprobs(
+                logits_w_tmp_tpk_tpp[prompt_logits_mask, :])
+
+            prompt_topk_logprobs, prompt_topk_indices = (
+                self._topk_logprobs_indices(logprobs, sampling_metadata))
+
+            (
+                maybe_sample_topk_logprobs,
+                maybe_sample_topk_indices,
+            ) = (None, None)
+        else:
+            (
+                maybe_sample_topk_logprobs,
+                maybe_sample_topk_indices,
+                prompt_topk_logprobs,
+                prompt_topk_indices,
+            ) = (None, None, None, None)
 
         sampler_output = SamplerOutput(
-            sampled_token_ids=sampled,
-            logprob_token_ids=topk_indices,
-            logprobs=topk_logprobs,
-            prompt_logprob_token_ids=None,
-            prompt_logprobs=None,
-        )
+            sampled_token_ids=maybe_sampled,
+            logprob_token_ids=maybe_sample_topk_indices,
+            logprobs=maybe_sample_topk_logprobs,
+            prompt_logprob_token_ids=prompt_topk_indices,
+            prompt_logprobs=prompt_topk_logprobs)
+
         return sampler_output
 
-    def apply_temperature(
+    def _apply_temperature(
         self,
         logits: torch.Tensor,
         temp: torch.Tensor,
@@ -59,7 +168,7 @@ def apply_temperature(
         logits.div_(temp.unsqueeze(dim=1))
         return logits
 
-    def apply_top_k_top_p(
+    def _apply_top_k_top_p(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 13cbc8fa39c03..0a3fb0535e35a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -207,7 +207,15 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         if removed_req_indices:
             self.input_batch.condense(removed_req_indices)
 
-    def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
+    def _prepare_inputs(
+        self,
+        scheduler_output: "SchedulerOutput",
+        sampling_metadata: SamplingMetadata,
+        num_input_tokens: int,
+    ) -> Tuple[torch.Tensor, FlashAttentionMetadata, torch.Tensor,
+               torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
+
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
         num_reqs = self.input_batch.num_reqs
@@ -240,8 +248,9 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
         arange_matrix = np.tile(np.arange(max_num_scheduled_tokens),
                                 (num_reqs, 1))
-        mask = arange_matrix < num_scheduled_tokens[:, np.newaxis]
-        arange = arange_matrix[mask]
+        prompt_logits_mask = arange_matrix < num_scheduled_tokens[:,
+                                                                  np.newaxis]
+        arange = arange_matrix[prompt_logits_mask]
 
         # Get positions.
         positions = torch.empty((total_num_scheduled_tokens, ),
@@ -321,8 +330,27 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # partial request, we do so for simplicity. We will ignore the sampled
         # token from the partial request.
         # TODO: Support prompt logprobs.
-        logits_indices = query_start_loc[1:] - 1
-        return input_ids, attn_metadata, logits_indices
+        maybe_sample_logits_indices = query_start_loc[1:] - 1
+        num_query_tokens = torch.diff(query_start_loc)
+        num_sampled_tokens = torch.tensor(
+            scheduler_output.partial_running_reqs, device=self.device)
+
+        # One or more requests require prompt logprobs
+        complete_req_mask = torch.tensor(
+            [not x for x in scheduler_output.partial_running_reqs])
+
+        if do_prompt_logprobs:
+            prompt_logits_mask = torch.ones(num_input_tokens, dtype=torch.bool)
+            prompt_logits_mask[
+                maybe_sample_logits_indices[complete_req_mask]] = False
+
+            return (input_ids, attn_metadata, num_query_tokens,
+                    num_sampled_tokens, maybe_sample_logits_indices,
+                    prompt_logits_mask)
+        else:
+            # No requests require prompt logprobs
+            return (input_ids, attn_metadata, num_query_tokens,
+                    num_sampled_tokens, maybe_sample_logits_indices, None)
 
     def _prepare_sampling(
         self,
@@ -421,9 +449,8 @@ def execute_model(
         self._execute_encoder(scheduler_output)
         encoder_outputs = self._gather_encoder_outputs(scheduler_output)
 
-        # Prepare the decoder inputs.
-        input_ids, attn_metadata, logits_indices = self._prepare_inputs(
-            scheduler_output)
+        sampling_metadata = self._prepare_sampling(scheduler_output)
+
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
@@ -435,6 +462,21 @@ def execute_model(
             # Eager mode.
             num_input_tokens = num_scheduled_tokens
 
+        do_logprobs = sampling_metadata.max_num_logprobs > 0
+        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
+
+        # Prepare the decoder inputs.
+        (
+            input_ids,
+            attn_metadata,
+            num_query_tokens,
+            num_sampled_tokens,
+            maybe_sample_logits_indices,
+            prompt_logits_mask,
+        ) = self._prepare_inputs(scheduler_output=scheduler_output,
+                                 sampling_metadata=sampling_metadata,
+                                 num_input_tokens=num_input_tokens)
+
         # Get the inputs embeds.
         if encoder_outputs:
             inputs_embeds = self.model.get_input_embeddings(
@@ -456,14 +498,18 @@ def execute_model(
                 attn_metadata=None,
                 inputs_embeds=self.inputs_embeds[:num_input_tokens],
             )
+
         hidden_states = hidden_states[:num_scheduled_tokens]
-        hidden_states = hidden_states[logits_indices]
-        logits = self.model.compute_logits(hidden_states, None)
+
+        sampling_metadata.num_query_tokens = num_query_tokens
+        sampling_metadata.num_sampled_tokens = num_sampled_tokens
+        sampling_metadata.maybe_sample_logits_indices = (
+            maybe_sample_logits_indices)
+        sampling_metadata.prompt_logits_mask = prompt_logits_mask
 
         # Sample the next token and get logprobs if needed.
-        sampling_metadata = self._prepare_sampling(scheduler_output)
         sampler_output = self.model.sample(
-            logits=logits,
+            logits=self.model.compute_logits(hidden_states, None),
             sampling_metadata=sampling_metadata,
         )
 
@@ -491,21 +537,27 @@ def execute_model(
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
-        if sampler_output.logprob_token_ids is None:
-            logprob_token_ids = None
-        else:
-            logprob_token_ids = sampler_output.logprob_token_ids.cpu()
-        if sampler_output.logprobs is None:
-            logprobs = None
-        else:
-            logprobs = sampler_output.logprobs.cpu()
+        (
+            logprob_token_ids,
+            logprobs,
+        ) = ((sampler_output.logprob_token_ids.cpu(),
+              sampler_output.logprobs.cpu()) if do_logprobs else (None, None))
+
+        (
+            prompt_logprob_token_ids,
+            prompt_logprobs,
+        ) = ((sampler_output.prompt_logprob_token_ids.cpu(),
+              sampler_output.prompt_logprobs.cpu()) if do_prompt_logprobs else
+             (None, None))
+
         model_runner_output = ModelRunnerOutput(
             req_ids=self.input_batch.req_ids[:num_reqs],
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids_cpu=sampled_token_ids,
             logprob_token_ids_cpu=logprob_token_ids,
             logprobs_cpu=logprobs,
-        )
+            prompt_logprob_token_ids_cpu=prompt_logprob_token_ids,
+            prompt_logprobs_cpu=prompt_logprobs)
         return model_runner_output
 
     def load_model(self) -> None:
@@ -692,6 +744,7 @@ def __init__(
         self.generators: Dict[int, torch.Generator] = {}
 
         self.num_logprobs: Dict[str, int] = {}
+        self.num_prompt_logprobs: Dict[str, int] = {}
         self.prompt_logprob_reqs: Set[str] = set()
 
     def add_request(
@@ -737,8 +790,11 @@ def add_request(
         self.generators[req_index] = request.generator
 
         num_logprobs = sampling_params.logprobs
+        num_prompt_logprobs = sampling_params.prompt_logprobs
         if num_logprobs is not None and num_logprobs > 0:
             self.num_logprobs[req_id] = num_logprobs
+        if num_prompt_logprobs is not None and num_prompt_logprobs > 0:
+            self.num_prompt_logprobs[req_id] = num_prompt_logprobs
         if sampling_params.prompt_logprobs:
             self.prompt_logprob_reqs.add(req_id)
 
@@ -754,6 +810,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.top_k_reqs.discard(req_id)
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
+        self.num_prompt_logprobs.pop(req_id, None)
         self.prompt_logprob_reqs.discard(req_id)
         return req_index
 
@@ -766,6 +823,7 @@ def clear(self) -> None:
         self.top_k_reqs.clear()
         self.generators.clear()
         self.num_logprobs.clear()
+        self.num_prompt_logprobs.clear()
         self.prompt_logprob_reqs.clear()
 
     def condense(self, empty_req_indices: List[int]) -> None:
@@ -832,7 +890,7 @@ def make_sampling_metadata(
             no_top_k=self.no_top_k,
             generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
-        )
+            max_num_prompt_logprobs=self.max_num_prompt_logprobs)
 
     @property
     def num_reqs(self) -> int:
@@ -858,6 +916,11 @@ def no_top_k(self) -> bool:
     def max_num_logprobs(self) -> int:
         return max(self.num_logprobs.values()) if self.num_logprobs else 0
 
+    @property
+    def max_num_prompt_logprobs(self) -> int:
+        return (max(self.num_prompt_logprobs.values())
+                if self.num_prompt_logprobs else 0)
+
     @property
     def no_logprob(self) -> bool:
         return len(self.num_logprobs) == 0

From 5daabf0a8413897b6691f06fabdefd4fe6d968b3 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 07:58:04 -0500
Subject: [PATCH 009/293] fixed issue with sample-logprob-only batches

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 vllm/v1/sample/sampler.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 77424df30e9ca..26dd4bafcff44 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -101,8 +101,7 @@ def forward(
                 (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
                 dim=-1)
         elif do_logprobs:
-            logprobs = self.get_logprobs(
-                logits_w_tmp_tpk_tpp[maybe_sample_logits_indices, :])
+            logprobs = self.get_logprobs(maybe_sample_logits_w_tmp_tpk_tpp)
 
             maybe_sampled_logprobs = logprobs[
                 torch.arange(maybe_sampled.shape[0]), maybe_sampled]

From ec793e75d6f1a0f3b402bd774a0253ca1d4f7129 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 08:23:27 -0500
Subject: [PATCH 010/293] refactored logprobs tensor pythonization in scheduler

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 tests/v1/samplers/test_logprobs.py |  2 -
 vllm/outputs.py                    | 13 +++++-
 vllm/v1/core/scheduler.py          | 68 +++++++++++++++++++++++++-----
 3 files changed, 70 insertions(+), 13 deletions(-)

diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index 114ce7bd1f2fb..29e193e28092f 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -33,14 +33,12 @@ def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
     no logprobs
 
     Args:
-      
       batch_logprobs_composition: types of logprobs configs to include in batch
 
     Returns:
 
       List of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs])
       tuples
-    
     """
     if batch_logprobs_composition == "NONE":
         # No requests with sample or prompt logprobs
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 9733158504945..912e485e40b59 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -131,7 +131,18 @@ def new(
         prompt_logprobs: Optional[PromptLogprobs],
         finished: bool = False,
     ) -> "RequestOutput":
-        """Initialize a new RequestOutput object."""
+        """Initialize a new RequestOutput object.
+        
+        Args:
+          request_id
+          prompt: optional single prompt string
+          prompt_token_ids: optional list of prompt tokens
+          text: completion text
+          token_ids: completion token ids
+          logprobs: completion sample logprobs
+          prompt_logprobs: prompt logprobs
+          finished
+        """
 
         # TODO: Support `n` > 1.
         completion_output = CompletionOutput(index=0,
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 476b12c705482..0e09da028b16f 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -385,29 +385,77 @@ def _try_schedule_encoder_inputs(
             encoder_inputs_to_schedule.append(i)
         return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
 
-    def update_from_output(
+    def _pythonize_logprobs(
         self,
-        scheduler_output: "SchedulerOutput",
+        do_logprobs: bool,
+        do_prompt_logprobs: bool,
         model_runner_output: "ModelRunnerOutput",
-    ) -> List[EngineCoreOutput]:
-        # NOTE(woosuk): This method doesn't consider speculative decoding.
-        sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
-        num_scheduled_tokens = scheduler_output.num_scheduled_tokens
-        do_logprobs = model_runner_output.logprobs_cpu is not None
-        do_prompt_logprobs = (
-            model_runner_output.prompt_logprobs_cpu is not None
-            and len(model_runner_output.prompt_logprobs_cpu) > 0)
+    ) -> Tuple[List, List, List, List]:
+        """Convert logprobs tensors to Python data structures.
+        
+        Args:
+          do_logprobs: sample logprobs are required
+          do_prompt_logprobs: prompt logprobs are required
+          model_runner_output: model runner output contains CPU logprobs tensors
+
+        Returns:
+          logprob_token_ids_list
+          logprob_values_list
+          prompt_logprob_token_ids_list
+          prompt_logprob_values_list
+        """
         if do_logprobs:
+            # Pythonize sample logprobs if needed
             assert model_runner_output.logprob_token_ids_cpu is not None
             logprob_token_ids_list = (
                 model_runner_output.logprob_token_ids_cpu.tolist())
             logprob_values_list = (model_runner_output.logprobs_cpu.tolist())
+        else:
+            (
+                logprob_token_ids_list,
+                logprob_values_list,
+            ) = (None, None)
         if do_prompt_logprobs:
+            # Pythonize prompt logprobs if needed
             assert model_runner_output.prompt_logprob_token_ids_cpu is not None
             prompt_logprob_token_ids_list = (
                 model_runner_output.prompt_logprob_token_ids_cpu.tolist())
             prompt_logprob_values_list = (
                 model_runner_output.prompt_logprobs_cpu.tolist())
+        else:
+            (
+                prompt_logprob_token_ids_list,
+                prompt_logprob_values_list,
+            ) = (None, None)
+
+        return (logprob_token_ids_list, logprob_values_list,
+                prompt_logprob_token_ids_list, prompt_logprob_values_list)
+
+    def update_from_output(
+        self,
+        scheduler_output: "SchedulerOutput",
+        model_runner_output: "ModelRunnerOutput",
+    ) -> List[EngineCoreOutput]:
+        # NOTE(woosuk): This method doesn't consider speculative decoding.
+        sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
+        num_scheduled_tokens = scheduler_output.num_scheduled_tokens
+        do_logprobs = model_runner_output.logprobs_cpu is not None
+        do_prompt_logprobs = (
+            model_runner_output.prompt_logprobs_cpu is not None
+            and len(model_runner_output.prompt_logprobs_cpu) > 0)
+
+        # Get logprobs as Python data structures
+        (
+            logprob_token_ids_list,
+            logprob_values_list,
+            prompt_logprob_token_ids_list,
+            prompt_logprob_values_list,
+        ) = self._pythonize_logprobs(do_logprobs, do_prompt_logprobs,
+                                     model_runner_output)
+
+        if do_prompt_logprobs:
+            # Index into prompt tokens, for building
+            # prompt logprobs output data structure
             curr_prompt_base_idx = 0
         new_running: List[Request] = []
         engine_core_outputs: List[EngineCoreOutput] = []

From 7593f88f51872ecf588a77e894ec3330b049e439 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 08:42:57 -0500
Subject: [PATCH 011/293] added fast logprobs test

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 tests/v1/samplers/test_logprobs.py | 131 +++++++++++++++++++++--------
 vllm/v1/worker/gpu_model_runner.py |  26 ++----
 2 files changed, 104 insertions(+), 53 deletions(-)

diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index 29e193e28092f..86d34a8285a86 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -75,50 +75,17 @@ def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
         raise ValueError("Invalid logprobs batch configuration for test.")
 
 
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype",
-                         ["half"])  # needed for comparing logprobs with HF
-# @pytest.mark.parametrize("detokenize", [True, False])
-@pytest.mark.parametrize("max_num_batched_tokens", [128, 256, 1024])
-@pytest.mark.parametrize("batch_logprobs_composition",
-                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
-def test_get_logprobs_and_prompt_logprobs(
+def _test_case_get_logprobs_and_prompt_logprobs(
     hf_runner,
     vllm_runner,
     model: str,
     dtype: str,
-    # detokenize: bool,
+    detokenize: bool,
     batch_logprobs_composition: str,
     max_num_batched_tokens: int,
     example_prompts,
     monkeypatch,
-):
-    """Test V1 Engine logprobs & prompt logprobs
-    
-    Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
-    settings and validate that
-    * The generated logprobs and prompt logprobs are consistent with the
-      configuration settings, in terms of whether or not the logprobs
-      (of either type) were requested and how many were requested
-    * The generated logprobs are consistent with the generated tokens
-    * The generated (prompt)logprobs are consistent with HuggingFace
-      (prompt)logprobs, as a reference
-
-    batch_logprobs_composition controls the logprobs configurations for
-    requests in the batch under test.
-
-    Args:
-      hf_runner
-      vllm_runner
-      model
-      dtype
-      detokenize: if False, return generated tokens bypassing detokenizer
-      batch_logprobs_composition: logprobs configuration for test batch
-      example_prompts
-      monkeypatch
-    """
-    detokenize = True
-
+) -> None:
     test_prompts = example_prompts
 
     # LLM engine v1
@@ -273,6 +240,98 @@ def test_get_logprobs_and_prompt_logprobs(
             assert vllm_result.prompt_logprobs is None
 
 
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype",
+                         ["half"])  # needed for comparing logprobs with HF
+# @pytest.mark.parametrize("detokenize", [True, False])
+@pytest.mark.parametrize("max_num_batched_tokens", [128, 256, 1024])
+@pytest.mark.parametrize("batch_logprobs_composition",
+                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
+def test_get_logprobs_and_prompt_logprobs(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+    # detokenize: bool,
+    batch_logprobs_composition: str,
+    max_num_batched_tokens: int,
+    example_prompts,
+    monkeypatch,
+) -> None:
+    """Test V1 Engine logprobs & prompt logprobs
+    
+    Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
+    settings and validate that
+    * The generated logprobs and prompt logprobs are consistent with the
+      configuration settings, in terms of whether or not the logprobs
+      (of either type) were requested and how many were requested
+    * The generated logprobs are consistent with the generated tokens
+    * The generated (prompt)logprobs are consistent with HuggingFace
+      (prompt)logprobs, as a reference
+
+    batch_logprobs_composition controls the logprobs configurations for
+    requests in the batch under test.
+
+    Args:
+      hf_runner
+      vllm_runner
+      model
+      dtype
+      detokenize: if False, return generated tokens bypassing detokenizer
+      batch_logprobs_composition: logprobs configuration for test batch
+      example_prompts
+      monkeypatch
+    """
+    detokenize = True
+
+    _test_case_get_logprobs_and_prompt_logprobs(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        model=model,
+        dtype=dtype,
+        detokenize=detokenize,
+        batch_logprobs_composition=batch_logprobs_composition,
+        max_num_batched_tokens=max_num_batched_tokens,
+        example_prompts=example_prompts,
+        monkeypatch=monkeypatch)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype",
+                         ["half"])  # needed for comparing logprobs with HF
+# @pytest.mark.parametrize("detokenize", [True, False])
+@pytest.mark.parametrize("max_num_batched_tokens", [128])
+@pytest.mark.parametrize("batch_logprobs_composition", ["SAMPLE_PROMPT"])
+def test_fast_get_logprobs_and_prompt_logprobs(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+    # detokenize: bool,
+    batch_logprobs_composition: str,
+    max_num_batched_tokens: int,
+    example_prompts,
+    monkeypatch,
+) -> None:
+    """Fast test: V1 Engine logprobs & prompt logprobs
+    
+    Faster version of `test_get_logprobs_and_prompt_logprobs` with
+    fewer test cases.
+    """
+    detokenize = True
+
+    _test_case_get_logprobs_and_prompt_logprobs(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        model=model,
+        dtype=dtype,
+        detokenize=detokenize,
+        batch_logprobs_composition=batch_logprobs_composition,
+        max_num_batched_tokens=max_num_batched_tokens,
+        example_prompts=example_prompts,
+        monkeypatch=monkeypatch)
+
+
 def test_max_logprobs(monkeypatch):
     """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
     
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0a3fb0535e35a..96bf7763e98b3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -537,27 +537,19 @@ def execute_model(
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
-        (
-            logprob_token_ids,
-            logprobs,
-        ) = ((sampler_output.logprob_token_ids.cpu(),
-              sampler_output.logprobs.cpu()) if do_logprobs else (None, None))
-
-        (
-            prompt_logprob_token_ids,
-            prompt_logprobs,
-        ) = ((sampler_output.prompt_logprob_token_ids.cpu(),
-              sampler_output.prompt_logprobs.cpu()) if do_prompt_logprobs else
-             (None, None))
-
         model_runner_output = ModelRunnerOutput(
             req_ids=self.input_batch.req_ids[:num_reqs],
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids_cpu=sampled_token_ids,
-            logprob_token_ids_cpu=logprob_token_ids,
-            logprobs_cpu=logprobs,
-            prompt_logprob_token_ids_cpu=prompt_logprob_token_ids,
-            prompt_logprobs_cpu=prompt_logprobs)
+            logprob_token_ids_cpu=(sampler_output.logprob_token_ids.cpu()
+                                   if do_logprobs else None),
+            logprobs_cpu=(sampler_output.logprobs.cpu()
+                          if do_logprobs else None),
+            prompt_logprob_token_ids_cpu=(
+                sampler_output.prompt_logprob_token_ids.cpu()
+                if do_prompt_logprobs else None),
+            prompt_logprobs_cpu=(sampler_output.prompt_logprobs.cpu()
+                                 if do_prompt_logprobs else None))
         return model_runner_output
 
     def load_model(self) -> None:

From 9f14c5e8c84d42222adca85de3d30fc2c17b5b65 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 10:11:35 -0500
Subject: [PATCH 012/293] wip refactor

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 vllm/v1/outputs.py                 |   8 +-
 vllm/v1/sample/metadata.py         |   1 -
 vllm/v1/sample/sampler.py          | 260 +++++++++++++++++++++--------
 vllm/v1/worker/gpu_model_runner.py |   9 +-
 4 files changed, 195 insertions(+), 83 deletions(-)

diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 3cd0430aabd6f..0bbbf24abd76d 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -11,14 +11,14 @@ class SamplerOutput:
     sampled_token_ids: torch.Tensor
 
     # [num_reqs, max_num_logprobs + 1]
-    logprob_token_ids: Optional[torch.Tensor]
+    logprob_token_ids: Optional[torch.Tensor] = None
     # [num_reqs, max_num_logprobs + 1]
-    logprobs: Optional[torch.Tensor]
+    logprobs: Optional[torch.Tensor] = None
 
     # [num_prompt_tokens, max_num_prompt_logprobs + 1]
-    prompt_logprob_token_ids: Optional[torch.Tensor]
+    prompt_logprobs: Optional[torch.Tensor] = None
     # [num_prompt_tokens, max_num_prompt_logprobs + 1]
-    prompt_logprobs: Optional[torch.Tensor]
+    prompt_logprob_token_ids: Optional[torch.Tensor] = None
 
 
 @dataclass
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 3bf5a462d5070..51fdae841971b 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -22,6 +22,5 @@ class SamplingMetadata:
     max_num_prompt_logprobs: int
 
     num_query_tokens: Optional[torch.Tensor] = None
-    num_sampled_tokens: Optional[torch.Tensor] = None
     maybe_sample_logits_indices: Optional[torch.Tensor] = None
     prompt_logits_mask: Optional[torch.Tensor] = None
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 26dd4bafcff44..32abeca59e532 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -47,112 +47,230 @@ def _topk_logprobs_indices(
         # Use int32 to reduce the tensor size.
         return topk_logprobs, topk_indices.to(torch.int32)
 
-    def forward(
+    def _compute_logprobs_from_processed_logits(
         self,
-        logits: torch.Tensor,
+        do_logprobs: bool,
+        do_prompt_logprobs: bool,
+        maybe_sampled: torch.Tensor,
+        maybe_sample_logits_indices: Optional[torch.Tensor],
+        prompt_logits_mask: Optional[torch.Tensor],
         sampling_metadata: SamplingMetadata,
-    ) -> SamplerOutput:
-
-        do_logprobs = sampling_metadata.max_num_logprobs > 0
-        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
-        num_query_tokens = sampling_metadata.num_query_tokens
-        maybe_sample_logits_indices = (
-            sampling_metadata.maybe_sample_logits_indices)
-        prompt_logits_mask = sampling_metadata.prompt_logits_mask
-
-        if do_prompt_logprobs:
-            logits_w_tmp_tpk_tpp = self._apply_temperature_top_k_top_p(
-                logits, sampling_metadata, num_query_tokens)
-
-            maybe_sample_logits_w_tmp_tpk_tpp = (
-                logits_w_tmp_tpk_tpp[maybe_sample_logits_indices])
-        else:
-            maybe_sample_logits_w_tmp_tpk_tpp = (
-                self._apply_temperature_top_k_top_p(
-                    logits[maybe_sample_logits_indices], sampling_metadata,
-                    None))
-
-        maybe_sampled = self._probs_sample(maybe_sample_logits_w_tmp_tpk_tpp,
-                                           sampling_metadata)
-
+        maybe_sample_logits_w_tmp_tpk_tpp: torch.Tensor,
+        logits_w_tmp_tpk_tpp: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute sample and prompt logprobs as required by batch config
+        
+        Consumes logits which have already had temperature, top-k and top-p
+        applied. 
+         
+        `do_logprobs` and `do_prompt_logprobs` control whether sample and
+        prompt logprobs are computed, respectively.
+
+        This function does not handle the case where no logprobs are required
+        at the batch level; it is assumed this function will not be called in
+        that scenario.
+
+        Args:
+          do_logprobs: compute sample logprobs
+          do_prompt_logprobs: compute prompt logprobs
+          maybe_sampled: list of sampled tokens; if there is a partial request,
+                         includes the partial request's sampled token (which
+                         will later be discarded.)
+          maybe_sample_logits_indices: sequence-offset indices where a new
+                         token is decoded; if there is a partial request,
+                         includes the index of the partial request's sampled
+                         token (which will later be discarded.)
+          prompt_logits_mask: mask indicating the sequence offsets of prompt
+                         tokens. Note: if there is a partial request,
+                         this mask includes the index of the partial request's
+                         sample token (since this sampled token will be
+                         discarded, but the logprobs computed at this offset
+                         are part of the prompt logprobs.) Note that this means
+                         prompt_logits_mask and maybe_sample_logits_indices
+                         may have overlap.
+          sampling_metadata
+          maybe_sample_logits_w_tmp_tpk_tpp: assumed to be logits gathered
+                         from sequence offsets where a new token is being
+                         decoded (including for a partial request); assumed
+                         that temperature, top-k and top-p have been applied.
+          logits_w_tmp_tpk_tpp: optional; all logits with temperature, top-k,
+                         top-p applied.
+
+          Returns:
+            Sample logprobs (`None` if `do_logprobs == False`)
+            Sample logprobs token indices (`None` if `do_logprobs == False`)
+            Prompt logprobs (`None` if `do_prompt_logprobs == False`)
+            Prompt logprobs token indices
+                (`None` if `do_prompt_logprobs == False`)
+        """
+
+        assert do_logprobs or do_prompt_logprobs
         if do_logprobs and do_prompt_logprobs:
-            logprobs = self.get_logprobs(logits_w_tmp_tpk_tpp)
-
-            maybe_sampled_logprobs = logprobs[maybe_sample_logits_indices,
-                                              maybe_sampled]
+            # Batch requires sample and prompt logprobs
 
+            # - Compute top logprobs for all sequence offsets
+            logprobs = self.get_logprobs(logits_w_tmp_tpk_tpp)
             topk_logprobs, topk_indices = self._topk_logprobs_indices(
                 logprobs, sampling_metadata)
 
+            # - Gather logprobs for sequence offsets where new tokens are
+            #   decoded
             maybe_sample_topk_logprobs = topk_logprobs[
                 maybe_sample_logits_indices, :]
             maybe_sample_topk_indices = topk_indices[
                 maybe_sample_logits_indices, :]
-            prompt_topk_logprobs = topk_logprobs[prompt_logits_mask, :]
-            prompt_topk_indices = topk_indices[prompt_logits_mask, :]
 
-            # Concat sampled token logprobs
-            maybe_sample_topk_logprobs = torch.cat(
-                (maybe_sample_topk_logprobs,
-                 maybe_sampled_logprobs.unsqueeze(-1)),
-                dim=-1)
-            #Concat sampled token id
-            maybe_sample_topk_indices = torch.cat(
-                (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
-                dim=-1)
-        elif do_logprobs:
-            logprobs = self.get_logprobs(maybe_sample_logits_w_tmp_tpk_tpp)
+            # - In case sampled tokens are not in the top logprobs at their
+            #   respective sequence offsets, gather logprobs associated with
+            #   sampled tokens
+            maybe_sampled_logprobs = logprobs[maybe_sample_logits_indices,
+                                              maybe_sampled]
 
-            maybe_sampled_logprobs = logprobs[
-                torch.arange(maybe_sampled.shape[0]), maybe_sampled]
+            return (
+                # Sample logprobs (including sampled tokens)
+                torch.cat((maybe_sample_topk_logprobs,
+                           maybe_sampled_logprobs.unsqueeze(-1)),
+                          dim=-1),
+                # Sample logprobs token indices (including sampled tokens)
+                torch.cat(
+                    (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
+                    dim=-1),
+                # Prompt logprobs
+                topk_logprobs[prompt_logits_mask, :],
+                # Prompt logprob token indices
+                topk_indices[prompt_logits_mask, :])
+        elif do_logprobs:
+            # Batch requires only sample logprobs
 
+            # - Compute top logprobs only at sequence offsets where new tokens
+            #   are being decoded
+            logprobs = self.get_logprobs(maybe_sample_logits_w_tmp_tpk_tpp)
             (
                 maybe_sample_topk_logprobs,
                 maybe_sample_topk_indices,
             ) = self._topk_logprobs_indices(logprobs, sampling_metadata)
 
-            # Concat sampled token logprobs
+            # - In case sampled tokens are not in the top logprobs at their
+            #   respective sequence offsets, gather logprobs associated with
+            #   sampled tokens
+            maybe_sampled_logprobs = logprobs[
+                torch.arange(maybe_sampled.shape[0]), maybe_sampled]
+
+            # - Concat sampled token logprobs
             maybe_sample_topk_logprobs = torch.cat(
                 (maybe_sample_topk_logprobs,
                  maybe_sampled_logprobs.unsqueeze(-1)),
                 dim=-1)
-            #Concat sampled token id
+            # - Concat sampled token id
             maybe_sample_topk_indices = torch.cat(
                 (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
                 dim=-1)
 
-            (
-                prompt_topk_logprobs,
-                prompt_topk_indices,
-            ) = (None, None)
+            # Return sample logprobs
+            return (maybe_sample_topk_logprobs, maybe_sample_topk_indices,
+                    None, None)
 
         elif do_prompt_logprobs:
+            # Batch requires only prompt logprobs
+
+            # - Compute top logprobs only at sequence offsets of prompt tokens
             logprobs = self.get_logprobs(
                 logits_w_tmp_tpk_tpp[prompt_logits_mask, :])
 
-            prompt_topk_logprobs, prompt_topk_indices = (
-                self._topk_logprobs_indices(logprobs, sampling_metadata))
+            # Return prompt logprobs
+            return ((None, None) +
+                    self._topk_logprobs_indices(logprobs, sampling_metadata))
 
-            (
-                maybe_sample_topk_logprobs,
-                maybe_sample_topk_indices,
-            ) = (None, None)
+    def forward(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
+        """Implement sampling.
+        
+        Apply temperature, top-k and top-p.
+        Sample from the probability distribution implied by `logits`.
+        Only sample at sequence offsets where new tokens are decoded.
+        In the process, compute sample and prompt logprobs (if required.)
+
+        Args:
+          logits: model output logits which imply probability distribution.
+          sampling_metadata: sampling config settings
+        
+        Returns:
+          Sampler output. Sampled tokens and sample/prompt logprobs
+          (if requested)
+        """
+
+        # Batch-level logprobs configs. `do_logprobs` indicates whether
+        # any request requires sample logprobs. `do_prompt_logprobs`
+        # indicates whether any request requires prompt logprobs.
+        do_logprobs = sampling_metadata.max_num_logprobs > 0
+        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
+        do_any_logprobs = do_logprobs or do_prompt_logprobs
+
+        num_query_tokens = sampling_metadata.num_query_tokens
+        maybe_sample_logits_indices = (
+            sampling_metadata.maybe_sample_logits_indices)
+        prompt_logits_mask = sampling_metadata.prompt_logits_mask
+
+        # Apply temperature, top-k and top-p to logits at sequence offsets
+        # where a new token is being decoded.
+        if do_prompt_logprobs:
+            # If prompt logprobs are required, then temp/top-k/top-p
+            # must also be applied to prompt logits as a prerequisite.
+            # So pass *all* logits through temp/top-k/top-p, then gather
+            # the processed logits from the sequence offsets where a new token
+            # is being decoded.
+            logits_w_tmp_tpk_tpp = self._apply_temperature_top_k_top_p(
+                logits, sampling_metadata, num_query_tokens)
+
+            maybe_sample_logits_w_tmp_tpk_tpp = (
+                logits_w_tmp_tpk_tpp[maybe_sample_logits_indices])
         else:
+            # If prompt logprobs are not required, then gather the logits
+            # only from the sequence offsets where a new token is being
+            # decoded, and *only* apply temp/top-k/top-p to those logits.
+            maybe_sample_logits_w_tmp_tpk_tpp = (
+                self._apply_temperature_top_k_top_p(
+                    logits[maybe_sample_logits_indices], sampling_metadata,
+                    None))
+
+        # Compute and sample token probability distribution, *only* at sequence
+        # offsets where a new token is being decoded
+        maybe_sampled = self._probs_sample(maybe_sample_logits_w_tmp_tpk_tpp,
+                                           sampling_metadata)
+
+        # Compute sample & prompt logprobs, as-needed
+        if do_any_logprobs:
             (
-                maybe_sample_topk_logprobs,
-                maybe_sample_topk_indices,
-                prompt_topk_logprobs,
-                prompt_topk_indices,
-            ) = (None, None, None, None)
-
-        sampler_output = SamplerOutput(
-            sampled_token_ids=maybe_sampled,
-            logprob_token_ids=maybe_sample_topk_indices,
-            logprobs=maybe_sample_topk_logprobs,
-            prompt_logprob_token_ids=prompt_topk_indices,
-            prompt_logprobs=prompt_topk_logprobs)
-
-        return sampler_output
+                maybe_sample_logprobs,
+                maybe_sample_logprobs_token_indices,
+                prompt_logprobs,
+                prompt_logprobs_token_indices,
+            ) = self._compute_logprobs_from_processed_logits(
+                do_logprobs=do_logprobs,
+                do_prompt_logprobs=do_prompt_logprobs,
+                maybe_sampled=maybe_sampled,
+                maybe_sample_logits_indices=maybe_sample_logits_indices,
+                prompt_logits_mask=prompt_logits_mask,
+                sampling_metadata=sampling_metadata,
+                maybe_sample_logits_w_tmp_tpk_tpp=
+                maybe_sample_logits_w_tmp_tpk_tpp,
+                logits_w_tmp_tpk_tpp=(logits_w_tmp_tpk_tpp
+                                      if do_prompt_logprobs else None))
+
+            # Return decoded output tokens and sample/prompt logprobs,
+            # as required
+            return SamplerOutput(
+                sampled_token_ids=maybe_sampled,
+                logprobs=maybe_sample_logprobs,
+                logprob_token_ids=maybe_sample_logprobs_token_indices,
+                prompt_logprobs=prompt_logprobs,
+                prompt_logprob_token_ids=prompt_logprobs_token_indices)
+        else:
+            # No logprobs; return decoded output tokens
+            return SamplerOutput(sampled_token_ids=maybe_sampled)
 
     def _apply_temperature(
         self,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 96bf7763e98b3..dd0d1824246d4 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -332,8 +332,6 @@ def _prepare_inputs(
         # TODO: Support prompt logprobs.
         maybe_sample_logits_indices = query_start_loc[1:] - 1
         num_query_tokens = torch.diff(query_start_loc)
-        num_sampled_tokens = torch.tensor(
-            scheduler_output.partial_running_reqs, device=self.device)
 
         # One or more requests require prompt logprobs
         complete_req_mask = torch.tensor(
@@ -345,12 +343,11 @@ def _prepare_inputs(
                 maybe_sample_logits_indices[complete_req_mask]] = False
 
             return (input_ids, attn_metadata, num_query_tokens,
-                    num_sampled_tokens, maybe_sample_logits_indices,
-                    prompt_logits_mask)
+                    maybe_sample_logits_indices, prompt_logits_mask)
         else:
             # No requests require prompt logprobs
             return (input_ids, attn_metadata, num_query_tokens,
-                    num_sampled_tokens, maybe_sample_logits_indices, None)
+                    maybe_sample_logits_indices, None)
 
     def _prepare_sampling(
         self,
@@ -470,7 +467,6 @@ def execute_model(
             input_ids,
             attn_metadata,
             num_query_tokens,
-            num_sampled_tokens,
             maybe_sample_logits_indices,
             prompt_logits_mask,
         ) = self._prepare_inputs(scheduler_output=scheduler_output,
@@ -502,7 +498,6 @@ def execute_model(
         hidden_states = hidden_states[:num_scheduled_tokens]
 
         sampling_metadata.num_query_tokens = num_query_tokens
-        sampling_metadata.num_sampled_tokens = num_sampled_tokens
         sampling_metadata.maybe_sample_logits_indices = (
             maybe_sample_logits_indices)
         sampling_metadata.prompt_logits_mask = prompt_logits_mask

From 3460c187bf66929051fe5c595fd39605899a2823 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 11:11:37 -0500
Subject: [PATCH 013/293] format

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 tests/v1/samplers/test_logprobs.py |  7 +--
 vllm/v1/sample/sampler.py          | 72 +++++++++++++++++++-----------
 2 files changed, 49 insertions(+), 30 deletions(-)

diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index 86d34a8285a86..a303438c8a3d9 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -57,7 +57,7 @@ def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
             (None, 0),
             (0, None),
             (0, 0),
-            (None, 6),
+            (None, 7),
             (0, 5),
         ]
     elif batch_logprobs_composition == "SAMPLE_PROMPT":
@@ -67,7 +67,7 @@ def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
             (0, 0),
             (5, None),
             (3, 0),
-            (6, 3),
+            (7, 3),
             (None, 6),
             (0, 5),
         ]
@@ -301,7 +301,8 @@ def test_get_logprobs_and_prompt_logprobs(
                          ["half"])  # needed for comparing logprobs with HF
 # @pytest.mark.parametrize("detokenize", [True, False])
 @pytest.mark.parametrize("max_num_batched_tokens", [128])
-@pytest.mark.parametrize("batch_logprobs_composition", ["SAMPLE_PROMPT"])
+@pytest.mark.parametrize("batch_logprobs_composition",
+                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
 def test_fast_get_logprobs_and_prompt_logprobs(
     hf_runner,
     vllm_runner,
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 32abeca59e532..4a0a3afb35e0b 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -36,14 +36,26 @@ def _probs_sample(
         # Use int32 to reduce the tensor size.
         return sampled.to(torch.int32)
 
-    def _topk_logprobs_indices(
+    def _top_logprobs_token_indices(
         self,
         logprobs: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
+        max_num_logprobs: int,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute top logprobs and associated token indices
+        
+        Args:
+          logprobs: total_tokens x vocab tensor
+          max_num_logprobs: Max number of top {sample,prompt} logprobs
+                            requested in batch (depending on whether top sample
+                            logprobs or top prompt logprobs are being computed)
 
-        topk_logprobs, topk_indices = torch.topk(
-            logprobs, sampling_metadata.max_num_logprobs, dim=-1)
+        Returns:
+          Top logprobs, total_tokens x max_num_logprobs tensor
+          Top logprob token indices, total_tokens x max_num_logprobs tensor
+        """
+        topk_logprobs, topk_indices = torch.topk(logprobs,
+                                                 max_num_logprobs,
+                                                 dim=-1)
         # Use int32 to reduce the tensor size.
         return topk_logprobs, topk_indices.to(torch.int32)
 
@@ -97,28 +109,33 @@ def _compute_logprobs_from_processed_logits(
                          top-p applied.
 
           Returns:
-            Sample logprobs (`None` if `do_logprobs == False`)
-            Sample logprobs token indices (`None` if `do_logprobs == False`)
-            Prompt logprobs (`None` if `do_prompt_logprobs == False`)
-            Prompt logprobs token indices
-                (`None` if `do_prompt_logprobs == False`)
+            Sample logprobs (`None` if `do_logprobs == False`,
+                             o/w num_samples x max_num_logprobs tensor)
+            Sample logprobs token indices (`None` if `do_logprobs == False`,
+                             o/w num_samples x max_num_logprobs tensor)
+            Prompt logprobs (`None` if `do_prompt_logprobs == False`,
+                             o/w num_prompt_tokens x max_num_prompt_logprobs
+                             tensor)
+            Prompt logprobs token indices (`None` if
+                 `do_prompt_logprobs == False`, o/w
+                 num_prompt_tokens x max_num_prompt_logprobs tensor)
         """
 
         assert do_logprobs or do_prompt_logprobs
         if do_logprobs and do_prompt_logprobs:
             # Batch requires sample and prompt logprobs
 
-            # - Compute top logprobs for all sequence offsets
+            # - Compute logprobs for all sequence offsets
             logprobs = self.get_logprobs(logits_w_tmp_tpk_tpp)
-            topk_logprobs, topk_indices = self._topk_logprobs_indices(
-                logprobs, sampling_metadata)
 
-            # - Gather logprobs for sequence offsets where new tokens are
-            #   decoded
-            maybe_sample_topk_logprobs = topk_logprobs[
-                maybe_sample_logits_indices, :]
-            maybe_sample_topk_indices = topk_indices[
-                maybe_sample_logits_indices, :]
+            # - Compute *top* logprobs for sequence offsets
+            #   where a new token is being decoded
+            (
+                maybe_sample_topk_logprobs,
+                maybe_sample_topk_indices,
+            ) = self._top_logprobs_token_indices(
+                logprobs[maybe_sample_logits_indices, :],
+                sampling_metadata.max_num_logprobs)
 
             # - In case sampled tokens are not in the top logprobs at their
             #   respective sequence offsets, gather logprobs associated with
@@ -126,7 +143,7 @@ def _compute_logprobs_from_processed_logits(
             maybe_sampled_logprobs = logprobs[maybe_sample_logits_indices,
                                               maybe_sampled]
 
-            return (
+            return ((
                 # Sample logprobs (including sampled tokens)
                 torch.cat((maybe_sample_topk_logprobs,
                            maybe_sampled_logprobs.unsqueeze(-1)),
@@ -134,11 +151,11 @@ def _compute_logprobs_from_processed_logits(
                 # Sample logprobs token indices (including sampled tokens)
                 torch.cat(
                     (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
-                    dim=-1),
-                # Prompt logprobs
-                topk_logprobs[prompt_logits_mask, :],
-                # Prompt logprob token indices
-                topk_indices[prompt_logits_mask, :])
+                    dim=-1)) +
+                    # Prompt logprobs and token indices
+                    self._top_logprobs_token_indices(
+                        logprobs[prompt_logits_mask, :],
+                        sampling_metadata.max_num_prompt_logprobs))
         elif do_logprobs:
             # Batch requires only sample logprobs
 
@@ -148,7 +165,8 @@ def _compute_logprobs_from_processed_logits(
             (
                 maybe_sample_topk_logprobs,
                 maybe_sample_topk_indices,
-            ) = self._topk_logprobs_indices(logprobs, sampling_metadata)
+            ) = self._top_logprobs_token_indices(
+                logprobs, sampling_metadata.max_num_logprobs)
 
             # - In case sampled tokens are not in the top logprobs at their
             #   respective sequence offsets, gather logprobs associated with
@@ -178,8 +196,8 @@ def _compute_logprobs_from_processed_logits(
                 logits_w_tmp_tpk_tpp[prompt_logits_mask, :])
 
             # Return prompt logprobs
-            return ((None, None) +
-                    self._topk_logprobs_indices(logprobs, sampling_metadata))
+            return ((None, None) + self._top_logprobs_token_indices(
+                logprobs, sampling_metadata.max_num_prompt_logprobs))
 
     def forward(
         self,

From 9ca0ce0daa81dcb31278f440be824effd739944d Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 11:35:10 -0500
Subject: [PATCH 014/293] refactor

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index dd0d1824246d4..1492a3ba89f0a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -329,18 +329,15 @@ def _prepare_inputs(
         # request in the batch. While we should not sample any token from this
         # partial request, we do so for simplicity. We will ignore the sampled
         # token from the partial request.
-        # TODO: Support prompt logprobs.
         maybe_sample_logits_indices = query_start_loc[1:] - 1
         num_query_tokens = torch.diff(query_start_loc)
 
-        # One or more requests require prompt logprobs
-        complete_req_mask = torch.tensor(
-            [not x for x in scheduler_output.partial_running_reqs])
-
         if do_prompt_logprobs:
             prompt_logits_mask = torch.ones(num_input_tokens, dtype=torch.bool)
-            prompt_logits_mask[
-                maybe_sample_logits_indices[complete_req_mask]] = False
+            # Sequence offsets where a token is being decoded are *not* prompt
+            # tokens, unless the request in question is partial
+            prompt_logits_mask[maybe_sample_logits_indices[
+                ~torch.tensor(scheduler_output.partial_running_reqs)]] = False
 
             return (input_ids, attn_metadata, num_query_tokens,
                     maybe_sample_logits_indices, prompt_logits_mask)
@@ -448,6 +445,9 @@ def execute_model(
 
         sampling_metadata = self._prepare_sampling(scheduler_output)
 
+        do_logprobs = sampling_metadata.max_num_logprobs > 0
+        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
+
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
@@ -459,9 +459,6 @@ def execute_model(
             # Eager mode.
             num_input_tokens = num_scheduled_tokens
 
-        do_logprobs = sampling_metadata.max_num_logprobs > 0
-        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
-
         # Prepare the decoder inputs.
         (
             input_ids,

From d277d37976a7a5feb36a4d8511af57e046026a1f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 12:53:18 -0500
Subject: [PATCH 015/293] attempted sample_metadata fix; sample logprobs work,
 prompt logprobs broken

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 vllm/v1/core/scheduler.py          | 31 +++++-----
 vllm/v1/sample/metadata.py         | 10 ++-
 vllm/v1/sample/sampler.py          | 15 ++++-
 vllm/v1/worker/gpu_model_runner.py | 99 ++++++++++++++++--------------
 4 files changed, 90 insertions(+), 65 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 0e09da028b16f..87113ea2f65e8 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -109,6 +109,7 @@ def schedule(self) -> "SchedulerOutput":
         # V1 model runner.
         # TODO(woosuk): Remove this constraint after refactoring model runner.
         has_partial_request = False
+        partial_req_index = -1
         req_index = 0
         while req_index < len(self.running):
             # Only the last request in the RUNNING queue can be "partial".
@@ -158,9 +159,11 @@ def schedule(self) -> "SchedulerOutput":
             ]
             num_scheduled_tokens[request.request_id] = num_new_tokens
             token_budget -= num_new_tokens
+            if (request.num_computed_tokens + num_new_tokens <
+                    request.num_tokens):
+                has_partial_request = True
+                partial_req_index = req_index
             req_index += 1
-            has_partial_request = (request.num_computed_tokens + num_new_tokens
-                                   < request.num_tokens)
 
             # Encoder-related.
             if encoder_inputs_to_schedule:
@@ -236,8 +239,10 @@ def schedule(self) -> "SchedulerOutput":
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
                 request.num_computed_tokens = num_computed_tokens
-                has_partial_request = (num_computed_tokens + num_new_tokens <
-                                       request.num_tokens)
+                if (request.num_computed_tokens + num_new_tokens <
+                        request.num_tokens):
+                    has_partial_request = True
+                    partial_req_index = req_index
 
                 # Encoder-related.
                 if encoder_inputs_to_schedule:
@@ -248,13 +253,6 @@ def schedule(self) -> "SchedulerOutput":
                         self.encoder_cache_manager.allocate(request, i)
                     encoder_budget = new_encoder_budget
 
-        # Now that requests are scheduled, generate a mask indicating which
-        # request is partial
-        partial_running_reqs = [
-            (req.num_computed_tokens + num_scheduled_tokens[req.request_id] <
-             req.num_tokens) for req in self.running
-        ]
-
         # Check if the scheduling constraints are satisfied.
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
         assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
@@ -285,7 +283,7 @@ def schedule(self) -> "SchedulerOutput":
             scheduled_new_reqs=new_reqs_data,
             scheduled_resumed_reqs=resumed_reqs_data,
             scheduled_running_reqs=running_reqs_data,
-            partial_running_reqs=partial_running_reqs,
+            partial_req_index=partial_req_index,
             num_scheduled_tokens=num_scheduled_tokens,
             total_num_scheduled_tokens=total_num_scheduled_tokens,
             scheduled_encoder_inputs=scheduled_encoder_inputs,
@@ -470,9 +468,14 @@ def update_from_output(
 
             if do_prompt_logprobs:
                 max_prompt_logprobs = request.max_prompt_logprobs
+                # Number of new prompt tokens is the number of scheduled
+                # tokens *if* the request is partial (because the sampled
+                # token is discarded and all sequence offsets are prompt
+                # offsets), otherwise it is the number of scheduled
+                # tokens minus one (for the sampled token)
                 num_new_prompt_tokens = (
                     num_scheduled_tokens[request.request_id] -
-                    int(not scheduler_output.partial_running_reqs[req_index]))
+                    int(scheduler_output.partial_req_index != req_index))
 
                 request_do_prompt_logprobs = (max_prompt_logprobs is not None
                                               and max_prompt_logprobs > 0
@@ -774,7 +777,7 @@ class SchedulerOutput:
     scheduled_new_reqs: List[NewRequestData]
     scheduled_resumed_reqs: List[ResumedRequestData]
     scheduled_running_reqs: List[RunningRequestData]
-    partial_running_reqs: List[bool]  # True if running req is partial
+    partial_req_index: int  # >0 if running req is partial, -1 o/w
 
     num_scheduled_tokens: Dict[str, int]
     total_num_scheduled_tokens: int
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 51fdae841971b..c1d817c8f3ffd 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -21,6 +21,10 @@ class SamplingMetadata:
     max_num_logprobs: int
     max_num_prompt_logprobs: int
 
-    num_query_tokens: Optional[torch.Tensor] = None
-    maybe_sample_logits_indices: Optional[torch.Tensor] = None
-    prompt_logits_mask: Optional[torch.Tensor] = None
+    query_start_loc: Optional[torch.Tensor]
+    num_query_tokens: Optional[torch.Tensor]
+    #maybe_sample_logits_indices: Optional[torch.Tensor] = None
+    #prompt_logits_mask: Optional[torch.Tensor] = None
+
+    num_input_tokens: int
+    partial_req_index: int  # >0 if there is a partial request, -1 o/w
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 4a0a3afb35e0b..4448b55deb868 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -228,9 +228,18 @@ def forward(
         do_any_logprobs = do_logprobs or do_prompt_logprobs
 
         num_query_tokens = sampling_metadata.num_query_tokens
-        maybe_sample_logits_indices = (
-            sampling_metadata.maybe_sample_logits_indices)
-        prompt_logits_mask = sampling_metadata.prompt_logits_mask
+        # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
+        # request in the batch. While we should not sample any token from this
+        # partial request, we do so for simplicity. We will ignore the sampled
+        # token from the partial request.
+        maybe_sample_logits_indices = sampling_metadata.query_start_loc[1:] - 1
+        prompt_logits_mask = torch.ones(sampling_metadata.num_input_tokens,
+                                        dtype=torch.bool)
+        # Sequence offsets where a token is being decoded are *not* prompt
+        # tokens...
+        prompt_logits_mask[maybe_sample_logits_indices] = False
+        # ...unless the request in question is partial.
+        prompt_logits_mask[sampling_metadata.partial_req_index] = True
 
         # Apply temperature, top-k and top-p to logits at sequence offsets
         # where a new token is being decoded.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 1492a3ba89f0a..2e642c5869c97 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -211,10 +211,8 @@ def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
         sampling_metadata: SamplingMetadata,
-        num_input_tokens: int,
     ) -> Tuple[torch.Tensor, FlashAttentionMetadata, torch.Tensor,
                torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
 
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
@@ -291,14 +289,7 @@ def _prepare_inputs(
                   out=slot_mapping)
 
         # Prepare the attention metadata.
-        query_start_loc = torch.empty((num_reqs + 1, ),
-                                      dtype=torch.int32,
-                                      device="cpu",
-                                      pin_memory=self.pin_memory)
-        query_start_loc_np = query_start_loc.numpy()
-        query_start_loc_np[0] = 0
-        np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1:])
-
+        query_start_loc = sampling_metadata.query_start_loc
         seq_lens = (self.input_batch.num_computed_tokens_cpu[:num_reqs] +
                     num_scheduled_tokens)
         max_seq_len = seq_lens.max()
@@ -313,7 +304,6 @@ def _prepare_inputs(
         input_ids = input_ids.to(self.device, non_blocking=True)
         self.positions[:total_num_scheduled_tokens].copy_(positions,
                                                           non_blocking=True)
-        query_start_loc = query_start_loc.to(self.device, non_blocking=True)
         seq_start_loc = seq_start_loc.to(self.device, non_blocking=True)
         slot_mapping = slot_mapping.to(self.device, non_blocking=True).long()
         attn_metadata = FlashAttentionMetadata(
@@ -329,26 +319,12 @@ def _prepare_inputs(
         # request in the batch. While we should not sample any token from this
         # partial request, we do so for simplicity. We will ignore the sampled
         # token from the partial request.
-        maybe_sample_logits_indices = query_start_loc[1:] - 1
-        num_query_tokens = torch.diff(query_start_loc)
-
-        if do_prompt_logprobs:
-            prompt_logits_mask = torch.ones(num_input_tokens, dtype=torch.bool)
-            # Sequence offsets where a token is being decoded are *not* prompt
-            # tokens, unless the request in question is partial
-            prompt_logits_mask[maybe_sample_logits_indices[
-                ~torch.tensor(scheduler_output.partial_running_reqs)]] = False
-
-            return (input_ids, attn_metadata, num_query_tokens,
-                    maybe_sample_logits_indices, prompt_logits_mask)
-        else:
-            # No requests require prompt logprobs
-            return (input_ids, attn_metadata, num_query_tokens,
-                    maybe_sample_logits_indices, None)
+        return (input_ids, attn_metadata)
 
     def _prepare_sampling(
         self,
         scheduler_output: "SchedulerOutput",
+        num_input_tokens: int,
     ) -> SamplingMetadata:
         skip_copy = True
         if (scheduler_output.finished_req_ids
@@ -358,7 +334,11 @@ def _prepare_sampling(
                 or scheduler_output.scheduled_resumed_reqs):
             skip_copy = False
         # Create the sampling metadata.
-        sampling_metadata = self.input_batch.make_sampling_metadata(skip_copy)
+        sampling_metadata = self.input_batch.make_sampling_metadata(
+            scheduler_output,
+            num_input_tokens,
+            skip_copy,
+        )
         return sampling_metadata
 
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
@@ -443,11 +423,6 @@ def execute_model(
         self._execute_encoder(scheduler_output)
         encoder_outputs = self._gather_encoder_outputs(scheduler_output)
 
-        sampling_metadata = self._prepare_sampling(scheduler_output)
-
-        do_logprobs = sampling_metadata.max_num_logprobs > 0
-        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
-
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
@@ -459,16 +434,17 @@ def execute_model(
             # Eager mode.
             num_input_tokens = num_scheduled_tokens
 
+        sampling_metadata = self._prepare_sampling(scheduler_output,
+                                                   num_input_tokens)
+        do_logprobs = sampling_metadata.max_num_logprobs > 0
+        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
+
         # Prepare the decoder inputs.
         (
             input_ids,
             attn_metadata,
-            num_query_tokens,
-            maybe_sample_logits_indices,
-            prompt_logits_mask,
         ) = self._prepare_inputs(scheduler_output=scheduler_output,
-                                 sampling_metadata=sampling_metadata,
-                                 num_input_tokens=num_input_tokens)
+                                 sampling_metadata=sampling_metadata)
 
         # Get the inputs embeds.
         if encoder_outputs:
@@ -494,11 +470,6 @@ def execute_model(
 
         hidden_states = hidden_states[:num_scheduled_tokens]
 
-        sampling_metadata.num_query_tokens = num_query_tokens
-        sampling_metadata.maybe_sample_logits_indices = (
-            maybe_sample_logits_indices)
-        sampling_metadata.prompt_logits_mask = prompt_logits_mask
-
         # Sample the next token and get logprobs if needed.
         sampler_output = self.model.sample(
             logits=self.model.compute_logits(hidden_states, None),
@@ -855,6 +826,8 @@ def condense(self, empty_req_indices: List[int]) -> None:
 
     def make_sampling_metadata(
         self,
+        scheduler_output: "SchedulerOutput",
+        num_input_tokens: int,
         skip_copy: bool = False,
     ) -> SamplingMetadata:
         if not skip_copy:
@@ -864,8 +837,36 @@ def make_sampling_metadata(
                 self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True)
             self.top_k[:self.num_reqs].copy_(
                 self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True)
+
+        num_reqs = self.num_reqs
+
+        # Get the number of scheduled tokens for each request.
+        # TODO: The Python loop can be slow. Optimize.
+        num_scheduled_tokens = []
+        max_num_scheduled_tokens = 0
+        for req_id in self.req_ids[:num_reqs]:
+            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            num_scheduled_tokens.append(num_tokens)
+            max_num_scheduled_tokens = max(max_num_scheduled_tokens,
+                                           num_tokens)
+        num_scheduled_tokens = np.array(num_scheduled_tokens, dtype=np.int32)
+        assert max_num_scheduled_tokens > 0
+
+        # Compute query start offsets. It makes sense to compute this here
+        # rather than in model runner _prepare_inputs() because query start
+        # offsets are required for computing num_query_tokens in the scenario
+        # where prompt logprobs are required by the batch.
+        query_start_loc = torch.empty((num_reqs + 1, ),
+                                      dtype=torch.int32,
+                                      device="cpu",
+                                      pin_memory=self.pin_memory)
+        query_start_loc_np = query_start_loc.numpy()
+        query_start_loc_np[0] = 0
+        np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1:])
+        query_start_loc = query_start_loc.to(self.device, non_blocking=True)
+
         return SamplingMetadata(
-            temperature=self.temperature[:self.num_reqs],
+            temperature=self.temperature[:num_reqs],
             all_greedy=self.all_greedy,
             all_random=self.all_random,
             top_p=self.top_p[:self.num_reqs],
@@ -874,7 +875,15 @@ def make_sampling_metadata(
             no_top_k=self.no_top_k,
             generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
-            max_num_prompt_logprobs=self.max_num_prompt_logprobs)
+            max_num_prompt_logprobs=self.max_num_prompt_logprobs,
+            query_start_loc=query_start_loc,
+            num_input_tokens=num_input_tokens,
+            partial_req_index=scheduler_output.partial_req_index,
+            # Required for prompt logprobs temperature computation.
+            # If prompt logprobs is not required for this batch, then
+            # avoid storing num_query_tokens
+            num_query_tokens=(torch.diff(query_start_loc)
+                              if self.max_num_prompt_logprobs > 0 else None))
 
     @property
     def num_reqs(self) -> int:

From 9416be56f97d143f65003d22222d002edbfe1806 Mon Sep 17 00:00:00 2001
From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>
Date: Tue, 26 Nov 2024 09:11:16 -0800
Subject: [PATCH 016/293] [Bugfix] Fix for Spec model TP + Chunked Prefill
 (#10232)

Signed-off-by: andoorve <37849411+andoorve@users.noreply.github.com>
Signed-off-by: Sourashis Roy <sroy@roblox.com>
Co-authored-by: Sourashis Roy <sroy@roblox.com>
---
 docs/source/serving/compatibility_matrix.rst  |  2 +-
 tests/core/test_chunked_prefill_scheduler.py  | 39 +++++++++++++
 tests/spec_decode/e2e/test_compatibility.py   | 46 ---------------
 .../e2e/test_integration_dist_tp2.py          | 57 +++++++++++++++++++
 tests/spec_decode/test_spec_decode_worker.py  |  3 +-
 vllm/config.py                                | 10 ----
 vllm/core/scheduler.py                        | 28 ++++++---
 vllm/spec_decode/spec_decode_worker.py        | 33 +++++++++--
 8 files changed, 145 insertions(+), 73 deletions(-)

diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst
index fa03d2cde1486..a93632ff36fb8 100644
--- a/docs/source/serving/compatibility_matrix.rst
+++ b/docs/source/serving/compatibility_matrix.rst
@@ -118,7 +118,7 @@ Feature x Feature
      - 
      - 
    * - :ref:`SD <spec_decode>`
-     - ✗
+     - ✅
      - ✅
      - ✗
      - ✅
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index acd82065ae457..eaaf004df38b2 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -413,6 +413,45 @@ def cannot_append_second_group2(seq_group, num_lookahead_slots):
     assert out.num_batched_tokens == max_num_batched_tokens
 
 
+@pytest.mark.parametrize("num_scheduler_steps", [1, 5])
+def test_chunked_prefill_spec_prefill(num_scheduler_steps):
+    """Verify that the num_lookahead_slots is set appropriately for an all"""
+    """prefill batch depending on whether multi-step scheduling is enabled"""
+    """or not"""
+    block_size = 4
+    max_seqs = 30
+    max_model_len = 200
+    max_num_batched_tokens = 30
+    num_lookahead_slots = 4
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+        num_lookahead_slots=num_lookahead_slots,
+        num_scheduler_steps=num_scheduler_steps,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 16
+    cache_config.num_gpu_blocks = 16
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    _, seq_group = create_dummy_prompt("1",
+                                       prompt_length=30,
+                                       block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    # The request is chunked.
+    # prefill scheduled now.
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == max_num_batched_tokens
+    print(out.num_lookahead_slots)
+    assert out.num_lookahead_slots == (0 if (num_scheduler_steps == 1) else
+                                       num_lookahead_slots)
+
+
 def test_chunked_prefill_max_seqs():
     block_size = 4
     max_seqs = 2
diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
index a3f0464e79675..af8397c235f48 100644
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -50,49 +50,3 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
     with pytest.raises(ValueError, match="cannot be larger than"):
         get_output_from_llm_generator(test_llm_generator, prompts,
                                       sampling_params)
-
-
-@pytest.mark.parametrize("common_llm_kwargs",
-                         [{
-                             "model": "meta-llama/Llama-2-7b-chat-hf",
-                             "speculative_model": "JackFram/llama-68m",
-                             "num_speculative_tokens": 5,
-                             "enable_chunked_prefill": "True",
-                         }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {
-        "tensor_parallel_size": 2,
-        "speculative_draft_tensor_parallel_size": 2,
-    },
-    {
-        "tensor_parallel_size": 4,
-        "speculative_draft_tensor_parallel_size": 4,
-    },
-    {
-        "tensor_parallel_size": 8,
-        "speculative_draft_tensor_parallel_size": 8,
-    },
-])
-@pytest.mark.parametrize("test_llm_kwargs", [{}])
-@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_xfail_chunked_prefill_draft_model_tp_not_one(
-        test_llm_generator):
-    """Verify that speculative decoding fails if chunked prefill is enabled for 
-    draft model with tensor parallelism of more than 1.
-    """
-    output_len = 128
-    temperature = 0.0
-
-    prompts = [
-        "Hello, my name is",
-    ]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-
-    with pytest.raises(ValueError, match="with tensor parallel size 1"):
-        get_output_from_llm_generator(test_llm_generator, prompts,
-                                      sampling_params)
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index 25562ca85adf4..02cba92795142 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -115,3 +115,60 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
                                      max_output_len=32,
                                      seed=seed,
                                      temperature=0.0)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [[
+        # Skip cuda graph recording for fast test.
+        "--enforce-eager",
+        "--tensor_parallel_size",
+        "2",
+
+        # precision
+        "--dtype",
+        "bfloat16",
+    ]])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [["--enable-chunked-prefill", "False"],
+     [
+         "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
+         "--max-num-seqs", "4"
+     ]])
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
+@pytest.mark.parametrize("model, test_llm_kwargs",
+                         [("JackFram/llama-68m", [
+                             "--speculative-model",
+                             "JackFram/llama-68m",
+                             "--num_speculative-tokens",
+                             "3",
+                         ]),
+                          ("JackFram/llama-68m", [
+                              "--speculative-model",
+                              "JackFram/llama-68m",
+                              "--num_speculative-tokens",
+                              "3",
+                              "--speculative-draft-tensor-parallel-size",
+                              "1",
+                          ])])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
+                                         per_test_common_llm_kwargs,
+                                         baseline_llm_kwargs, test_llm_kwargs,
+                                         batch_size: int, seed: int):
+    """Verify spec decode works well with same and different TP size for
+    the draft model with chunked prefill.
+    """
+    run_equality_correctness_test_tp(model,
+                                     common_llm_kwargs,
+                                     per_test_common_llm_kwargs,
+                                     baseline_llm_kwargs,
+                                     test_llm_kwargs,
+                                     batch_size,
+                                     max_output_len=32,
+                                     seed=seed,
+                                     temperature=0.0)
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index 8df143104c279..d7caf57147278 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -867,7 +867,8 @@ def test_chunked_prefill_flow(k: int, batch_size: int, batch_composition: str):
     target_group_metadata_list = prefill + decodes
     execute_model_req = ExecuteModelRequest(
         seq_group_metadata_list=target_group_metadata_list,
-        num_lookahead_slots=k)
+        # For prefill only batches we expect num_lookahead_slots = 0.
+        num_lookahead_slots=k if n_decodes > 0 else 0)
 
     target_token_ids = torch.randint(low=0,
                                      high=vocab_size,
diff --git a/vllm/config.py b/vllm/config.py
index c87feaec3e5f6..eae6f909e3933 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1409,16 +1409,6 @@ def maybe_create_spec_config(
                     draft_hf_config
             )
 
-            if (enable_chunked_prefill and \
-                 speculative_draft_tensor_parallel_size != 1):
-                # TODO - Investigate why the error reported in
-                # https://github.com/vllm-project/vllm/pull/9291#issuecomment-2463266258
-                # is happening and re-enable it.
-                raise ValueError(
-                    "Chunked prefill and speculative decoding can be enabled "
-                    "simultaneously only for draft models with tensor "
-                    "parallel size 1.")
-
             draft_model_config.max_model_len = (
                 SpeculativeConfig._maybe_override_draft_max_model_len(
                     speculative_max_model_len,
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 530cbdc3a9190..d23009dae01ee 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1201,15 +1201,25 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
         # Update swapped requests.
         self.swapped.extend(running_scheduled.swapped_out)
         # Put prefills first due to Attention backend ordering assumption.
+        scheduled_seq_groups = (prefills.seq_groups +
+                                running_scheduled.prefill_seq_groups +
+                                swapped_in.prefill_seq_groups +
+                                running_scheduled.decode_seq_groups +
+                                swapped_in.decode_seq_groups)
+        num_prefill_groups = (len(prefills.seq_groups) +
+                              len(swapped_in.prefill_seq_groups) +
+                              len(running_scheduled.prefill_seq_groups))
+        # If all prompts, then we set num_lookahead_slots to 0
+        # this allows us to go through the `no_spec` path in
+        # `spec_decode_worker.py`
+        all_prefills = (len(scheduled_seq_groups) == num_prefill_groups)
+        num_lookahead_slots = (0 if
+                               (all_prefills
+                                and not self.scheduler_config.is_multi_step)
+                               else running_scheduled.num_lookahead_slots)
         return SchedulerOutputs(
-            scheduled_seq_groups=(prefills.seq_groups +
-                                  running_scheduled.prefill_seq_groups +
-                                  swapped_in.prefill_seq_groups +
-                                  running_scheduled.decode_seq_groups +
-                                  swapped_in.decode_seq_groups),
-            num_prefill_groups=(len(prefills.seq_groups) +
-                                len(swapped_in.prefill_seq_groups) +
-                                len(running_scheduled.prefill_seq_groups)),
+            scheduled_seq_groups=scheduled_seq_groups,
+            num_prefill_groups=num_prefill_groups,
             num_batched_tokens=budget.num_batched_tokens +
             budget.num_cached_tokens,
             blocks_to_swap_in=swapped_in.blocks_to_swap_in,
@@ -1218,7 +1228,7 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
             swapped_in.blocks_to_copy,
             ignored_seq_groups=prefills.ignored_seq_groups +
             swapped_in.infeasible_seq_groups,
-            num_lookahead_slots=running_scheduled.num_lookahead_slots,
+            num_lookahead_slots=num_lookahead_slots,
             running_queue_size=len(self.running),
             preempted=(len(running_scheduled.preempted) +
                        len(running_scheduled.swapped_out)),
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index b57742c2ebfdd..b279931ca4b02 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -408,7 +408,20 @@ def execute_model(
         disable_all_speculation = self._should_disable_all_speculation(
             execute_model_req)
         num_lookahead_slots = execute_model_req.num_lookahead_slots
-
+        all_prompt = True
+        atleast_one_prompt = False
+        all_zero_spec_tokens = True
+        for sgm in execute_model_req.seq_group_metadata_list:
+            all_prompt = all_prompt and sgm.is_prompt
+            atleast_one_prompt = atleast_one_prompt or sgm.is_prompt
+            all_zero_spec_tokens = all_zero_spec_tokens and (
+                sgm.num_speculative_tokens == 0)
+
+        if all_prompt and execute_model_req.seq_group_metadata_list:
+            assert num_lookahead_slots == 0, (
+                "Prompt only runs should have num_lookahead_slots equal to 0. "
+                "This should never happen, please file a bug at "
+                "https://github.com/vllm-project/vllm/issues")
         # Speculative decoding is disabled in the following cases:
         # 1. Prefill phase: Speculative decoding is not
         #    used during the prefill phase.
@@ -419,11 +432,8 @@ def execute_model(
         # In any of these cases, the proposer and scorer workers
         # are called normally.
         # We expect `num_speculative_tokens` to be None for prefills.
-        no_spec = all(
-            sgm.is_prompt for sgm in execute_model_req.seq_group_metadata_list
-        ) or num_lookahead_slots == 0 or disable_all_speculation or all(
-            sgm.num_speculative_tokens == 0
-            for sgm in execute_model_req.seq_group_metadata_list)
+        no_spec = (num_lookahead_slots == 0 or disable_all_speculation
+                   or all_zero_spec_tokens)
 
         # Broadcast how many lookahead slots are scheduled for this step, and
         # whether all speculation is disabled, to all non-driver workers.
@@ -442,6 +452,15 @@ def execute_model(
             num_lookahead_slots=num_lookahead_slots,
             no_spec=no_spec,
             disable_all_speculation=disable_all_speculation,
+            # When both chunked prefill and speculative decoding are enabled
+            # it is possible that the same batch contains both prefill
+            # and decodes. If that happens in the scorer we run the batch
+            # as one single forward pass. However, in the proposer we
+            # run them as 2 different batches - one for prefill and
+            # the other for decodes. The variable indicates to the non-driver
+            # worker that there are prefills as part of the speculative batch
+            # and hence it needs to run an extra prefill forward pass.
+            run_spec_proposer_for_prefill=atleast_one_prompt,
         )
         broadcast_tensor_dict(broadcast_dict, src=self._driver_rank)
 
@@ -653,6 +672,8 @@ def _run_non_driver_rank(self) -> bool:
 
         if not data["no_spec"]:
             self.scorer_worker.execute_model()
+            if data["run_spec_proposer_for_prefill"]:
+                self.proposer_worker.execute_model()
 
         return True
 

From f694c57e62099599278561c9a759d8db2e790441 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 14:03:20 -0500
Subject: [PATCH 017/293] cleaned up sampling metadata

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 vllm/v1/core/scheduler.py  | 24 +++++++++++++++---------
 vllm/v1/sample/metadata.py |  6 +++---
 vllm/v1/sample/sampler.py  |  5 +++--
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 87113ea2f65e8..5ada9ceab54e6 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -109,7 +109,6 @@ def schedule(self) -> "SchedulerOutput":
         # V1 model runner.
         # TODO(woosuk): Remove this constraint after refactoring model runner.
         has_partial_request = False
-        partial_req_index = -1
         req_index = 0
         while req_index < len(self.running):
             # Only the last request in the RUNNING queue can be "partial".
@@ -159,10 +158,8 @@ def schedule(self) -> "SchedulerOutput":
             ]
             num_scheduled_tokens[request.request_id] = num_new_tokens
             token_budget -= num_new_tokens
-            if (request.num_computed_tokens + num_new_tokens <
-                    request.num_tokens):
-                has_partial_request = True
-                partial_req_index = req_index
+            has_partial_request = (request.num_computed_tokens + num_new_tokens
+                                   < request.num_tokens)
             req_index += 1
 
             # Encoder-related.
@@ -239,10 +236,8 @@ def schedule(self) -> "SchedulerOutput":
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
                 request.num_computed_tokens = num_computed_tokens
-                if (request.num_computed_tokens + num_new_tokens <
-                        request.num_tokens):
-                    has_partial_request = True
-                    partial_req_index = req_index
+                has_partial_request = (request.num_computed_tokens +
+                                       num_new_tokens < request.num_tokens)
 
                 # Encoder-related.
                 if encoder_inputs_to_schedule:
@@ -279,6 +274,17 @@ def schedule(self) -> "SchedulerOutput":
                 req.num_computed_tokens) for req in scheduled_running_reqs
         ]
         preempted_req_ids = {req.request_id for req in preempted_reqs}
+
+        partial_req_indices = [
+            idx for idx, request in enumerate(self.running)
+            if request.num_computed_tokens +
+            num_scheduled_tokens[request.request_id] < request.num_tokens
+        ]
+        num_partial_reqs = len(partial_req_indices)
+        assert num_partial_reqs < 2
+        partial_req_index = (partial_req_indices[0]
+                             if num_partial_reqs > 0 else -1)
+
         scheduler_output = SchedulerOutput(
             scheduled_new_reqs=new_reqs_data,
             scheduled_resumed_reqs=resumed_reqs_data,
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index c1d817c8f3ffd..b9c97bcfb0d47 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -18,13 +18,13 @@ class SamplingMetadata:
 
     generators: Dict[int, torch.Generator]
 
+    # Max number of sample or prompt logprobs
+    # (respectiely) at the batch level
     max_num_logprobs: int
     max_num_prompt_logprobs: int
 
+    # Attributes which support logprob computation
     query_start_loc: Optional[torch.Tensor]
     num_query_tokens: Optional[torch.Tensor]
-    #maybe_sample_logits_indices: Optional[torch.Tensor] = None
-    #prompt_logits_mask: Optional[torch.Tensor] = None
-
     num_input_tokens: int
     partial_req_index: int  # >0 if there is a partial request, -1 o/w
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 4448b55deb868..e0b03f7aa03b3 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -237,9 +237,10 @@ def forward(
                                         dtype=torch.bool)
         # Sequence offsets where a token is being decoded are *not* prompt
         # tokens...
+        pdx = sampling_metadata.partial_req_index
         prompt_logits_mask[maybe_sample_logits_indices] = False
-        # ...unless the request in question is partial.
-        prompt_logits_mask[sampling_metadata.partial_req_index] = True
+        # ...unless the request in question is partial
+        prompt_logits_mask[maybe_sample_logits_indices[pdx]] = True
 
         # Apply temperature, top-k and top-p to logits at sequence offsets
         # where a new token is being decoded.

From b2d6303c34b92e5dbd9e51327a3b021fa0472f92 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 14:12:50 -0500
Subject: [PATCH 018/293] small change

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 vllm/v1/engine/llm_engine.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index b93634230529e..402a1c5dc85ad 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -160,11 +160,8 @@ def step(self) -> List[RequestOutput]:
         return request_outputs
 
     def get_model_config(self):
-        """Gets the model configuration."""
         return self.model_config
 
-    # TODO(rob): Can we get rid of these?
-
     def start_profile(self):
         self.engine_core.profile(True)
 

From f095097ea49cac45879243bd7e0d7479fdd54209 Mon Sep 17 00:00:00 2001
From: Conroy Cheers <conroy@corncheese.org>
Date: Wed, 27 Nov 2024 05:26:28 +1100
Subject: [PATCH 019/293] [Hardware][NVIDIA] Add non-NVML CUDA mode for Jetson
 (#9735)

Signed-off-by: Conroy Cheers <conroy@corncheese.org>
---
 CMakeLists.txt             |  10 +-
 vllm/platforms/__init__.py |  10 +-
 vllm/platforms/cuda.py     | 222 +++++++++++++++++++++++--------------
 3 files changed, 155 insertions(+), 87 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff34225537cdd..882d4412632a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 
 # Supported NVIDIA architectures.
-set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
+set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
 
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
@@ -249,7 +249,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Only build Marlin kernels if we are building for at least some compatible archs.
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
   if (MARLIN_ARCHS)
     set(MARLIN_SRCS
        "csrc/quantization/fp8/fp8_marlin.cu"
@@ -300,8 +300,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   #
   # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
   # kernels for the remaining archs that are not already built for 3x.
-  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS 
+    "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
   # subtract out the archs that are already built for 3x
   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
   if (SCALED_MM_2X_ARCHS)
@@ -427,7 +427,7 @@ set_gencode_flags_for_srcs(
   CUDA_ARCHS "${CUDA_ARCHS}")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
     set(MARLIN_MOE_SRC
         "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 1f68fc2e25df3..7cb8ac4b0a1e0 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -28,7 +28,15 @@
     finally:
         pynvml.nvmlShutdown()
 except Exception:
-    pass
+    # CUDA is supported on Jetson, but NVML may not be.
+    import os
+
+    def cuda_is_jetson() -> bool:
+        return os.path.isfile("/etc/nv_tegra_release") \
+            or os.path.exists("/sys/class/tegra-firmware")
+
+    if cuda_is_jetson():
+        is_cuda = True
 
 is_rocm = False
 
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 70724b8be4c45..0d07050fd1b6a 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -4,7 +4,7 @@
 
 import os
 from functools import lru_cache, wraps
-from typing import TYPE_CHECKING, Callable, List, Tuple, TypeVar
+from typing import TYPE_CHECKING, Callable, List, TypeVar
 
 import pynvml
 import torch
@@ -38,10 +38,23 @@
 # see https://github.com/huggingface/diffusers/issues/9704 for details
 torch.backends.cuda.enable_cudnn_sdp(False)
 
-# NVML utils
-# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
-# all the related functions work on real physical device ids.
-# the major benefit of using NVML is that it will not initialize CUDA
+
+def device_id_to_physical_device_id(device_id: int) -> int:
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        if device_ids == [""]:
+            msg = (
+                "CUDA_VISIBLE_DEVICES is set to empty string, which means"
+                " GPU support is disabled. If you are using ray, please unset"
+                " the environment variable `CUDA_VISIBLE_DEVICES` inside the"
+                " worker/actor. "
+                "Check https://github.com/vllm-project/vllm/issues/8402 for"
+                " more information.")
+            raise RuntimeError(msg)
+        physical_device_id = device_ids[device_id]
+        return int(physical_device_id)
+    else:
+        return device_id
 
 
 def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
@@ -57,87 +70,75 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
     return wrapper
 
 
-@lru_cache(maxsize=8)
-@with_nvml_context
-def get_physical_device_capability(device_id: int = 0) -> Tuple[int, int]:
-    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
-    return pynvml.nvmlDeviceGetCudaComputeCapability(handle)
-
-
-@lru_cache(maxsize=8)
-@with_nvml_context
-def get_physical_device_name(device_id: int = 0) -> str:
-    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
-    return pynvml.nvmlDeviceGetName(handle)
-
-
-@lru_cache(maxsize=8)
-@with_nvml_context
-def get_physical_device_total_memory(device_id: int = 0) -> int:
-    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
-    return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
-
+class CudaPlatformBase(Platform):
+    _enum = PlatformEnum.CUDA
+    device_type: str = "cuda"
+    dispatch_key: str = "CUDA"
 
-@with_nvml_context
-def warn_if_different_devices():
-    device_ids: int = pynvml.nvmlDeviceGetCount()
-    if device_ids > 1:
-        device_names = [get_physical_device_name(i) for i in range(device_ids)]
-        if len(set(device_names)) > 1 and os.environ.get(
-                "CUDA_DEVICE_ORDER") != "PCI_BUS_ID":
-            logger.warning(
-                "Detected different devices in the system: \n%s\nPlease"
-                " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
-                "avoid unexpected behavior.", "\n".join(device_names))
+    @classmethod
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        raise NotImplementedError
 
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        raise NotImplementedError
 
-try:
-    from sphinx.ext.autodoc.mock import _MockModule
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        raise NotImplementedError
 
-    if not isinstance(pynvml, _MockModule):
-        warn_if_different_devices()
-except ModuleNotFoundError:
-    warn_if_different_devices()
+    @classmethod
+    def is_full_nvlink(cls, device_ids: List[int]) -> bool:
+        raise NotImplementedError
 
+    @classmethod
+    def log_warnings(cls):
+        pass
 
-def device_id_to_physical_device_id(device_id: int) -> int:
-    if "CUDA_VISIBLE_DEVICES" in os.environ:
-        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
-        if device_ids == [""]:
-            msg = (
-                "CUDA_VISIBLE_DEVICES is set to empty string, which means"
-                " GPU support is disabled. If you are using ray, please unset"
-                " the environment variable `CUDA_VISIBLE_DEVICES` inside the"
-                " worker/actor. "
-                "Check https://github.com/vllm-project/vllm/issues/8402 for"
-                " more information.")
-            raise RuntimeError(msg)
-        physical_device_id = device_ids[device_id]
-        return int(physical_device_id)
-    else:
-        return device_id
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        parallel_config = vllm_config.parallel_config
+        scheduler_config = vllm_config.scheduler_config
+        if parallel_config.worker_cls == "auto":
+            if scheduler_config.is_multi_step:
+                parallel_config.worker_cls = \
+                    "vllm.worker.multi_step_worker.MultiStepWorker"
+            elif vllm_config.speculative_config:
+                parallel_config.worker_cls = \
+                    "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+            else:
+                parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
 
-class CudaPlatform(Platform):
-    _enum = PlatformEnum.CUDA
-    device_type: str = "cuda"
-    dispatch_key: str = "CUDA"
+# NVML utils
+# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using NVML is that it will not initialize CUDA
+class NvmlCudaPlatform(CudaPlatformBase):
 
     @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
         physical_device_id = device_id_to_physical_device_id(device_id)
-        major, minor = get_physical_device_capability(physical_device_id)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+        major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
         return DeviceCapability(major=major, minor=minor)
 
     @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
     def get_device_name(cls, device_id: int = 0) -> str:
         physical_device_id = device_id_to_physical_device_id(device_id)
-        return get_physical_device_name(physical_device_id)
+        return cls._get_physical_device_name(physical_device_id)
 
     @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         physical_device_id = device_id_to_physical_device_id(device_id)
-        return get_physical_device_total_memory(physical_device_id)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+        return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
 
     @classmethod
     @with_nvml_context
@@ -153,27 +154,86 @@ def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
                 if i < j:
                     try:
                         p2p_status = pynvml.nvmlDeviceGetP2PStatus(
-                            handle, peer_handle,
-                            pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
+                            handle,
+                            peer_handle,
+                            pynvml.NVML_P2P_CAPS_INDEX_NVLINK,
+                        )
                         if p2p_status != pynvml.NVML_P2P_STATUS_OK:
                             return False
                     except pynvml.NVMLError:
                         logger.exception(
-                            "NVLink detection failed. This is normal if your"
-                            " machine has no NVLink equipped.")
+                            "NVLink detection failed. This is normal if"
+                            " your machine has no NVLink equipped.")
                         return False
         return True
 
     @classmethod
-    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        parallel_config = vllm_config.parallel_config
-        scheduler_config = vllm_config.scheduler_config
-        if parallel_config.worker_cls == "auto":
-            if scheduler_config.is_multi_step:
-                parallel_config.worker_cls = \
-                    "vllm.worker.multi_step_worker.MultiStepWorker"
-            elif vllm_config.speculative_config:
-                parallel_config.worker_cls = \
-                    "vllm.spec_decode.spec_decode_worker.create_spec_worker"
-            else:
-                parallel_config.worker_cls = "vllm.worker.worker.Worker"
+    def _get_physical_device_name(cls, device_id: int = 0) -> str:
+        handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+        return pynvml.nvmlDeviceGetName(handle)
+
+    @classmethod
+    @with_nvml_context
+    def log_warnings(cls):
+        device_ids: int = pynvml.nvmlDeviceGetCount()
+        if device_ids > 1:
+            device_names = [
+                cls._get_physical_device_name(i) for i in range(device_ids)
+            ]
+            if (len(set(device_names)) > 1
+                    and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"):
+                logger.warning(
+                    "Detected different devices in the system: \n%s\nPlease"
+                    " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
+                    "avoid unexpected behavior.",
+                    "\n".join(device_names),
+                )
+
+
+class NonNvmlCudaPlatform(CudaPlatformBase):
+
+    @classmethod
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.cuda.get_device_name(device_id)
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.cuda.get_device_properties(device_id)
+        return device_props.total_memory
+
+    @classmethod
+    def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
+        logger.exception(
+            "NVLink detection not possible, as context support was"
+            " not found. Assuming no NVLink available.")
+        return False
+
+
+# Autodetect either NVML-enabled or non-NVML platform
+# based on whether NVML is available.
+nvml_available = False
+try:
+    try:
+        pynvml.nvmlInit()
+        nvml_available = True
+    except Exception:
+        # On Jetson, NVML is not supported.
+        nvml_available = False
+finally:
+    if nvml_available:
+        pynvml.nvmlShutdown()
+
+CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
+
+try:
+    from sphinx.ext.autodoc.mock import _MockModule
+
+    if not isinstance(pynvml, _MockModule):
+        CudaPlatform.log_warnings()
+except ModuleNotFoundError:
+    CudaPlatform.log_warnings()

From 1c3692f3cf2032a33a51ca579cd69ecde67ef175 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 26 Nov 2024 13:44:01 -0500
Subject: [PATCH 020/293] [Bugfix] Fix using `-O[0,3]` with LLM entrypoint
 (#10677)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/engine/arg_utils.py |  5 ++++-
 vllm/entrypoints/llm.py  | 10 ++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 60ad5ee54a2f2..90b4798f17a13 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -206,7 +206,10 @@ def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
         # without having to manually construct a
         # CompilationConfig object
-        if isinstance(self.compilation_config, (int, dict)):
+        if isinstance(self.compilation_config, (int)):
+            self.compilation_config = CompilationConfig.from_cli(
+                str(self.compilation_config))
+        elif isinstance(self.compilation_config, (dict)):
             self.compilation_config = CompilationConfig.from_cli(
                 json.dumps(self.compilation_config))
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index e07f4c04abd84..1551a9a998160 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -185,8 +185,14 @@ def __init__(
             kwargs["disable_log_stats"] = True
 
         if compilation_config is not None:
-            compilation_config_instance = CompilationConfig.from_cli(
-                json.dumps(compilation_config))
+            if isinstance(compilation_config, (int)):
+                compilation_config_instance = CompilationConfig.from_cli(
+                    str(compilation_config))
+            elif isinstance(compilation_config, (dict)):
+                compilation_config_instance = CompilationConfig.from_cli(
+                    json.dumps(compilation_config))
+            else:
+                compilation_config_instance = compilation_config
         else:
             compilation_config_instance = None
 

From d89ca25d5e5f45725009628e0ea86545804e0831 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 14:12:50 -0500
Subject: [PATCH 021/293] small change

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 vllm/v1/engine/llm_engine.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index b93634230529e..402a1c5dc85ad 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -160,11 +160,8 @@ def step(self) -> List[RequestOutput]:
         return request_outputs
 
     def get_model_config(self):
-        """Gets the model configuration."""
         return self.model_config
 
-    # TODO(rob): Can we get rid of these?
-
     def start_profile(self):
         self.engine_core.profile(True)
 

From 47a71ecc087553f8d352bcb08602a767f2ce26c2 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 15:00:26 -0500
Subject: [PATCH 022/293] partially re-enabled detokenize cases in test

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 tests/v1/samplers/test_logprobs.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index a303438c8a3d9..01be27926ef84 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -57,7 +57,7 @@ def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
             (None, 0),
             (0, None),
             (0, 0),
-            (None, 7),
+            (None, 6),
             (0, 5),
         ]
     elif batch_logprobs_composition == "SAMPLE_PROMPT":
@@ -67,7 +67,7 @@ def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
             (0, 0),
             (5, None),
             (3, 0),
-            (7, 3),
+            (6, 3),
             (None, 6),
             (0, 5),
         ]
@@ -243,7 +243,7 @@ def _test_case_get_logprobs_and_prompt_logprobs(
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype",
                          ["half"])  # needed for comparing logprobs with HF
-# @pytest.mark.parametrize("detokenize", [True, False])
+@pytest.mark.parametrize("detokenize", [True, False])
 @pytest.mark.parametrize("max_num_batched_tokens", [128, 256, 1024])
 @pytest.mark.parametrize("batch_logprobs_composition",
                          ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
@@ -252,7 +252,7 @@ def test_get_logprobs_and_prompt_logprobs(
     vllm_runner,
     model: str,
     dtype: str,
-    # detokenize: bool,
+    detokenize: bool,
     batch_logprobs_composition: str,
     max_num_batched_tokens: int,
     example_prompts,
@@ -279,6 +279,7 @@ def test_get_logprobs_and_prompt_logprobs(
       dtype
       detokenize: if False, return generated tokens bypassing detokenizer
       batch_logprobs_composition: logprobs configuration for test batch
+      max_num_batched_tokens: token budget for scheduling
       example_prompts
       monkeypatch
     """
@@ -301,8 +302,7 @@ def test_get_logprobs_and_prompt_logprobs(
                          ["half"])  # needed for comparing logprobs with HF
 # @pytest.mark.parametrize("detokenize", [True, False])
 @pytest.mark.parametrize("max_num_batched_tokens", [128])
-@pytest.mark.parametrize("batch_logprobs_composition",
-                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
+@pytest.mark.parametrize("batch_logprobs_composition", ["SAMPLE_PROMPT"])
 def test_fast_get_logprobs_and_prompt_logprobs(
     hf_runner,
     vllm_runner,

From 028256e9fe81af8b3ea844767f8b7522d58f444a Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 15:13:57 -0500
Subject: [PATCH 023/293] deferring support for detokenization feature to
 subsequent SamplingParams work

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 tests/v1/samplers/test_logprobs.py | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index 01be27926ef84..7c736d957e38a 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -243,7 +243,6 @@ def _test_case_get_logprobs_and_prompt_logprobs(
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype",
                          ["half"])  # needed for comparing logprobs with HF
-@pytest.mark.parametrize("detokenize", [True, False])
 @pytest.mark.parametrize("max_num_batched_tokens", [128, 256, 1024])
 @pytest.mark.parametrize("batch_logprobs_composition",
                          ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
@@ -252,7 +251,6 @@ def test_get_logprobs_and_prompt_logprobs(
     vllm_runner,
     model: str,
     dtype: str,
-    detokenize: bool,
     batch_logprobs_composition: str,
     max_num_batched_tokens: int,
     example_prompts,
@@ -277,20 +275,17 @@ def test_get_logprobs_and_prompt_logprobs(
       vllm_runner
       model
       dtype
-      detokenize: if False, return generated tokens bypassing detokenizer
       batch_logprobs_composition: logprobs configuration for test batch
       max_num_batched_tokens: token budget for scheduling
       example_prompts
       monkeypatch
     """
-    detokenize = True
-
     _test_case_get_logprobs_and_prompt_logprobs(
         hf_runner=hf_runner,
         vllm_runner=vllm_runner,
         model=model,
         dtype=dtype,
-        detokenize=detokenize,
+        detokenize=True,
         batch_logprobs_composition=batch_logprobs_composition,
         max_num_batched_tokens=max_num_batched_tokens,
         example_prompts=example_prompts,
@@ -300,15 +295,14 @@ def test_get_logprobs_and_prompt_logprobs(
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype",
                          ["half"])  # needed for comparing logprobs with HF
-# @pytest.mark.parametrize("detokenize", [True, False])
 @pytest.mark.parametrize("max_num_batched_tokens", [128])
-@pytest.mark.parametrize("batch_logprobs_composition", ["SAMPLE_PROMPT"])
+@pytest.mark.parametrize("batch_logprobs_composition",
+                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
 def test_fast_get_logprobs_and_prompt_logprobs(
     hf_runner,
     vllm_runner,
     model: str,
     dtype: str,
-    # detokenize: bool,
     batch_logprobs_composition: str,
     max_num_batched_tokens: int,
     example_prompts,
@@ -319,14 +313,13 @@ def test_fast_get_logprobs_and_prompt_logprobs(
     Faster version of `test_get_logprobs_and_prompt_logprobs` with
     fewer test cases.
     """
-    detokenize = True
 
     _test_case_get_logprobs_and_prompt_logprobs(
         hf_runner=hf_runner,
         vllm_runner=vllm_runner,
         model=model,
         dtype=dtype,
-        detokenize=detokenize,
+        detokenize=True,
         batch_logprobs_composition=batch_logprobs_composition,
         max_num_batched_tokens=max_num_batched_tokens,
         example_prompts=example_prompts,
@@ -356,15 +349,12 @@ def test_max_logprobs(monkeypatch):
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("detokenize", [True, False])
-def test_none_logprobs(vllm_runner, model, detokenize: bool, example_prompts,
-                       monkeypatch):
+def test_none_logprobs(vllm_runner, model, example_prompts, monkeypatch):
     """Engine should return `logprobs` and `prompt_logprobs` as `None`
     
     Args:
       vllm_runner
       model
-      detokenize: whether to feed generated tokens to detokenizer
       example_prompts
       monkeypatch
     """
@@ -385,8 +375,7 @@ def test_none_logprobs(vllm_runner, model, detokenize: bool, example_prompts,
         sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
                                                        logprobs=None,
                                                        prompt_logprobs=None,
-                                                       temperature=0.0,
-                                                       detokenize=detokenize)
+                                                       temperature=0.0)
         results_logprobs_none = vllm_model.model.generate(
             example_prompts, sampling_params=sampling_params_logprobs_none)
 

From 9190b5a58879561d5d1138f18f6ac6b9cc1628d8 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Fri, 29 Nov 2024 02:45:14 +0000
Subject: [PATCH 024/293] tweak tolerance; fast check

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml      | 9 +++++++++
 tests/v1/samplers/test_logprobs.py | 4 ++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index fc23c9cff0d87..c6d31b837c55d 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -174,6 +174,15 @@ steps:
   commands:
     - VLLM_USE_V1=1 pytest -v -s v1
 
+- label: V1 Fast Test
+  #mirror_hardwares: [amd]
+  fast_check: true
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - VLLM_USE_V1=1 pytest -v -s v1/samplers/test_logprobs.py::test_fast_get_logprobs_and_prompt_logprobs
+
 - label: Examples Test # 15min
   working_dir: "/vllm-workspace/examples"
   #mirror_hardwares: [amd]
diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index 7c736d957e38a..a42e78da85ca0 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -234,8 +234,8 @@ def _test_case_get_logprobs_and_prompt_logprobs(
                     torch.testing.assert_close(
                         logprob.logprob,
                         hf_logprob[0][i][token_id].item(),
-                        atol=1e-2,
-                        rtol=1e-2)
+                        atol=2e-2,
+                        rtol=2e-2)
         else:
             assert vllm_result.prompt_logprobs is None
 

From fda0fcb75494dd7677c92c057204cdcfcfe615e6 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 2 Dec 2024 00:38:36 +0000
Subject: [PATCH 025/293] removed fast tests from pipeline

---
 .buildkite/test-pipeline.yaml | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 02a80640ac3f8..46692506f01d4 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -174,15 +174,6 @@ steps:
   commands:
     - VLLM_USE_V1=1 pytest -v -s v1
 
-- label: V1 Fast Test
-  #mirror_hardwares: [amd]
-  fast_check: true
-  source_file_dependencies:
-    - vllm/
-    - tests/v1
-  commands:
-    - VLLM_USE_V1=1 pytest -v -s v1/samplers/test_logprobs.py::test_fast_get_logprobs_and_prompt_logprobs
-
 - label: Examples Test # 15min
   working_dir: "/vllm-workspace/examples"
   #mirror_hardwares: [amd]

From 1fa0b711c943bbcfdc3003dbfb6c293820617919 Mon Sep 17 00:00:00 2001
From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Date: Mon, 2 Dec 2024 04:38:52 -0500
Subject: [PATCH 026/293] Update vllm/outputs.py

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/outputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/outputs.py b/vllm/outputs.py
index ead37164f1113..08bc5a91174a9 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -141,7 +141,7 @@ def new(
           token_ids: completion token ids
           logprobs: completion sample logprobs
           prompt_logprobs: prompt logprobs
-          finished
+          finished: whether the request is finished
         """
 
         # TODO: Support `n` > 1.

From bc1c004bfde9131859997c72990554c8dc17fc1f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 2 Dec 2024 10:12:01 +0000
Subject: [PATCH 027/293] small fixes

---
 vllm/v1/engine/processor.py        | 26 +++++++++++++++++++++++---
 vllm/v1/worker/gpu_model_runner.py |  3 +--
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 5bcf1b5e7b86e..8fe9d3adb8792 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -39,7 +39,7 @@ def __init__(
         self.input_processor = input_registry.create_input_processor(
             model_config)
 
-    def _assert_valid_logprobs_prompt_logprobs(
+    def _assert_valid_sample_logprobs_prompt_logprobs(
         self,
         params: Union[SamplingParams, PoolingParams],
         max_logprobs: int,
@@ -70,17 +70,37 @@ def process_inputs(
         prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
         arrival_time: float,
-        max_logprobs: int,
+        max_logprobs_permitted_by_engine: int,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
+        """Process the input prompt into an engine request
+        
+        Args:
+          request_id: request ID
+          prompt: input prompt str
+          params: sampling or pooling commands
+          arrival_time: time when inputs arrived; will be computed if `None`
+          is passed in
+          max_logprobs_permitted_by_engine: the max number of sample or prompt
+          logprobs a request may ask for
+          lora_request: LoRA request structure
+          trace_headers: trace info
+          prompt_adapter_request: prompt adapter request structure
+          priority: currently unsupported; must be zero & is by default.
+
+        Returns:
+          Detokenizer request structure
+          Engine request structure
+        """
 
         # TODO(woosuk): Support embedding mode.
         # TODO(woosuk): Support encoder-decoder models.
 
-        self._assert_valid_logprobs_prompt_logprobs(params, max_logprobs)
+        self._assert_valid_sample_logprobs_prompt_logprobs(
+            params, max_logprobs_permitted_by_engine)
 
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8dbfb6ef3aaa4..6004d160c5c09 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -211,8 +211,7 @@ def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
         sampling_metadata: SamplingMetadata,
-    ) -> Tuple[torch.Tensor, FlashAttentionMetadata, torch.Tensor,
-               torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    ) -> Tuple[torch.Tensor, FlashAttentionMetadata]:
 
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0

From bec886b23b2569f926418ab267950300b82a4274 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 2 Dec 2024 10:34:19 +0000
Subject: [PATCH 028/293] moved output processing commands into processor

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/core/scheduler.py   | 250 -----------------------------------
 vllm/v1/engine/core.py      | 255 +++++++++++++++++++++++++++++++++++-
 vllm/v1/engine/processor.py |   2 +-
 3 files changed, 253 insertions(+), 254 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index b515d15172c44..899bdcbb156bb 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -6,11 +6,8 @@
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import Logprob
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_manager import KVCacheManager
-from vllm.v1.engine import EngineCoreOutput
-from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 
 if TYPE_CHECKING:
@@ -390,253 +387,6 @@ def _try_schedule_encoder_inputs(
             encoder_inputs_to_schedule.append(i)
         return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
 
-    def _pythonize_logprobs(
-        self,
-        do_logprobs: bool,
-        do_prompt_logprobs: bool,
-        model_runner_output: "ModelRunnerOutput",
-    ) -> Tuple[List, List, List, List]:
-        """Convert logprobs tensors to Python data structures.
-        
-        Args:
-          do_logprobs: sample logprobs are required
-          do_prompt_logprobs: prompt logprobs are required
-          model_runner_output: model runner output contains CPU logprobs tensors
-
-        Returns:
-          logprob_token_ids_list
-          logprob_values_list
-          prompt_logprob_token_ids_list
-          prompt_logprob_values_list
-        """
-        if do_logprobs:
-            # Pythonize sample logprobs if needed
-            assert model_runner_output.logprob_token_ids_cpu is not None
-            logprob_token_ids_list = (
-                model_runner_output.logprob_token_ids_cpu.tolist())
-            logprob_values_list = (model_runner_output.logprobs_cpu.tolist())
-        else:
-            (
-                logprob_token_ids_list,
-                logprob_values_list,
-            ) = (None, None)
-        if do_prompt_logprobs:
-            # Pythonize prompt logprobs if needed
-            assert model_runner_output.prompt_logprob_token_ids_cpu is not None
-            prompt_logprob_token_ids_list = (
-                model_runner_output.prompt_logprob_token_ids_cpu.tolist())
-            prompt_logprob_values_list = (
-                model_runner_output.prompt_logprobs_cpu.tolist())
-        else:
-            (
-                prompt_logprob_token_ids_list,
-                prompt_logprob_values_list,
-            ) = (None, None)
-
-        return (logprob_token_ids_list, logprob_values_list,
-                prompt_logprob_token_ids_list, prompt_logprob_values_list)
-
-    def update_from_output(
-        self,
-        scheduler_output: "SchedulerOutput",
-        model_runner_output: "ModelRunnerOutput",
-    ) -> List[EngineCoreOutput]:
-        # NOTE(woosuk): This method doesn't consider speculative decoding.
-        sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
-        num_scheduled_tokens = scheduler_output.num_scheduled_tokens
-        do_logprobs = model_runner_output.logprobs_cpu is not None
-        do_prompt_logprobs = (
-            model_runner_output.prompt_logprobs_cpu is not None
-            and len(model_runner_output.prompt_logprobs_cpu) > 0)
-
-        # Get logprobs as Python data structures
-        (
-            logprob_token_ids_list,
-            logprob_values_list,
-            prompt_logprob_token_ids_list,
-            prompt_logprob_values_list,
-        ) = self._pythonize_logprobs(do_logprobs, do_prompt_logprobs,
-                                     model_runner_output)
-
-        if do_prompt_logprobs:
-            # Index into prompt tokens, for building
-            # prompt logprobs output data structure
-            curr_prompt_base_idx = 0
-        new_running: List[Request] = []
-        engine_core_outputs: List[EngineCoreOutput] = []
-        for request in self.running:
-            req_id = request.request_id
-            request.num_computed_tokens += num_scheduled_tokens[req_id]
-            req_index = model_runner_output.req_id_to_index[req_id]
-            num_new_tokens = 1
-            max_logprobs = request.max_logprobs
-            request_do_logprobs = (do_logprobs and max_logprobs is not None
-                                   and max_logprobs > 0)
-
-            if do_prompt_logprobs:
-                max_prompt_logprobs = request.max_prompt_logprobs
-                # Number of new prompt tokens is the number of scheduled
-                # tokens *if* the request is partial (because the sampled
-                # token is discarded and all sequence offsets are prompt
-                # offsets), otherwise it is the number of scheduled
-                # tokens minus one (for the sampled token)
-                num_new_prompt_tokens = (
-                    num_scheduled_tokens[request.request_id] -
-                    int(scheduler_output.partial_req_index != req_index))
-
-                request_do_prompt_logprobs = (max_prompt_logprobs is not None
-                                              and max_prompt_logprobs > 0
-                                              and num_new_prompt_tokens > 0)
-
-                if request_do_prompt_logprobs:
-
-                    # Construct prompt logprobs, under the condition that
-                    # prompt logprobs were requested & a nonzero number of
-                    # prompt tokens were computed in this step for this request.
-                    #
-                    # Note that this scenario returns an EngineCoreOutput which
-                    # is empty except for the prompt logprobs which were
-                    # computed for these prompt tokens.
-
-                    slice_upper_index = (curr_prompt_base_idx +
-                                         num_new_prompt_tokens)
-                    prompt_logprob_token_ids = prompt_logprob_token_ids_list[
-                        curr_prompt_base_idx:slice_upper_index]
-                    prompt_logprob_values = prompt_logprob_values_list[
-                        curr_prompt_base_idx:slice_upper_index]
-                    curr_prompt_base_idx = slice_upper_index
-
-                    logprob_cnt = max_prompt_logprobs
-                    prompt_logprobs = [{
-                        lpt: Logprob(lpv, (idx + 1), None)
-                        for idx, (lpv, lpt) in enumerate(
-                            zip(plp_tok_values[0:logprob_cnt],
-                                plp_tok_token_ids[0:logprob_cnt]))
-                    } for plp_tok_values, plp_tok_token_ids in zip(
-                        prompt_logprob_values, prompt_logprob_token_ids)]
-
-                    if not request.prompt_logprobs:
-                        # Ensure that None is the first prompt logprob
-                        prompt_logprobs = [None] + prompt_logprobs
-
-                    curr_prompt_base_idx = slice_upper_index
-
-                    prompt_slice_range_upper = request.num_computed_tokens
-                    prompt_slice_range_lower = (prompt_slice_range_upper -
-                                                num_new_prompt_tokens)
-                    request.prompt_logprobs.extend(prompt_logprobs)
-                else:
-                    curr_prompt_base_idx += num_new_prompt_tokens
-            else:
-                request_do_prompt_logprobs = False
-
-            # When the request's num_computed_tokens catches up its num_tokens,
-            # the request generates output tokens. Otherwise, we ignore the
-            # sampler output for the request.
-            assert request.num_computed_tokens <= request.num_tokens
-
-            cached_encoder_input_ids = (
-                self.encoder_cache_manager.get_cached_input_ids(request))
-            for input_id in list(cached_encoder_input_ids):
-                start_pos = request.mm_positions[input_id]["offset"]
-                num_tokens = request.mm_positions[input_id]["length"]
-                if start_pos + num_tokens <= request.num_computed_tokens:
-                    # The encoder output is already processed and stored
-                    # in the decoder's KV cache.
-                    self.encoder_cache_manager.free(request, input_id)
-
-            if request.num_computed_tokens == request.num_tokens:
-                # NOTE(woosuk): Currently, we assume that each request
-                # generates at most one token at each step.
-                token_id = sampled_token_ids[req_index]
-                if request_do_logprobs:
-                    # Construct logprobs, if requested (TODO: assumes one
-                    # generated token).
-                    logprob_token_ids = logprob_token_ids_list[req_index]
-                    logprob_values = logprob_values_list[req_index]
-                    logprob_cnt = max_logprobs
-                    if token_id not in logprob_token_ids[0:max_logprobs]:
-                        # Sampled token is not in the in the top logprobs;
-                        # inject it & resort, ensuring that excess logprobs
-                        # not requested by the user have -inf probability
-                        logprob_values[max_logprobs:-1] = (
-                            [float('-inf')] *
-                            (len(logprob_values) - 1 - max_logprobs))
-
-                        indices = sorted(range(len(logprob_values)),
-                                         key=lambda k: logprob_values[k],
-                                         reverse=True)
-                        logprob_values = [logprob_values[i] for i in indices]
-                        logprob_token_ids = [
-                            logprob_token_ids[i] for i in indices
-                        ]
-
-                        # There will be one more logprob than the user requested
-                        logprob_cnt = max_logprobs + 1
-
-                    # Only keep the number of logprobs specified by the request
-                    # (plus possibly the sampled token id & its logprob)
-                    logprob_values = logprob_values[0:logprob_cnt]
-                    logprob_token_ids = logprob_token_ids[0:logprob_cnt]
-
-                    request.logprobs.append({
-                        lpt: Logprob(lpv, (idx + 1), None)
-                        for idx, (lpv, lpt) in enumerate(
-                            zip(logprob_values, logprob_token_ids))
-                    })
-                request.append_output_token_ids(token_id)
-                # TODO: Update the KV cache manager for prefix caching.
-
-                # Check for stop and update request state.
-                # This must be called before me make the EngineCoreOutput.
-                stopped = self._check_stop(request)
-
-                # Add EngineCoreOutput for this Request.
-                # Return the logprob for the most recently computed tokens.
-                # Return no prompt logprobs in decode-phase.
-                output = EngineCoreOutput(
-                    request_id=req_id,
-                    new_token_ids=request.output_token_ids[-num_new_tokens:],
-                    finished=request.is_finished(),
-                    finish_reason=request.get_finished_reason(),
-                    stop_reason=request.stop_reason,
-                    logprobs=(request.logprobs[-num_new_tokens:]
-                              if request_do_logprobs else None),
-                    prompt_logprobs=(prompt_logprobs
-                                     if request_do_prompt_logprobs else None),
-                    prompt_logprobs_token_ids=(request.prompt_token_ids
-                                               if request_do_prompt_logprobs
-                                               else None))
-                engine_core_outputs.append(output)
-
-                # Breakout of the loop.
-                if stopped:
-                    continue
-
-            elif request_do_prompt_logprobs:
-                # This request is still partial but prompt logprobs were
-                # requested
-                engine_core_outputs.append(
-                    EngineCoreOutput(
-                        request_id=req_id,
-                        new_token_ids=[],
-                        finished=request.is_finished(),
-                        finish_reason=request.get_finished_reason(),
-                        stop_reason=request.stop_reason,
-                        logprobs=[] if request_do_logprobs else None,
-                        prompt_logprobs=(
-                            prompt_logprobs if request_do_prompt_logprobs else
-                            ([] if request_do_prompt_logprobs else None)),
-                        prompt_logprobs_token_ids=(
-                            request.prompt_token_ids[prompt_slice_range_lower:
-                                                     prompt_slice_range_upper]
-                            if request_do_prompt_logprobs else
-                            ([] if request_do_prompt_logprobs else None))))
-
-            new_running.append(request)
-        self.running = new_running
-        return engine_core_outputs
-
     def _check_stop(self, request: Request) -> bool:
         if (request.num_tokens >= self.max_model_len
                 or request.num_output_tokens >= request.max_tokens):
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 34f99dd30ef2e..c6ff0bc59da5f 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -14,13 +14,15 @@
 
 from vllm.config import CacheConfig, VllmConfig
 from vllm.logger import init_logger
+from vllm.sequence import Logprob
 from vllm.usage.usage_lib import UsageContext
-from vllm.v1.core.scheduler import Scheduler
+from vllm.v1.core.scheduler import Scheduler, SchedulerOutput
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType)
 from vllm.v1.engine.mm_input_mapper import MMInputMapper
 from vllm.v1.executor.gpu_executor import GPUExecutor
+from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import PickleEncoder
 from vllm.version import __version__ as VLLM_VERSION
@@ -103,6 +105,254 @@ def abort_requests(self, request_ids: List[str]):
         self.scheduler.finish_requests(request_ids,
                                        RequestStatus.FINISHED_ABORTED)
 
+    def _pythonize_logprobs(
+        self,
+        do_logprobs: bool,
+        do_prompt_logprobs: bool,
+        model_runner_output: "ModelRunnerOutput",
+    ) -> Tuple[List, List, List, List]:
+        """Convert logprobs tensors to Python data structures.
+        
+        Args:
+          do_logprobs: sample logprobs are required
+          do_prompt_logprobs: prompt logprobs are required
+          model_runner_output: model runner output contains CPU logprobs tensors
+
+        Returns:
+          logprob_token_ids_list
+          logprob_values_list
+          prompt_logprob_token_ids_list
+          prompt_logprob_values_list
+        """
+        if do_logprobs:
+            # Pythonize sample logprobs if needed
+            assert model_runner_output.logprob_token_ids_cpu is not None
+            logprob_token_ids_list = (
+                model_runner_output.logprob_token_ids_cpu.tolist())
+            logprob_values_list = (model_runner_output.logprobs_cpu.tolist())
+        else:
+            (
+                logprob_token_ids_list,
+                logprob_values_list,
+            ) = (None, None)
+        if do_prompt_logprobs:
+            # Pythonize prompt logprobs if needed
+            assert model_runner_output.prompt_logprob_token_ids_cpu is not None
+            prompt_logprob_token_ids_list = (
+                model_runner_output.prompt_logprob_token_ids_cpu.tolist())
+            prompt_logprob_values_list = (
+                model_runner_output.prompt_logprobs_cpu.tolist())
+        else:
+            (
+                prompt_logprob_token_ids_list,
+                prompt_logprob_values_list,
+            ) = (None, None)
+
+        return (logprob_token_ids_list, logprob_values_list,
+                prompt_logprob_token_ids_list, prompt_logprob_values_list)
+
+    def update_from_output(
+        self,
+        scheduler_output: "SchedulerOutput",
+        model_runner_output: "ModelRunnerOutput",
+    ) -> List[EngineCoreOutput]:
+        scheduler = self.scheduler
+        # NOTE(woosuk): This method doesn't consider speculative decoding.
+        sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
+        num_scheduled_tokens = scheduler_output.num_scheduled_tokens
+        do_logprobs = model_runner_output.logprobs_cpu is not None
+        do_prompt_logprobs = (
+            model_runner_output.prompt_logprobs_cpu is not None
+            and len(model_runner_output.prompt_logprobs_cpu) > 0)
+
+        # Get logprobs as Python data structures
+        (
+            logprob_token_ids_list,
+            logprob_values_list,
+            prompt_logprob_token_ids_list,
+            prompt_logprob_values_list,
+        ) = self._pythonize_logprobs(do_logprobs, do_prompt_logprobs,
+                                     model_runner_output)
+
+        if do_prompt_logprobs:
+            # Index into prompt tokens, for building
+            # prompt logprobs output data structure
+            curr_prompt_base_idx = 0
+        new_running: List[Request] = []
+        engine_core_outputs: List[EngineCoreOutput] = []
+        for request in scheduler.running:
+            req_id = request.request_id
+            request.num_computed_tokens += num_scheduled_tokens[req_id]
+            req_index = model_runner_output.req_id_to_index[req_id]
+            num_new_tokens = 1
+            max_logprobs = request.max_logprobs
+            request_do_logprobs = (do_logprobs and max_logprobs is not None
+                                   and max_logprobs > 0)
+
+            if do_prompt_logprobs:
+                max_prompt_logprobs = request.max_prompt_logprobs
+                # Number of new prompt tokens is the number of scheduled
+                # tokens *if* the request is partial (because the sampled
+                # token is discarded and all sequence offsets are prompt
+                # offsets), otherwise it is the number of scheduled
+                # tokens minus one (for the sampled token)
+                num_new_prompt_tokens = (
+                    num_scheduled_tokens[request.request_id] -
+                    int(scheduler_output.partial_req_index != req_index))
+
+                request_do_prompt_logprobs = (max_prompt_logprobs is not None
+                                              and max_prompt_logprobs > 0
+                                              and num_new_prompt_tokens > 0)
+
+                if request_do_prompt_logprobs:
+
+                    # Construct prompt logprobs, under the condition that
+                    # prompt logprobs were requested & a nonzero number of
+                    # prompt tokens were computed in this step for this request.
+                    #
+                    # Note that this scenario returns an EngineCoreOutput which
+                    # is empty except for the prompt logprobs which were
+                    # computed for these prompt tokens.
+
+                    slice_upper_index = (curr_prompt_base_idx +
+                                         num_new_prompt_tokens)
+                    prompt_logprob_token_ids = prompt_logprob_token_ids_list[
+                        curr_prompt_base_idx:slice_upper_index]
+                    prompt_logprob_values = prompt_logprob_values_list[
+                        curr_prompt_base_idx:slice_upper_index]
+                    curr_prompt_base_idx = slice_upper_index
+
+                    logprob_cnt = max_prompt_logprobs
+                    prompt_logprobs = [{
+                        lpt: Logprob(lpv, (idx + 1), None)
+                        for idx, (lpv, lpt) in enumerate(
+                            zip(plp_tok_values[0:logprob_cnt],
+                                plp_tok_token_ids[0:logprob_cnt]))
+                    } for plp_tok_values, plp_tok_token_ids in zip(
+                        prompt_logprob_values, prompt_logprob_token_ids)]
+
+                    if not request.prompt_logprobs:
+                        # Ensure that None is the first prompt logprob
+                        prompt_logprobs = [None] + prompt_logprobs
+
+                    curr_prompt_base_idx = slice_upper_index
+
+                    prompt_slice_range_upper = request.num_computed_tokens
+                    prompt_slice_range_lower = (prompt_slice_range_upper -
+                                                num_new_prompt_tokens)
+                    request.prompt_logprobs.extend(prompt_logprobs)
+                else:
+                    curr_prompt_base_idx += num_new_prompt_tokens
+            else:
+                request_do_prompt_logprobs = False
+
+            # When the request's num_computed_tokens catches up its num_tokens,
+            # the request generates output tokens. Otherwise, we ignore the
+            # sampler output for the request.
+            assert request.num_computed_tokens <= request.num_tokens
+
+            cached_encoder_input_ids = (
+                scheduler.encoder_cache_manager.get_cached_input_ids(request))
+            for input_id in list(cached_encoder_input_ids):
+                start_pos = request.mm_positions[input_id]["offset"]
+                num_tokens = request.mm_positions[input_id]["length"]
+                if start_pos + num_tokens <= request.num_computed_tokens:
+                    # The encoder output is already processed and stored
+                    # in the decoder's KV cache.
+                    scheduler.encoder_cache_manager.free(request, input_id)
+
+            if request.num_computed_tokens == request.num_tokens:
+                # NOTE(woosuk): Currently, we assume that each request
+                # generates at most one token at each step.
+                token_id = sampled_token_ids[req_index]
+                if request_do_logprobs:
+                    # Construct logprobs, if requested (TODO: assumes one
+                    # generated token).
+                    logprob_token_ids = logprob_token_ids_list[req_index]
+                    logprob_values = logprob_values_list[req_index]
+                    logprob_cnt = max_logprobs
+                    if token_id not in logprob_token_ids[0:max_logprobs]:
+                        # Sampled token is not in the in the top logprobs;
+                        # inject it & resort, ensuring that excess logprobs
+                        # not requested by the user have -inf probability
+                        logprob_values[max_logprobs:-1] = (
+                            [float('-inf')] *
+                            (len(logprob_values) - 1 - max_logprobs))
+
+                        indices = sorted(range(len(logprob_values)),
+                                         key=lambda k: logprob_values[k],
+                                         reverse=True)
+                        logprob_values = [logprob_values[i] for i in indices]
+                        logprob_token_ids = [
+                            logprob_token_ids[i] for i in indices
+                        ]
+
+                        # There will be one more logprob than the user requested
+                        logprob_cnt = max_logprobs + 1
+
+                    # Only keep the number of logprobs specified by the request
+                    # (plus possibly the sampled token id & its logprob)
+                    logprob_values = logprob_values[0:logprob_cnt]
+                    logprob_token_ids = logprob_token_ids[0:logprob_cnt]
+
+                    request.logprobs.append({
+                        lpt: Logprob(lpv, (idx + 1), None)
+                        for idx, (lpv, lpt) in enumerate(
+                            zip(logprob_values, logprob_token_ids))
+                    })
+                request.append_output_token_ids(token_id)
+                # TODO: Update the KV cache manager for prefix caching.
+
+                # Check for stop and update request state.
+                # This must be called before me make the EngineCoreOutput.
+                stopped = scheduler._check_stop(request)
+
+                # Add EngineCoreOutput for this Request.
+                # Return the logprob for the most recently computed tokens.
+                # Return no prompt logprobs in decode-phase.
+                output = EngineCoreOutput(
+                    request_id=req_id,
+                    new_token_ids=request.output_token_ids[-num_new_tokens:],
+                    finished=request.is_finished(),
+                    finish_reason=request.get_finished_reason(),
+                    stop_reason=request.stop_reason,
+                    logprobs=(request.logprobs[-num_new_tokens:]
+                              if request_do_logprobs else None),
+                    prompt_logprobs=(prompt_logprobs
+                                     if request_do_prompt_logprobs else None),
+                    prompt_logprobs_token_ids=(request.prompt_token_ids
+                                               if request_do_prompt_logprobs
+                                               else None))
+                engine_core_outputs.append(output)
+
+                # Breakout of the loop.
+                if stopped:
+                    continue
+
+            elif request_do_prompt_logprobs:
+                # This request is still partial but prompt logprobs were
+                # requested
+                engine_core_outputs.append(
+                    EngineCoreOutput(
+                        request_id=req_id,
+                        new_token_ids=[],
+                        finished=request.is_finished(),
+                        finish_reason=request.get_finished_reason(),
+                        stop_reason=request.stop_reason,
+                        logprobs=[] if request_do_logprobs else None,
+                        prompt_logprobs=(
+                            prompt_logprobs if request_do_prompt_logprobs else
+                            ([] if request_do_prompt_logprobs else None)),
+                        prompt_logprobs_token_ids=(
+                            request.prompt_token_ids[prompt_slice_range_lower:
+                                                     prompt_slice_range_upper]
+                            if request_do_prompt_logprobs else
+                            ([] if request_do_prompt_logprobs else None))))
+
+            new_running.append(request)
+        scheduler.running = new_running
+        return engine_core_outputs
+
     def step(self) -> List[EngineCoreOutput]:
         """Schedule, execute, and make output."""
 
@@ -111,8 +361,7 @@ def step(self) -> List[EngineCoreOutput]:
 
         scheduler_output = self.scheduler.schedule()
         output = self.model_executor.execute_model(scheduler_output)
-        engine_core_outputs = self.scheduler.update_from_output(
-            scheduler_output, output)
+        engine_core_outputs = self.update_from_output(scheduler_output, output)
         return engine_core_outputs
 
     def profile(self, is_start=True):
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 8fe9d3adb8792..37b16051da9fb 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -76,7 +76,7 @@ def process_inputs(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
-        """Process the input prompt into an engine request
+        """Process the input prompt into engine (& possibly tokenizer) requests
         
         Args:
           request_id: request ID

From 554f43111a09efd03c66b3a55d0ff6a9a338654b Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 2 Dec 2024 11:03:16 +0000
Subject: [PATCH 029/293] added explanatory comment to
 EngineCore.update_from_output()

---
 vllm/v1/engine/core.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index c6ff0bc59da5f..2611d08efe0dc 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -156,6 +156,15 @@ def update_from_output(
         scheduler_output: "SchedulerOutput",
         model_runner_output: "ModelRunnerOutput",
     ) -> List[EngineCoreOutput]:
+        """Build engine core output from model runner output.
+        
+        Args:
+          scheduler_output: scheduler output prior to engine step.
+          model_runner_output: model runner output from engine step.
+
+        Returns:
+          Engine core output which tracks the progress of generation.
+        """
         scheduler = self.scheduler
         # NOTE(woosuk): This method doesn't consider speculative decoding.
         sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()

From 5dea1d58b7f810be10351603d537f9b0a1c4e5c2 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 25 Nov 2024 01:27:30 -0800
Subject: [PATCH 030/293] [misc] move functions to config.py (#10624)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/compile/piecewise/test_simple.py        |  4 +-
 tests/compile/piecewise/test_toy_llama.py     |  4 +-
 tests/kernels/test_encoder_decoder_attn.py    |  3 +-
 .../model_executor/test_enabled_custom_ops.py |  3 +-
 vllm/attention/layer.py                       |  3 +-
 vllm/compilation/wrapper.py                   |  3 +-
 vllm/config.py                                | 51 +++++++++++++++++
 vllm/model_executor/custom_op.py              |  2 +-
 vllm/model_executor/model_loader/loader.py    |  3 +-
 .../model_executor/model_loader/tensorizer.py |  3 +-
 vllm/plugins/__init__.py                      | 56 -------------------
 11 files changed, 62 insertions(+), 73 deletions(-)

diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 0db12d6b6a43c..7ef502abee345 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -10,8 +10,8 @@
 from vllm.compilation.compile_context import set_compile_context
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
-from vllm.plugins import set_current_vllm_config
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
 from vllm.utils import direct_register_custom_op
 
 global_counter = 0
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index cfe661b8871e0..dbd5a3bbffeab 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -16,8 +16,8 @@
 from vllm.compilation.compile_context import set_compile_context
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
-from vllm.plugins import set_current_vllm_config
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
 from vllm.utils import direct_register_custom_op
 
 # create a library to hold the custom op
diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index c4b72ba6bf4ee..d943b048b7934 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -18,10 +18,9 @@
 from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
 from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
                                      global_force_attn_backend_context_manager)
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.forward_context import set_forward_context
 from vllm.platforms import current_platform
-from vllm.plugins import set_current_vllm_config
 
 # List of support backends for encoder/decoder models
 LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index c54e30995da49..0a3aba255fd76 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -2,13 +2,12 @@
 
 import pytest
 
-from vllm.config import CompilationConfig, VllmConfig
+from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import (GeluAndMul,
                                                    ReLUSquaredActivation,
                                                    SiluAndMul)
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.plugins import set_current_vllm_config
 
 
 # Registered subclass for test
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 1bb335909484b..17157617248f7 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -7,13 +7,12 @@
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, AttentionType
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, get_current_vllm_config
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import current_platform
-from vllm.plugins import get_current_vllm_config
 from vllm.utils import direct_register_custom_op
 
 
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 0143d0301ca1a..bc4d292fef402 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -8,7 +8,7 @@
 import torch
 
 import vllm.envs as envs
-from vllm.config import CompilationLevel
+from vllm.config import CompilationLevel, get_current_vllm_config
 
 
 class TorchCompileWrapperWithCustomDispatcher:
@@ -32,7 +32,6 @@ def __init__(self,
             # default compilation settings
             # compiling the forward method
 
-            from vllm.plugins import get_current_vllm_config
             backend = get_current_vllm_config(
             ).compilation_config.init_backend()
 
diff --git a/vllm/config.py b/vllm/config.py
index 68720f3a3034d..0a390c4311ba6 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3,6 +3,7 @@
 import hashlib
 import json
 import warnings
+from contextlib import contextmanager
 from dataclasses import dataclass, field, replace
 from pathlib import Path
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Counter, Dict,
@@ -2450,3 +2451,53 @@ def __str__(self):
         self.cache_config.enable_prefix_caching,
         self.model_config.use_async_output_proc,
         self.model_config.mm_processor_kwargs)
+
+
+_current_vllm_config: Optional[VllmConfig] = None
+
+
+@contextmanager
+def set_current_vllm_config(vllm_config: VllmConfig):
+    """
+    Temporarily set the current VLLM config.
+    Used during model initialization.
+    We save the current VLLM config in a global variable,
+    so that all modules can access it, e.g. custom ops
+    can access the VLLM config to determine how to dispatch.
+    """
+    global _current_vllm_config
+    old_vllm_config = _current_vllm_config
+    from vllm.compilation.counter import compilation_counter
+    num_models_seen = compilation_counter.num_models_seen
+    try:
+        _current_vllm_config = vllm_config
+        yield
+    finally:
+        logger.debug("enabled custom ops: %s",
+                     vllm_config.compilation_config.enabled_custom_ops)
+        logger.debug("disabled custom ops: %s",
+                     vllm_config.compilation_config.disabled_custom_ops)
+        if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
+            and compilation_counter.num_models_seen == num_models_seen:
+            # If the model supports compilation,
+            # compilation_counter.num_models_seen should be increased
+            # by at least 1.
+            # If it is not increased, it means the model does not support
+            # compilation (does not have @support_torch_compile decorator).
+            logger.warning(
+                "`torch.compile` is turned on, but the model %s"
+                " does not support it. Please open an issue on GitHub"
+                "if you want it to be supported.",
+                vllm_config.model_config.model)
+        _current_vllm_config = old_vllm_config
+
+
+def get_current_vllm_config() -> VllmConfig:
+    if _current_vllm_config is None:
+        # in ci, usually when we test custom ops/modules directly,
+        # we don't set the vllm config. In that case, we set a default
+        # config.
+        logger.warning("Current VLLM config is not set.")
+        from vllm.config import VllmConfig
+        return VllmConfig()
+    return _current_vllm_config
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index b07966f2ab7d0..fddc8bad09ef5 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -2,9 +2,9 @@
 
 import torch.nn as nn
 
+from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.plugins import get_current_vllm_config
 from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 34e0860162260..441dd409b4f9d 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -23,7 +23,7 @@
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
 from vllm.config import (LoadConfig, LoadFormat, ModelConfig, ParallelConfig,
-                         VllmConfig)
+                         VllmConfig, set_current_vllm_config)
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.envs import VLLM_USE_MODELSCOPE
@@ -47,7 +47,6 @@
     safetensors_weights_iterator)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.plugins import set_current_vllm_config
 from vllm.utils import is_pin_memory_available
 
 
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 3fd668765a1b1..87f3fcb5cae00 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -13,13 +13,12 @@
 from transformers import PretrainedConfig
 
 import vllm.envs as envs
-from vllm.config import ModelConfig, ParallelConfig
+from vllm.config import ModelConfig, ParallelConfig, set_current_vllm_config
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.logger import init_logger
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
-from vllm.plugins import set_current_vllm_config
 from vllm.utils import FlexibleArgumentParser
 
 tensorizer_error_msg = None
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 8b43167693598..3c64726ca3344 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,15 +1,10 @@
 import logging
 import os
-from contextlib import contextmanager
-from typing import TYPE_CHECKING, Optional
 
 import torch
 
 import vllm.envs as envs
 
-if TYPE_CHECKING:
-    from vllm.config import VllmConfig
-
 logger = logging.getLogger(__name__)
 
 # make sure one process only loads plugins once
@@ -64,54 +59,3 @@ def load_general_plugins():
                 logger.info("plugin %s loaded.", plugin.name)
             except Exception:
                 logger.exception("Failed to load plugin %s", plugin.name)
-
-
-_current_vllm_config: Optional["VllmConfig"] = None
-
-
-@contextmanager
-def set_current_vllm_config(vllm_config: "VllmConfig"):
-    """
-    Temporarily set the current VLLM config.
-    Used during model initialization.
-    We save the current VLLM config in a global variable,
-    so that all modules can access it, e.g. custom ops
-    can access the VLLM config to determine how to dispatch.
-    """
-    global _current_vllm_config
-    old_vllm_config = _current_vllm_config
-    from vllm.compilation.counter import compilation_counter
-    from vllm.config import CompilationLevel
-    num_models_seen = compilation_counter.num_models_seen
-    try:
-        _current_vllm_config = vllm_config
-        yield
-    finally:
-        logger.debug("enabled custom ops: %s",
-                     vllm_config.compilation_config.enabled_custom_ops)
-        logger.debug("disabled custom ops: %s",
-                     vllm_config.compilation_config.disabled_custom_ops)
-        if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
-            and compilation_counter.num_models_seen == num_models_seen:
-            # If the model supports compilation,
-            # compilation_counter.num_models_seen should be increased
-            # by at least 1.
-            # If it is not increased, it means the model does not support
-            # compilation (does not have @support_torch_compile decorator).
-            logger.warning(
-                "`torch.compile` is turned on, but the model %s"
-                " does not support it. Please open an issue on GitHub"
-                "if you want it to be supported.",
-                vllm_config.model_config.model)
-        _current_vllm_config = old_vllm_config
-
-
-def get_current_vllm_config() -> "VllmConfig":
-    if _current_vllm_config is None:
-        # in ci, usually when we test custom ops/modules directly,
-        # we don't set the vllm config. In that case, we set a default
-        # config.
-        logger.warning("Current VLLM config is not set.")
-        from vllm.config import VllmConfig
-        return VllmConfig()
-    return _current_vllm_config

From 930f2cc2c6a7d78211ad2152c27bfc62acd1b697 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 25 Nov 2024 17:51:20 +0800
Subject: [PATCH 031/293] [Model] Support `is_causal` HF config field for Qwen2
 model (#10621)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 docs/source/models/supported_models.rst       | 13 +++++++++---
 .../embedding/language/test_embedding.py      | 12 +++++++++--
 tests/models/embedding/utils.py               |  4 ++--
 vllm/config.py                                | 15 ++++++++++----
 vllm/model_executor/models/qwen2.py           | 20 +++++++++++++++++--
 5 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index ccd2d8de8ec0b..54e2c4479c2c9 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -342,7 +342,7 @@ Text Embedding
     - ✅︎
   * - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM`
     - Qwen2-based
-    - :code:`ssmits/Qwen2-7B-Instruct-embed-base`, :code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`, etc.
+    - :code:`ssmits/Qwen2-7B-Instruct-embed-base`, :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
     - ✅︎
     - ✅︎
   * - :code:`RobertaModel`, :code:`RobertaForMaskedLM`
@@ -363,6 +363,13 @@ Text Embedding
 .. tip::
   You can override the model's pooling method by passing :code:`--override-pooler-config`.
 
+.. note::
+  Unlike base Qwen2, :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
+  You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
+
+  On the other hand, its 1.5B variant (:code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
+  despite being described otherwise on its model card.
+
 Reward Modeling
 ---------------
 
@@ -606,10 +613,10 @@ Text Generation
 | :sup:`+` Multiple items can be inputted per text prompt for this modality.
 
 .. note::
-  vLLM currently only supports adding LoRA to the language backbone of multimodal models.               
+  vLLM currently only supports adding LoRA to the language backbone of multimodal models.
 
 .. note::
-  For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
+  The official :code:`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
   For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
 
 Multimodal Embedding
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index c3f351ef707be..36b1e5887981c 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -21,6 +21,7 @@
                      marks=[pytest.mark.core_model]),
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
         pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
+        pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"),
     ],
 )
 @pytest.mark.parametrize("dtype", ["half"])
@@ -31,6 +32,10 @@ def test_models(
     model,
     dtype: str,
 ) -> None:
+    vllm_extra_kwargs = {}
+    if model == "Alibaba-NLP/gte-Qwen2-7B-instruct":
+        vllm_extra_kwargs["hf_overrides"] = {"is_causal": False}
+
     # The example_prompts has ending "\n", for example:
     # "Write a short story about a robot that dreams for the first time.\n"
     # sentence_transformers will strip the input texts, see:
@@ -43,8 +48,11 @@ def test_models(
                    is_sentence_transformer=True) as hf_model:
         hf_outputs = hf_model.encode(example_prompts)
 
-    with vllm_runner(model, task="embedding", dtype=dtype,
-                     max_model_len=None) as vllm_model:
+    with vllm_runner(model,
+                     task="embedding",
+                     dtype=dtype,
+                     max_model_len=None,
+                     **vllm_extra_kwargs) as vllm_model:
         vllm_outputs = vllm_model.encode(example_prompts)
         # This test is for verifying whether the model's extra_repr
         # can be printed correctly.
diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py
index fd1c44d9c117e..f96c7d2b176db 100644
--- a/tests/models/embedding/utils.py
+++ b/tests/models/embedding/utils.py
@@ -24,7 +24,7 @@ def check_embeddings_close(
                                   dim=0)
 
         fail_msg = (f"Test{prompt_idx}:"
-                    f"\n{name_0}:\t{embeddings_0!r}"
-                    f"\n{name_1}:\t{embeddings_1!r}")
+                    f"\n{name_0}:\t{embeddings_0[:16]!r}"
+                    f"\n{name_1}:\t{embeddings_1[:16]!r}")
 
         assert sim >= 1 - tol, fail_msg
diff --git a/vllm/config.py b/vllm/config.py
index 0a390c4311ba6..f9ecb02cd5bde 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -27,7 +27,7 @@
     get_hf_text_config, get_pooling_config,
     get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope)
 from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory,
-                        identity, print_warning_once, resolve_obj_by_qualname)
+                        print_warning_once, resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -183,7 +183,7 @@ def __init__(
             hf_overrides_fn = hf_overrides
         else:
             hf_overrides_kw = hf_overrides
-            hf_overrides_fn = identity
+            hf_overrides_fn = None
 
         if rope_scaling is not None:
             hf_override: Dict[str, Any] = {"rope_scaling": rope_scaling}
@@ -212,8 +212,15 @@ def __init__(
         self.skip_tokenizer_init = skip_tokenizer_init
 
         hf_config = get_config(self.model, trust_remote_code, revision,
-                               code_revision, config_format, **hf_overrides_kw)
-        hf_config = hf_overrides_fn(hf_config)
+                               code_revision, config_format)
+
+        if hf_overrides_kw:
+            logger.info("Overriding HF config with %s", hf_overrides_kw)
+            hf_config.update(hf_overrides_kw)
+        if hf_overrides_fn:
+            logger.info("Overriding HF config with %s", hf_overrides_fn)
+            hf_config = hf_overrides_fn(hf_config)
+
         self.hf_config = hf_config
 
         self.hf_text_config = get_hf_text_config(self.hf_config)
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 370cff5fa153f..8da75c9935a13 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -27,7 +27,7 @@
 from torch import nn
 from transformers import Qwen2Config
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention, AttentionMetadata, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -164,11 +164,17 @@ def forward(
         hidden_states: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
+        attn_type: str = AttentionType.DECODER,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q,
+                                k,
+                                v,
+                                kv_cache,
+                                attn_metadata,
+                                attn_type=attn_type)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -210,6 +216,15 @@ def __init__(
         self.post_attention_layernorm = RMSNorm(config.hidden_size,
                                                 eps=config.rms_norm_eps)
 
+        # By default, Qwen2 uses causal attention as it is a decoder-only model.
+        # You can override the HF config with `is_causal=False` to enable
+        # bidirectional attention, which is used in some embedding models
+        # (e.g. Alibaba-NLP/gte-Qwen2-7B-instruct)
+        if getattr(config, "is_causal", True):
+            self._attn_type = AttentionType.DECODER
+        else:
+            self._attn_type = AttentionType.ENCODER_ONLY
+
     def forward(
         self,
         positions: torch.Tensor,
@@ -230,6 +245,7 @@ def forward(
             hidden_states=hidden_states,
             kv_cache=kv_cache,
             attn_metadata=attn_metadata,
+            attn_type=self._attn_type,
         )
 
         # Fully Connected

From 060ca2fecae2abf0fce0fa4085344420dfa0f9aa Mon Sep 17 00:00:00 2001
From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
Date: Mon, 25 Nov 2024 21:08:30 +0800
Subject: [PATCH 032/293] Super tiny little typo fix (#10633)

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 docs/source/quantization/fp8_e5m2_kvcache.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/quantization/fp8_e5m2_kvcache.rst b/docs/source/quantization/fp8_e5m2_kvcache.rst
index 9ae07bcd3b991..b2d824427f786 100644
--- a/docs/source/quantization/fp8_e5m2_kvcache.rst
+++ b/docs/source/quantization/fp8_e5m2_kvcache.rst
@@ -4,7 +4,7 @@ FP8 E5M2 KV Cache
 ==================
 
 The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits.
-The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bflaot16 and fp8 to each other.
+The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other.
 
 Here is an example of how to enable this feature:
 

From 084199bf6e0251f2466413e2da2fe78d06bf1b18 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Tue, 26 Nov 2024 00:21:41 +0800
Subject: [PATCH 033/293] [Bug]: Authorization ignored when root_path is set
 (#10606)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/entrypoints/openai/test_root_path.py | 103 +++++++++++++++++++++
 vllm/entrypoints/openai/api_server.py      |   6 +-
 2 files changed, 107 insertions(+), 2 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_root_path.py

diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py
new file mode 100644
index 0000000000000..20f7960619efb
--- /dev/null
+++ b/tests/entrypoints/openai/test_root_path.py
@@ -0,0 +1,103 @@
+import contextlib
+import os
+from typing import Any, List, NamedTuple
+
+import openai  # use the official client for correctness check
+import pytest
+
+from ...utils import RemoteOpenAIServer
+
+# # any model with a chat template should work here
+MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
+API_KEY = "abc-123"
+ERROR_API_KEY = "abc"
+ROOT_PATH = "llm"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--enforce-eager",
+        "--max-model-len",
+        "4080",
+        "--root-path",  # use --root-path=/llm for testing
+        "/" + ROOT_PATH,
+        "--chat-template",
+        DUMMY_CHAT_TEMPLATE,
+    ]
+    envs = os.environ.copy()
+
+    envs["VLLM_API_KEY"] = API_KEY
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
+        yield remote_server
+
+
+class TestCase(NamedTuple):
+    model_name: str
+    base_url: List[str]
+    api_key: str
+    expected_error: Any
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        TestCase(
+            model_name=MODEL_NAME,
+            base_url=["v1"],  # http://localhost:8000/v1
+            api_key=ERROR_API_KEY,
+            expected_error=openai.AuthenticationError),
+        TestCase(
+            model_name=MODEL_NAME,
+            base_url=[ROOT_PATH, "v1"],  # http://localhost:8000/llm/v1
+            api_key=ERROR_API_KEY,
+            expected_error=openai.AuthenticationError),
+        TestCase(
+            model_name=MODEL_NAME,
+            base_url=["v1"],  # http://localhost:8000/v1
+            api_key=API_KEY,
+            expected_error=None),
+        TestCase(
+            model_name=MODEL_NAME,
+            base_url=[ROOT_PATH, "v1"],  # http://localhost:8000/llm/v1
+            api_key=API_KEY,
+            expected_error=None),
+    ],
+)
+async def test_chat_session_root_path_with_api_key(server: RemoteOpenAIServer,
+                                                   test_case: TestCase):
+    saying: str = "Here is a common saying about apple. An apple a day, keeps"
+    ctx = contextlib.nullcontext()
+    if test_case.expected_error is not None:
+        ctx = pytest.raises(test_case.expected_error)
+    with ctx:
+        client = openai.AsyncOpenAI(
+            api_key=test_case.api_key,
+            base_url=server.url_for(*test_case.base_url),
+            max_retries=0)
+        chat_completion = await client.chat.completions.create(
+            model=test_case.model_name,
+            messages=[{
+                "role": "user",
+                "content": "tell me a common saying"
+            }, {
+                "role": "assistant",
+                "content": saying
+            }],
+            extra_body={
+                "continue_final_message": True,
+                "add_generation_prompt": False
+            })
+
+        assert chat_completion.id is not None
+        assert len(chat_completion.choices) == 1
+        choice = chat_completion.choices[0]
+        assert choice.finish_reason == "stop"
+        message = choice.message
+        assert len(message.content) > 0
+        assert message.role == "assistant"
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2b1f14b89b1f2..bc018be982bff 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -499,10 +499,12 @@ async def validation_exception_handler(_, exc):
 
         @app.middleware("http")
         async def authentication(request: Request, call_next):
-            root_path = "" if args.root_path is None else args.root_path
             if request.method == "OPTIONS":
                 return await call_next(request)
-            if not request.url.path.startswith(f"{root_path}/v1"):
+            url_path = request.url.path
+            if app.root_path and url_path.startswith(app.root_path):
+                url_path = url_path[len(app.root_path):]
+            if not url_path.startswith("/v1"):
                 return await call_next(request)
             if request.headers.get("Authorization") != "Bearer " + token:
                 return JSONResponse(content={"error": "Unauthorized"},

From ad02c99ed9a55936f6b40d936039ad7723fe7ae9 Mon Sep 17 00:00:00 2001
From: Wallas Henrique <wallashss@users.noreply.github.com>
Date: Mon, 25 Nov 2024 14:23:32 -0300
Subject: [PATCH 034/293] [Bugfix] Fix chunked prefill with model dtype float32
 on Turing Devices (#9850)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 pyproject.toml                       |  1 +
 tests/conftest.py                    | 19 +++++++++
 tests/kernels/test_prefix_prefill.py | 63 ++++++++++++++++++++++++++++
 vllm/attention/ops/prefix_prefill.py | 41 ++++++++++++------
 vllm/config.py                       | 10 +++++
 vllm/engine/arg_utils.py             |  1 +
 6 files changed, 122 insertions(+), 13 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3c8c46cc8621e..253b706a774a7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -98,4 +98,5 @@ markers = [
     "quant_model: run this model test under Quantized category",
     "distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
     "skip_v1: do not run this test with v1",
+    "optional: optional tests that are automatically skipped, include --optional to run them",
 ]
diff --git a/tests/conftest.py b/tests/conftest.py
index 29707f975e2a0..d56942d8912af 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1030,3 +1030,22 @@ def dummy_gemma2_embedding_path():
         with open(json_path, "w") as f:
             json.dump(config, f)
     return _dummy_gemma2_embedding_path
+
+
+# Add the flag `--optional` to allow run tests
+# that are marked with @pytest.mark.optional
+def pytest_addoption(parser):
+    parser.addoption("--optional",
+                     action="store_true",
+                     default=False,
+                     help="run optional test")
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--optional"):
+        # --optional given in cli: do not skip optional tests
+        return
+    skip_optional = pytest.mark.skip(reason="need --optional option to run")
+    for item in items:
+        if "optional" in item.keywords:
+            item.add_marker(skip_optional)
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index a8a187ebaede4..3fdb7996ba4e0 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -40,6 +40,13 @@ def test_contexted_kv_attention(
     kv_cache_dtype: str,
     device: str,
 ) -> None:
+
+    if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability(
+            89):
+        pytest.skip(
+            'Triton limitation: fp8e4nv data type is not supported on CUDA'
+            ' arch < 89')
+
     current_platform.seed_everything(0)
     torch.set_default_device(device)
 
@@ -235,6 +242,13 @@ def test_contexted_kv_attention_alibi(
     kv_cache_dtype: str,
     device: str,
 ) -> None:
+
+    if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability(
+            89):
+        pytest.skip(
+            'Triton limitation: fp8e4nv data type is not supported on CUDA'
+            ' arch < 89')
+
     current_platform.seed_everything(0)
     torch.set_default_device(device)
 
@@ -462,3 +476,52 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
     print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
     atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
     torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
+
+
+# These tests are optional to only run when explicitly invoked
+#
+# pytest -v -s --optional \
+# tests/kernels/test_prefix_prefill.py::test_contexted_kv_attention_f32
+#
+# These tests are useful to test model dtype float32 on Turing devices.
+# We skip them to not increase the time when running tests on CI
+@pytest.mark.optional
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", [torch.float32])
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
+@torch.inference_mode()
+def test_contexted_kv_attention_f32(
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    sliding_window: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    device: str,
+) -> None:
+    test_contexted_kv_attention(num_heads, num_queries_per_kv, head_size,
+                                sliding_window, dtype, kv_cache_dtype, device)
+
+
+@pytest.mark.optional
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", [torch.float32])
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_contexted_kv_attention_alibi_f32(
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    device: str,
+) -> None:
+    test_contexted_kv_attention_alibi(num_heads, num_queries_per_kv, head_size,
+                                      dtype, kv_cache_dtype, device)
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index a2a649c8ebcfd..9c11a8df55278 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -7,6 +7,13 @@
 
 from vllm.platforms import current_platform
 
+# Static kernels parameters
+BASE_BLOCK = 128 if current_platform.has_device_capability(80) else 64
+NUM_WARPS = 8
+
+# To check compatibility
+IS_TURING = current_platform.get_device_capability() == (7, 5)
+
 if triton.__version__ >= "2.1.0":
 
     @triton.jit
@@ -50,6 +57,7 @@ def _fwd_kernel(
         stride_v_cache_d,
         stride_v_cache_bl,
         num_queries_per_kv: int,
+        IN_PRECISION: tl.constexpr,
         BLOCK_M: tl.constexpr,
         BLOCK_DMODEL: tl.constexpr,  # head size
         BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
@@ -130,7 +138,7 @@ def _fwd_kernel(
                 k = k_load
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)  # [M,N]
-            qk += tl.dot(q, k)
+            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
             qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
                           float("-inf"))
             qk *= sm_scale
@@ -178,7 +186,7 @@ def _fwd_kernel(
                 v = v_load
             p = p.to(v.dtype)
 
-            acc += tl.dot(p, v)
+            acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
             # # update m_i and l_i
             l_i = l_i_new
             m_i = m_i_new
@@ -204,7 +212,7 @@ def _fwd_kernel(
                         other=0.0)
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k)
+            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
             qk *= sm_scale
             # apply causal mask
             qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
@@ -238,7 +246,7 @@ def _fwd_kernel(
                         other=0.0)
             p = p.to(v.dtype)
 
-            acc += tl.dot(p, v)
+            acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
             # update m_i and l_i
             l_i = l_i_new
             m_i = m_i_new
@@ -485,6 +493,7 @@ def _fwd_kernel_alibi(
         stride_v_cache_d,
         stride_v_cache_bl,
         num_queries_per_kv: int,
+        IN_PRECISION: tl.constexpr,
         BLOCK_M: tl.constexpr,
         BLOCK_DMODEL: tl.constexpr,  # head size
         BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
@@ -560,7 +569,7 @@ def _fwd_kernel_alibi(
                 k = k_load
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k)
+            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
             qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
                           float("-inf"))
             qk *= sm_scale
@@ -600,7 +609,7 @@ def _fwd_kernel_alibi(
                 v = v_load
             p = p.to(v.dtype)
 
-            acc += tl.dot(p, v, allow_tf32=False)
+            acc = tl.dot(p, v, acc=acc, input_precision='ieee')
             # update m_i and l_i
             l_i = l_i_new
             m_i = m_i_new
@@ -635,7 +644,7 @@ def _fwd_kernel_alibi(
                         other=0.0)
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k, allow_tf32=False)
+            qk = tl.dot(q, k, acc=qk, input_precision='ieee')
             qk *= sm_scale
             qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
                           float("-inf"))
@@ -673,7 +682,7 @@ def _fwd_kernel_alibi(
                         other=0.0)
             p = p.to(v.dtype)
 
-            acc += tl.dot(p, v, allow_tf32=False)
+            acc = tl.dot(p, v, acc=acc, input_precision='ieee')
             # update m_i and l_i
             l_i = l_i_new
             m_i = m_i_new
@@ -709,13 +718,17 @@ def context_attention_fwd(q,
                               alibi_slopes=None,
                               sliding_window=None):
 
-        BLOCK = 128 if current_platform.has_device_capability(80) else 64
-        NUM_WARPS = 8
-
+        q_dtype_is_f32 = q.dtype is torch.float32
         # need to reduce num. blocks when using fp32
         # due to increased use of GPU shared memory
-        if q.dtype is torch.float32:
-            BLOCK = BLOCK // 2
+        # if q.dtype is torch.float32:
+        BLOCK = BASE_BLOCK // 2 if q_dtype_is_f32 else BASE_BLOCK
+
+        # Turing does have tensor core for float32 multiplication
+        # use ieee as fallback for triton kernels work. There is also
+        # warning on vllm/config.py to inform users this fallback
+        # implementation
+        IN_PRECISION = 'ieee' if IS_TURING and q_dtype_is_f32 else None
 
         # Conversion of FP8 Tensor from uint8 storage to
         # appropriate torch.dtype for interpretation by Triton
@@ -799,6 +812,7 @@ def context_attention_fwd(q,
                 v_cache.stride(
                     3),  #[num_blocks, num_kv_heads, head_size, block_size]
                 num_queries_per_kv=num_queries_per_kv,
+                IN_PRECISION=IN_PRECISION,
                 BLOCK_M=BLOCK,
                 BLOCK_DMODEL=Lk,
                 BLOCK_DMODEL_PADDED=Lk_padded,
@@ -850,6 +864,7 @@ def context_attention_fwd(q,
             v_cache.stride(
                 3),  #[num_blocks, num_kv_heads, head_size, block_size]
             num_queries_per_kv=num_queries_per_kv,
+            IN_PRECISION=IN_PRECISION,
             BLOCK_M=BLOCK,
             BLOCK_DMODEL=Lk,
             BLOCK_DMODEL_PADDED=Lk_padded,
diff --git a/vllm/config.py b/vllm/config.py
index f9ecb02cd5bde..c87feaec3e5f6 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2388,6 +2388,16 @@ def __post_init__(self):
             self.quant_config = VllmConfig._get_quantization_config(
                 self.model_config, self.load_config)
 
+        if self.scheduler_config is not None and \
+            self.model_config is not None and \
+            self.scheduler_config.chunked_prefill_enabled and \
+            self.model_config.dtype == torch.float32 and \
+            current_platform.get_device_capability() == (7, 5):
+            print_warning_once(
+                "Turing devices tensor cores do not support float32 matmul. "
+                "To workaround this limitation, vLLM will set 'ieee' input "
+                "precision for chunked prefill triton kernels.")
+
         if self.compilation_config is None:
             self.compilation_config = CompilationConfig()
         if envs.VLLM_USE_V1 and not self.model_config.enforce_eager:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index a43e133f21ac2..ca68c1d57151c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1055,6 +1055,7 @@ def create_engine_config(self) -> VllmConfig:
             msg = "Chunked prefill is not supported for embedding models"
             raise ValueError(msg)
 
+
         speculative_config = SpeculativeConfig.maybe_create_spec_config(
             target_model_config=model_config,
             target_parallel_config=parallel_config,

From c76bf01690c079d659d5ec818ff1bece93b2ea30 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Mon, 25 Nov 2024 09:34:46 -0800
Subject: [PATCH 035/293] [Docs] Add Snowflake Slides (#10641)

Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4e1353d98f1dc..cfeb24cbb5823 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ Easy, fast, and cheap LLM serving for everyone
 ---
 
 *Latest News* 🔥
-- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing).
+- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
 - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
 - [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
 - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).

From 5e36a52c5dcbf27b482a94ecf3f34f217990e2dc Mon Sep 17 00:00:00 2001
From: zhou fan <1247714429@qq.com>
Date: Tue, 26 Nov 2024 02:10:55 +0800
Subject: [PATCH 036/293] [Model]: Add support for Aria model (#10514)

Signed-off-by: xffxff <1247714429@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 docs/source/models/supported_models.rst       |   6 +
 examples/offline_inference_vision_language.py |  18 +
 ...e_inference_vision_language_multi_image.py |  20 +
 tests/models/registry.py                      |   2 +
 vllm/entrypoints/chat_utils.py                |   2 +
 vllm/model_executor/models/aria.py            | 695 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 vllm/transformers_utils/configs/aria.py       |  47 ++
 8 files changed, 791 insertions(+)
 create mode 100644 vllm/model_executor/models/aria.py
 create mode 100644 vllm/transformers_utils/configs/aria.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 54e2c4479c2c9..7a6932d65e653 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -476,6 +476,12 @@ Text Generation
     - Example HF Models
     - :ref:`LoRA <lora>`
     - :ref:`PP <distributed_serving>`
+  * - :code:`AriaForConditionalGeneration`
+    - Aria
+    - T + I
+    - :code:`rhymes-ai/Aria`
+    - 
+    - ✅︎
   * - :code:`Blip2ForConditionalGeneration`
     - BLIP-2
     - T + I\ :sup:`E`
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 11af6880e1b5a..f08f22eec164a 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -402,6 +402,23 @@ def run_idefics3(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
+# Aria
+def run_aria(question: str, modality: str):
+    assert modality == "image"
+    model_name = "rhymes-ai/Aria"
+
+    llm = LLM(model=model_name,
+              tokenizer_mode="slow",
+              trust_remote_code=True,
+              dtype="bfloat16")
+
+    prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
+              "<|im_end|>\n<|im_start|>assistant\n")
+
+    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
+    return llm, prompt, stop_token_ids
+
+
 model_example_map = {
     "llava": run_llava,
     "llava-next": run_llava_next,
@@ -423,6 +440,7 @@ def run_idefics3(question: str, modality: str):
     "molmo": run_molmo,
     "glm4v": run_glm4v,
     "idefics3": run_idefics3,
+    "aria": run_aria,
 }
 
 
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index dc12df8d78211..788b604cfd4a0 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -321,6 +321,25 @@ def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
     )
 
 
+def load_aria(question, image_urls: List[str]) -> ModelRequestData:
+    model_name = "rhymes-ai/Aria"
+    llm = LLM(model=model_name,
+              tokenizer_mode="slow",
+              trust_remote_code=True,
+              dtype="bfloat16",
+              limit_mm_per_prompt={"image": len(image_urls)})
+    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
+    prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None)
+
+
 model_example_map = {
     "phi3_v": load_phi3v,
     "h2ovl_chat": load_h2onvl,
@@ -330,6 +349,7 @@ def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
     "qwen_vl_chat": load_qwenvl_chat,
     "mllama": load_mllama,
     "idefics3": load_idefics3,
+    "aria": load_aria,
 }
 
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index fa0818c4f0bd1..669c832b1df3a 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -43,6 +43,8 @@ class _HfExamplesInfo:
                                          trust_remote_code=True),
     "ArcticForCausalLM": _HfExamplesInfo("Snowflake/snowflake-arctic-instruct",
                                          trust_remote_code=True),
+    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria",
+                                                    trust_remote_code=True),
     "BaiChuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan-7B",
                                          trust_remote_code=True),
     "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat",
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index abee5ac46391c..c2054dcbfce0e 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -412,6 +412,8 @@ def _placeholder_str(self, modality: ModalityStr,
                 return ""
             if model_type == "idefics3":
                 return "<image>"
+            if model_type == "aria":
+                return "<|fim_prefix|><|img|><|fim_suffix|>"
 
             raise TypeError(f"Unknown {modality} model type: {model_type}")
         elif modality == "audio":
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
new file mode 100644
index 0000000000000..0356435e9c257
--- /dev/null
+++ b/vllm/model_executor/models/aria.py
@@ -0,0 +1,695 @@
+import math
+from typing import Iterable, List, Optional, Set, Tuple, TypedDict, Union
+
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from transformers import LlamaConfig
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, QuantizationConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.inputs import INPUT_REGISTRY, token_inputs
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    get_compressed_tensors_cache_scale)
+from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput,
+                                                SamplingMetadata)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.idefics2_vision_model import (
+    Idefics2VisionTransformer)
+from vllm.model_executor.models.interfaces import SupportsMultiModal
+from vllm.model_executor.models.llama import (LlamaDecoderLayer, LlamaMLP,
+                                              LlamaModel)
+from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
+                                              is_pp_missing_parameter,
+                                              make_layers, maybe_prefix,
+                                              merge_multimodal_embeddings)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.image import cached_get_image_processor
+from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.utils import (cached_get_tokenizer,
+                                   repeat_and_pad_placeholder_tokens)
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.aria import (AriaMoELMConfig,
+                                                  AriaVisionConfig)
+
+from .utils import flatten_bn
+
+
+class AriaImagePixelInputs(TypedDict):
+    pixel_values: torch.Tensor
+    pixel_mask: Optional[torch.Tensor]
+    """
+    Shape: 
+        pixel_values: `(batch_size * num_images, num_channels, height, width)`
+        pixel_mask: `(batch_size * num_images, height, width)`
+    """
+
+
+class AriaVisionTransformer(Idefics2VisionTransformer):
+    """
+    AriaVisionTransformer is a modified version of Idefics2VisionTransformer
+    that replaces the post-layernorm with an identity layer.
+    """
+
+    def __init__(
+        self,
+        config: AriaVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, quant_config, prefix)
+        self.post_layernorm = nn.Identity()
+
+
+class AriaVisionModel(nn.Module):
+    config_class = AriaVisionConfig
+
+    def __init__(
+        self,
+        config: AriaVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.vision_model = AriaVisionTransformer(
+            config,
+            quant_config,
+            prefix=f"{prefix}.vision_model",
+        )
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        pixel_mask: Optional[torch.BoolTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.BoolTensor]]:
+        patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
+
+        vit_oup = self.vision_model(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+        )
+
+        image_atts = self._create_image_attention_mask(patch_attention_mask)
+
+        return vit_oup, image_atts
+
+    def _create_patch_attention_mask(self, pixel_mask):
+        if pixel_mask is None:
+            return None
+
+        patches_subgrid = pixel_mask.unfold(
+            dimension=1,
+            size=self.vision_model.config.patch_size,
+            step=self.vision_model.config.patch_size,
+        ).unfold(
+            dimension=2,
+            size=self.vision_model.config.patch_size,
+            step=self.vision_model.config.patch_size,
+        )
+        return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+
+    def _create_image_attention_mask(self, patch_attention_mask):
+        if patch_attention_mask is None:
+            return None
+
+        flattened_mask = patch_attention_mask.flatten(1)
+        return torch.logical_not(flattened_mask)
+
+
+class FFN(nn.Module):
+
+    def __init__(self, embed_dim, ff_dim, output_dim):
+        super().__init__()
+        self.linear_in = ColumnParallelLinear(embed_dim, ff_dim, bias=False)
+        self.linear_out = RowParallelLinear(ff_dim, output_dim, bias=False)
+        self.act = get_act_fn("gelu_new")
+
+    def forward(self, hidden_states):
+        hidden_states, _ = self.linear_in(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.linear_out(hidden_states)
+        return hidden_states
+
+
+class CrossAttention(nn.Module):
+
+    def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0):
+        super().__init__()
+        self.num_heads = num_heads
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.k_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+
+        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.linear = nn.Linear(embed_dim, embed_dim)
+        self.dropout = nn.Dropout(drop_out_rate)
+
+        self.layer_norm = nn.LayerNorm(embed_dim)
+        self.ln_kv = nn.LayerNorm(kv_dim)
+
+    def forward(self, x, hidden_states, attn_mask=None, add_residual=False):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        query = self.q_proj(normed_hidden_states).permute(1, 0, 2)
+
+        x = self.ln_kv(x)
+        key = self.k_proj(x).permute(1, 0, 2)
+        value = self.v_proj(x).permute(1, 0, 2)
+
+        attn_output, _ = self.multihead_attn(query,
+                                             key,
+                                             value,
+                                             attn_mask=attn_mask)
+
+        attn_output = attn_output.permute(1, 0, 2)
+
+        if add_residual:
+            attn_output = hidden_states + self.dropout(
+                self.linear(attn_output))
+        else:
+            attn_output = self.dropout(self.linear(attn_output))
+
+        return attn_output
+
+
+class AriaProjector(nn.Module):
+    """
+    A projection module with one cross attention layer and one FFN layer, which
+    projects ViT's outputs into MoE's inputs.
+
+    Args:
+        patch_to_query_dict (dict): Maps patch numbers to their corresponding
+        query numbers,
+            e.g., {1225: 128, 4900: 256}. This allows for different query sizes
+            based on image resolution.
+        embed_dim (int): Embedding dimension. 
+        num_heads (int): Number of attention heads. 
+        kv_dim (int): Dimension of key and value. 
+        ff_dim (int): Hidden dimension of the feed-forward network. 
+        output_dim (int): Output dimension. 
+        norm_layer (nn.Module): Normalization layer. Default is nn.LayerNorm.
+
+    Outputs:
+        A tensor with the shape of (batch_size, query_number, output_dim)
+    """
+
+    def __init__(
+        self,
+        patch_to_query_dict,
+        embed_dim,
+        num_heads,
+        kv_dim,
+        ff_dim,
+        output_dim,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.patch_to_query_dict = patch_to_query_dict
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        self.query = nn.Parameter(
+            torch.zeros(max(patch_to_query_dict.values()), self.embed_dim))
+
+        trunc_normal_(self.query, std=0.02)
+
+        self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads)
+
+        self.ln_ffn = norm_layer(embed_dim)
+        self.ffn = FFN(embed_dim, ff_dim, output_dim)
+
+    def forward(self, x, attn_mask=None):
+        bs = x.shape[0]
+        queries = self.query.unsqueeze(0).repeat(bs, 1, 1)
+
+        query_num = self.patch_to_query_dict.get(x.shape[1], None)
+        assert (query_num is not None
+                ), f"Query number for {x.shape[1]} patches is not provided"
+
+        queries = queries[:, :query_num, :]
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.repeat_interleave(self.num_heads, 0)
+            attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1)
+
+        attention_out = self.cross_attn(x, queries, attn_mask=attn_mask)
+
+        out = self.ffn(self.ln_ffn(attention_out))
+
+        return out
+
+
+class AriaFusedMoE(FusedMoE):
+
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
+                      shard_id: str) -> Set[str]:
+        # Override the weight_loader to handle the expert weights in the Aria
+        # model, which are already packed with experts, and merge the gate and
+        # up weights for each expert.
+        # Note: Loading expert weights with quantization is not supported
+        tp_rank = get_tensor_model_parallel_rank()
+        if shard_id == 'w13':
+            # the shape of loaded_weight is
+            # (num_experts, hidden_size, 2 * moe_intermediate_size)
+            if self.tp_size > 1:
+                up, gate = loaded_weight.chunk(2, dim=-1)
+                up_current_rank = up.chunk(self.tp_size, dim=-1)[tp_rank]
+                gate_current_rank = gate.chunk(self.tp_size, dim=-1)[tp_rank]
+                up_and_gate = torch.cat([up_current_rank, gate_current_rank],
+                                        dim=-1).transpose(1, 2)
+                param.data.copy_(up_and_gate)
+            else:
+                param.data.copy_(loaded_weight.transpose(1, 2))
+        elif shard_id == 'w2':
+            # the shape of loaded_weight is
+            # (num_experts, moe_intermediate_size, hidden_size)
+            if self.tp_size > 1:
+                down_current_rank = loaded_weight.chunk(self.tp_size,
+                                                        dim=1)[tp_rank]
+                param.data.copy_(down_current_rank.transpose(1, 2))
+            else:
+                param.data.copy_(loaded_weight.transpose(1, 2))
+
+
+class MoELayer(nn.Module):
+    """
+    Mixture of Experts (MoE) Layer for the AriaMoE model.
+
+    This layer implements the MoE mechanism, which routes input tokens to
+    different experts based on a routing algorithm, processes them through the
+    experts, and then combines the outputs.
+    """
+
+    def __init__(
+        self,
+        config: AriaMoELMConfig,
+        quant_config: Optional[QuantizationConfig],
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.router_weight = nn.Parameter(
+            torch.empty(
+                (self.config.moe_num_experts, self.config.hidden_size)))
+
+        self.experts = AriaFusedMoE(
+            num_experts=config.moe_num_experts,
+            top_k=config.moe_topk,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            quant_config=quant_config,
+            reduce_results=True,
+        )
+        self.shared_experts = LlamaMLP(
+            config.hidden_size,
+            config.moe_intermediate_size * config.moe_num_shared_experts,
+            "silu",
+            quant_config=quant_config,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the MoE Layer.
+
+        Args:
+            hidden_states (torch.Tensor): Input tensor of shape (batch_size,
+            sequence_length, hidden_size).
+
+        Returns:
+            torch.Tensor: Output tensor after passing through the MoE layer.
+        """
+
+        router_output = torch.nn.functional.linear(hidden_states,
+                                                   self.router_weight)
+
+        shared_expert_output = self.shared_experts(hidden_states)
+        sparse_expert_output = self.experts(hidden_states, router_output)
+
+        return sparse_expert_output + shared_expert_output
+
+
+class MoEDecoderLayer(LlamaDecoderLayer):
+    """
+    Custom Decoder Layer for the AriaMoE model which modifies the standard
+    `LlamaDecoderLayer` by replacing the traditional MLP with a Mixture of
+    Experts (MoE) Layer.
+    """
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, cache_config, quant_config, prefix)
+        self.mlp = MoELayer(config, quant_config=quant_config)
+
+
+class AriaMoELMModel(LlamaModel):
+    """
+    Custom LlamaModel for the AriaMoE model which modifies the standard
+    LlamaModel by replacing the `LlamaDecoderLayer` with `MoEDecoderLayer`.
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        # FIXME: this is a hack to disable the compilation of the model
+        self.do_not_compile = True
+
+        self.layers = None
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: MoEDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+
+    # Adapted from LlamaModel.load_weights with the modification of adding
+    # the expert weights mapping to `stacked_params_mapping`
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+            ("experts.w13_weight", "experts.fc1.weight", 'w13'),
+            ("experts.w2_weight", "experts.fc2.weight", 'w2'),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if scale_name := get_compressed_tensors_cache_scale(name):
+                # Loading kv cache scales for compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = loaded_weight[0]
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+def build_mm_projector(config):
+    return AriaProjector(
+        patch_to_query_dict=config.projector_patch_to_query_dict,
+        embed_dim=config.vision_config.hidden_size,
+        num_heads=config.vision_config.num_attention_heads,
+        kv_dim=config.vision_config.hidden_size,
+        ff_dim=config.text_config.hidden_size,
+        output_dim=config.text_config.hidden_size,
+    )
+
+
+def get_max_multimodal_tokens(ctx):
+    return max(ctx.model_config.hf_config.image_size2tokens.values())
+
+
+def input_mapper_for_aria(ctx, data):
+    return MultiModalInputs(data)
+
+
+def input_processor(ctx, llm_inputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    # if it is pure text input, use it as is
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+
+    model_config = ctx.model_config
+
+    tokenizer = cached_get_tokenizer(model_config.tokenizer)
+    image_processor = cached_get_image_processor(
+        model_config.model, trust_remote_code=model_config.trust_remote_code)
+    hf_config = model_config.hf_config
+
+    # prepare image tokens, the max_image_size is used to determine the number
+    # of patch_size for every image
+    max_image_size = multi_modal_data.pop("max_image_size", 980)
+    _split_image = multi_modal_data.pop("split_image", False)
+
+    assert isinstance(max_image_size,
+                      (int, float)), "max_image_size should be float or int"
+    images = (multi_modal_data["image"] if isinstance(
+        multi_modal_data["image"], list) else [multi_modal_data["image"]])
+
+    image_inputs = image_processor.preprocess(images,
+                                              max_image_size=max_image_size,
+                                              split_image=_split_image,
+                                              return_tensors="pt").data
+    image_inputs['pixel_values'] = image_inputs['pixel_values'].to(
+        ctx.model_config.dtype)
+    num_crops = image_inputs.pop("num_crops")
+
+    prompt_token_ids = llm_inputs["prompt_token_ids"]
+    if num_crops.sum().item() > 0:
+        _, prompt_token_ids, _ = repeat_and_pad_placeholder_tokens(
+            tokenizer,
+            None,
+            prompt_token_ids,
+            placeholder_token_id=hf_config.image_token_index,
+            repeat_count=num_crops,
+        )
+
+    repeat_count = [hf_config.image_size2tokens[max_image_size]
+                    ] * sum(num_crops).item()
+    new_prompt, new_token_ids, _ = repeat_and_pad_placeholder_tokens(
+        tokenizer,
+        None,
+        prompt_token_ids,
+        placeholder_token_id=hf_config.image_token_index,
+        repeat_count=repeat_count,
+    )
+
+    return token_inputs(
+        prompt_token_ids=new_token_ids,
+        prompt=new_prompt,
+        multi_modal_data={"image": image_inputs},
+    )
+
+
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_multimodal_tokens)
+@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_aria)
+@INPUT_REGISTRY.register_input_processor(input_processor)
+class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
+    """
+    Aria model for conditional generation tasks.
+
+    This model combines a vision tower, a multi-modal projector, and a language
+    model to perform tasks that involve both image and text inputs.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        # prepare the image_size to tokens mapping for the image preprocess, see
+        # input_processor
+        config.image_size2tokens = {
+            int(math.sqrt(k) * config.vision_config.patch_size): v
+            for k, v in config.projector_patch_to_query_dict.items()
+        }
+        self.config = config
+        self.vision_tower = AriaVisionModel(config.vision_config)
+        self.multi_modal_projector = build_mm_projector(config)
+        self.vocab_size = config.text_config.vocab_size
+        self.language_model = AriaMoELMModel(
+            vllm_config=vllm_config.with_hf_config(config.text_config),
+            prefix=maybe_prefix(prefix, "language_model.model"),
+        )
+        self.pad_token_id = (self.config.pad_token_id
+                             if self.config.pad_token_id is not None else -1)
+        self.unpadded_vocab_size = config.text_config.vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.text_config.hidden_size,
+            org_num_embeddings=self.language_model.org_vocab_size,
+            quant_config=quant_config,
+        )
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                self.vocab_size, logit_scale)
+        self.sampler = Sampler()
+
+    def _validate_image_sizes(
+            self, images: List[torch.Tensor]) -> List[torch.Tensor]:
+        if not all(img.shape == images[0].shape for img in images):
+            raise ValueError("All images must be the same size")
+        return images
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[AriaImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        pixel_mask = kwargs.pop("pixel_mask", None)
+
+        if pixel_values is None:
+            return None
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        pixel_values = self._validate_image_sizes(pixel_values)
+        pixel_values = flatten_bn(pixel_values, concat=True)
+        if pixel_mask is not None:
+            pixel_mask = flatten_bn(pixel_mask, concat=True)
+
+        return AriaImagePixelInputs(
+            pixel_values=pixel_values,
+            pixel_mask=pixel_mask,
+        )
+
+    def _process_image_input(
+        self, image_input: AriaImagePixelInputs
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert self.vision_tower is not None
+
+        pixel_values = image_input['pixel_values']
+        pixel_mask = image_input['pixel_mask']
+
+        image_feature, image_attn_mask = self.vision_tower(
+            pixel_values, pixel_mask=pixel_mask)
+        return self.multi_modal_projector(image_feature, image_attn_mask)
+
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        multimodal_embeddings = self._process_image_input(image_input)
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.config.image_token_index)
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
+            # always pass the input via `inputs_embeds`
+            # to make sure the computation graph is consistent
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      multimodal_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model(
+            input_ids,
+            positions,
+            kv_caches,
+            attn_metadata,
+            intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        hf_to_vllm_mapper = WeightsMapper(
+            orig_to_new_prefix={
+                "language_model.model": "language_model",
+                "language_model.lm_head": "lm_head",
+            },
+            orig_to_new_suffix={
+                "router.weight": "router_weight",
+            },
+        )
+
+        loader = AutoWeightsLoader(self)
+        loader.load_weights(weights, mapper=hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 789ffb4d3bde0..184f4b2bc1526 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -133,6 +133,7 @@
 
 _MULTIMODAL_MODELS = {
     # [Decoder-only]
+    "AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
     "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
     "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
     "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
diff --git a/vllm/transformers_utils/configs/aria.py b/vllm/transformers_utils/configs/aria.py
new file mode 100644
index 0000000000000..d253da0d96a34
--- /dev/null
+++ b/vllm/transformers_utils/configs/aria.py
@@ -0,0 +1,47 @@
+from transformers.models.idefics2.configuration_idefics2 import (
+    Idefics2VisionConfig)
+from transformers.models.llama.configuration_llama import LlamaConfig
+
+
+class AriaVisionConfig(Idefics2VisionConfig):
+    model_type = "aria_vision_model"
+
+
+class AriaMoELMConfig(LlamaConfig):
+    """
+    Configuration class for AriaMoE language model.
+
+    This class extends the LlamaConfig to include additional parameters specific
+    to the Mixture of Experts (MoE) architecture.
+    """
+
+    model_type = "aria_moe_lm"
+
+    def __init__(
+        self,
+        moe_intermediate_size: int = 4096,
+        moe_num_experts: int = 8,
+        moe_topk: int = 2,
+        moe_num_shared_experts: int = 2,
+        **kwargs,
+    ):
+        """
+        Initialize the AriaMoELMConfig.
+
+        Args:
+            moe_intermediate_size (int): The intermediate size for MoE layers.
+                Default is 4096.
+            moe_num_experts (int): The number of experts in the MoE layer.
+                Default is 8.
+            moe_topk (int): The number of top experts to route to for each 
+                token. Default is 2.
+            moe_num_shared_experts (int): The number of shared experts. Default
+                is 2. 
+            **kwargs: Additional keyword arguments to be passed to the parent
+                LlamaConfig.
+        """
+        super().__init__(**kwargs)
+        self.moe_intermediate_size = moe_intermediate_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_topk = moe_topk
+        self.moe_num_shared_experts = moe_num_shared_experts

From 80a1dd498e5ab468c6cdaa30bc95559da1fecbde Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 26 Nov 2024 02:14:33 +0800
Subject: [PATCH 037/293] [Model] Enable optional prefix when loading embedding
 models (#10639)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/model_executor/models/bert.py    |  9 +++++----
 vllm/model_executor/models/gemma2.py  |  4 +++-
 vllm/model_executor/models/llama.py   |  5 ++++-
 vllm/model_executor/models/qwen2.py   | 12 ++++++------
 vllm/model_executor/models/roberta.py |  3 ++-
 5 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index f570d6d3c12b3..1fff72b3490e9 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -14,18 +14,17 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.pooler import (CrossEncodingPooler, Pooler,
                                                PoolingType)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.interfaces import SupportsCrossEncoding
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.transformers_utils.config import (
     get_cross_encoder_activation_function)
 
-from .utils import maybe_prefix
+from .interfaces import SupportsCrossEncoding
+from .utils import WeightsMapper, maybe_prefix
 
 
 class BertEmbedding(nn.Module):
@@ -442,6 +441,8 @@ def pooler(
         return self._pooler(hidden_states, pooling_metadata)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
+        weights = hf_to_vllm_mapper.apply(weights)
         self.model.load_weights(weights)
 
     def _build_model(self,
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index fd8223dd9be1b..d229eb74669ee 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -42,7 +42,7 @@
 from vllm.sequence import IntermediateTensors, PoolerOutput
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, extract_layer_index,
+from .utils import (AutoWeightsLoader, WeightsMapper, extract_layer_index,
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -511,4 +511,6 @@ def pooler(
         return self._pooler(hidden_states, pooling_metadata)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
+        weights = hf_to_vllm_mapper.apply(weights)
         self.model.load_weights(weights)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 66b29e72cfa89..33d78d74129c8 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -53,7 +53,8 @@
 from vllm.sequence import IntermediateTensors, PoolerOutput
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -689,6 +690,8 @@ def pooler(
         return self._pooler(hidden_states, pooling_metadata)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
+        weights = hf_to_vllm_mapper.apply(weights)
         self.model.load_weights(weights)
 
     def load_kv_cache_scales(self, quantization_param_path: str) -> None:
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 8da75c9935a13..46640226d4cf8 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -50,7 +50,8 @@
 from vllm.sequence import IntermediateTensors, PoolerOutput
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -585,8 +586,7 @@ def pooler(
     ) -> Optional[PoolerOutput]:
         return self._pooler(hidden_states, pooling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str,
-                                                   torch.Tensor]]) -> Set[str]:
-        loader = AutoWeightsLoader(self,
-                                   ignore_unexpected_prefixes=["lm_head."])
-        return loader.load_weights(weights)
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
+        weights = hf_to_vllm_mapper.apply(weights)
+        self.model.load_weights(weights)
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 5a296e311f079..ba1a78ac640fd 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -11,13 +11,14 @@
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel
-from vllm.model_executor.models.interfaces import SupportsCrossEncoding
 from vllm.model_executor.models.utils import maybe_prefix
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.transformers_utils.config import (
     get_cross_encoder_activation_function)
 
+from .interfaces import SupportsCrossEncoding
+
 
 class RobertaEmbedding(nn.Module):
 

From 84e74aaa6077bb36fda2a5dadc01629ba6f00df7 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 26 Nov 2024 02:15:45 +0800
Subject: [PATCH 038/293] [Doc] Fix typos in docs (#10636)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 docs/source/models/supported_models.rst      | 2 +-
 docs/source/serving/compatibility_matrix.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 7a6932d65e653..3f012284bfbff 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -365,7 +365,7 @@ Text Embedding
 
 .. note::
   Unlike base Qwen2, :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
-  You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
+  You can set :code:`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
 
   On the other hand, its 1.5B variant (:code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
   despite being described otherwise on its model card.
diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst
index a4300761d2635..fa03d2cde1486 100644
--- a/docs/source/serving/compatibility_matrix.rst
+++ b/docs/source/serving/compatibility_matrix.rst
@@ -393,7 +393,7 @@ Feature x Hardware
      - ✅
      - ✅
      - ✅
-     - ✗
+     - ?
    * - :abbr:`enc-dec (Encoder-Decoder Models)`
      - ✅
      - ✅

From 0b34acf8f746d48d897ddb12baab8dcd9153c7e4 Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Mon, 25 Nov 2024 14:26:40 -0800
Subject: [PATCH 039/293] [Model] Add OLMo November 2024 model (#10503)

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 docs/source/models/supported_models.rst     |   5 +
 tests/distributed/test_pipeline_parallel.py |   1 +
 tests/models/registry.py                    |   1 +
 vllm/model_executor/models/olmo2.py         | 432 ++++++++++++++++++++
 vllm/model_executor/models/registry.py      |   1 +
 vllm/transformers_utils/config.py           |   5 +-
 vllm/transformers_utils/configs/__init__.py |   2 +
 vllm/transformers_utils/configs/olmo2.py    | 166 ++++++++
 8 files changed, 611 insertions(+), 2 deletions(-)
 create mode 100644 vllm/model_executor/models/olmo2.py
 create mode 100644 vllm/transformers_utils/configs/olmo2.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 3f012284bfbff..b5cbe6915d581 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -234,6 +234,11 @@ Text Generation
     - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc.
     -
     - ✅︎
+  * - :code:`OLMo2ForCausalLM`
+    - OLMo2
+    - :code:`allenai/OLMo2-7B-1124`, etc.
+    -
+    - ✅︎
   * - :code:`OLMoEForCausalLM`
     - OLMoE
     - :code:`allenai/OLMoE-1B-7B-0924`, :code:`allenai/OLMoE-1B-7B-0924-Instruct`, etc.
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index c49ed9802cde8..386877e0e0a2c 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -167,6 +167,7 @@ def iter_params(self, model_name: str):
     "mosaicml/mpt-7b": PPTestSettings.fast(),
     "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
     "allenai/OLMo-1B-hf": PPTestSettings.fast(),
+    "shanearora/OLMo-7B-1124-hf": PPTestSettings.fast(),
     "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
     "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
     "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 669c832b1df3a..865e90b3f8b0e 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -93,6 +93,7 @@ class _HfExamplesInfo:
     "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
     "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
     "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
+    "Olmo2ForCausalLM": _HfExamplesInfo("shanearora/OLMo-7B-1124-hf"),
     "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
     "OPTForCausalLM": _HfExamplesInfo("facebook/opt-iml-max-1.3b"),
     "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
new file mode 100644
index 0000000000000..a35c911f90d96
--- /dev/null
+++ b/vllm/model_executor/models/olmo2.py
@@ -0,0 +1,432 @@
+# Adapted from
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py
+# Copyright 2024 The vLLM team.
+# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only OLMo2 model compatible with HuggingFace weights."""
+
+from functools import partial
+from typing import Iterable, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.distributed.communication_op import tensor_model_parallel_all_gather
+from vllm.distributed.parallel_state import get_tensor_model_parallel_rank
+from vllm.distributed.utils import split_tensor_along_last_dim
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import SupportsPP
+from vllm.model_executor.models.utils import (
+    is_pp_missing_parameter, make_empty_intermediate_tensors_factory,
+    make_layers, maybe_prefix)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.olmo2 import Olmo2Config
+
+
+class Olmo2Attention(nn.Module):
+    """
+    This is the attention block where the output is computed as
+    ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.config = vllm_config.model_config.hf_config
+        assert isinstance(self.config, Olmo2Config)
+
+        hidden_size = self.config.hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = self.config.num_attention_heads
+
+        assert hidden_size % self.total_num_heads == 0
+        assert self.total_num_heads % self.tp_size == 0
+
+        self.num_heads = self.total_num_heads // self.tp_size
+        self.total_num_kv_heads = (self.config.num_key_value_heads
+                                   or self.total_num_heads)
+        if self.total_num_kv_heads >= self.tp_size:
+            assert self.total_num_kv_heads % self.tp_size == 0
+        else:
+            assert self.tp_size % self.total_num_kv_heads == 0
+
+        self.num_kv_heads = max(1, self.total_num_kv_heads // self.tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.max_position_embeddings = self.config.max_position_embeddings
+        self.rope_theta = self.config.rope_theta
+
+        # Attention input projection. Projects x -> (q, k, v)
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=vllm_config.quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.k_norm = RMSNorm(
+            self.total_num_kv_heads * self.head_dim,
+            eps=self.config.rms_norm_eps,
+        )
+        self.q_norm = RMSNorm(self.config.hidden_size,
+                              eps=self.config.rms_norm_eps)
+
+        # Rotary embeddings.
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,  # type: ignore
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=vllm_config.cache_config,
+            quant_config=vllm_config.quant_config,
+            prefix=prefix,
+        )
+
+        # Attention output projection.
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=vllm_config.quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+    def _apply_qk_norm(self, q: torch.Tensor,
+                       k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.tp_size > 1:
+            q = tensor_model_parallel_all_gather(q.contiguous())
+            k = tensor_model_parallel_all_gather(k.contiguous())
+        q = self.q_norm.forward_native(q)
+        k = self.k_norm.forward_native(k)
+        if self.tp_size > 1:
+            splitter = partial(split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+        return q, k
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self._apply_qk_norm(q, k)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Olmo2MLP(nn.Module):
+    """
+    This is the MLP block where the output is computed as
+    ``MLP(x)`` in ``LN(MLP(x + LN(Attention(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        assert isinstance(config, Olmo2Config)
+        hidden_size = config.hidden_size
+        intermediate_size = config.intermediate_size
+
+        # Feed-forward input projection.
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=vllm_config.quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+
+        # Activation function.
+        self.act_fn = SiluAndMul()
+
+        # Feed-forward output projection.
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=vllm_config.quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Olmo2DecoderLayer(nn.Module):
+    """
+    This is a typical transformer block where the output is
+    computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        assert isinstance(config, Olmo2Config)
+        # Attention block.
+        self.self_attn = Olmo2Attention(vllm_config=vllm_config,
+                                        prefix=f"{prefix}.self_attn")
+
+        # MLP block.
+        self.mlp = Olmo2MLP(vllm_config=vllm_config, prefix=f"{prefix}.mlp")
+
+        # LayerNorm
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+        self.post_feedforward_layernorm = RMSNorm(config.hidden_size,
+                                                  eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        # Attention block.
+        residual = hidden_states
+        hidden_states = self.self_attn(positions, hidden_states, kv_cache,
+                                       attn_metadata)
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # MLP block.
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Olmo2Model(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.config = vllm_config.model_config.hf_config
+        assert isinstance(self.config, Olmo2Config)
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=f"{prefix}.embed_tokens",
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            self.config.num_hidden_layers,
+            lambda prefix: Olmo2DecoderLayer(vllm_config=vllm_config,
+                                             prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = RMSNorm(
+            self.config.hidden_size,
+            eps=self.config.rms_norm_eps,
+        )
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    self.config.hidden_size))
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """
+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        """
+        if get_pp_group().is_first_rank:
+            # Get embeddings of input.
+            # shape: (batch_size, seq_len, d_model)
+            inputs_embeds = self.embed_tokens(input_ids)
+
+            # embed positions
+            hidden_states = inputs_embeds
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            assert isinstance(hidden_states, torch.Tensor)
+
+        # Apply blocks one-by-one.
+        for i in range(self.start_layer, self.end_layer):
+            # shape: (batch_size, seq_len, d_model)
+            hidden_states = self.layers[i](
+                positions,
+                hidden_states,
+                kv_caches[i - self.start_layer],
+                attn_metadata,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+
+        # Apply final layer norm.
+        # shape: (batch_size, seq_len or 1, d_model)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class Olmo2ForCausalLM(nn.Module, SupportsPP):
+    """
+    Extremely barebones HF model wrapper.
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        assert isinstance(config, Olmo2Config)
+        self.config = config
+        self.model = Olmo2Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=vllm_config.quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = Sampler()
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader  # type: ignore
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 184f4b2bc1526..f5a02a5b25ca2 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -74,6 +74,7 @@
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
+    "Olmo2ForCausalLM": ("olmo2", "Olmo2ForCausalLM"),
     "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"),
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 70d18d40b7aa7..4c096acdf2035 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -28,8 +28,8 @@
                                              MedusaConfig, MllamaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
                                              NemotronConfig, NVLM_D_Config,
-                                             RWConfig, SolarConfig,
-                                             UltravoxConfig)
+                                             Olmo2Config, RWConfig,
+                                             SolarConfig, UltravoxConfig)
 # yapf: enable
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import resolve_obj_by_qualname
@@ -62,6 +62,7 @@
     "internvl_chat": InternVLChatConfig,
     "nemotron": NemotronConfig,
     "NVLM_D": NVLM_D_Config,
+    "olmo2": Olmo2Config,
     "solar": SolarConfig,
     "ultravox": UltravoxConfig,
     **_CONFIG_REGISTRY_OVERRIDE_HF
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index d1e19c9a33c24..4c721001d8434 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -15,6 +15,7 @@
 from vllm.transformers_utils.configs.mpt import MPTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
+from vllm.transformers_utils.configs.olmo2 import Olmo2Config
 from vllm.transformers_utils.configs.solar import SolarConfig
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
@@ -33,6 +34,7 @@
     "MLPSpeculatorConfig",
     "NemotronConfig",
     "NVLM_D_Config",
+    "Olmo2Config",
     "SolarConfig",
     "UltravoxConfig",
 ]
\ No newline at end of file
diff --git a/vllm/transformers_utils/configs/olmo2.py b/vllm/transformers_utils/configs/olmo2.py
new file mode 100644
index 0000000000000..0e6d8e4879b06
--- /dev/null
+++ b/vllm/transformers_utils/configs/olmo2.py
@@ -0,0 +1,166 @@
+# yapf: disable
+# ruff: noqa: E501
+# coding=utf-8
+# Copied from
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/configuration_olmo2.py
+"""OLMo 2 configuration."""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class Olmo2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Olmo2Model`]. It is used to instantiate an OLMo2
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the [allenai/Olmo2-7B-1124-hf](https://huggingface.co/allenai/Olmo2-7B-1124-hf).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50304):
+            Vocabulary size of the Olmo2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Olmo2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            Padding token id.
+        bos_token_id (`int`, *optional*):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 50279):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+
+    ```python
+    >>> from transformers import Olmo2Model, Olmo2Config
+
+    >>> # Initializing a Olmo2 7B style configuration
+    >>> configuration = Olmo2Config()
+
+    >>> # Initializing a model from the Olmo2 7B style configuration
+    >>> model = Olmo2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "olmo2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=50304,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=None,
+        eos_token_id=50279,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        rms_norm_eps=1e-5,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        self.rms_norm_eps = rms_norm_eps
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")

From 61dc22baed8a85b7ac62b676f1eaf9664ac2f65a Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 25 Nov 2024 17:04:50 -0800
Subject: [PATCH 040/293] [misc] do not read HOST_IP (#10644)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/envs.py                      | 2 +-
 vllm/executor/ray_gpu_executor.py | 4 ++--
 vllm/executor/ray_hpu_executor.py | 4 ++--
 vllm/utils.py                     | 7 +++++++
 4 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 14c1617f1be19..c896770e5f6bc 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -153,7 +153,7 @@ def get_default_config_root():
     # If you are using multi-node inference, you should set this differently
     # on each node.
     'VLLM_HOST_IP':
-    lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
+    lambda: os.getenv('VLLM_HOST_IP', ""),
 
     # used in distributed environment to manually set the communication port
     # Note: if VLLM_PORT is set, and some code asks for multiple ports, the
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 810b0f06ff7b2..6542b18ae70b1 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -216,8 +216,8 @@ def sort_by_driver_then_worker_ip(worker):
                 f"Every node should have a unique IP address. Got {n_nodes}"
                 f" nodes with node ids {list(node_workers.keys())} and "
                 f"{n_ips} unique IP addresses {all_ips}. Please check your"
-                " network configuration. If you set `VLLM_HOST_IP` or "
-                "`HOST_IP` environment variable, make sure it is unique for"
+                " network configuration. If you set `VLLM_HOST_IP`"
+                " environment variable, make sure it is unique for"
                 " each node.")
 
         VLLM_INSTANCE_ID = get_vllm_instance_id()
diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py
index 6fe8c6c403358..a74328e5aa272 100644
--- a/vllm/executor/ray_hpu_executor.py
+++ b/vllm/executor/ray_hpu_executor.py
@@ -192,8 +192,8 @@ def sort_by_driver_then_worker_ip(worker):
                 f"Every node should have a unique IP address. Got {n_nodes}"
                 f" nodes with node ids {list(node_workers.keys())} and "
                 f"{n_ips} unique IP addresses {all_ips}. Please check your"
-                " network configuration. If you set `VLLM_HOST_IP` or "
-                "`HOST_IP` environment variable, make sure it is unique for"
+                " network configuration. If you set `VLLM_HOST_IP` "
+                "environment variable, make sure it is unique for"
                 " each node.")
 
         VLLM_INSTANCE_ID = get_vllm_instance_id()
diff --git a/vllm/utils.py b/vllm/utils.py
index dd4283e3ac381..bec876d983701 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -467,6 +467,13 @@ async def collect_from_async_generator(
 
 def get_ip() -> str:
     host_ip = envs.VLLM_HOST_IP
+    if "HOST_IP" in os.environ and "VLLM_HOST_IP" not in os.environ:
+        logger.warning(
+            "The environment variable HOST_IP is deprecated and ignored, as"
+            " it is often used by Docker and other software to"
+            "interact with the container's network stack. Please"
+            "use VLLM_HOST_IP instead to set the IP address for vLLM processes"
+            " to communicate with each other.")
     if host_ip:
         return host_ip
 

From ea0c690b1517765049a8c65ceaa3ebad88e3b239 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 25 Nov 2024 18:32:09 -0800
Subject: [PATCH 041/293] [bugfix] fix aria model and add torch.compile
 (#10645)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/model_executor/models/aria.py  | 26 ++++----------------------
 vllm/model_executor/models/llama.py | 16 ++++++++++------
 2 files changed, 14 insertions(+), 28 deletions(-)

diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 0356435e9c257..fa6b95f5481ad 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -29,7 +29,7 @@
                                               LlamaModel)
 from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
                                               is_pp_missing_parameter,
-                                              make_layers, maybe_prefix,
+                                              maybe_prefix,
                                               merge_multimodal_embeddings)
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.base import MultiModalInputs
@@ -363,27 +363,9 @@ class AriaMoELMModel(LlamaModel):
     """
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config, prefix=prefix)
-
-        config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
-
-        # FIXME: this is a hack to disable the compilation of the model
-        self.do_not_compile = True
-
-        self.layers = None
-
-        self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers,
-            lambda prefix: MoEDecoderLayer(
-                config=config,
-                cache_config=cache_config,
-                quant_config=quant_config,
-                prefix=prefix,
-            ),
-            prefix=f"{prefix}.layers",
-        )
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=MoEDecoderLayer)
 
     # Adapted from LlamaModel.load_weights with the modification of adding
     # the expert weights mapping to `stacked_params_mapping`
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 33d78d74129c8..355b2f3ef8b28 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union
 
 import torch
 from torch import nn
@@ -273,7 +273,11 @@ def forward(
 @support_torch_compile
 class LlamaModel(nn.Module):
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
@@ -299,10 +303,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.embed_tokens = PPMissingLayer()
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: LlamaDecoderLayer(config=config,
-                                             cache_config=cache_config,
-                                             quant_config=quant_config,
-                                             prefix=prefix),
+            lambda prefix: layer_type(config=config,
+                                      cache_config=cache_config,
+                                      quant_config=quant_config,
+                                      prefix=prefix),
             prefix=f"{prefix}.layers",
         )
         if get_pp_group().is_last_rank:

From e8d3cc3b24b68f02f70e5ef90a901984263d61f5 Mon Sep 17 00:00:00 2001
From: Sanket Kale <sanket.kale@fujitsu.com>
Date: Tue, 26 Nov 2024 08:02:39 +0530
Subject: [PATCH 042/293] [Feature] vLLM ARM Enablement for AARCH64 CPUs
 (#9228)

Signed-off-by: Sanket Kale <sanketk.kale@fujitsu.com>
Co-authored-by: Sanket Kale <sanketk.kale@fujitsu.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 Dockerfile.arm                                |  62 +++
 cmake/cpu_extension.cmake                     |  33 +-
 csrc/cpu/attention.cpp                        |  18 +-
 csrc/cpu/cpu_types.hpp                        |   6 +-
 csrc/cpu/cpu_types_arm.hpp                    | 515 ++++++++++++++++++
 .../getting_started/arm-installation.rst      |  50 ++
 docs/source/index.rst                         |   1 +
 examples/offline_inference.py                 |   2 +-
 requirements-cpu.txt                          |   7 +-
 9 files changed, 678 insertions(+), 16 deletions(-)
 create mode 100644 Dockerfile.arm
 create mode 100644 csrc/cpu/cpu_types_arm.hpp
 create mode 100644 docs/source/getting_started/arm-installation.rst

diff --git a/Dockerfile.arm b/Dockerfile.arm
new file mode 100644
index 0000000000000..093ee2209222f
--- /dev/null
+++ b/Dockerfile.arm
@@ -0,0 +1,62 @@
+# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
+
+FROM ubuntu:22.04 AS cpu-test-arm
+
+ENV CCACHE_DIR=/root/.cache/ccache
+
+ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    apt-get update -y \
+    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install py-cpuinfo  # Use this to gather CPU info and optimize based on ARM Neoverse cores
+
+# Set LD_PRELOAD for tcmalloc on ARM
+ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
+
+RUN echo 'ulimit -c 0' >> ~/.bashrc
+
+WORKDIR /workspace
+
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
+    pip install --upgrade pip && \
+    pip install -r requirements-build.txt
+
+FROM cpu-test-arm AS build
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
+    pip install -v -r requirements-cpu.txt
+
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+# Disabling AVX512 specific optimizations for ARM
+ARG VLLM_CPU_DISABLE_AVX512="true"
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
+    pip install dist/*.whl && \
+    rm -rf dist
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
\ No newline at end of file
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 426189481575b..68f7ca1af05ad 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -16,16 +16,15 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
 #
 # Check the compile flags
 #
-if (CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le")
-    list(APPEND CXX_COMPILE_FLAGS
-        "-fopenmp"
-        "-DVLLM_CPU_EXTENSION")
-else()
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
     list(APPEND CXX_COMPILE_FLAGS
-        "-fopenmp"
         "-mf16c"
-        "-DVLLM_CPU_EXTENSION")
+    )
 endif()
+list(APPEND CXX_COMPILE_FLAGS
+    "-fopenmp"
+    "-DVLLM_CPU_EXTENSION")
 
 execute_process(COMMAND cat /proc/cpuinfo
                 RESULT_VARIABLE CPUINFO_RET
@@ -59,6 +58,8 @@ find_isa(${CPUINFO} "avx2" AVX2_FOUND)
 find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
 find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
 find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
+find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
+find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     list(APPEND CXX_COMPILE_FLAGS
@@ -78,9 +79,11 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
     else()
         message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
     endif()
+    
 elseif (AVX2_FOUND)
     list(APPEND CXX_COMPILE_FLAGS "-mavx2")
     message(WARNING "vLLM CPU backend using AVX2 ISA")
+    
 elseif (POWER9_FOUND OR POWER10_FOUND)
     message(STATUS "PowerPC detected")
     # Check for PowerPC VSX support
@@ -88,8 +91,20 @@ elseif (POWER9_FOUND OR POWER10_FOUND)
         "-mvsx"
         "-mcpu=native"
         "-mtune=native")
+
+elseif (ASIMD_FOUND)
+    message(STATUS "ARMv8 or later architecture detected")
+    if(ARM_BF16_FOUND)
+        message(STATUS "BF16 extension detected")
+        set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16")
+        add_compile_definitions(ARM_BF16_SUPPORT)
+    else()
+        message(WARNING "BF16 functionality is not available")
+        set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")  
+    endif()
+    list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})     
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
 endif()
 
 #
@@ -159,4 +174,4 @@ define_gpu_extension_target(
     WITH_SOABI
 )
 
-message(STATUS "Enabling C extension.")
+message(STATUS "Enabling C extension.")
\ No newline at end of file
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index e6c03dcb034fd..e21832ba7582f 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -51,6 +51,10 @@ struct KernelVecType<c10::BFloat16> {
   using v_load_vec_type = vec_op::BF16Vec16;
 };
 #else
+  #ifdef __aarch64__
+    #ifndef ARM_BF16_SUPPORT
+    // pass
+    #else
 template <>
 struct KernelVecType<c10::BFloat16> {
   using q_load_vec_type = vec_op::BF16Vec8;
@@ -60,6 +64,18 @@ struct KernelVecType<c10::BFloat16> {
   using qk_acc_vec_type = vec_op::FP32Vec16;
   using v_load_vec_type = vec_op::BF16Vec16;
 };
+    #endif
+  #else
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using q_load_vec_type = vec_op::BF16Vec8;
+  using q_vec_type = vec_op::FP32Vec16;
+  using k_load_vec_type = vec_op::BF16Vec16;
+  using k_vec_type = vec_op::FP32Vec16;
+  using qk_acc_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
+  #endif
 #endif
 
 template <typename T>
@@ -779,4 +795,4 @@ void paged_attention_v2(
                                  CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
                                  CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
                                });
-}
+}
\ No newline at end of file
diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp
index 0213be09105ed..28db0479748bf 100644
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -1,4 +1,3 @@
-
 #ifndef CPU_TYPES_HPP
 #define CPU_TYPES_HPP
 
@@ -8,8 +7,11 @@
 #elif defined(__POWER9_VECTOR__)
   //ppc implementation
   #include "cpu_types_vsx.hpp"
+#elif defined(__aarch64__)
+  //arm implementation
+  #include "cpu_types_arm.hpp"
 #else
   #warning "unsupported vLLM cpu implementation"
 #endif
 
-#endif
+#endif
\ No newline at end of file
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
new file mode 100644
index 0000000000000..73e0f8cb2e0fb
--- /dev/null
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -0,0 +1,515 @@
+#include <arm_neon.h>
+#include <torch/all.h> 
+#include <cmath>
+
+namespace vec_op {
+
+#ifdef ARM_BF16_SUPPORT
+  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)                         \
+    AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)  
+#else
+  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
+#endif
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+#define CPU_KERNEL_GUARD_IN(NAME)
+#define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+#define CPU_KERNEL_GUARD_IN(NAME)                                              \
+  std::cout << #NAME << " invoked." << std::endl;
+#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+#endif
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+namespace {
+  template <typename T, T... indexes, typename F>
+  constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
+    (f(std::integral_constant<T, indexes>{}), ...);
+  };
+}; 
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F &&f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T> struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; };
+};
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+struct FP16Vec8 : public Vec<FP16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  float16x8_t reg;
+
+  explicit FP16Vec8(const void *ptr)
+      : reg(vld1q_f16(static_cast<const __fp16 *>(ptr))) {};
+
+  explicit FP16Vec8(const FP32Vec8 &);
+
+  void save(void *ptr) const {
+    vst1q_f16(static_cast<__fp16 *>(ptr), reg);
+  }
+};
+
+struct FP16Vec16 : public Vec<FP16Vec16> {
+    constexpr static int VEC_ELEM_NUM = 16;
+    
+    float16x8x2_t reg; 
+    
+    explicit FP16Vec16(const void *ptr) {
+        reg.val[0] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr));        
+        reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8);    
+    }
+    
+    explicit FP16Vec16(const FP32Vec16& vec);
+    
+    void save(void *ptr) const {
+        vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);       
+        vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);   
+    }
+    
+    void save(void *ptr, const int elem_num) const {
+        int full_blocks = elem_num / 8;   
+        int remainder = elem_num % 8;     
+        
+        if (full_blocks > 0) {
+            vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
+            if (full_blocks > 1) {
+                vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
+            }
+        }
+        
+        if (remainder > 0) {
+            float16x8_t temp = reg.val[full_blocks];
+            for (int i = 0; i < remainder; ++i) {
+                reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i);
+            }
+        }
+    }
+};
+
+
+#ifdef ARM_BF16_SUPPORT
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  bfloat16x8_t reg;
+
+  explicit BF16Vec8(const void *ptr)
+      : reg(*reinterpret_cast<const bfloat16x8_t *>(ptr)) {};
+
+  explicit BF16Vec8(bfloat16x8_t data) : reg(data) {};
+
+  explicit BF16Vec8(const FP32Vec8 &);
+
+  explicit BF16Vec8(float32x4x2_t v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {};  
+
+  void save(void *ptr) const { *reinterpret_cast<bfloat16x8_t *>(ptr) = reg; }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  bfloat16x8x2_t reg;
+
+  explicit BF16Vec16(const void *ptr)
+      : reg(*reinterpret_cast<const bfloat16x8x2_t *>(ptr)) {};
+
+  explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {};
+
+  explicit BF16Vec16(const FP32Vec16 &);
+
+  explicit BF16Vec16(float32x4x4_t v) : reg({
+    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]),
+    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])
+  }){};
+
+  void save(void *ptr) const { *reinterpret_cast<bfloat16x8x2_t *>(ptr) = reg; };
+};
+
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  bfloat16x8x4_t reg;
+
+  explicit BF16Vec32(const void *ptr)
+      : reg(*reinterpret_cast<const bfloat16x8x4_t *>(ptr)) {};
+
+  explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {};
+
+  explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
+    vec8_data.reg,
+    vec8_data.reg,
+    vec8_data.reg,
+    vec8_data.reg
+  }) {};
+
+  void save(void *ptr) const { *reinterpret_cast<bfloat16x8x4_t *>(ptr) = reg; };
+};
+#endif
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+
+  union AliasReg {
+    float32x4_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  float32x4_t reg;
+
+  explicit FP32Vec4(float v) : reg(vdupq_n_f32(v)) {};
+
+  explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {};
+
+  explicit FP32Vec4(const float *ptr) : reg(vld1q_f32(ptr)) {};
+
+  explicit FP32Vec4(float32x4_t data) : reg(data) {};
+
+  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {};
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  union AliasReg {
+    float32x4x2_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  float32x4x2_t reg;
+
+  explicit FP32Vec8(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v)}) {};
+
+  explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {};
+
+  explicit FP32Vec8(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {};
+
+  explicit FP32Vec8(float32x4x2_t data) : reg(data) {};
+
+  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {};
+
+  explicit FP32Vec8(const FP16Vec8 &v) {
+        reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg));  
+        reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); 
+    };
+
+  explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {};
+
+  #ifdef ARM_BF16_SUPPORT
+
+  explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {};
+
+  explicit FP32Vec8(const BF16Vec8 &v) : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {};
+
+  #endif
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float answer = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&answer, &ar](int i) { answer += ar.values[i]; });
+
+    return answer;
+  }
+
+  FP32Vec8 exp() const {
+    AliasReg ar;
+    ar.reg = reg;
+
+    float32x2_t exp_vec0 = {expf(ar.values[0]), expf(ar.values[1])};
+    float32x2_t exp_vec1 = {expf(ar.values[2]), expf(ar.values[3])};
+    float32x2_t exp_vec2 = {expf(ar.values[4]), expf(ar.values[5])};
+    float32x2_t exp_vec3 = {expf(ar.values[6]), expf(ar.values[7])};
+
+    float32x4_t result0 = vcombine_f32(exp_vec0, exp_vec1);
+    float32x4_t result1 = vcombine_f32(exp_vec2, exp_vec3);
+
+    float32x4x2_t result;
+    result.val[0] = result0;
+    result.val[1] = result1;
+
+    return FP32Vec8(result);
+  }
+
+  FP32Vec8 tanh() const {
+    AliasReg ar;
+    ar.reg = reg;
+
+    float32x2_t tanh_vec0 = {tanhf(ar.values[0]), tanhf(ar.values[1])};
+    float32x2_t tanh_vec1 = {tanhf(ar.values[2]), tanhf(ar.values[3])};
+    float32x2_t tanh_vec2 = {tanhf(ar.values[4]), tanhf(ar.values[5])};
+    float32x2_t tanh_vec3 = {tanhf(ar.values[6]), tanhf(ar.values[7])};
+
+    float32x4_t result0 = vcombine_f32(tanh_vec0, tanh_vec1);
+    float32x4_t result1 = vcombine_f32(tanh_vec2, tanh_vec3);
+
+    float32x4x2_t result;
+    result.val[0] = result0;
+    result.val[1] = result1;
+
+    return FP32Vec8(result);
+  }
+
+  FP32Vec8 er() const {
+    AliasReg ar;
+    ar.reg = reg;
+
+    float32x2_t er_vec0 = {static_cast<float32_t>(erf(ar.values[0])), static_cast<float32_t>(erf(ar.values[1]))};
+    float32x2_t er_vec1 = {static_cast<float32_t>(erf(ar.values[2])), static_cast<float32_t>(erf(ar.values[3]))};
+    float32x2_t er_vec2 = {static_cast<float32_t>(erf(ar.values[4])), static_cast<float32_t>(erf(ar.values[5]))};
+    float32x2_t er_vec3 = {static_cast<float32_t>(erf(ar.values[6])), static_cast<float32_t>(erf(ar.values[7]))};
+
+    float32x4_t result0 = vcombine_f32(er_vec0, er_vec1);
+    float32x4_t result1 = vcombine_f32(er_vec2, er_vec3);
+
+    float32x4x2_t result;
+    result.val[0] = result0;
+    result.val[1] = result1;
+
+    return FP32Vec8(result);
+  } 
+
+  FP32Vec8 operator*(const FP32Vec8 &b) const {
+    return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), vmulq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  FP32Vec8 operator+(const FP32Vec8 &b) const {
+    return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), vaddq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  FP32Vec8 operator-(const FP32Vec8 &b) const {
+    return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), vsubq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  FP32Vec8 operator/(const FP32Vec8 &b) const {
+    return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), vdivq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  void save(float *ptr) const {
+    vst1q_f32(ptr, reg.val[0]);
+    vst1q_f32(ptr + 4, reg.val[1]);
+  }
+};
+
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    float32x4x4_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  float32x4x4_t reg;
+
+  explicit FP32Vec16(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {}
+
+  explicit FP32Vec16() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {}
+
+  explicit FP32Vec16(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), vld1q_f32(ptr + 12)}) {}
+
+  explicit FP32Vec16(float32x4x4_t data) : reg(data) {}
+
+  explicit FP32Vec16(const FP32Vec8 &data) {
+        reg.val[0] = data.reg.val[0]; 
+        reg.val[1] = data.reg.val[1]; 
+        reg.val[2] = data.reg.val[0]; 
+        reg.val[3] = data.reg.val[1]; 
+  }
+
+  explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
+
+  explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v.reg)) {}
+
+  #ifdef ARM_BF16_SUPPORT
+  explicit FP32Vec16(bfloat16x8x2_t v) : reg({
+    vcvtq_low_f32_bf16(v.val[0]),
+    vcvtq_high_f32_bf16(v.val[0]),
+    vcvtq_low_f32_bf16(v.val[1]),
+    vcvtq_high_f32_bf16(v.val[1])
+  }) {};
+  #endif
+
+  explicit FP32Vec16(const FP32Vec4 &data) {
+    reg.val[0] = data.reg;
+    reg.val[1] = data.reg;
+    reg.val[2] = data.reg;
+    reg.val[3] = data.reg;
+  };
+
+  #ifdef ARM_BF16_SUPPORT
+  explicit FP32Vec16(const BF16Vec16 &v) : reg({
+    vcvtq_low_f32_bf16(v.reg.val[0]),
+    vcvtq_high_f32_bf16(v.reg.val[0]),
+    vcvtq_low_f32_bf16(v.reg.val[1]),
+    vcvtq_high_f32_bf16(v.reg.val[1])
+  }) {};
+
+  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {};
+  #endif
+
+  explicit FP32Vec16(const FP16Vec16 &v) {
+      reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0]));
+      reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0]));
+      reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1]));
+      reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1]));
+  };
+
+  FP32Vec16 operator+(const FP32Vec16 &b) const {
+    return FP32Vec16(float32x4x4_t({
+        vaddq_f32(reg.val[0], b.reg.val[0]),
+        vaddq_f32(reg.val[1], b.reg.val[1]),
+        vaddq_f32(reg.val[2], b.reg.val[2]),
+        vaddq_f32(reg.val[3], b.reg.val[3])}));
+  };
+
+  FP32Vec16 operator*(const FP32Vec16 &b) const {
+    return FP32Vec16(float32x4x4_t({
+        vmulq_f32(reg.val[0], b.reg.val[0]),
+        vmulq_f32(reg.val[1], b.reg.val[1]),
+        vmulq_f32(reg.val[2], b.reg.val[2]),
+        vmulq_f32(reg.val[3], b.reg.val[3])}));
+  };
+
+  FP32Vec16 operator-(const FP32Vec16 &b) const {
+    return FP32Vec16(float32x4x4_t({
+        vsubq_f32(reg.val[0], b.reg.val[0]),
+        vsubq_f32(reg.val[1], b.reg.val[1]),
+        vsubq_f32(reg.val[2], b.reg.val[2]),
+        vsubq_f32(reg.val[3], b.reg.val[3])
+    }));
+  };
+
+  FP32Vec16 operator/(const FP32Vec16 &b) const {
+    return FP32Vec16(float32x4x4_t({
+        vdivq_f32(reg.val[0], b.reg.val[0]),
+        vdivq_f32(reg.val[1], b.reg.val[1]),
+        vdivq_f32(reg.val[2], b.reg.val[2]),
+        vdivq_f32(reg.val[3], b.reg.val[3])
+    }));
+  };
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float answer = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&answer, &ar](int i) { answer += ar.values[i]; });
+
+    return answer;
+  };
+
+  template <int group_size> float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+
+    AliasReg ar;
+    ar.reg = reg;
+    float answer = 0;
+    const int start = idx * group_size;
+    unroll_loop<int, group_size>(
+        [&answer, &start, ar](int i) { answer += ar.values[start + i]; });
+
+    return answer;
+  };
+
+  void save(float *ptr) const {
+    vst1q_f32(ptr, reg.val[0]);
+    vst1q_f32(ptr + 4, reg.val[1]);
+    vst1q_f32(ptr + 8, reg.val[2]);
+    vst1q_f32(ptr + 12, reg.val[3]);
+  };
+};
+
+template <typename T> struct VecType { using vec_type = void; };
+
+template <typename T> using vec_t = typename VecType<T>::vec_type;
+
+template <> struct VecType<float> { using vec_type = FP32Vec8; };
+
+template <> struct VecType<c10::Half> { using vec_type = FP16Vec8; };
+
+#ifdef ARM_BF16_SUPPORT
+template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+#endif
+
+template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+
+template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
+  *reinterpret_cast<__fp16 *>(ptr) = v;
+}
+
+inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) {
+    float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]);
+    float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]);
+    float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]);
+    float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]);
+
+    reg.val[0] = vcombine_f16(low_0, high_0);
+    reg.val[1] = vcombine_f16(low_1, high_1);
+};
+
+inline FP16Vec8 :: FP16Vec8(const FP32Vec8 &v) {
+    float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]);
+    float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]);
+
+    reg = vcombine_f16(lower_half, upper_half);
+};
+
+inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+
+  acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a.reg.val[0], b.reg.val[0]);
+  acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a.reg.val[1], b.reg.val[1]);
+  acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a.reg.val[2], b.reg.val[2]);
+  acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a.reg.val[3], b.reg.val[3]);
+};
+
+#ifdef ARM_BF16_SUPPORT
+inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
+
+  float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0]));
+  float32x4_t a0_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[0]));
+  float32x4_t a1_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[1]));
+  float32x4_t a1_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[1]));
+
+  float32x4_t b0_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[0]));
+  float32x4_t b0_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[0]));
+  float32x4_t b1_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[1]));
+  float32x4_t b1_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[1]));
+
+  acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a0_low, b0_low);
+  acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a0_high, b0_high);
+  acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a1_low, b1_low);
+  acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a1_high, b1_high);
+};
+#endif
+
+#ifdef ARM_BF16_SUPPORT
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {};
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg({
+    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]),
+    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), v.reg.val[3])
+  }){};
+#endif
+
+inline void prefetch(const void *addr) {
+    __builtin_prefetch(addr, 0, 1);
+};
+
+#ifdef ARM_BF16_SUPPORT
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) { 
+  *reinterpret_cast<__bf16 *>(ptr) = vcvth_bf16_f32(v);
+};
+#endif
+};
\ No newline at end of file
diff --git a/docs/source/getting_started/arm-installation.rst b/docs/source/getting_started/arm-installation.rst
new file mode 100644
index 0000000000000..7b457df92c11d
--- /dev/null
+++ b/docs/source/getting_started/arm-installation.rst
@@ -0,0 +1,50 @@
+.. _installation_arm:
+
+Installation for ARM CPUs
+=========================
+
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering:
+
+* CPU backend inference capabilities
+* Relevant runtime environment variables
+* Performance optimization tips
+
+ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
+Contents:
+
+1. :ref:`Requirements <arm_backend_requirements>`
+2. :ref:`Quick Start with Dockerfile <arm_backend_quick_start_dockerfile>`
+3. :ref:`Building from Source <build_arm_backend_from_source>`
+
+.. _arm_backend_requirements:
+
+Requirements
+------------
+
+* **Operating System**: Linux or macOS
+* **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended)
+* **Instruction Set Architecture (ISA)**: NEON support is required
+
+.. _arm_backend_quick_start_dockerfile:
+
+Quick Start with Dockerfile
+---------------------------
+
+You can quickly set up vLLM on ARM using Docker:
+
+.. code-block:: console
+
+    $ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g .
+    $ docker run -it \
+                 --rm \
+                 --network=host \
+                 --cpuset-cpus=<cpu-id-list, optional> \
+                 --cpuset-mems=<memory-node, optional> \
+                 vllm-cpu-env
+
+.. _build_arm_backend_from_source:
+
+Building from Source
+--------------------
+
+To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index c2afd806c50f9..0692e949f1c77 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -67,6 +67,7 @@ Documentation
    getting_started/openvino-installation
    getting_started/cpu-installation
    getting_started/gaudi-installation
+   getting_started/arm-installation
    getting_started/neuron-installation
    getting_started/tpu-installation
    getting_started/xpu-installation
diff --git a/examples/offline_inference.py b/examples/offline_inference.py
index 9b758fa2479f6..23cc6e8539431 100644
--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
@@ -19,4 +19,4 @@
 for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
\ No newline at end of file
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 749b03a0603d8..db8ad9d3a015d 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -1,6 +1,7 @@
 # Common dependencies
 -r requirements-common.txt
 
-# Dependencies for x86_64 CPUs
-torch == 2.5.1+cpu; platform_machine != "ppc64le"
-torchvision; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
+# Dependencies for CPUs
+torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
+torch==2.5.1; platform_machine == "aarch64"
+torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
\ No newline at end of file

From ee2c7f56fce31bbb7bfcb9b25f1ea712bafe72e5 Mon Sep 17 00:00:00 2001
From: Ricky Xu <rickyx@anyscale.com>
Date: Mon, 25 Nov 2024 21:09:43 -0800
Subject: [PATCH 043/293] [v1] EngineArgs for better config handling for v1
 (#10382)

Signed-off-by: rickyx <rickyx@anyscale.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml              |  2 +-
 tests/v1/engine/test_async_llm.py          |  3 ++
 tests/v1/engine/test_engine_args.py        | 42 +++++++++++++++++
 tests/v1/engine/test_engine_core.py        |  3 +-
 tests/v1/engine/test_engine_core_client.py |  6 ++-
 vllm/engine/arg_utils.py                   | 53 ++++++++++++++++++++--
 vllm/engine/async_llm_engine.py            |  2 +-
 vllm/engine/llm_engine.py                  |  2 +-
 vllm/engine/multiprocessing/engine.py      |  2 +-
 vllm/entrypoints/openai/api_server.py      |  4 +-
 vllm/v1/engine/async_llm.py                |  2 +-
 vllm/v1/engine/core.py                     | 13 ------
 vllm/v1/engine/llm_engine.py               |  2 +-
 13 files changed, 109 insertions(+), 27 deletions(-)
 create mode 100644 tests/v1/engine/test_engine_args.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index bff33d35b423e..fc23c9cff0d87 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -172,7 +172,7 @@ steps:
     - vllm/
     - tests/v1
   commands:
-    - pytest -v -s v1
+    - VLLM_USE_V1=1 pytest -v -s v1
 
 - label: Examples Test # 15min
   working_dir: "/vllm-workspace/examples"
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 1f26fe0fc892f..fffb5b8100ec7 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -32,6 +32,9 @@ async def generate(engine: AsyncLLM, request_id: str,
 
 @pytest.mark.asyncio
 async def test_load(monkeypatch):
+    # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
+    # so that in the future when we switch, we don't have to change all the
+    # tests.
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
new file mode 100644
index 0000000000000..69cfdf5a395c1
--- /dev/null
+++ b/tests/v1/engine/test_engine_args.py
@@ -0,0 +1,42 @@
+import pytest
+
+from vllm import envs
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
+from vllm.usage.usage_lib import UsageContext
+
+if not envs.VLLM_USE_V1:
+    pytest.skip(
+        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
+        allow_module_level=True,
+    )
+
+
+def test_defaults():
+    engine_args = EngineArgs(model="facebook/opt-125m")
+
+    # Assert V1 defaults
+    assert (engine_args.enable_prefix_caching
+            ), "V1 turns on prefix caching by default"
+
+
+def test_defaults_with_usage_context():
+    engine_args = EngineArgs(model="facebook/opt-125m")
+    vllm_config: VllmConfig = engine_args.create_engine_config(
+        UsageContext.LLM_CLASS)
+
+    assert vllm_config.scheduler_config.max_num_seqs == 1024
+    assert vllm_config.scheduler_config.max_num_batched_tokens == 8192
+
+    engine_args = EngineArgs(model="facebook/opt-125m")
+    vllm_config = engine_args.create_engine_config(
+        UsageContext.OPENAI_API_SERVER)
+    assert vllm_config.scheduler_config.max_num_seqs == 1024
+    assert vllm_config.scheduler_config.max_num_batched_tokens == 2048
+
+
+def test_prefix_cache_disabled_with_multimodel():
+    engine_args = EngineArgs(model="llava-hf/llava-1.5-7b-hf")
+
+    vllm_config = engine_args.create_engine_config(UsageContext.LLM_CLASS)
+    assert not vllm_config.cache_config.enable_prefix_caching
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index b3692b594326a..bd11ff1877064 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -43,7 +43,8 @@ def test_engine_core(monkeypatch):
         m.setenv("VLLM_USE_V1", "1")
         """Setup the EngineCore."""
         engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config()
+        vllm_config = engine_args.create_engine_config(
+            usage_context=UsageContext.UNKNOWN_CONTEXT)
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index e248e35ae4069..582192196aaf9 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -82,7 +82,8 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
         m.setenv("VLLM_USE_V1", "1")
 
         engine_args = EngineArgs(model=MODEL_NAME, compilation_config=3)
-        vllm_config = engine_args.create_engine_config()
+        vllm_config = engine_args.create_engine_config(
+            UsageContext.UNKNOWN_CONTEXT)
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
         client = EngineCoreClient.make_client(
             vllm_config,
@@ -153,7 +154,8 @@ async def test_engine_core_client_asyncio(monkeypatch):
         m.setenv("VLLM_USE_V1", "1")
 
         engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config()
+        vllm_config = engine_args.create_engine_config(
+            usage_context=UsageContext.UNKNOWN_CONTEXT)
         executor_class = AsyncLLM._get_executor_cls(vllm_config)
         client = EngineCoreClient.make_client(
             vllm_config,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ca68c1d57151c..60ad5ee54a2f2 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -20,6 +20,7 @@
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.platforms import current_platform
 from vllm.transformers_utils.utils import check_gguf_file
+from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, StoreBoolean
 
 if TYPE_CHECKING:
@@ -113,7 +114,7 @@ class EngineArgs:
     # NOTE(kzawora): default block size for Gaudi should be 128
     # smaller sizes still work, but very inefficiently
     block_size: int = 16 if not current_platform.is_hpu() else 128
-    enable_prefix_caching: bool = False
+    enable_prefix_caching: Optional[bool] = None
     disable_sliding_window: bool = False
     use_v2_block_manager: bool = True
     swap_space: float = 4  # GiB
@@ -197,6 +198,11 @@ def __post_init__(self):
         if not self.tokenizer:
             self.tokenizer = self.model
 
+        # Override the default value of enable_prefix_caching if it's not set
+        # by user.
+        if self.enable_prefix_caching is None:
+            self.enable_prefix_caching = bool(envs.VLLM_USE_V1)
+
         # support `EngineArgs(compilation_config={...})`
         # without having to manually construct a
         # CompilationConfig object
@@ -953,7 +959,12 @@ def create_load_config(self) -> LoadConfig:
             ignore_patterns=self.ignore_patterns,
         )
 
-    def create_engine_config(self) -> VllmConfig:
+    def create_engine_config(self,
+                             usage_context: Optional[UsageContext] = None
+                             ) -> VllmConfig:
+        if envs.VLLM_USE_V1:
+            self._override_v1_engine_args(usage_context)
+
         # gguf file needs a specific model loader and doesn't use hf_repo
         if check_gguf_file(self.model):
             self.quantization = self.load_format = "gguf"
@@ -1170,7 +1181,7 @@ def create_engine_config(self) -> VllmConfig:
             or "all" in detailed_trace_modules,
         )
 
-        return VllmConfig(
+        config = VllmConfig(
             model_config=model_config,
             cache_config=cache_config,
             parallel_config=parallel_config,
@@ -1185,6 +1196,42 @@ def create_engine_config(self) -> VllmConfig:
             compilation_config=self.compilation_config,
         )
 
+        if envs.VLLM_USE_V1:
+            self._override_v1_engine_config(config)
+        return config
+
+    def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
+        """
+        Override the EngineArgs's args based on the usage context for V1.
+        """
+        assert envs.VLLM_USE_V1, "V1 is not enabled"
+
+        if self.max_num_batched_tokens is None:
+            # When no user override, set the default values based on the
+            # usage context.
+            if usage_context == UsageContext.LLM_CLASS:
+                logger.warning("Setting max_num_batched_tokens to 8192 "
+                               "for LLM_CLASS usage context.")
+                self.max_num_seqs = 1024
+                self.max_num_batched_tokens = 8192
+            elif usage_context == UsageContext.OPENAI_API_SERVER:
+                logger.warning("Setting max_num_batched_tokens to 2048 "
+                               "for OPENAI_API_SERVER usage context.")
+                self.max_num_seqs = 1024
+                self.max_num_batched_tokens = 2048
+
+    def _override_v1_engine_config(self, engine_config: VllmConfig) -> None:
+        """
+        Override the EngineConfig's configs based on the usage context for V1.
+        """
+        assert envs.VLLM_USE_V1, "V1 is not enabled"
+        # TODO (ywang96): Enable APC by default when VLM supports it.
+        if engine_config.model_config.is_multimodal_model:
+            logger.warning(
+                "Prefix caching is currently not supported for multimodal "
+                "models and has been disabled.")
+            engine_config.cache_config.enable_prefix_caching = False
+
 
 @dataclass
 class AsyncEngineArgs(EngineArgs):
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 5a5388708b1c6..3224577c567f8 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -680,7 +680,7 @@ def from_engine_args(
         """Creates an async LLM engine from the engine arguments."""
         # Create the engine configs.
         if engine_config is None:
-            engine_config = engine_args.create_engine_config()
+            engine_config = engine_args.create_engine_config(usage_context)
 
         executor_class = cls._get_executor_cls(engine_config)
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index fb21b2dedeb74..a4975cece9a81 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -568,7 +568,7 @@ def from_engine_args(
     ) -> "LLMEngine":
         """Creates an LLM engine from the engine arguments."""
         # Create the engine configs.
-        engine_config = engine_args.create_engine_config()
+        engine_config = engine_args.create_engine_config(usage_context)
         executor_class = cls._get_executor_cls(engine_config)
         # Create the LLM engine.
         engine = cls(
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 7de23643a2e1c..49a90b321dac4 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -111,7 +111,7 @@ def from_engine_args(cls, engine_args: AsyncEngineArgs,
         from vllm.plugins import load_general_plugins
         load_general_plugins()
 
-        engine_config = engine_args.create_engine_config()
+        engine_config = engine_args.create_engine_config(usage_context)
         executor_class = LLMEngine._get_executor_cls(engine_config)
 
         use_async_sockets = engine_config.model_config.use_async_output_proc
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index bc018be982bff..6bc31ef83ded4 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -135,8 +135,8 @@ async def build_async_engine_client_from_engine_args(
     # TODO: fill out feature matrix.
     if (MQLLMEngineClient.is_unsupported_config(engine_args)
             or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):
-
-        engine_config = engine_args.create_engine_config()
+        engine_config = engine_args.create_engine_config(
+            UsageContext.OPENAI_API_SERVER)
         uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config),
                            "uses_ray", False)
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index c44ebb2a85ba0..a17c8eac4b77c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -94,7 +94,7 @@ def from_engine_args(
 
         # Create the engine configs.
         if engine_config is None:
-            vllm_config = engine_args.create_engine_config()
+            vllm_config = engine_args.create_engine_config(usage_context)
         else:
             vllm_config = engine_config
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 1a978fbe7355f..34f99dd30ef2e 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -41,19 +41,6 @@ def __init__(
         executor_class: Type[GPUExecutor],
         usage_context: UsageContext,
     ):
-        # Override the configs for V1.
-        # FIXME
-        if usage_context == UsageContext.LLM_CLASS:
-            vllm_config.scheduler_config.max_num_seqs = 1024
-            vllm_config.scheduler_config.max_num_batched_tokens = 8192
-        elif usage_context == UsageContext.OPENAI_API_SERVER:
-            vllm_config.scheduler_config.max_num_seqs = 1024
-            vllm_config.scheduler_config.max_num_batched_tokens = 2048
-
-        # TODO (ywang96): Enable APC by default when VLM supports it.
-        if not vllm_config.model_config.is_multimodal_model:
-            vllm_config.cache_config.enable_prefix_caching = True
-
         assert vllm_config.model_config.task != "embedding"
 
         logger.info("Initializing an LLM engine (v%s) with config: %s",
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 75a77be750acd..7a5482f03b6fa 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -82,7 +82,7 @@ def from_engine_args(
         """Creates an LLM engine from the engine arguments."""
 
         # Create the engine configs.
-        vllm_config = engine_args.create_engine_config()
+        vllm_config = engine_args.create_engine_config(usage_context)
         executor_class = cls._get_executor_cls(vllm_config)
 
         if VLLM_ENABLE_V1_MULTIPROCESSING:

From 0bd61fb083e87b2eb8c9d09a1e3c5d9cb30f0c11 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Tue, 26 Nov 2024 00:00:16 -0600
Subject: [PATCH 044/293] custom allreduce + torch.compile (#10121)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 docs/source/getting_started/debugging.rst     |   1 -
 tests/distributed/test_pynccl.py              |  15 +--
 tests/distributed/test_utils.py               |   2 -
 .../device_communicators/pynccl.py            |  26 ++---
 vllm/distributed/parallel_state.py            | 110 ++++++------------
 vllm/v1/worker/gpu_model_runner.py            |   6 +-
 6 files changed, 59 insertions(+), 101 deletions(-)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index 77bf550601346..0c1afcbd7c0b9 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -86,7 +86,6 @@ If GPU/CPU communication cannot be established, you can use the following Python
     from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 
     pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
-    pynccl.disabled = False
 
     s = torch.cuda.Stream()
     with torch.cuda.stream(s):
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index f702d7c46ea73..fb24d6bc2c100 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -60,7 +60,7 @@ def worker_fn():
     tensor = torch.ones(16, 1024, 1024,
                         dtype=torch.float32).cuda(pynccl_comm.rank)
     with pynccl_comm.change_state(enable=True):
-        pynccl_comm.all_reduce(tensor)
+        tensor = pynccl_comm.all_reduce(tensor)
     result = tensor.mean().cpu().item()
     assert result == pynccl_comm.world_size
 
@@ -84,12 +84,12 @@ def multiple_allreduce_worker_fn():
     with pynccl_comm.change_state(enable=True):
         # two groups can communicate independently
         if torch.distributed.get_rank() in [0, 1]:
-            pynccl_comm.all_reduce(tensor)
-            pynccl_comm.all_reduce(tensor)
+            tensor = pynccl_comm.all_reduce(tensor)
+            tensor = pynccl_comm.all_reduce(tensor)
             result = tensor.mean().cpu().item()
             assert result == 4
         else:
-            pynccl_comm.all_reduce(tensor)
+            tensor = pynccl_comm.all_reduce(tensor)
             result = tensor.mean().cpu().item()
             assert result == 2
 
@@ -140,14 +140,11 @@ def worker_fn_with_cudagraph():
         with torch.cuda.graph(
                 graph, stream=pynccl_comm.stream), pynccl_comm.change_state(
                     enable=True):
-            # operation during the graph capture is recorded but not executed
-            # see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture # noqa
-            pynccl_comm.all_reduce(a)
+            a_out = pynccl_comm.all_reduce(a)
         pynccl_comm.stream.synchronize()
-        assert a.mean().cpu().item() == pynccl_comm.world_size**0
         graph.replay()
         pynccl_comm.stream.synchronize()
-        assert a.mean().cpu().item() == pynccl_comm.world_size**1
+        assert a_out.mean().cpu().item() == pynccl_comm.world_size**1
 
 
 @worker_fn_wrapper
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 686b697c98e03..5fb1ae7b29fd2 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -70,14 +70,12 @@ def gpu_worker(rank, WORLD_SIZE, port1, port2):
                                        rank=rank,
                                        world_size=WORLD_SIZE)
     pynccl1 = PyNcclCommunicator(pg1, device=rank)
-    pynccl1.disabled = False
     if rank <= 2:
         pg2 = StatelessProcessGroup.create(host="127.0.0.1",
                                            port=port2,
                                            rank=rank,
                                            world_size=3)
         pynccl2 = PyNcclCommunicator(pg2, device=rank)
-        pynccl2.disabled = False
     data = torch.tensor([rank]).cuda()
     pynccl1.all_reduce(data)
     pg1.barrier()
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 7411304eb18fa..d4e3f81747038 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -106,30 +106,30 @@ def __init__(
             self.stream.synchronize()
             del data
 
-        # by default it is disabled, e.g. in profiling models and prefill phase.
-        # to use it, use under `with obj.change_state(enable=True)`, usually
-        # when we are using CUDA graph.
-        self.disabled = True
-
     def all_reduce(self,
-                   tensor: torch.Tensor,
+                   in_tensor: torch.Tensor,
                    op: ReduceOp = ReduceOp.SUM,
-                   stream=None):
+                   stream=None) -> torch.Tensor:
         if self.disabled:
-            return
+            return None
         # nccl communicator created on a specific device
         # will only work on tensors on the same device
         # otherwise it will cause "illegal memory access"
-        assert tensor.device == self.device, (
+        assert in_tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {tensor.device}")
+            f"but the input tensor is on {in_tensor.device}")
+
+        out_tensor = torch.empty_like(in_tensor)
+
         if stream is None:
             stream = self.stream
-        self.nccl.ncclAllReduce(buffer_type(tensor.data_ptr()),
-                                buffer_type(tensor.data_ptr()), tensor.numel(),
-                                ncclDataTypeEnum.from_torch(tensor.dtype),
+        self.nccl.ncclAllReduce(buffer_type(in_tensor.data_ptr()),
+                                buffer_type(out_tensor.data_ptr()),
+                                in_tensor.numel(),
+                                ncclDataTypeEnum.from_torch(in_tensor.dtype),
                                 ncclRedOpTypeEnum.from_torch(op), self.comm,
                                 cudaStream_t(stream.cuda_stream))
+        return out_tensor
 
     def all_gather(self,
                    output_tensor: torch.Tensor,
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 87ade377266a2..ccbe00386c5da 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -96,42 +96,24 @@ def _register_group(group: "GroupCoordinator") -> None:
     _groups[group.unique_name] = weakref.ref(group)
 
 
-if supports_custom_op():
-
-    def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None:
-        assert group_name in _groups, f"Group {group_name} is not found."
-        group = _groups[group_name]()
-        if group is None:
-            raise ValueError(f"Group {group_name} is destroyed.")
-        group._all_reduce_in_place(tensor)
-
-    def inplace_all_reduce_fake(tensor: torch.Tensor, group_name: str) -> None:
-        return
+def all_reduce(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
+    assert group_name in _groups, f"Group {group_name} is not found."
+    group = _groups[group_name]()
+    if group is None:
+        raise ValueError(f"Group {group_name} is destroyed.")
+    return group._all_reduce_out_place(tensor)
 
-    direct_register_custom_op(
-        op_name="inplace_all_reduce",
-        op_func=inplace_all_reduce,
-        mutates_args=["tensor"],
-        fake_impl=inplace_all_reduce_fake,
-    )
 
-    def outplace_all_reduce(tensor: torch.Tensor,
-                            group_name: str) -> torch.Tensor:
-        assert group_name in _groups, f"Group {group_name} is not found."
-        group = _groups[group_name]()
-        if group is None:
-            raise ValueError(f"Group {group_name} is destroyed.")
-        return group._all_reduce_out_place(tensor)
+def all_reduce_fake(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
+    return torch.empty_like(tensor)
 
-    def outplace_all_reduce_fake(tensor: torch.Tensor,
-                                 group_name: str) -> torch.Tensor:
-        return torch.empty_like(tensor)
 
+if supports_custom_op():
     direct_register_custom_op(
-        op_name="outplace_all_reduce",
-        op_func=outplace_all_reduce,
+        op_name="all_reduce",
+        op_func=all_reduce,
         mutates_args=[],
-        fake_impl=outplace_all_reduce_fake,
+        fake_impl=all_reduce_fake,
     )
 
 
@@ -317,30 +299,13 @@ def graph_capture(
             stream.wait_stream(curr_stream)
 
         with torch.cuda.stream(stream), maybe_ca_context:
-            # In graph mode, we have to be very careful about the collective
-            # operations. The current status is:
-            #     allreduce \ Mode   |  Eager  |  Graph  |
-            # --------------------------------------------
-            # custom allreduce       | enabled | enabled |
-            # PyNccl                 | disabled| enabled |
-            # torch.distributed      | enabled | disabled|
-            #
-            # Note that custom allreduce will have a runtime check, if the
-            #  tensor size is too large, it will fallback to the next
-            #  available option.
-            # In summary: When using CUDA graph, we use
-            #  either custom all-reduce kernel or pynccl. When not using
-            #  CUDA graph, we use either custom all-reduce kernel or
-            #  PyTorch NCCL. We always prioritize using custom all-reduce
-            #  kernel but fall back to PyTorch or pynccl if it is
-            #  disabled or not supported.
             pynccl_comm = self.pynccl_comm
             maybe_pynccl_context: Any
             if not pynccl_comm:
                 maybe_pynccl_context = nullcontext()
             else:
                 maybe_pynccl_context = pynccl_comm.change_state(
-                    enable=True, stream=torch.cuda.current_stream())
+                    stream=torch.cuda.current_stream())
             with maybe_pynccl_context:
                 yield graph_capture_context
 
@@ -356,8 +321,8 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
          coordinator.
 
         In addition, PyTorch custom ops do not support mutation or returning
-        a new tensor in the same op. So we need to figure out if the op is
-        in-place or out-of-place ahead of time.
+        a new tensor in the same op. So we always make the all-reduce operation
+        out-of-place.
         """
         # Bypass the function if we are using only 1 GPU.
         if self.world_size == 1:
@@ -368,10 +333,6 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
             ipex.distributed.all_reduce(input_, group=self.device_group)
             return input_
 
-        if not supports_custom_op():
-            self._all_reduce_in_place(input_)
-            return input_
-
         if self.tpu_communicator is not None and \
             not self.tpu_communicator.disabled:
             # TPU handles Dynamo with its own logic.
@@ -385,30 +346,31 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
                 not self.xpu_communicator.disabled:
             return self.xpu_communicator.all_reduce(input_)
 
-        if self.ca_comm is not None and \
-            not self.ca_comm.disabled and \
-                self.ca_comm.should_custom_ar(input_):
-            return torch.ops.vllm.outplace_all_reduce(
-                input_, group_name=self.unique_name)
-        else:
-            torch.ops.vllm.inplace_all_reduce(input_,
-                                              group_name=self.unique_name)
-            return input_
+        return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name)
 
     def _all_reduce_out_place(self, input_: torch.Tensor) -> torch.Tensor:
+        # always try custom allreduce first,
+        # and then pynccl.
         ca_comm = self.ca_comm
-        assert ca_comm is not None
-        assert not ca_comm.disabled
-        out = ca_comm.custom_all_reduce(input_)
-        assert out is not None
-        return out
-
-    def _all_reduce_in_place(self, input_: torch.Tensor) -> None:
+        if ca_comm is not None and not ca_comm.disabled and \
+            ca_comm.should_custom_ar(input_):
+            out = ca_comm.custom_all_reduce(input_)
+            assert out is not None
+            return out
         pynccl_comm = self.pynccl_comm
-        if (pynccl_comm is not None and not pynccl_comm.disabled):
-            pynccl_comm.all_reduce(input_)
-        else:
-            torch.distributed.all_reduce(input_, group=self.device_group)
+        assert pynccl_comm is not None
+        # TODO: pynccl should not use `stream=`
+        # it can just always use the current stream.
+        out = pynccl_comm.all_reduce(input_,
+                                     stream=torch.cuda.current_stream())
+        if out is None:
+            # fall back to the default all-reduce using PyTorch.
+            # this usually happens during testing.
+            # when we run the model, allreduce only happens for the TP
+            # group, where we always have either custom allreduce or pynccl.
+            out = input_.clone()
+            torch.distributed.all_reduce(out, group=self.device_group)
+        return out
 
     def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         world_size = self.world_size
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 02f9498142bb7..13cbc8fa39c03 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -10,6 +10,7 @@
 
 from vllm.compilation.compile_context import set_compile_context
 from vllm.config import CompilationLevel, VllmConfig
+from vllm.distributed.parallel_state import graph_capture
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
@@ -570,8 +571,9 @@ def capture_model(self) -> None:
         # Trigger CUDA graph capture for specific shapes.
         # Capture the large shapes first so that the smaller shapes
         # can reuse the memory pool allocated for the large shapes.
-        for num_tokens in reversed(self.cudagraph_batch_sizes):
-            self._dummy_run(self.model, num_tokens, self.kv_caches)
+        with graph_capture():
+            for num_tokens in reversed(self.cudagraph_batch_sizes):
+                self._dummy_run(self.model, num_tokens, self.kv_caches)
 
         end_time = time.perf_counter()
         end_free_gpu_memory = torch.cuda.mem_get_info()[0]

From dc8a363d255229d8159947605982ae58ac598a39 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 26 Nov 2024 14:55:00 +0800
Subject: [PATCH 045/293] [Misc] Remove outdated init protocols (#10655)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/model_executor/models/interfaces.py      | 30 -------------------
 vllm/model_executor/models/interfaces_base.py |  2 +-
 2 files changed, 1 insertion(+), 31 deletions(-)

diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 4f0c75b2c6a57..9b4a97abf9b51 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -10,7 +10,6 @@
 from .interfaces_base import is_embedding_model
 
 if TYPE_CHECKING:
-    from vllm.config import LoRAConfig, MultiModalConfig, SchedulerConfig
     from vllm.sequence import IntermediateTensors
 
 logger = init_logger(__name__)
@@ -29,9 +28,6 @@ class SupportsMultiModal(Protocol):
         MRO of your model class.
     """
 
-    def __init__(self, *, multimodal_config: "MultiModalConfig") -> None:
-        ...
-
 
 # We can't use runtime_checkable with ClassVar for issubclass checks
 # so we need to treat the class as an instance and use isinstance instead
@@ -39,9 +35,6 @@ def __init__(self, *, multimodal_config: "MultiModalConfig") -> None:
 class _SupportsMultiModalType(Protocol):
     supports_multimodal: Literal[True]
 
-    def __call__(self, *, multimodal_config: "MultiModalConfig") -> None:
-        ...
-
 
 @overload
 def supports_multimodal(
@@ -81,10 +74,6 @@ class SupportsLoRA(Protocol):
     embedding_modules: ClassVar[Dict[str, str]]
     embedding_padding_modules: ClassVar[List[str]]
 
-    # lora_config is None when LoRA is not enabled
-    def __init__(self, *, lora_config: Optional["LoRAConfig"] = None) -> None:
-        ...
-
 
 # We can't use runtime_checkable with ClassVar for issubclass checks
 # so we need to treat the class as an instance and use isinstance instead
@@ -97,9 +86,6 @@ class _SupportsLoRAType(Protocol):
     embedding_modules: Dict[str, str]
     embedding_padding_modules: List[str]
 
-    def __call__(self, *, lora_config: Optional["LoRAConfig"] = None) -> None:
-        ...
-
 
 @overload
 def supports_lora(model: Type[object]) -> TypeIs[Type[SupportsLoRA]]:
@@ -276,21 +262,11 @@ class HasInnerState(Protocol):
         for max_num_seqs, etc. True for e.g. both Mamba and Jamba.
     """
 
-    def __init__(self,
-                 *,
-                 scheduler_config: Optional["SchedulerConfig"] = None) -> None:
-        ...
-
 
 @runtime_checkable
 class _HasInnerStateType(Protocol):
     has_inner_state: ClassVar[Literal[True]]
 
-    def __init__(self,
-                 *,
-                 scheduler_config: Optional["SchedulerConfig"] = None) -> None:
-        ...
-
 
 @overload
 def has_inner_state(model: object) -> TypeIs[HasInnerState]:
@@ -323,17 +299,11 @@ class IsAttentionFree(Protocol):
         True for Mamba but not Jamba.
     """
 
-    def __init__(self) -> None:
-        ...
-
 
 @runtime_checkable
 class _IsAttentionFreeType(Protocol):
     is_attention_free: ClassVar[Literal[True]]
 
-    def __init__(self) -> None:
-        ...
-
 
 @overload
 def is_attention_free(model: object) -> TypeIs[IsAttentionFree]:
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 7bb43beff255c..957a5a6e26b5c 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -71,7 +71,7 @@ def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool:
                         and issubclass(model, nn.Module)):
         logger.warning(
             "The model (%s) is missing "
-            "vLLM-specific keywords from its initializer: %s",
+            "vLLM-specific keywords from its `forward` method: %s",
             model,
             missing_kws,
         )

From 1f74fe91d8a37f09173c1349a0c059873f74f6fc Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 26 Nov 2024 00:20:04 -0800
Subject: [PATCH 046/293] [ci] add vllm_test_utils (#10659)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 Dockerfile                                    |  4 ++
 Dockerfile.cpu                                |  4 ++
 Dockerfile.hpu                                |  3 ++
 Dockerfile.neuron                             |  3 ++
 Dockerfile.openvino                           |  3 ++
 Dockerfile.ppc64le                            |  3 ++
 Dockerfile.rocm                               |  3 ++
 Dockerfile.tpu                                |  3 ++
 Dockerfile.xpu                                |  3 +-
 tests/entrypoints/llm/test_lazy_outlines.py   | 23 +++++---
 tests/test_lazy_torch_compile.py              | 54 +------------------
 tests/vllm_test_utils/setup.py                |  7 +++
 .../vllm_test_utils/__init__.py               |  8 +++
 .../vllm_test_utils/vllm_test_utils/blame.py  | 53 ++++++++++++++++++
 14 files changed, 113 insertions(+), 61 deletions(-)
 create mode 100644 tests/vllm_test_utils/setup.py
 create mode 100644 tests/vllm_test_utils/vllm_test_utils/__init__.py
 create mode 100644 tests/vllm_test_utils/vllm_test_utils/blame.py

diff --git a/Dockerfile b/Dockerfile
index 220dbe26712ec..682f046d4b6ec 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -191,6 +191,10 @@ ADD . /vllm-workspace/
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-dev.txt
 
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -e tests/vllm_test_utils
+
 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install hf_transfer
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 287b4958da4e5..d2f72ea975a3d 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -62,4 +62,8 @@ WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -e tests/vllm_test_utils
+
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/Dockerfile.hpu b/Dockerfile.hpu
index d18fc016387bf..87e0c1a6a934e 100644
--- a/Dockerfile.hpu
+++ b/Dockerfile.hpu
@@ -11,6 +11,9 @@ ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
 
 RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 2143315d2a078..76dbd4c04d3f3 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -38,4 +38,7 @@ ENV VLLM_TARGET_DEVICE neuron
 RUN --mount=type=bind,source=.git,target=.git \
     pip install --no-build-isolation -v -e .
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 CMD ["/bin/bash"]
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index a05ff452cd36e..8bd188ffde408 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -22,4 +22,7 @@ RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVIC
 COPY examples/ /workspace/examples
 COPY benchmarks/ /workspace/benchmarks
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 CMD ["/bin/bash"]
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index b19c6ddec7948..971248577983f 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -29,6 +29,9 @@ RUN --mount=type=cache,target=/root/.cache/pip  \
 RUN --mount=type=bind,source=.git,target=.git \
     VLLM_TARGET_DEVICE=cpu python3 setup.py install
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 62d4a9b4909c3..e733994f8c33e 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -168,4 +168,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     if ls libs/*.whl; then \
     python3 -m pip install libs/*.whl; fi
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 CMD ["/bin/bash"]
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index 0a507b6ecdf60..b617932a85b47 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -22,4 +22,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         -r requirements-tpu.txt
 RUN python3 setup.py develop
 
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
+
 CMD ["/bin/bash"]
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index 63bc682770422..a374f20d7d949 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -64,5 +64,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 ENV VLLM_USAGE_SOURCE production-docker-image \
     TRITON_XPU_PROFILE 1
-
+# install development dependencies (for testing)
+RUN python3 -m pip install -e tests/vllm_test_utils
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index cbfb0cc32c1ce..81fb000d8ac56 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -1,12 +1,12 @@
 import sys
 
+from vllm_test_utils import blame
+
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 
 
-def test_lazy_outlines(sample_regex):
-    """If users don't use guided decoding, outlines should not be imported.
-    """
+def run_normal():
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
@@ -25,13 +25,12 @@ def test_lazy_outlines(sample_regex):
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-    # make sure outlines is not imported
-    assert 'outlines' not in sys.modules
-
     # Destroy the LLM object and free up the GPU memory.
     del llm
     cleanup_dist_env_and_memory()
 
+
+def run_lmfe(sample_regex):
     # Create an LLM with guided decoding enabled.
     llm = LLM(model="facebook/opt-125m",
               enforce_eager=True,
@@ -51,5 +50,15 @@ def test_lazy_outlines(sample_regex):
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
+
+def test_lazy_outlines(sample_regex):
+    """If users don't use guided decoding, outlines should not be imported.
+    """
     # make sure outlines is not imported
-    assert 'outlines' not in sys.modules
+    module_name = "outlines"
+    with blame(lambda: module_name in sys.modules) as result:
+        run_normal()
+        run_lmfe(sample_regex)
+    assert not result.found, (
+        f"Module {module_name} is already imported, the"
+        f" first import location is:\n{result.trace_stack}")
diff --git a/tests/test_lazy_torch_compile.py b/tests/test_lazy_torch_compile.py
index b8ac4dd93732b..4756fac8e2a8d 100644
--- a/tests/test_lazy_torch_compile.py
+++ b/tests/test_lazy_torch_compile.py
@@ -1,61 +1,9 @@
 # Description: Test the lazy import module
 # The utility function cannot be placed in `vllm.utils`
 # this needs to be a standalone script
-
-import contextlib
-import dataclasses
 import sys
-import traceback
-from typing import Callable, Generator
-
-
-@dataclasses.dataclass
-class BlameResult:
-    found: bool = False
-    trace_stack: str = ""
-
-
-@contextlib.contextmanager
-def blame(func: Callable) -> Generator[BlameResult, None, None]:
-    """
-    Trace the function calls to find the first function that satisfies the
-    condition. The trace stack will be stored in the result.
-
-    Usage:
-
-    ```python
-    with blame(lambda: some_condition()) as result:
-        # do something
-    
-    if result.found:
-        print(result.trace_stack)
-    """
-    result = BlameResult()
-
-    def _trace_calls(frame, event, arg=None):
-        nonlocal result
-        if event in ['call', 'return']:
-            # for every function call or return
-            try:
-                # Temporarily disable the trace function
-                sys.settrace(None)
-                # check condition here
-                if not result.found and func():
-                    result.found = True
-                    result.trace_stack = "".join(traceback.format_stack())
-                # Re-enable the trace function
-                sys.settrace(_trace_calls)
-            except NameError:
-                # modules are deleted during shutdown
-                pass
-        return _trace_calls
-
-    sys.settrace(_trace_calls)
-
-    yield result
-
-    sys.settrace(None)
 
+from vllm_test_utils import blame
 
 module_name = "torch._inductor.async_compile"
 
diff --git a/tests/vllm_test_utils/setup.py b/tests/vllm_test_utils/setup.py
new file mode 100644
index 0000000000000..790e891ec837d
--- /dev/null
+++ b/tests/vllm_test_utils/setup.py
@@ -0,0 +1,7 @@
+from setuptools import setup
+
+setup(
+    name='vllm_test_utils',
+    version='0.1',
+    packages=['vllm_test_utils'],
+)
diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py
new file mode 100644
index 0000000000000..bf0b62a5b75e3
--- /dev/null
+++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py
@@ -0,0 +1,8 @@
+"""
+vllm_utils is a package for vLLM testing utilities.
+It does not import any vLLM modules.
+"""
+
+from .blame import BlameResult, blame
+
+__all__ = ["blame", "BlameResult"]
diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py
new file mode 100644
index 0000000000000..ad23ab83c2d81
--- /dev/null
+++ b/tests/vllm_test_utils/vllm_test_utils/blame.py
@@ -0,0 +1,53 @@
+import contextlib
+import dataclasses
+import sys
+import traceback
+from typing import Callable, Generator
+
+
+@dataclasses.dataclass
+class BlameResult:
+    found: bool = False
+    trace_stack: str = ""
+
+
+@contextlib.contextmanager
+def blame(func: Callable) -> Generator[BlameResult, None, None]:
+    """
+    Trace the function calls to find the first function that satisfies the
+    condition. The trace stack will be stored in the result.
+
+    Usage:
+
+    ```python
+    with blame(lambda: some_condition()) as result:
+        # do something
+    
+    if result.found:
+        print(result.trace_stack)
+    """
+    result = BlameResult()
+
+    def _trace_calls(frame, event, arg=None):
+        nonlocal result
+        if event in ['call', 'return']:
+            # for every function call or return
+            try:
+                # Temporarily disable the trace function
+                sys.settrace(None)
+                # check condition here
+                if not result.found and func():
+                    result.found = True
+                    result.trace_stack = "".join(traceback.format_stack())
+                # Re-enable the trace function
+                sys.settrace(_trace_calls)
+            except NameError:
+                # modules are deleted during shutdown
+                pass
+        return _trace_calls
+
+    sys.settrace(_trace_calls)
+
+    yield result
+
+    sys.settrace(None)

From 53f9d49a9b43e8d6513744cd5f64789d5c3770ee Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Tue, 26 Nov 2024 18:36:45 +0800
Subject: [PATCH 047/293] [V1] Enable profile for LLMEngine (#10665)

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/llm_engine.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 7a5482f03b6fa..bd19d998a4adb 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -161,13 +161,13 @@ def step(self) -> List[RequestOutput]:
     # TODO(rob): Can we get rid of these?
 
     def get_model_config(self):
-        pass
+        return self.model_config
 
     def start_profile(self):
-        pass
+        self.engine_core.profile(True)
 
     def stop_profile(self):
-        pass
+        self.engine_core.profile(False)
 
     def get_tokenizer_group(self, group_type):
         pass

From e82fe47e81ca87b0cf105526a55e9980a025c991 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 07:48:22 -0500
Subject: [PATCH 048/293] Squash commit of all changes from v1_logprobs

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/samplers/__init__.py                |   0
 tests/v1/samplers/test_logprobs.py           | 340 +++++++++++++++++++
 vllm/outputs.py                              |  16 +-
 vllm/transformers_utils/detokenizer_utils.py |  51 ++-
 vllm/v1/core/scheduler.py                    | 152 ++++++++-
 vllm/v1/engine/__init__.py                   |   9 +
 vllm/v1/engine/async_llm.py                  |   3 +-
 vllm/v1/engine/detokenizer.py                |  60 +++-
 vllm/v1/engine/llm_engine.py                 |  10 +-
 vllm/v1/engine/processor.py                  |  28 +-
 vllm/v1/outputs.py                           |   8 +-
 vllm/v1/request.py                           |   8 +-
 vllm/v1/sample/metadata.py                   |   8 +-
 vllm/v1/sample/sampler.py                    | 161 +++++++--
 vllm/v1/worker/gpu_model_runner.py           | 107 ++++--
 15 files changed, 885 insertions(+), 76 deletions(-)
 create mode 100644 tests/v1/samplers/__init__.py
 create mode 100644 tests/v1/samplers/test_logprobs.py

diff --git a/tests/v1/samplers/__init__.py b/tests/v1/samplers/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
new file mode 100644
index 0000000000000..114ce7bd1f2fb
--- /dev/null
+++ b/tests/v1/samplers/test_logprobs.py
@@ -0,0 +1,340 @@
+from typing import List, Tuple
+
+import pytest
+import torch
+
+from tests.kernels.utils import override_backend_env_variable
+from vllm import SamplingParams
+
+from ...conftest import VllmRunner
+
+MODELS = ["facebook/opt-125m"]
+
+
+def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
+    """Generate logprobs configs for a batch of requests
+    
+    A given request's logprobs configuration is (1) num_sample_logprobs and (2)
+    num_prompt_logprobs. The batch logprobs configuration is the list of request
+    logprobs configs.
+
+    batch_logprobs_composition == "NONE" yields a batch with no sample or prompt
+    logprobs
+
+    batch_logprobs_composition == "SAMPLE" yields a batch with some requests
+    configured for sample logprobs only, and others configured for no logprobs
+
+    batch_logprobs_composition == "PROMPT" yields a batch with some requests
+    configured for prompt logprobs only, and others configured for no logprobs
+
+    batch_logprobs_composition == "SAMPLE_PROMPT" yields a batch with some
+    requests configured for sample logprobs and prompt logprobs, some configured
+    for only sample logprobs or only prompt logprobs, and some configured for
+    no logprobs
+
+    Args:
+      
+      batch_logprobs_composition: types of logprobs configs to include in batch
+
+    Returns:
+
+      List of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs])
+      tuples
+    
+    """
+    if batch_logprobs_composition == "NONE":
+        # No requests with sample or prompt logprobs
+        return [(None, None), (0, None), (None, 0), (0, 0)]
+    elif batch_logprobs_composition == "SAMPLE":
+        return [
+            (None, None),
+            (None, 0),
+            (0, None),
+            (0, 0),
+            (5, None),
+            (3, 0),
+        ]
+    elif batch_logprobs_composition == "PROMPT":
+        return [
+            (None, 0),
+            (0, None),
+            (0, 0),
+            (None, 6),
+            (0, 5),
+        ]
+    elif batch_logprobs_composition == "SAMPLE_PROMPT":
+        return [
+            (None, 0),
+            (0, None),
+            (0, 0),
+            (5, None),
+            (3, 0),
+            (6, 3),
+            (None, 6),
+            (0, 5),
+        ]
+    else:
+        raise ValueError("Invalid logprobs batch configuration for test.")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype",
+                         ["half"])  # needed for comparing logprobs with HF
+# @pytest.mark.parametrize("detokenize", [True, False])
+@pytest.mark.parametrize("max_num_batched_tokens", [128, 256, 1024])
+@pytest.mark.parametrize("batch_logprobs_composition",
+                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
+def test_get_logprobs_and_prompt_logprobs(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+    # detokenize: bool,
+    batch_logprobs_composition: str,
+    max_num_batched_tokens: int,
+    example_prompts,
+    monkeypatch,
+):
+    """Test V1 Engine logprobs & prompt logprobs
+    
+    Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
+    settings and validate that
+    * The generated logprobs and prompt logprobs are consistent with the
+      configuration settings, in terms of whether or not the logprobs
+      (of either type) were requested and how many were requested
+    * The generated logprobs are consistent with the generated tokens
+    * The generated (prompt)logprobs are consistent with HuggingFace
+      (prompt)logprobs, as a reference
+
+    batch_logprobs_composition controls the logprobs configurations for
+    requests in the batch under test.
+
+    Args:
+      hf_runner
+      vllm_runner
+      model
+      dtype
+      detokenize: if False, return generated tokens bypassing detokenizer
+      batch_logprobs_composition: logprobs configuration for test batch
+      example_prompts
+      monkeypatch
+    """
+    detokenize = True
+
+    test_prompts = example_prompts
+
+    # LLM engine v1
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    override_backend_env_variable(monkeypatch, "FLASH_ATTN")
+
+    max_num_seqs = 128
+    max_num_batched_tokens = 128
+    max_model_len = 128
+
+    max_tokens = 5
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(
+            test_prompts,
+            max_tokens=max_tokens,
+        )
+        hf_logprobs = hf_model.generate_greedy_logprobs(
+            test_prompts,
+            max_tokens=max_tokens,
+        )
+
+    # Batch has mixed sample params
+    # (different logprobs/prompt logprobs combos)
+    logprob_prompt_logprob_list = _get_test_batch(batch_logprobs_composition)
+
+    # We rely on there being more prompts than combinations of
+    # logprobs & prompt logprobs which we want to test
+    assert len(test_prompts) >= len(logprob_prompt_logprob_list)
+    # Make sure there is a sample params for each prompt
+    num_extra_params = len(test_prompts) - len(logprob_prompt_logprob_list)
+    if num_extra_params > 0:
+        logprob_prompt_logprob_list = (
+            logprob_prompt_logprob_list +
+            logprob_prompt_logprob_list[-num_extra_params:])
+    # Now the number of prompts should match the number of sample params combos
+    assert len(test_prompts) == len(logprob_prompt_logprob_list)
+    # Generate SamplingParams
+    vllm_sampling_params = [
+        SamplingParams(max_tokens=max_tokens,
+                       logprobs=lp,
+                       prompt_logprobs=plp,
+                       temperature=0.0,
+                       detokenize=detokenize)
+        for lp, plp in logprob_prompt_logprob_list
+    ]
+
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_logprobs=7,
+            max_num_batched_tokens=max_num_batched_tokens,
+            max_num_seqs=max_num_seqs,
+            max_model_len=max_model_len,
+            enforce_eager=True,
+    ) as vllm_model:
+        vllm_results = vllm_model.model.generate(
+            test_prompts, sampling_params=vllm_sampling_params)
+
+    for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip(
+            vllm_results, hf_logprobs, hf_outputs,
+            logprob_prompt_logprob_list):
+
+        # Extract request-level (prompt)logprobs config
+        num_top_logprobs = logprob_prompt_logprob[0]
+        num_top_prompt_logprobs = logprob_prompt_logprob[1]
+
+        # Test whether sampled token output is consistent between vLLM and HF
+        # vLLM prompt+completion should match HF output
+        assert (vllm_result.prompt_token_ids +
+                vllm_result.outputs[0].token_ids == hf_output[0])
+
+        # Validate sample logprobs
+        if num_top_logprobs is not None and num_top_logprobs > 0:
+            assert num_top_logprobs is not None
+            # Confirm that the structure of the sample logprobs in the result is
+            # correct
+            assert vllm_result.outputs[0].logprobs is not None
+            assert len(vllm_result.outputs[0].logprobs) == max_tokens
+            for logprobs in vllm_result.outputs[0].logprobs:
+                assert logprobs is not None
+                # If the output token is not included in the top X
+                # logprob, it can return 1 more data
+                assert (len(logprobs) == num_top_logprobs
+                        or len(logprobs) == num_top_logprobs + 1)
+            output_text = vllm_result.outputs[0].text
+            output_string_from_most_likely_tokens_lst: List[str] = []
+            for top_logprobs in vllm_result.outputs[0].logprobs:
+                top_logprob = next(iter(top_logprobs.values()))
+                output_string_from_most_likely_tokens_lst.append(
+                    top_logprob.decoded_token)
+
+            if detokenize:
+                output_string_from_most_likely_tokens = "".join(
+                    output_string_from_most_likely_tokens_lst)
+                assert output_text == output_string_from_most_likely_tokens, (
+                    "The output text from the top logprob for each token "
+                    "position should be the same as the output text in the "
+                    "result.")
+            else:
+                assert output_text == ''
+                assert output_string_from_most_likely_tokens_lst == (
+                    [None] * max_tokens)
+
+            # Compare vLLM sample logprobs to HF
+            vllm_sample_logprobs = vllm_result.outputs[0].logprobs
+            for i, top_logprobs in enumerate(vllm_sample_logprobs):
+                for token_id, sample_logprob in top_logprobs.items():
+                    logprob = sample_logprob.logprob
+                    torch.testing.assert_close(
+                        logprob,
+                        hf_logprob[i][-1][token_id].item(),
+                        atol=1e-2,
+                        rtol=1e-2)
+                    if detokenize:
+                        assert isinstance(sample_logprob.decoded_token, str), (
+                            "The token should be decoded by the time it is"
+                            " returned to the user.")
+        else:
+            # Logprobs disabled for this request; should be None
+            assert vllm_result.outputs[0].logprobs is None
+
+        # Validate prompt logprobs
+        if (num_top_prompt_logprobs is not None
+                and num_top_prompt_logprobs > 0):
+            # Confirm that structure of prompt logprobs in result is correct
+            assert vllm_result.prompt_logprobs is not None
+            # - The first prompt logprob is always None
+            assert vllm_result.prompt_logprobs[0] is None
+            # - Prompt logprobs are returned for all indices in
+            #   the prompt
+            assert len(vllm_result.prompt_logprobs) == len(
+                vllm_result.prompt_token_ids)
+            for prompt_logprobs in vllm_result.prompt_logprobs[1:]:
+                assert prompt_logprobs is not None
+                # - If the prompt token is not included in the top X
+                #   logprob, it can return 1 more data
+                assert (len(prompt_logprobs) == num_top_prompt_logprobs
+                        or len(prompt_logprobs) == num_top_prompt_logprobs + 1)
+
+            # Compare prompt logprobs to HF
+            # The first prompt logprob is always None, so we compare it from
+            # 1:.
+            vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
+            for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
+                for token_id, logprob in vllm_prompt_logprob_dict.items():
+                    torch.testing.assert_close(
+                        logprob.logprob,
+                        hf_logprob[0][i][token_id].item(),
+                        atol=1e-2,
+                        rtol=1e-2)
+        else:
+            assert vllm_result.prompt_logprobs is None
+
+
+def test_max_logprobs(monkeypatch):
+    """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
+    
+    Should also fail for `prompt_logprobs > max_logprobs`
+    
+    Args:
+      monkeypatch
+    """
+    # LLM engine v1
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    override_backend_env_variable(monkeypatch, "FLASH_ATTN")
+
+    runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
+    vllm_sampling_params = SamplingParams(logprobs=1)
+    # should pass
+    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
+
+    bad_sampling_params = SamplingParams(logprobs=2)
+    with pytest.raises(ValueError):
+        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("detokenize", [True, False])
+def test_none_logprobs(vllm_runner, model, detokenize: bool, example_prompts,
+                       monkeypatch):
+    """Engine should return `logprobs` and `prompt_logprobs` as `None`
+    
+    Args:
+      vllm_runner
+      model
+      detokenize: whether to feed generated tokens to detokenizer
+      example_prompts
+      monkeypatch
+    """
+
+    # LLM engine v1
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    override_backend_env_variable(monkeypatch, "FLASH_ATTN")
+
+    max_num_seqs = 256
+    max_num_batched_tokens = None
+    max_tokens = 5
+
+    with vllm_runner(
+            model,
+            max_num_batched_tokens=max_num_batched_tokens,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
+                                                       logprobs=None,
+                                                       prompt_logprobs=None,
+                                                       temperature=0.0,
+                                                       detokenize=detokenize)
+        results_logprobs_none = vllm_model.model.generate(
+            example_prompts, sampling_params=sampling_params_logprobs_none)
+
+    for i in range(len(results_logprobs_none)):
+        # Check sample logprobs are None
+        assert results_logprobs_none[i].outputs[0].logprobs is None
+        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
+        # Check prompt logprobs are None
+        assert results_logprobs_none[i].prompt_logprobs is None
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 2d256803edfe8..9733158504945 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -127,24 +127,24 @@ def new(
         prompt_token_ids: Optional[List[int]],
         text: str,
         token_ids: List[int],
+        logprobs: Optional[SampleLogprobs],
+        prompt_logprobs: Optional[PromptLogprobs],
         finished: bool = False,
     ) -> "RequestOutput":
         """Initialize a new RequestOutput object."""
 
         # TODO: Support `n` > 1.
-        completion_output = CompletionOutput(
-            index=0,
-            text=text,
-            token_ids=token_ids,
-            cumulative_logprob=None,
-            logprobs=None,  # TODO
-        )
+        completion_output = CompletionOutput(index=0,
+                                             text=text,
+                                             token_ids=token_ids,
+                                             cumulative_logprob=None,
+                                             logprobs=logprobs)
 
         return RequestOutput(
             request_id=request_id,
             prompt=prompt,
             prompt_token_ids=prompt_token_ids,
-            prompt_logprobs=None,  # TODO
+            prompt_logprobs=prompt_logprobs,
             outputs=[completion_output],
             finished=finished,
         )
diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 37ff8a236e791..885e3b9d92f88 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -1,4 +1,6 @@
-from typing import List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
+
+from vllm.sequence import Logprob
 
 from .tokenizer import AnyTokenizer
 
@@ -165,3 +167,50 @@ def detokenize_incrementally(
 
     new_text = new_text[len(prefix_text):]
     return new_tokens, new_text, read_offset, len(output_tokens)
+
+
+def detokenize_logprob_incrementally_in_place(
+    tokenizer: AnyTokenizer,
+    logprob_dict: Dict[int, Logprob],
+    input_ids_prefix: List[int],
+    prev_tokens: Optional[List[str]],
+    prefix_offset: int,
+    read_offset: int,
+    skip_special_tokens: bool = False,
+    spaces_between_special_tokens: bool = True,
+) -> None:
+    """Detokenizes the logprobs at a single token offset incrementally.
+
+    For each top-token in `logprob_dict`, apply incremental detokenization
+    to the token list `input_ids_prefix + [top-token id]`
+
+    The logprob data structure is modified in-place with the string
+    representation of each decoded top-token.
+    
+    Args:
+        tokenizer: The tokenizer to use.
+        logprob_dict: logprob data structure for a single token position
+        input_ids_prefix: The input ids *preceding* the token offset under
+                          consideration
+        prev_tokens: The previous tokens. If None, this function will convert
+            the input ids to tokens and return the tokens and the new text.
+        prefix_offset: The prefix offset.
+        read_offset: The read offset.
+        skip_special_tokens: Whether to skip special tokens.
+        spaces_between_special_tokens: Whether to add spaces between special
+            tokens.
+    """
+
+    for token_id in logprob_dict:
+        # Detokenize logprob for a particular top
+        # token at a particular token offset
+
+        logprob_dict[token_id].decoded_token = detokenize_incrementally(
+            tokenizer=tokenizer,
+            all_input_ids=input_ids_prefix + [token_id],
+            prev_tokens=prev_tokens,
+            prefix_offset=prefix_offset,
+            read_offset=read_offset,
+            skip_special_tokens=skip_special_tokens,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+        )[1]
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index ba50a9786d805..476b12c705482 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -6,6 +6,7 @@
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
+from vllm.sequence import Logprob
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.engine import EngineCoreOutput
@@ -247,6 +248,13 @@ def schedule(self) -> "SchedulerOutput":
                         self.encoder_cache_manager.allocate(request, i)
                     encoder_budget = new_encoder_budget
 
+        # Now that requests are scheduled, generate a mask indicating which
+        # request is partial
+        partial_running_reqs = [
+            (req.num_computed_tokens + num_scheduled_tokens[req.request_id] <
+             req.num_tokens) for req in self.running
+        ]
+
         # Check if the scheduling constraints are satisfied.
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
         assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
@@ -277,6 +285,7 @@ def schedule(self) -> "SchedulerOutput":
             scheduled_new_reqs=new_reqs_data,
             scheduled_resumed_reqs=resumed_reqs_data,
             scheduled_running_reqs=running_reqs_data,
+            partial_running_reqs=partial_running_reqs,
             num_scheduled_tokens=num_scheduled_tokens,
             total_num_scheduled_tokens=total_num_scheduled_tokens,
             scheduled_encoder_inputs=scheduled_encoder_inputs,
@@ -384,11 +393,85 @@ def update_from_output(
         # NOTE(woosuk): This method doesn't consider speculative decoding.
         sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
+        do_logprobs = model_runner_output.logprobs_cpu is not None
+        do_prompt_logprobs = (
+            model_runner_output.prompt_logprobs_cpu is not None
+            and len(model_runner_output.prompt_logprobs_cpu) > 0)
+        if do_logprobs:
+            assert model_runner_output.logprob_token_ids_cpu is not None
+            logprob_token_ids_list = (
+                model_runner_output.logprob_token_ids_cpu.tolist())
+            logprob_values_list = (model_runner_output.logprobs_cpu.tolist())
+        if do_prompt_logprobs:
+            assert model_runner_output.prompt_logprob_token_ids_cpu is not None
+            prompt_logprob_token_ids_list = (
+                model_runner_output.prompt_logprob_token_ids_cpu.tolist())
+            prompt_logprob_values_list = (
+                model_runner_output.prompt_logprobs_cpu.tolist())
+            curr_prompt_base_idx = 0
         new_running: List[Request] = []
         engine_core_outputs: List[EngineCoreOutput] = []
         for request in self.running:
             req_id = request.request_id
             request.num_computed_tokens += num_scheduled_tokens[req_id]
+            req_index = model_runner_output.req_id_to_index[req_id]
+            num_new_tokens = 1
+            max_logprobs = request.max_logprobs
+            request_do_logprobs = (do_logprobs and max_logprobs is not None
+                                   and max_logprobs > 0)
+
+            if do_prompt_logprobs:
+                max_prompt_logprobs = request.max_prompt_logprobs
+                num_new_prompt_tokens = (
+                    num_scheduled_tokens[request.request_id] -
+                    int(not scheduler_output.partial_running_reqs[req_index]))
+
+                request_do_prompt_logprobs = (max_prompt_logprobs is not None
+                                              and max_prompt_logprobs > 0
+                                              and num_new_prompt_tokens > 0)
+
+                if request_do_prompt_logprobs:
+
+                    # Construct prompt logprobs, under the condition that
+                    # prompt logprobs were requested & a nonzero number of
+                    # prompt tokens were computed in this step for this request.
+                    #
+                    # Note that this scenario returns an EngineCoreOutput which
+                    # is empty except for the prompt logprobs which were
+                    # computed for these prompt tokens.
+
+                    slice_upper_index = (curr_prompt_base_idx +
+                                         num_new_prompt_tokens)
+                    prompt_logprob_token_ids = prompt_logprob_token_ids_list[
+                        curr_prompt_base_idx:slice_upper_index]
+                    prompt_logprob_values = prompt_logprob_values_list[
+                        curr_prompt_base_idx:slice_upper_index]
+                    curr_prompt_base_idx = slice_upper_index
+
+                    logprob_cnt = max_prompt_logprobs
+                    prompt_logprobs = [{
+                        lpt: Logprob(lpv, (idx + 1), None)
+                        for idx, (lpv, lpt) in enumerate(
+                            zip(plp_tok_values[0:logprob_cnt],
+                                plp_tok_token_ids[0:logprob_cnt]))
+                    } for plp_tok_values, plp_tok_token_ids in zip(
+                        prompt_logprob_values, prompt_logprob_token_ids)]
+
+                    if not request.prompt_logprobs:
+                        # Ensure that None is the first prompt logprob
+                        prompt_logprobs = [None] + prompt_logprobs
+
+                    curr_prompt_base_idx = slice_upper_index
+
+                    prompt_slice_range_upper = request.num_computed_tokens
+                    prompt_slice_range_lower = (prompt_slice_range_upper -
+                                                num_new_prompt_tokens)
+                    request.prompt_logprobs.extend(prompt_logprobs)
+                else:
+                    curr_prompt_base_idx += num_new_prompt_tokens
+            else:
+                request_do_prompt_logprobs = False
+
             # When the request's num_computed_tokens catches up its num_tokens,
             # the request generates output tokens. Otherwise, we ignore the
             # sampler output for the request.
@@ -405,12 +488,45 @@ def update_from_output(
                     self.encoder_cache_manager.free(request, input_id)
 
             if request.num_computed_tokens == request.num_tokens:
-                req_index = model_runner_output.req_id_to_index[req_id]
                 # NOTE(woosuk): Currently, we assume that each request
                 # generates at most one token at each step.
                 token_id = sampled_token_ids[req_index]
+                if request_do_logprobs:
+                    # Construct logprobs, if requested (TODO: assumes one
+                    # generated token).
+                    logprob_token_ids = logprob_token_ids_list[req_index]
+                    logprob_values = logprob_values_list[req_index]
+                    logprob_cnt = max_logprobs
+                    if token_id not in logprob_token_ids[0:max_logprobs]:
+                        # Sampled token is not in the in the top logprobs;
+                        # inject it & resort, ensuring that excess logprobs
+                        # not requested by the user have -inf probability
+                        logprob_values[max_logprobs:-1] = (
+                            [float('-inf')] *
+                            (len(logprob_values) - 1 - max_logprobs))
+
+                        indices = sorted(range(len(logprob_values)),
+                                         key=lambda k: logprob_values[k],
+                                         reverse=True)
+                        logprob_values = [logprob_values[i] for i in indices]
+                        logprob_token_ids = [
+                            logprob_token_ids[i] for i in indices
+                        ]
+
+                        # There will be one more logprob than the user requested
+                        logprob_cnt = max_logprobs + 1
+
+                    # Only keep the number of logprobs specified by the request
+                    # (plus possibly the sampled token id & its logprob)
+                    logprob_values = logprob_values[0:logprob_cnt]
+                    logprob_token_ids = logprob_token_ids[0:logprob_cnt]
+
+                    request.logprobs.append({
+                        lpt: Logprob(lpv, (idx + 1), None)
+                        for idx, (lpv, lpt) in enumerate(
+                            zip(logprob_values, logprob_token_ids))
+                    })
                 request.append_output_token_ids(token_id)
-                num_new_tokens = 1
                 # TODO: Update the KV cache manager for prefix caching.
 
                 # Check for stop and update request state.
@@ -418,18 +534,47 @@ def update_from_output(
                 stopped = self._check_stop(request)
 
                 # Add EngineCoreOutput for this Request.
+                # Return the logprob for the most recently computed tokens.
+                # Return no prompt logprobs in decode-phase.
                 output = EngineCoreOutput(
                     request_id=req_id,
                     new_token_ids=request.output_token_ids[-num_new_tokens:],
                     finished=request.is_finished(),
                     finish_reason=request.get_finished_reason(),
-                    stop_reason=request.stop_reason)
+                    stop_reason=request.stop_reason,
+                    logprobs=(request.logprobs[-num_new_tokens:]
+                              if request_do_logprobs else None),
+                    prompt_logprobs=(prompt_logprobs
+                                     if request_do_prompt_logprobs else None),
+                    prompt_logprobs_token_ids=(request.prompt_token_ids
+                                               if request_do_prompt_logprobs
+                                               else None))
                 engine_core_outputs.append(output)
 
                 # Breakout of the loop.
                 if stopped:
                     continue
 
+            elif request_do_prompt_logprobs:
+                # This request is still partial but prompt logprobs were
+                # requested
+                engine_core_outputs.append(
+                    EngineCoreOutput(
+                        request_id=req_id,
+                        new_token_ids=[],
+                        finished=request.is_finished(),
+                        finish_reason=request.get_finished_reason(),
+                        stop_reason=request.stop_reason,
+                        logprobs=[] if request_do_logprobs else None,
+                        prompt_logprobs=(
+                            prompt_logprobs if request_do_prompt_logprobs else
+                            ([] if request_do_prompt_logprobs else None)),
+                        prompt_logprobs_token_ids=(
+                            request.prompt_token_ids[prompt_slice_range_lower:
+                                                     prompt_slice_range_upper]
+                            if request_do_prompt_logprobs else
+                            ([] if request_do_prompt_logprobs else None))))
+
             new_running.append(request)
         self.running = new_running
         return engine_core_outputs
@@ -581,6 +726,7 @@ class SchedulerOutput:
     scheduled_new_reqs: List[NewRequestData]
     scheduled_resumed_reqs: List[ResumedRequestData]
     scheduled_running_reqs: List[RunningRequestData]
+    partial_running_reqs: List[bool]  # True if running req is partial
 
     num_scheduled_tokens: Dict[str, int]
     total_num_scheduled_tokens: int
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 967124fd850ea..46ee3154d69c0 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -7,6 +7,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
 from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.sequence import PromptLogprobs, SampleLogprobs
 
 
 @dataclass
@@ -22,6 +23,11 @@ class DetokenizerRequest:
     stop: List[str]
     include_stop_str_in_output: bool
 
+    # Per-request logprobs & prompt logprobs
+    # counts; None is equivalent to 0
+    logprobs: Optional[int]
+    prompt_logprobs: Optional[int]
+
 
 @dataclass
 class EngineCoreRequest:
@@ -52,6 +58,9 @@ class EngineCoreOutput(msgspec.Struct,
     request_id: str
     new_token_ids: List[int]
     finished: bool
+    logprobs: Optional[SampleLogprobs]
+    prompt_logprobs: Optional[PromptLogprobs]
+    prompt_logprobs_token_ids: Optional[List[int]]
     finish_reason: Optional[str] = None
     stop_reason: Union[int, str, None] = None
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a17c8eac4b77c..421ecc8c0d921 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -144,7 +144,8 @@ async def add_request(
 
         # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
         detokenizer_req, engine_core_req = self.processor.process_inputs(
-            request_id, prompt, params, arrival_time, lora_request,
+            request_id, prompt, params, arrival_time,
+            (await self.get_model_config()).max_logprobs, lora_request,
             trace_headers, prompt_adapter_request, priority)
 
         # 3) Add the request to Detokenizer (this process).
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 6249d60199a62..5ad8b8c725f3e 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,17 +1,21 @@
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Dict, Iterable, List, Optional, Tuple, Union
 
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import RequestOutputKind
+from vllm.sequence import PromptLogprobs, SampleLogprobs
 from vllm.transformers_utils.detokenizer_utils import (
-    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
+    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally,
+    detokenize_logprob_incrementally_in_place)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.v1.engine import DetokenizerRequest, EngineCoreOutput
 
 logger = init_logger(__name__)
 
+AnyLogprobs = Union[Optional[SampleLogprobs], Optional[PromptLogprobs]]
+
 
 @dataclass
 class IncrementalDetokenizer:
@@ -20,6 +24,8 @@ class IncrementalDetokenizer:
     output_text: str
     tokens: List[str]
     token_ids: List[int]
+    logprobs: Optional[SampleLogprobs]
+    prompt_logprobs: Optional[PromptLogprobs]
 
     # Stop strings
     stop: List[str]
@@ -72,6 +78,11 @@ def from_new_request(
         else:
             stop_buffer_length = 0
 
+        # Logprobs & prompt logprobs settings
+        do_logprobs = request.logprobs is not None and request.logprobs > 0
+        do_prompt_logprobs = (request.prompt_logprobs is not None
+                              and request.prompt_logprobs > 0)
+
         return cls(
             output_text="",
             tokens=tokens,
@@ -91,25 +102,34 @@ def from_new_request(
             prompt_token_ids=request.prompt_token_ids,
             tokenizer=tokenizer,
             stop_buffer_length=stop_buffer_length,
-        )
+            logprobs=[] if do_logprobs else None,
+            prompt_logprobs=[] if do_prompt_logprobs else None)
 
     def add_tokens(
         self,
         new_token_ids: List[int],
+        new_logprobs: Optional[SampleLogprobs],
+        new_prompt_logprobs: Optional[PromptLogprobs],
         finish_reason: Optional[str],
         stop_reason: Optional[str],
     ) -> Optional[RequestOutput]:
         """
         Update RequestState for the request_id by:
             1) Detokenize the new token ids incrementally.
+            1a) If necessary, detokenize logprobs incrementally
+            1b) If necessary, detokenize prompt logprobs incrementally
             2) Update the RequestOutput with the new text.
         """
 
-        # 1) Detokenize the new token ids incrementally.
+        do_logprobs = new_logprobs is not None and len(new_logprobs) > 0
+        assert not do_logprobs or len(new_logprobs) == len(new_token_ids)
+
+        # 1) Detokenize the new token ids incrementally. If necessary,
+        #    detokenize logprobs.
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
         decoded_text = ""
-        for new_token_id in new_token_ids:
+        for tdx, new_token_id in enumerate(new_token_ids):
             self.token_ids.append(new_token_id)
             (new_tokens, new_decoded_token_text, prefix_offset,
              read_offset) = detokenize_incrementally(
@@ -123,6 +143,23 @@ def add_tokens(
                  spaces_between_special_tokens,
              )
 
+            if do_logprobs:
+                # Detokenize individual token logprobs in-place
+                logprob_dict = new_logprobs[tdx]
+                assert logprob_dict is not None
+                detokenize_logprob_incrementally_in_place(
+                    tokenizer=self.tokenizer,
+                    logprob_dict=logprob_dict,
+                    input_ids_prefix=self.token_ids[0:-1],
+                    prev_tokens=self.tokens,
+                    prefix_offset=self.prefix_offset,
+                    read_offset=self.read_offset,
+                    skip_special_tokens=self.skip_special_tokens,
+                    spaces_between_special_tokens=self.
+                    spaces_between_special_tokens,
+                )
+                self.logprobs.append(logprob_dict)
+
             self.tokens.extend(new_tokens)
             self.prefix_offset = prefix_offset
             self.read_offset = read_offset
@@ -130,6 +167,10 @@ def add_tokens(
 
             decoded_text += new_decoded_token_text
 
+        # 1b) If necessary, detokenize prompt logprobs incrementally
+        if new_prompt_logprobs is not None and len(new_prompt_logprobs) > 0:
+            self.prompt_logprobs.extend(new_prompt_logprobs)
+
         # 2) Evaluate stop criteria.
         if self.stop:
             stop = StopChecker.check_stop_strings(
@@ -139,11 +180,10 @@ def add_tokens(
                 include_in_output=self.include_stop_str_in_output,
             )
             if stop is not None:
-                stop_str, truncate_to = stop
+                _, truncate_to = stop
                 if truncate_to != -1:
                     self.output_text = self.output_text[:truncate_to]
                 finish_reason = "stop"  # TODO: use constant
-                stop_reason = stop_str
 
         # TODO: handle stop_token_ids here too?
 
@@ -156,6 +196,8 @@ def add_tokens(
         delta = self.output_kind == RequestOutputKind.DELTA
         output_text = self._get_next_output_text(finished, delta)
         token_ids = new_token_ids if delta else self.output_token_ids
+        logprobs = new_logprobs if delta else self.logprobs
+        prompt_logprobs = new_prompt_logprobs if delta else self.prompt_logprobs
 
         request_output = RequestOutput.new(
             self.request_id,
@@ -163,6 +205,8 @@ def add_tokens(
             self.prompt_token_ids,
             output_text,
             token_ids,
+            logprobs,
+            prompt_logprobs,
             finished,
         )
 
@@ -254,6 +298,8 @@ def step(
             # Detokenize and update state.
             request_output = detokenizer.add_tokens(
                 new_token_ids=engine_core_output.new_token_ids,
+                new_logprobs=engine_core_output.logprobs,
+                new_prompt_logprobs=engine_core_output.prompt_logprobs,
                 finish_reason=engine_core_output.finish_reason,
                 stop_reason=engine_core_output.stop_reason,
             )
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index bd19d998a4adb..b93634230529e 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -134,8 +134,9 @@ def add_request(
 
         # 1) Process raw inputs into the request.
         detokenizer_req, engine_core_req = self.processor.process_inputs(
-            request_id, prompt, params, arrival_time, lora_request,
-            trace_headers, prompt_adapter_request, priority)
+            request_id, prompt, params, arrival_time,
+            self.get_model_config().max_logprobs, lora_request, trace_headers,
+            prompt_adapter_request, priority)
 
         # 2) Add the request to Detokenizer.
         self.detokenizer.add_request(detokenizer_req)
@@ -158,11 +159,12 @@ def step(self) -> List[RequestOutput]:
 
         return request_outputs
 
-    # TODO(rob): Can we get rid of these?
-
     def get_model_config(self):
+        """Gets the model configuration."""
         return self.model_config
 
+    # TODO(rob): Can we get rid of these?
+
     def start_profile(self):
         self.engine_core.profile(True)
 
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 5c1577190c75a..5bcf1b5e7b86e 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -39,6 +39,28 @@ def __init__(
         self.input_processor = input_registry.create_input_processor(
             model_config)
 
+    def _assert_valid_logprobs_prompt_logprobs(
+        self,
+        params: Union[SamplingParams, PoolingParams],
+        max_logprobs: int,
+    ):
+        """Validate requested number of sample logprobs & prompt logprobs
+        
+        Fails with ValueError if to many logprobs are requested.
+
+        Args:
+          params: Sampling parameters
+          max_logprobs: max number of logprobs or prompt logprobs
+        """
+
+        if isinstance(params, SamplingParams) and (
+            (params.logprobs and params.logprobs > max_logprobs) or
+            (params.prompt_logprobs
+             and params.prompt_logprobs > max_logprobs)):
+
+            raise ValueError(f"Cannot request more than "
+                             f"{max_logprobs} logprobs or prompt logprobs.")
+
     # TODO: run in an ThreadpoolExecutor or BackgroundProcess.
     # This ideally should releases the GIL, so we should not block the
     # asyncio loop while this is running.
@@ -48,6 +70,7 @@ def process_inputs(
         prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
         arrival_time: float,
+        max_logprobs: int,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
@@ -55,9 +78,10 @@ def process_inputs(
     ) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
 
         # TODO(woosuk): Support embedding mode.
-        # TODO(woosuk): Check max_logprobs
         # TODO(woosuk): Support encoder-decoder models.
 
+        self._assert_valid_logprobs_prompt_logprobs(params, max_logprobs)
+
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                              "not enabled!")
@@ -106,6 +130,8 @@ def process_inputs(
             sampling_params.output_kind,
             sampling_params.stop,
             sampling_params.include_stop_str_in_output,
+            sampling_params.logprobs,
+            sampling_params.prompt_logprobs,
         )
 
         # Make Request for EngineCore.
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 8574987728844..3cd0430aabd6f 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -15,8 +15,9 @@ class SamplerOutput:
     # [num_reqs, max_num_logprobs + 1]
     logprobs: Optional[torch.Tensor]
 
-    # TODO: Support prompt logprobs.
+    # [num_prompt_tokens, max_num_prompt_logprobs + 1]
     prompt_logprob_token_ids: Optional[torch.Tensor]
+    # [num_prompt_tokens, max_num_prompt_logprobs + 1]
     prompt_logprobs: Optional[torch.Tensor]
 
 
@@ -35,3 +36,8 @@ class ModelRunnerOutput:
     logprob_token_ids_cpu: Optional[torch.Tensor]
     # [num_reqs, max_num_logprobs + 1]
     logprobs_cpu: Optional[torch.Tensor]
+
+    # [num_reqs, max_num_prompt_logprobs]
+    prompt_logprob_token_ids_cpu: Optional[torch.Tensor]
+    # [num_reqs, max_num_prompt_logprobs]
+    prompt_logprobs_cpu: Optional[torch.Tensor]
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 51fb4003e5fe0..ce2accbd63aff 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -5,7 +5,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import RequestMetrics
+from vllm.sequence import PromptLogprobs, RequestMetrics, SampleLogprobs
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.utils import ConstantList
 
@@ -43,6 +43,12 @@ def __init__(
         self.num_prompt_tokens = len(self.prompt_token_ids)
         self._output_token_ids: List[int] = []
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
+        self.max_logprobs = sampling_params.logprobs
+        self.max_prompt_logprobs = sampling_params.prompt_logprobs
+        self.logprobs: Optional[SampleLogprobs] = (
+            None if self.max_logprobs is None else [])
+        self.prompt_logprobs: Optional[PromptLogprobs] = (
+            None if self.max_prompt_logprobs is None else [])
         self.num_computed_tokens = 0
 
         # Raw multimodal data before the mm input mapper (e.g., PIL images).
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 9ef36f2e6b212..3bf5a462d5070 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict
+from typing import Dict, Optional
 
 import torch
 
@@ -19,3 +19,9 @@ class SamplingMetadata:
     generators: Dict[int, torch.Generator]
 
     max_num_logprobs: int
+    max_num_prompt_logprobs: int
+
+    num_query_tokens: Optional[torch.Tensor] = None
+    num_sampled_tokens: Optional[torch.Tensor] = None
+    maybe_sample_logits_indices: Optional[torch.Tensor] = None
+    prompt_logits_mask: Optional[torch.Tensor] = None
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 927f274541c4d..77424df30e9ca 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,5 +1,5 @@
 """A layer that samples the next tokens from the model's outputs."""
-from typing import Dict
+from typing import Dict, Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -12,41 +12,150 @@
 
 class Sampler(nn.Module):
 
-    def forward(
+    def _apply_temperature_top_k_top_p(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
-    ) -> SamplerOutput:
-        logits = self.apply_temperature(logits, sampling_metadata.temperature)
-        logits = self.apply_top_k_top_p(logits, sampling_metadata)
+        num_query_tokens: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+
+        temperature = (sampling_metadata.temperature if
+                       num_query_tokens is None else torch.repeat_interleave(
+                           sampling_metadata.temperature, num_query_tokens))
+
+        return self._apply_top_k_top_p(
+            self._apply_temperature(logits, temperature), sampling_metadata)
 
-        probs = self.get_probs(logits)
+    def _probs_sample(
+        self,
+        maybe_sample_logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        probs = self.get_probs(maybe_sample_logits)
         sampled = self.sample(probs, sampling_metadata)
         # Use int32 to reduce the tensor size.
-        sampled = sampled.to(torch.int32)
-
-        if sampling_metadata.max_num_logprobs > 0:
-            logprobs = self.get_logprobs(logits)
-            # FIXME: Mask the sampled token_id, get topk logprobs,
-            # and concatenate the topk with the sampled token_id.
-            topk_logprobs, topk_indices = torch.topk(
-                logprobs, sampling_metadata.max_num_logprobs, dim=-1)
-            # Use int32 to reduce the tensor size.
-            topk_indices = topk_indices.to(torch.int32)
+        return sampled.to(torch.int32)
+
+    def _topk_logprobs_indices(
+        self,
+        logprobs: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        topk_logprobs, topk_indices = torch.topk(
+            logprobs, sampling_metadata.max_num_logprobs, dim=-1)
+        # Use int32 to reduce the tensor size.
+        return topk_logprobs, topk_indices.to(torch.int32)
+
+    def forward(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
+
+        do_logprobs = sampling_metadata.max_num_logprobs > 0
+        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
+        num_query_tokens = sampling_metadata.num_query_tokens
+        maybe_sample_logits_indices = (
+            sampling_metadata.maybe_sample_logits_indices)
+        prompt_logits_mask = sampling_metadata.prompt_logits_mask
+
+        if do_prompt_logprobs:
+            logits_w_tmp_tpk_tpp = self._apply_temperature_top_k_top_p(
+                logits, sampling_metadata, num_query_tokens)
+
+            maybe_sample_logits_w_tmp_tpk_tpp = (
+                logits_w_tmp_tpk_tpp[maybe_sample_logits_indices])
         else:
-            topk_logprobs = None
-            topk_indices = None
+            maybe_sample_logits_w_tmp_tpk_tpp = (
+                self._apply_temperature_top_k_top_p(
+                    logits[maybe_sample_logits_indices], sampling_metadata,
+                    None))
+
+        maybe_sampled = self._probs_sample(maybe_sample_logits_w_tmp_tpk_tpp,
+                                           sampling_metadata)
+
+        if do_logprobs and do_prompt_logprobs:
+            logprobs = self.get_logprobs(logits_w_tmp_tpk_tpp)
+
+            maybe_sampled_logprobs = logprobs[maybe_sample_logits_indices,
+                                              maybe_sampled]
+
+            topk_logprobs, topk_indices = self._topk_logprobs_indices(
+                logprobs, sampling_metadata)
+
+            maybe_sample_topk_logprobs = topk_logprobs[
+                maybe_sample_logits_indices, :]
+            maybe_sample_topk_indices = topk_indices[
+                maybe_sample_logits_indices, :]
+            prompt_topk_logprobs = topk_logprobs[prompt_logits_mask, :]
+            prompt_topk_indices = topk_indices[prompt_logits_mask, :]
+
+            # Concat sampled token logprobs
+            maybe_sample_topk_logprobs = torch.cat(
+                (maybe_sample_topk_logprobs,
+                 maybe_sampled_logprobs.unsqueeze(-1)),
+                dim=-1)
+            #Concat sampled token id
+            maybe_sample_topk_indices = torch.cat(
+                (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
+                dim=-1)
+        elif do_logprobs:
+            logprobs = self.get_logprobs(
+                logits_w_tmp_tpk_tpp[maybe_sample_logits_indices, :])
+
+            maybe_sampled_logprobs = logprobs[
+                torch.arange(maybe_sampled.shape[0]), maybe_sampled]
+
+            (
+                maybe_sample_topk_logprobs,
+                maybe_sample_topk_indices,
+            ) = self._topk_logprobs_indices(logprobs, sampling_metadata)
+
+            # Concat sampled token logprobs
+            maybe_sample_topk_logprobs = torch.cat(
+                (maybe_sample_topk_logprobs,
+                 maybe_sampled_logprobs.unsqueeze(-1)),
+                dim=-1)
+            #Concat sampled token id
+            maybe_sample_topk_indices = torch.cat(
+                (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
+                dim=-1)
+
+            (
+                prompt_topk_logprobs,
+                prompt_topk_indices,
+            ) = (None, None)
+
+        elif do_prompt_logprobs:
+            logprobs = self.get_logprobs(
+                logits_w_tmp_tpk_tpp[prompt_logits_mask, :])
+
+            prompt_topk_logprobs, prompt_topk_indices = (
+                self._topk_logprobs_indices(logprobs, sampling_metadata))
+
+            (
+                maybe_sample_topk_logprobs,
+                maybe_sample_topk_indices,
+            ) = (None, None)
+        else:
+            (
+                maybe_sample_topk_logprobs,
+                maybe_sample_topk_indices,
+                prompt_topk_logprobs,
+                prompt_topk_indices,
+            ) = (None, None, None, None)
 
         sampler_output = SamplerOutput(
-            sampled_token_ids=sampled,
-            logprob_token_ids=topk_indices,
-            logprobs=topk_logprobs,
-            prompt_logprob_token_ids=None,
-            prompt_logprobs=None,
-        )
+            sampled_token_ids=maybe_sampled,
+            logprob_token_ids=maybe_sample_topk_indices,
+            logprobs=maybe_sample_topk_logprobs,
+            prompt_logprob_token_ids=prompt_topk_indices,
+            prompt_logprobs=prompt_topk_logprobs)
+
         return sampler_output
 
-    def apply_temperature(
+    def _apply_temperature(
         self,
         logits: torch.Tensor,
         temp: torch.Tensor,
@@ -59,7 +168,7 @@ def apply_temperature(
         logits.div_(temp.unsqueeze(dim=1))
         return logits
 
-    def apply_top_k_top_p(
+    def _apply_top_k_top_p(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 13cbc8fa39c03..0a3fb0535e35a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -207,7 +207,15 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         if removed_req_indices:
             self.input_batch.condense(removed_req_indices)
 
-    def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
+    def _prepare_inputs(
+        self,
+        scheduler_output: "SchedulerOutput",
+        sampling_metadata: SamplingMetadata,
+        num_input_tokens: int,
+    ) -> Tuple[torch.Tensor, FlashAttentionMetadata, torch.Tensor,
+               torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
+
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
         num_reqs = self.input_batch.num_reqs
@@ -240,8 +248,9 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
         arange_matrix = np.tile(np.arange(max_num_scheduled_tokens),
                                 (num_reqs, 1))
-        mask = arange_matrix < num_scheduled_tokens[:, np.newaxis]
-        arange = arange_matrix[mask]
+        prompt_logits_mask = arange_matrix < num_scheduled_tokens[:,
+                                                                  np.newaxis]
+        arange = arange_matrix[prompt_logits_mask]
 
         # Get positions.
         positions = torch.empty((total_num_scheduled_tokens, ),
@@ -321,8 +330,27 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # partial request, we do so for simplicity. We will ignore the sampled
         # token from the partial request.
         # TODO: Support prompt logprobs.
-        logits_indices = query_start_loc[1:] - 1
-        return input_ids, attn_metadata, logits_indices
+        maybe_sample_logits_indices = query_start_loc[1:] - 1
+        num_query_tokens = torch.diff(query_start_loc)
+        num_sampled_tokens = torch.tensor(
+            scheduler_output.partial_running_reqs, device=self.device)
+
+        # One or more requests require prompt logprobs
+        complete_req_mask = torch.tensor(
+            [not x for x in scheduler_output.partial_running_reqs])
+
+        if do_prompt_logprobs:
+            prompt_logits_mask = torch.ones(num_input_tokens, dtype=torch.bool)
+            prompt_logits_mask[
+                maybe_sample_logits_indices[complete_req_mask]] = False
+
+            return (input_ids, attn_metadata, num_query_tokens,
+                    num_sampled_tokens, maybe_sample_logits_indices,
+                    prompt_logits_mask)
+        else:
+            # No requests require prompt logprobs
+            return (input_ids, attn_metadata, num_query_tokens,
+                    num_sampled_tokens, maybe_sample_logits_indices, None)
 
     def _prepare_sampling(
         self,
@@ -421,9 +449,8 @@ def execute_model(
         self._execute_encoder(scheduler_output)
         encoder_outputs = self._gather_encoder_outputs(scheduler_output)
 
-        # Prepare the decoder inputs.
-        input_ids, attn_metadata, logits_indices = self._prepare_inputs(
-            scheduler_output)
+        sampling_metadata = self._prepare_sampling(scheduler_output)
+
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
@@ -435,6 +462,21 @@ def execute_model(
             # Eager mode.
             num_input_tokens = num_scheduled_tokens
 
+        do_logprobs = sampling_metadata.max_num_logprobs > 0
+        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
+
+        # Prepare the decoder inputs.
+        (
+            input_ids,
+            attn_metadata,
+            num_query_tokens,
+            num_sampled_tokens,
+            maybe_sample_logits_indices,
+            prompt_logits_mask,
+        ) = self._prepare_inputs(scheduler_output=scheduler_output,
+                                 sampling_metadata=sampling_metadata,
+                                 num_input_tokens=num_input_tokens)
+
         # Get the inputs embeds.
         if encoder_outputs:
             inputs_embeds = self.model.get_input_embeddings(
@@ -456,14 +498,18 @@ def execute_model(
                 attn_metadata=None,
                 inputs_embeds=self.inputs_embeds[:num_input_tokens],
             )
+
         hidden_states = hidden_states[:num_scheduled_tokens]
-        hidden_states = hidden_states[logits_indices]
-        logits = self.model.compute_logits(hidden_states, None)
+
+        sampling_metadata.num_query_tokens = num_query_tokens
+        sampling_metadata.num_sampled_tokens = num_sampled_tokens
+        sampling_metadata.maybe_sample_logits_indices = (
+            maybe_sample_logits_indices)
+        sampling_metadata.prompt_logits_mask = prompt_logits_mask
 
         # Sample the next token and get logprobs if needed.
-        sampling_metadata = self._prepare_sampling(scheduler_output)
         sampler_output = self.model.sample(
-            logits=logits,
+            logits=self.model.compute_logits(hidden_states, None),
             sampling_metadata=sampling_metadata,
         )
 
@@ -491,21 +537,27 @@ def execute_model(
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
-        if sampler_output.logprob_token_ids is None:
-            logprob_token_ids = None
-        else:
-            logprob_token_ids = sampler_output.logprob_token_ids.cpu()
-        if sampler_output.logprobs is None:
-            logprobs = None
-        else:
-            logprobs = sampler_output.logprobs.cpu()
+        (
+            logprob_token_ids,
+            logprobs,
+        ) = ((sampler_output.logprob_token_ids.cpu(),
+              sampler_output.logprobs.cpu()) if do_logprobs else (None, None))
+
+        (
+            prompt_logprob_token_ids,
+            prompt_logprobs,
+        ) = ((sampler_output.prompt_logprob_token_ids.cpu(),
+              sampler_output.prompt_logprobs.cpu()) if do_prompt_logprobs else
+             (None, None))
+
         model_runner_output = ModelRunnerOutput(
             req_ids=self.input_batch.req_ids[:num_reqs],
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids_cpu=sampled_token_ids,
             logprob_token_ids_cpu=logprob_token_ids,
             logprobs_cpu=logprobs,
-        )
+            prompt_logprob_token_ids_cpu=prompt_logprob_token_ids,
+            prompt_logprobs_cpu=prompt_logprobs)
         return model_runner_output
 
     def load_model(self) -> None:
@@ -692,6 +744,7 @@ def __init__(
         self.generators: Dict[int, torch.Generator] = {}
 
         self.num_logprobs: Dict[str, int] = {}
+        self.num_prompt_logprobs: Dict[str, int] = {}
         self.prompt_logprob_reqs: Set[str] = set()
 
     def add_request(
@@ -737,8 +790,11 @@ def add_request(
         self.generators[req_index] = request.generator
 
         num_logprobs = sampling_params.logprobs
+        num_prompt_logprobs = sampling_params.prompt_logprobs
         if num_logprobs is not None and num_logprobs > 0:
             self.num_logprobs[req_id] = num_logprobs
+        if num_prompt_logprobs is not None and num_prompt_logprobs > 0:
+            self.num_prompt_logprobs[req_id] = num_prompt_logprobs
         if sampling_params.prompt_logprobs:
             self.prompt_logprob_reqs.add(req_id)
 
@@ -754,6 +810,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.top_k_reqs.discard(req_id)
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
+        self.num_prompt_logprobs.pop(req_id, None)
         self.prompt_logprob_reqs.discard(req_id)
         return req_index
 
@@ -766,6 +823,7 @@ def clear(self) -> None:
         self.top_k_reqs.clear()
         self.generators.clear()
         self.num_logprobs.clear()
+        self.num_prompt_logprobs.clear()
         self.prompt_logprob_reqs.clear()
 
     def condense(self, empty_req_indices: List[int]) -> None:
@@ -832,7 +890,7 @@ def make_sampling_metadata(
             no_top_k=self.no_top_k,
             generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
-        )
+            max_num_prompt_logprobs=self.max_num_prompt_logprobs)
 
     @property
     def num_reqs(self) -> int:
@@ -858,6 +916,11 @@ def no_top_k(self) -> bool:
     def max_num_logprobs(self) -> int:
         return max(self.num_logprobs.values()) if self.num_logprobs else 0
 
+    @property
+    def max_num_prompt_logprobs(self) -> int:
+        return (max(self.num_prompt_logprobs.values())
+                if self.num_prompt_logprobs else 0)
+
     @property
     def no_logprob(self) -> bool:
         return len(self.num_logprobs) == 0

From e39555101d769ba94719246b3cb020119c8cfbdf Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 07:58:04 -0500
Subject: [PATCH 049/293] fixed issue with sample-logprob-only batches

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/sample/sampler.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 77424df30e9ca..26dd4bafcff44 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -101,8 +101,7 @@ def forward(
                 (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
                 dim=-1)
         elif do_logprobs:
-            logprobs = self.get_logprobs(
-                logits_w_tmp_tpk_tpp[maybe_sample_logits_indices, :])
+            logprobs = self.get_logprobs(maybe_sample_logits_w_tmp_tpk_tpp)
 
             maybe_sampled_logprobs = logprobs[
                 torch.arange(maybe_sampled.shape[0]), maybe_sampled]

From ae66ae4308c7375414381a78063de16bb0ed0a53 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 08:23:27 -0500
Subject: [PATCH 050/293] refactored logprobs tensor pythonization in scheduler

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/samplers/test_logprobs.py |  2 -
 vllm/outputs.py                    | 13 +++++-
 vllm/v1/core/scheduler.py          | 68 +++++++++++++++++++++++++-----
 3 files changed, 70 insertions(+), 13 deletions(-)

diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index 114ce7bd1f2fb..29e193e28092f 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -33,14 +33,12 @@ def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
     no logprobs
 
     Args:
-      
       batch_logprobs_composition: types of logprobs configs to include in batch
 
     Returns:
 
       List of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs])
       tuples
-    
     """
     if batch_logprobs_composition == "NONE":
         # No requests with sample or prompt logprobs
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 9733158504945..912e485e40b59 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -131,7 +131,18 @@ def new(
         prompt_logprobs: Optional[PromptLogprobs],
         finished: bool = False,
     ) -> "RequestOutput":
-        """Initialize a new RequestOutput object."""
+        """Initialize a new RequestOutput object.
+        
+        Args:
+          request_id
+          prompt: optional single prompt string
+          prompt_token_ids: optional list of prompt tokens
+          text: completion text
+          token_ids: completion token ids
+          logprobs: completion sample logprobs
+          prompt_logprobs: prompt logprobs
+          finished
+        """
 
         # TODO: Support `n` > 1.
         completion_output = CompletionOutput(index=0,
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 476b12c705482..0e09da028b16f 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -385,29 +385,77 @@ def _try_schedule_encoder_inputs(
             encoder_inputs_to_schedule.append(i)
         return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
 
-    def update_from_output(
+    def _pythonize_logprobs(
         self,
-        scheduler_output: "SchedulerOutput",
+        do_logprobs: bool,
+        do_prompt_logprobs: bool,
         model_runner_output: "ModelRunnerOutput",
-    ) -> List[EngineCoreOutput]:
-        # NOTE(woosuk): This method doesn't consider speculative decoding.
-        sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
-        num_scheduled_tokens = scheduler_output.num_scheduled_tokens
-        do_logprobs = model_runner_output.logprobs_cpu is not None
-        do_prompt_logprobs = (
-            model_runner_output.prompt_logprobs_cpu is not None
-            and len(model_runner_output.prompt_logprobs_cpu) > 0)
+    ) -> Tuple[List, List, List, List]:
+        """Convert logprobs tensors to Python data structures.
+        
+        Args:
+          do_logprobs: sample logprobs are required
+          do_prompt_logprobs: prompt logprobs are required
+          model_runner_output: model runner output contains CPU logprobs tensors
+
+        Returns:
+          logprob_token_ids_list
+          logprob_values_list
+          prompt_logprob_token_ids_list
+          prompt_logprob_values_list
+        """
         if do_logprobs:
+            # Pythonize sample logprobs if needed
             assert model_runner_output.logprob_token_ids_cpu is not None
             logprob_token_ids_list = (
                 model_runner_output.logprob_token_ids_cpu.tolist())
             logprob_values_list = (model_runner_output.logprobs_cpu.tolist())
+        else:
+            (
+                logprob_token_ids_list,
+                logprob_values_list,
+            ) = (None, None)
         if do_prompt_logprobs:
+            # Pythonize prompt logprobs if needed
             assert model_runner_output.prompt_logprob_token_ids_cpu is not None
             prompt_logprob_token_ids_list = (
                 model_runner_output.prompt_logprob_token_ids_cpu.tolist())
             prompt_logprob_values_list = (
                 model_runner_output.prompt_logprobs_cpu.tolist())
+        else:
+            (
+                prompt_logprob_token_ids_list,
+                prompt_logprob_values_list,
+            ) = (None, None)
+
+        return (logprob_token_ids_list, logprob_values_list,
+                prompt_logprob_token_ids_list, prompt_logprob_values_list)
+
+    def update_from_output(
+        self,
+        scheduler_output: "SchedulerOutput",
+        model_runner_output: "ModelRunnerOutput",
+    ) -> List[EngineCoreOutput]:
+        # NOTE(woosuk): This method doesn't consider speculative decoding.
+        sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
+        num_scheduled_tokens = scheduler_output.num_scheduled_tokens
+        do_logprobs = model_runner_output.logprobs_cpu is not None
+        do_prompt_logprobs = (
+            model_runner_output.prompt_logprobs_cpu is not None
+            and len(model_runner_output.prompt_logprobs_cpu) > 0)
+
+        # Get logprobs as Python data structures
+        (
+            logprob_token_ids_list,
+            logprob_values_list,
+            prompt_logprob_token_ids_list,
+            prompt_logprob_values_list,
+        ) = self._pythonize_logprobs(do_logprobs, do_prompt_logprobs,
+                                     model_runner_output)
+
+        if do_prompt_logprobs:
+            # Index into prompt tokens, for building
+            # prompt logprobs output data structure
             curr_prompt_base_idx = 0
         new_running: List[Request] = []
         engine_core_outputs: List[EngineCoreOutput] = []

From 17d858d5ffe0a63b5968196d791180f24e5484a5 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 08:42:57 -0500
Subject: [PATCH 051/293] added fast logprobs test

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/samplers/test_logprobs.py | 131 +++++++++++++++++++++--------
 vllm/v1/worker/gpu_model_runner.py |  26 ++----
 2 files changed, 104 insertions(+), 53 deletions(-)

diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index 29e193e28092f..86d34a8285a86 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -75,50 +75,17 @@ def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
         raise ValueError("Invalid logprobs batch configuration for test.")
 
 
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype",
-                         ["half"])  # needed for comparing logprobs with HF
-# @pytest.mark.parametrize("detokenize", [True, False])
-@pytest.mark.parametrize("max_num_batched_tokens", [128, 256, 1024])
-@pytest.mark.parametrize("batch_logprobs_composition",
-                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
-def test_get_logprobs_and_prompt_logprobs(
+def _test_case_get_logprobs_and_prompt_logprobs(
     hf_runner,
     vllm_runner,
     model: str,
     dtype: str,
-    # detokenize: bool,
+    detokenize: bool,
     batch_logprobs_composition: str,
     max_num_batched_tokens: int,
     example_prompts,
     monkeypatch,
-):
-    """Test V1 Engine logprobs & prompt logprobs
-    
-    Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
-    settings and validate that
-    * The generated logprobs and prompt logprobs are consistent with the
-      configuration settings, in terms of whether or not the logprobs
-      (of either type) were requested and how many were requested
-    * The generated logprobs are consistent with the generated tokens
-    * The generated (prompt)logprobs are consistent with HuggingFace
-      (prompt)logprobs, as a reference
-
-    batch_logprobs_composition controls the logprobs configurations for
-    requests in the batch under test.
-
-    Args:
-      hf_runner
-      vllm_runner
-      model
-      dtype
-      detokenize: if False, return generated tokens bypassing detokenizer
-      batch_logprobs_composition: logprobs configuration for test batch
-      example_prompts
-      monkeypatch
-    """
-    detokenize = True
-
+) -> None:
     test_prompts = example_prompts
 
     # LLM engine v1
@@ -273,6 +240,98 @@ def test_get_logprobs_and_prompt_logprobs(
             assert vllm_result.prompt_logprobs is None
 
 
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype",
+                         ["half"])  # needed for comparing logprobs with HF
+# @pytest.mark.parametrize("detokenize", [True, False])
+@pytest.mark.parametrize("max_num_batched_tokens", [128, 256, 1024])
+@pytest.mark.parametrize("batch_logprobs_composition",
+                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
+def test_get_logprobs_and_prompt_logprobs(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+    # detokenize: bool,
+    batch_logprobs_composition: str,
+    max_num_batched_tokens: int,
+    example_prompts,
+    monkeypatch,
+) -> None:
+    """Test V1 Engine logprobs & prompt logprobs
+    
+    Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
+    settings and validate that
+    * The generated logprobs and prompt logprobs are consistent with the
+      configuration settings, in terms of whether or not the logprobs
+      (of either type) were requested and how many were requested
+    * The generated logprobs are consistent with the generated tokens
+    * The generated (prompt)logprobs are consistent with HuggingFace
+      (prompt)logprobs, as a reference
+
+    batch_logprobs_composition controls the logprobs configurations for
+    requests in the batch under test.
+
+    Args:
+      hf_runner
+      vllm_runner
+      model
+      dtype
+      detokenize: if False, return generated tokens bypassing detokenizer
+      batch_logprobs_composition: logprobs configuration for test batch
+      example_prompts
+      monkeypatch
+    """
+    detokenize = True
+
+    _test_case_get_logprobs_and_prompt_logprobs(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        model=model,
+        dtype=dtype,
+        detokenize=detokenize,
+        batch_logprobs_composition=batch_logprobs_composition,
+        max_num_batched_tokens=max_num_batched_tokens,
+        example_prompts=example_prompts,
+        monkeypatch=monkeypatch)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype",
+                         ["half"])  # needed for comparing logprobs with HF
+# @pytest.mark.parametrize("detokenize", [True, False])
+@pytest.mark.parametrize("max_num_batched_tokens", [128])
+@pytest.mark.parametrize("batch_logprobs_composition", ["SAMPLE_PROMPT"])
+def test_fast_get_logprobs_and_prompt_logprobs(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    dtype: str,
+    # detokenize: bool,
+    batch_logprobs_composition: str,
+    max_num_batched_tokens: int,
+    example_prompts,
+    monkeypatch,
+) -> None:
+    """Fast test: V1 Engine logprobs & prompt logprobs
+    
+    Faster version of `test_get_logprobs_and_prompt_logprobs` with
+    fewer test cases.
+    """
+    detokenize = True
+
+    _test_case_get_logprobs_and_prompt_logprobs(
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        model=model,
+        dtype=dtype,
+        detokenize=detokenize,
+        batch_logprobs_composition=batch_logprobs_composition,
+        max_num_batched_tokens=max_num_batched_tokens,
+        example_prompts=example_prompts,
+        monkeypatch=monkeypatch)
+
+
 def test_max_logprobs(monkeypatch):
     """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
     
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0a3fb0535e35a..96bf7763e98b3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -537,27 +537,19 @@ def execute_model(
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
-        (
-            logprob_token_ids,
-            logprobs,
-        ) = ((sampler_output.logprob_token_ids.cpu(),
-              sampler_output.logprobs.cpu()) if do_logprobs else (None, None))
-
-        (
-            prompt_logprob_token_ids,
-            prompt_logprobs,
-        ) = ((sampler_output.prompt_logprob_token_ids.cpu(),
-              sampler_output.prompt_logprobs.cpu()) if do_prompt_logprobs else
-             (None, None))
-
         model_runner_output = ModelRunnerOutput(
             req_ids=self.input_batch.req_ids[:num_reqs],
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids_cpu=sampled_token_ids,
-            logprob_token_ids_cpu=logprob_token_ids,
-            logprobs_cpu=logprobs,
-            prompt_logprob_token_ids_cpu=prompt_logprob_token_ids,
-            prompt_logprobs_cpu=prompt_logprobs)
+            logprob_token_ids_cpu=(sampler_output.logprob_token_ids.cpu()
+                                   if do_logprobs else None),
+            logprobs_cpu=(sampler_output.logprobs.cpu()
+                          if do_logprobs else None),
+            prompt_logprob_token_ids_cpu=(
+                sampler_output.prompt_logprob_token_ids.cpu()
+                if do_prompt_logprobs else None),
+            prompt_logprobs_cpu=(sampler_output.prompt_logprobs.cpu()
+                                 if do_prompt_logprobs else None))
         return model_runner_output
 
     def load_model(self) -> None:

From f5c0afd27b05cc37515fdb363c91504404f492f8 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 10:11:35 -0500
Subject: [PATCH 052/293] wip refactor

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/outputs.py                 |   8 +-
 vllm/v1/sample/metadata.py         |   1 -
 vllm/v1/sample/sampler.py          | 260 +++++++++++++++++++++--------
 vllm/v1/worker/gpu_model_runner.py |   9 +-
 4 files changed, 195 insertions(+), 83 deletions(-)

diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 3cd0430aabd6f..0bbbf24abd76d 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -11,14 +11,14 @@ class SamplerOutput:
     sampled_token_ids: torch.Tensor
 
     # [num_reqs, max_num_logprobs + 1]
-    logprob_token_ids: Optional[torch.Tensor]
+    logprob_token_ids: Optional[torch.Tensor] = None
     # [num_reqs, max_num_logprobs + 1]
-    logprobs: Optional[torch.Tensor]
+    logprobs: Optional[torch.Tensor] = None
 
     # [num_prompt_tokens, max_num_prompt_logprobs + 1]
-    prompt_logprob_token_ids: Optional[torch.Tensor]
+    prompt_logprobs: Optional[torch.Tensor] = None
     # [num_prompt_tokens, max_num_prompt_logprobs + 1]
-    prompt_logprobs: Optional[torch.Tensor]
+    prompt_logprob_token_ids: Optional[torch.Tensor] = None
 
 
 @dataclass
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 3bf5a462d5070..51fdae841971b 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -22,6 +22,5 @@ class SamplingMetadata:
     max_num_prompt_logprobs: int
 
     num_query_tokens: Optional[torch.Tensor] = None
-    num_sampled_tokens: Optional[torch.Tensor] = None
     maybe_sample_logits_indices: Optional[torch.Tensor] = None
     prompt_logits_mask: Optional[torch.Tensor] = None
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 26dd4bafcff44..32abeca59e532 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -47,112 +47,230 @@ def _topk_logprobs_indices(
         # Use int32 to reduce the tensor size.
         return topk_logprobs, topk_indices.to(torch.int32)
 
-    def forward(
+    def _compute_logprobs_from_processed_logits(
         self,
-        logits: torch.Tensor,
+        do_logprobs: bool,
+        do_prompt_logprobs: bool,
+        maybe_sampled: torch.Tensor,
+        maybe_sample_logits_indices: Optional[torch.Tensor],
+        prompt_logits_mask: Optional[torch.Tensor],
         sampling_metadata: SamplingMetadata,
-    ) -> SamplerOutput:
-
-        do_logprobs = sampling_metadata.max_num_logprobs > 0
-        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
-        num_query_tokens = sampling_metadata.num_query_tokens
-        maybe_sample_logits_indices = (
-            sampling_metadata.maybe_sample_logits_indices)
-        prompt_logits_mask = sampling_metadata.prompt_logits_mask
-
-        if do_prompt_logprobs:
-            logits_w_tmp_tpk_tpp = self._apply_temperature_top_k_top_p(
-                logits, sampling_metadata, num_query_tokens)
-
-            maybe_sample_logits_w_tmp_tpk_tpp = (
-                logits_w_tmp_tpk_tpp[maybe_sample_logits_indices])
-        else:
-            maybe_sample_logits_w_tmp_tpk_tpp = (
-                self._apply_temperature_top_k_top_p(
-                    logits[maybe_sample_logits_indices], sampling_metadata,
-                    None))
-
-        maybe_sampled = self._probs_sample(maybe_sample_logits_w_tmp_tpk_tpp,
-                                           sampling_metadata)
-
+        maybe_sample_logits_w_tmp_tpk_tpp: torch.Tensor,
+        logits_w_tmp_tpk_tpp: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute sample and prompt logprobs as required by batch config
+        
+        Consumes logits which have already had temperature, top-k and top-p
+        applied. 
+         
+        `do_logprobs` and `do_prompt_logprobs` control whether sample and
+        prompt logprobs are computed, respectively.
+
+        This function does not handle the case where no logprobs are required
+        at the batch level; it is assumed this function will not be called in
+        that scenario.
+
+        Args:
+          do_logprobs: compute sample logprobs
+          do_prompt_logprobs: compute prompt logprobs
+          maybe_sampled: list of sampled tokens; if there is a partial request,
+                         includes the partial request's sampled token (which
+                         will later be discarded.)
+          maybe_sample_logits_indices: sequence-offset indices where a new
+                         token is decoded; if there is a partial request,
+                         includes the index of the partial request's sampled
+                         token (which will later be discarded.)
+          prompt_logits_mask: mask indicating the sequence offsets of prompt
+                         tokens. Note: if there is a partial request,
+                         this mask includes the index of the partial request's
+                         sample token (since this sampled token will be
+                         discarded, but the logprobs computed at this offset
+                         are part of the prompt logprobs.) Note that this means
+                         prompt_logits_mask and maybe_sample_logits_indices
+                         may have overlap.
+          sampling_metadata
+          maybe_sample_logits_w_tmp_tpk_tpp: assumed to be logits gathered
+                         from sequence offsets where a new token is being
+                         decoded (including for a partial request); assumed
+                         that temperature, top-k and top-p have been applied.
+          logits_w_tmp_tpk_tpp: optional; all logits with temperature, top-k,
+                         top-p applied.
+
+          Returns:
+            Sample logprobs (`None` if `do_logprobs == False`)
+            Sample logprobs token indices (`None` if `do_logprobs == False`)
+            Prompt logprobs (`None` if `do_prompt_logprobs == False`)
+            Prompt logprobs token indices
+                (`None` if `do_prompt_logprobs == False`)
+        """
+
+        assert do_logprobs or do_prompt_logprobs
         if do_logprobs and do_prompt_logprobs:
-            logprobs = self.get_logprobs(logits_w_tmp_tpk_tpp)
-
-            maybe_sampled_logprobs = logprobs[maybe_sample_logits_indices,
-                                              maybe_sampled]
+            # Batch requires sample and prompt logprobs
 
+            # - Compute top logprobs for all sequence offsets
+            logprobs = self.get_logprobs(logits_w_tmp_tpk_tpp)
             topk_logprobs, topk_indices = self._topk_logprobs_indices(
                 logprobs, sampling_metadata)
 
+            # - Gather logprobs for sequence offsets where new tokens are
+            #   decoded
             maybe_sample_topk_logprobs = topk_logprobs[
                 maybe_sample_logits_indices, :]
             maybe_sample_topk_indices = topk_indices[
                 maybe_sample_logits_indices, :]
-            prompt_topk_logprobs = topk_logprobs[prompt_logits_mask, :]
-            prompt_topk_indices = topk_indices[prompt_logits_mask, :]
 
-            # Concat sampled token logprobs
-            maybe_sample_topk_logprobs = torch.cat(
-                (maybe_sample_topk_logprobs,
-                 maybe_sampled_logprobs.unsqueeze(-1)),
-                dim=-1)
-            #Concat sampled token id
-            maybe_sample_topk_indices = torch.cat(
-                (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
-                dim=-1)
-        elif do_logprobs:
-            logprobs = self.get_logprobs(maybe_sample_logits_w_tmp_tpk_tpp)
+            # - In case sampled tokens are not in the top logprobs at their
+            #   respective sequence offsets, gather logprobs associated with
+            #   sampled tokens
+            maybe_sampled_logprobs = logprobs[maybe_sample_logits_indices,
+                                              maybe_sampled]
 
-            maybe_sampled_logprobs = logprobs[
-                torch.arange(maybe_sampled.shape[0]), maybe_sampled]
+            return (
+                # Sample logprobs (including sampled tokens)
+                torch.cat((maybe_sample_topk_logprobs,
+                           maybe_sampled_logprobs.unsqueeze(-1)),
+                          dim=-1),
+                # Sample logprobs token indices (including sampled tokens)
+                torch.cat(
+                    (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
+                    dim=-1),
+                # Prompt logprobs
+                topk_logprobs[prompt_logits_mask, :],
+                # Prompt logprob token indices
+                topk_indices[prompt_logits_mask, :])
+        elif do_logprobs:
+            # Batch requires only sample logprobs
 
+            # - Compute top logprobs only at sequence offsets where new tokens
+            #   are being decoded
+            logprobs = self.get_logprobs(maybe_sample_logits_w_tmp_tpk_tpp)
             (
                 maybe_sample_topk_logprobs,
                 maybe_sample_topk_indices,
             ) = self._topk_logprobs_indices(logprobs, sampling_metadata)
 
-            # Concat sampled token logprobs
+            # - In case sampled tokens are not in the top logprobs at their
+            #   respective sequence offsets, gather logprobs associated with
+            #   sampled tokens
+            maybe_sampled_logprobs = logprobs[
+                torch.arange(maybe_sampled.shape[0]), maybe_sampled]
+
+            # - Concat sampled token logprobs
             maybe_sample_topk_logprobs = torch.cat(
                 (maybe_sample_topk_logprobs,
                  maybe_sampled_logprobs.unsqueeze(-1)),
                 dim=-1)
-            #Concat sampled token id
+            # - Concat sampled token id
             maybe_sample_topk_indices = torch.cat(
                 (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
                 dim=-1)
 
-            (
-                prompt_topk_logprobs,
-                prompt_topk_indices,
-            ) = (None, None)
+            # Return sample logprobs
+            return (maybe_sample_topk_logprobs, maybe_sample_topk_indices,
+                    None, None)
 
         elif do_prompt_logprobs:
+            # Batch requires only prompt logprobs
+
+            # - Compute top logprobs only at sequence offsets of prompt tokens
             logprobs = self.get_logprobs(
                 logits_w_tmp_tpk_tpp[prompt_logits_mask, :])
 
-            prompt_topk_logprobs, prompt_topk_indices = (
-                self._topk_logprobs_indices(logprobs, sampling_metadata))
+            # Return prompt logprobs
+            return ((None, None) +
+                    self._topk_logprobs_indices(logprobs, sampling_metadata))
 
-            (
-                maybe_sample_topk_logprobs,
-                maybe_sample_topk_indices,
-            ) = (None, None)
+    def forward(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
+        """Implement sampling.
+        
+        Apply temperature, top-k and top-p.
+        Sample from the probability distribution implied by `logits`.
+        Only sample at sequence offsets where new tokens are decoded.
+        In the process, compute sample and prompt logprobs (if required.)
+
+        Args:
+          logits: model output logits which imply probability distribution.
+          sampling_metadata: sampling config settings
+        
+        Returns:
+          Sampler output. Sampled tokens and sample/prompt logprobs
+          (if requested)
+        """
+
+        # Batch-level logprobs configs. `do_logprobs` indicates whether
+        # any request requires sample logprobs. `do_prompt_logprobs`
+        # indicates whether any request requires prompt logprobs.
+        do_logprobs = sampling_metadata.max_num_logprobs > 0
+        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
+        do_any_logprobs = do_logprobs or do_prompt_logprobs
+
+        num_query_tokens = sampling_metadata.num_query_tokens
+        maybe_sample_logits_indices = (
+            sampling_metadata.maybe_sample_logits_indices)
+        prompt_logits_mask = sampling_metadata.prompt_logits_mask
+
+        # Apply temperature, top-k and top-p to logits at sequence offsets
+        # where a new token is being decoded.
+        if do_prompt_logprobs:
+            # If prompt logprobs are required, then temp/top-k/top-p
+            # must also be applied to prompt logits as a prerequisite.
+            # So pass *all* logits through temp/top-k/top-p, then gather
+            # the processed logits from the sequence offsets where a new token
+            # is being decoded.
+            logits_w_tmp_tpk_tpp = self._apply_temperature_top_k_top_p(
+                logits, sampling_metadata, num_query_tokens)
+
+            maybe_sample_logits_w_tmp_tpk_tpp = (
+                logits_w_tmp_tpk_tpp[maybe_sample_logits_indices])
         else:
+            # If prompt logprobs are not required, then gather the logits
+            # only from the sequence offsets where a new token is being
+            # decoded, and *only* apply temp/top-k/top-p to those logits.
+            maybe_sample_logits_w_tmp_tpk_tpp = (
+                self._apply_temperature_top_k_top_p(
+                    logits[maybe_sample_logits_indices], sampling_metadata,
+                    None))
+
+        # Compute and sample token probability distribution, *only* at sequence
+        # offsets where a new token is being decoded
+        maybe_sampled = self._probs_sample(maybe_sample_logits_w_tmp_tpk_tpp,
+                                           sampling_metadata)
+
+        # Compute sample & prompt logprobs, as-needed
+        if do_any_logprobs:
             (
-                maybe_sample_topk_logprobs,
-                maybe_sample_topk_indices,
-                prompt_topk_logprobs,
-                prompt_topk_indices,
-            ) = (None, None, None, None)
-
-        sampler_output = SamplerOutput(
-            sampled_token_ids=maybe_sampled,
-            logprob_token_ids=maybe_sample_topk_indices,
-            logprobs=maybe_sample_topk_logprobs,
-            prompt_logprob_token_ids=prompt_topk_indices,
-            prompt_logprobs=prompt_topk_logprobs)
-
-        return sampler_output
+                maybe_sample_logprobs,
+                maybe_sample_logprobs_token_indices,
+                prompt_logprobs,
+                prompt_logprobs_token_indices,
+            ) = self._compute_logprobs_from_processed_logits(
+                do_logprobs=do_logprobs,
+                do_prompt_logprobs=do_prompt_logprobs,
+                maybe_sampled=maybe_sampled,
+                maybe_sample_logits_indices=maybe_sample_logits_indices,
+                prompt_logits_mask=prompt_logits_mask,
+                sampling_metadata=sampling_metadata,
+                maybe_sample_logits_w_tmp_tpk_tpp=
+                maybe_sample_logits_w_tmp_tpk_tpp,
+                logits_w_tmp_tpk_tpp=(logits_w_tmp_tpk_tpp
+                                      if do_prompt_logprobs else None))
+
+            # Return decoded output tokens and sample/prompt logprobs,
+            # as required
+            return SamplerOutput(
+                sampled_token_ids=maybe_sampled,
+                logprobs=maybe_sample_logprobs,
+                logprob_token_ids=maybe_sample_logprobs_token_indices,
+                prompt_logprobs=prompt_logprobs,
+                prompt_logprob_token_ids=prompt_logprobs_token_indices)
+        else:
+            # No logprobs; return decoded output tokens
+            return SamplerOutput(sampled_token_ids=maybe_sampled)
 
     def _apply_temperature(
         self,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 96bf7763e98b3..dd0d1824246d4 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -332,8 +332,6 @@ def _prepare_inputs(
         # TODO: Support prompt logprobs.
         maybe_sample_logits_indices = query_start_loc[1:] - 1
         num_query_tokens = torch.diff(query_start_loc)
-        num_sampled_tokens = torch.tensor(
-            scheduler_output.partial_running_reqs, device=self.device)
 
         # One or more requests require prompt logprobs
         complete_req_mask = torch.tensor(
@@ -345,12 +343,11 @@ def _prepare_inputs(
                 maybe_sample_logits_indices[complete_req_mask]] = False
 
             return (input_ids, attn_metadata, num_query_tokens,
-                    num_sampled_tokens, maybe_sample_logits_indices,
-                    prompt_logits_mask)
+                    maybe_sample_logits_indices, prompt_logits_mask)
         else:
             # No requests require prompt logprobs
             return (input_ids, attn_metadata, num_query_tokens,
-                    num_sampled_tokens, maybe_sample_logits_indices, None)
+                    maybe_sample_logits_indices, None)
 
     def _prepare_sampling(
         self,
@@ -470,7 +467,6 @@ def execute_model(
             input_ids,
             attn_metadata,
             num_query_tokens,
-            num_sampled_tokens,
             maybe_sample_logits_indices,
             prompt_logits_mask,
         ) = self._prepare_inputs(scheduler_output=scheduler_output,
@@ -502,7 +498,6 @@ def execute_model(
         hidden_states = hidden_states[:num_scheduled_tokens]
 
         sampling_metadata.num_query_tokens = num_query_tokens
-        sampling_metadata.num_sampled_tokens = num_sampled_tokens
         sampling_metadata.maybe_sample_logits_indices = (
             maybe_sample_logits_indices)
         sampling_metadata.prompt_logits_mask = prompt_logits_mask

From f7833f3af3f9feed8df7c95453c10ec89175be7e Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 11:11:37 -0500
Subject: [PATCH 053/293] format

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/samplers/test_logprobs.py |  7 +--
 vllm/v1/sample/sampler.py          | 72 +++++++++++++++++++-----------
 2 files changed, 49 insertions(+), 30 deletions(-)

diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index 86d34a8285a86..a303438c8a3d9 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -57,7 +57,7 @@ def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
             (None, 0),
             (0, None),
             (0, 0),
-            (None, 6),
+            (None, 7),
             (0, 5),
         ]
     elif batch_logprobs_composition == "SAMPLE_PROMPT":
@@ -67,7 +67,7 @@ def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
             (0, 0),
             (5, None),
             (3, 0),
-            (6, 3),
+            (7, 3),
             (None, 6),
             (0, 5),
         ]
@@ -301,7 +301,8 @@ def test_get_logprobs_and_prompt_logprobs(
                          ["half"])  # needed for comparing logprobs with HF
 # @pytest.mark.parametrize("detokenize", [True, False])
 @pytest.mark.parametrize("max_num_batched_tokens", [128])
-@pytest.mark.parametrize("batch_logprobs_composition", ["SAMPLE_PROMPT"])
+@pytest.mark.parametrize("batch_logprobs_composition",
+                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
 def test_fast_get_logprobs_and_prompt_logprobs(
     hf_runner,
     vllm_runner,
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 32abeca59e532..4a0a3afb35e0b 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -36,14 +36,26 @@ def _probs_sample(
         # Use int32 to reduce the tensor size.
         return sampled.to(torch.int32)
 
-    def _topk_logprobs_indices(
+    def _top_logprobs_token_indices(
         self,
         logprobs: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
+        max_num_logprobs: int,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute top logprobs and associated token indices
+        
+        Args:
+          logprobs: total_tokens x vocab tensor
+          max_num_logprobs: Max number of top {sample,prompt} logprobs
+                            requested in batch (depending on whether top sample
+                            logprobs or top prompt logprobs are being computed)
 
-        topk_logprobs, topk_indices = torch.topk(
-            logprobs, sampling_metadata.max_num_logprobs, dim=-1)
+        Returns:
+          Top logprobs, total_tokens x max_num_logprobs tensor
+          Top logprob token indices, total_tokens x max_num_logprobs tensor
+        """
+        topk_logprobs, topk_indices = torch.topk(logprobs,
+                                                 max_num_logprobs,
+                                                 dim=-1)
         # Use int32 to reduce the tensor size.
         return topk_logprobs, topk_indices.to(torch.int32)
 
@@ -97,28 +109,33 @@ def _compute_logprobs_from_processed_logits(
                          top-p applied.
 
           Returns:
-            Sample logprobs (`None` if `do_logprobs == False`)
-            Sample logprobs token indices (`None` if `do_logprobs == False`)
-            Prompt logprobs (`None` if `do_prompt_logprobs == False`)
-            Prompt logprobs token indices
-                (`None` if `do_prompt_logprobs == False`)
+            Sample logprobs (`None` if `do_logprobs == False`,
+                             o/w num_samples x max_num_logprobs tensor)
+            Sample logprobs token indices (`None` if `do_logprobs == False`,
+                             o/w num_samples x max_num_logprobs tensor)
+            Prompt logprobs (`None` if `do_prompt_logprobs == False`,
+                             o/w num_prompt_tokens x max_num_prompt_logprobs
+                             tensor)
+            Prompt logprobs token indices (`None` if
+                 `do_prompt_logprobs == False`, o/w
+                 num_prompt_tokens x max_num_prompt_logprobs tensor)
         """
 
         assert do_logprobs or do_prompt_logprobs
         if do_logprobs and do_prompt_logprobs:
             # Batch requires sample and prompt logprobs
 
-            # - Compute top logprobs for all sequence offsets
+            # - Compute logprobs for all sequence offsets
             logprobs = self.get_logprobs(logits_w_tmp_tpk_tpp)
-            topk_logprobs, topk_indices = self._topk_logprobs_indices(
-                logprobs, sampling_metadata)
 
-            # - Gather logprobs for sequence offsets where new tokens are
-            #   decoded
-            maybe_sample_topk_logprobs = topk_logprobs[
-                maybe_sample_logits_indices, :]
-            maybe_sample_topk_indices = topk_indices[
-                maybe_sample_logits_indices, :]
+            # - Compute *top* logprobs for sequence offsets
+            #   where a new token is being decoded
+            (
+                maybe_sample_topk_logprobs,
+                maybe_sample_topk_indices,
+            ) = self._top_logprobs_token_indices(
+                logprobs[maybe_sample_logits_indices, :],
+                sampling_metadata.max_num_logprobs)
 
             # - In case sampled tokens are not in the top logprobs at their
             #   respective sequence offsets, gather logprobs associated with
@@ -126,7 +143,7 @@ def _compute_logprobs_from_processed_logits(
             maybe_sampled_logprobs = logprobs[maybe_sample_logits_indices,
                                               maybe_sampled]
 
-            return (
+            return ((
                 # Sample logprobs (including sampled tokens)
                 torch.cat((maybe_sample_topk_logprobs,
                            maybe_sampled_logprobs.unsqueeze(-1)),
@@ -134,11 +151,11 @@ def _compute_logprobs_from_processed_logits(
                 # Sample logprobs token indices (including sampled tokens)
                 torch.cat(
                     (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
-                    dim=-1),
-                # Prompt logprobs
-                topk_logprobs[prompt_logits_mask, :],
-                # Prompt logprob token indices
-                topk_indices[prompt_logits_mask, :])
+                    dim=-1)) +
+                    # Prompt logprobs and token indices
+                    self._top_logprobs_token_indices(
+                        logprobs[prompt_logits_mask, :],
+                        sampling_metadata.max_num_prompt_logprobs))
         elif do_logprobs:
             # Batch requires only sample logprobs
 
@@ -148,7 +165,8 @@ def _compute_logprobs_from_processed_logits(
             (
                 maybe_sample_topk_logprobs,
                 maybe_sample_topk_indices,
-            ) = self._topk_logprobs_indices(logprobs, sampling_metadata)
+            ) = self._top_logprobs_token_indices(
+                logprobs, sampling_metadata.max_num_logprobs)
 
             # - In case sampled tokens are not in the top logprobs at their
             #   respective sequence offsets, gather logprobs associated with
@@ -178,8 +196,8 @@ def _compute_logprobs_from_processed_logits(
                 logits_w_tmp_tpk_tpp[prompt_logits_mask, :])
 
             # Return prompt logprobs
-            return ((None, None) +
-                    self._topk_logprobs_indices(logprobs, sampling_metadata))
+            return ((None, None) + self._top_logprobs_token_indices(
+                logprobs, sampling_metadata.max_num_prompt_logprobs))
 
     def forward(
         self,

From 704d63562c34b5e7861c897da8aebd65ac40ba2a Mon Sep 17 00:00:00 2001
From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>
Date: Tue, 26 Nov 2024 09:11:16 -0800
Subject: [PATCH 054/293] [Bugfix] Fix for Spec model TP + Chunked Prefill
 (#10232)

Signed-off-by: andoorve <37849411+andoorve@users.noreply.github.com>
Signed-off-by: Sourashis Roy <sroy@roblox.com>
Co-authored-by: Sourashis Roy <sroy@roblox.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 docs/source/serving/compatibility_matrix.rst  |  2 +-
 tests/core/test_chunked_prefill_scheduler.py  | 39 +++++++++++++
 tests/spec_decode/e2e/test_compatibility.py   | 46 ---------------
 .../e2e/test_integration_dist_tp2.py          | 57 +++++++++++++++++++
 tests/spec_decode/test_spec_decode_worker.py  |  3 +-
 vllm/config.py                                | 10 ----
 vllm/core/scheduler.py                        | 28 ++++++---
 vllm/spec_decode/spec_decode_worker.py        | 33 +++++++++--
 8 files changed, 145 insertions(+), 73 deletions(-)

diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst
index fa03d2cde1486..a93632ff36fb8 100644
--- a/docs/source/serving/compatibility_matrix.rst
+++ b/docs/source/serving/compatibility_matrix.rst
@@ -118,7 +118,7 @@ Feature x Feature
      - 
      - 
    * - :ref:`SD <spec_decode>`
-     - ✗
+     - ✅
      - ✅
      - ✗
      - ✅
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index acd82065ae457..eaaf004df38b2 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -413,6 +413,45 @@ def cannot_append_second_group2(seq_group, num_lookahead_slots):
     assert out.num_batched_tokens == max_num_batched_tokens
 
 
+@pytest.mark.parametrize("num_scheduler_steps", [1, 5])
+def test_chunked_prefill_spec_prefill(num_scheduler_steps):
+    """Verify that the num_lookahead_slots is set appropriately for an all"""
+    """prefill batch depending on whether multi-step scheduling is enabled"""
+    """or not"""
+    block_size = 4
+    max_seqs = 30
+    max_model_len = 200
+    max_num_batched_tokens = 30
+    num_lookahead_slots = 4
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+        num_lookahead_slots=num_lookahead_slots,
+        num_scheduler_steps=num_scheduler_steps,
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 16
+    cache_config.num_gpu_blocks = 16
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    _, seq_group = create_dummy_prompt("1",
+                                       prompt_length=30,
+                                       block_size=block_size)
+    scheduler.add_seq_group(seq_group)
+    _, out = schedule_and_update_computed_tokens(scheduler)
+    # The request is chunked.
+    # prefill scheduled now.
+    assert len(out.scheduled_seq_groups) == 1
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == max_num_batched_tokens
+    print(out.num_lookahead_slots)
+    assert out.num_lookahead_slots == (0 if (num_scheduler_steps == 1) else
+                                       num_lookahead_slots)
+
+
 def test_chunked_prefill_max_seqs():
     block_size = 4
     max_seqs = 2
diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
index a3f0464e79675..af8397c235f48 100644
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -50,49 +50,3 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
     with pytest.raises(ValueError, match="cannot be larger than"):
         get_output_from_llm_generator(test_llm_generator, prompts,
                                       sampling_params)
-
-
-@pytest.mark.parametrize("common_llm_kwargs",
-                         [{
-                             "model": "meta-llama/Llama-2-7b-chat-hf",
-                             "speculative_model": "JackFram/llama-68m",
-                             "num_speculative_tokens": 5,
-                             "enable_chunked_prefill": "True",
-                         }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {
-        "tensor_parallel_size": 2,
-        "speculative_draft_tensor_parallel_size": 2,
-    },
-    {
-        "tensor_parallel_size": 4,
-        "speculative_draft_tensor_parallel_size": 4,
-    },
-    {
-        "tensor_parallel_size": 8,
-        "speculative_draft_tensor_parallel_size": 8,
-    },
-])
-@pytest.mark.parametrize("test_llm_kwargs", [{}])
-@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_xfail_chunked_prefill_draft_model_tp_not_one(
-        test_llm_generator):
-    """Verify that speculative decoding fails if chunked prefill is enabled for 
-    draft model with tensor parallelism of more than 1.
-    """
-    output_len = 128
-    temperature = 0.0
-
-    prompts = [
-        "Hello, my name is",
-    ]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-
-    with pytest.raises(ValueError, match="with tensor parallel size 1"):
-        get_output_from_llm_generator(test_llm_generator, prompts,
-                                      sampling_params)
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index 25562ca85adf4..02cba92795142 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -115,3 +115,60 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
                                      max_output_len=32,
                                      seed=seed,
                                      temperature=0.0)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [[
+        # Skip cuda graph recording for fast test.
+        "--enforce-eager",
+        "--tensor_parallel_size",
+        "2",
+
+        # precision
+        "--dtype",
+        "bfloat16",
+    ]])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [["--enable-chunked-prefill", "False"],
+     [
+         "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
+         "--max-num-seqs", "4"
+     ]])
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
+@pytest.mark.parametrize("model, test_llm_kwargs",
+                         [("JackFram/llama-68m", [
+                             "--speculative-model",
+                             "JackFram/llama-68m",
+                             "--num_speculative-tokens",
+                             "3",
+                         ]),
+                          ("JackFram/llama-68m", [
+                              "--speculative-model",
+                              "JackFram/llama-68m",
+                              "--num_speculative-tokens",
+                              "3",
+                              "--speculative-draft-tensor-parallel-size",
+                              "1",
+                          ])])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
+                                         per_test_common_llm_kwargs,
+                                         baseline_llm_kwargs, test_llm_kwargs,
+                                         batch_size: int, seed: int):
+    """Verify spec decode works well with same and different TP size for
+    the draft model with chunked prefill.
+    """
+    run_equality_correctness_test_tp(model,
+                                     common_llm_kwargs,
+                                     per_test_common_llm_kwargs,
+                                     baseline_llm_kwargs,
+                                     test_llm_kwargs,
+                                     batch_size,
+                                     max_output_len=32,
+                                     seed=seed,
+                                     temperature=0.0)
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index 8df143104c279..d7caf57147278 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -867,7 +867,8 @@ def test_chunked_prefill_flow(k: int, batch_size: int, batch_composition: str):
     target_group_metadata_list = prefill + decodes
     execute_model_req = ExecuteModelRequest(
         seq_group_metadata_list=target_group_metadata_list,
-        num_lookahead_slots=k)
+        # For prefill only batches we expect num_lookahead_slots = 0.
+        num_lookahead_slots=k if n_decodes > 0 else 0)
 
     target_token_ids = torch.randint(low=0,
                                      high=vocab_size,
diff --git a/vllm/config.py b/vllm/config.py
index c87feaec3e5f6..eae6f909e3933 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1409,16 +1409,6 @@ def maybe_create_spec_config(
                     draft_hf_config
             )
 
-            if (enable_chunked_prefill and \
-                 speculative_draft_tensor_parallel_size != 1):
-                # TODO - Investigate why the error reported in
-                # https://github.com/vllm-project/vllm/pull/9291#issuecomment-2463266258
-                # is happening and re-enable it.
-                raise ValueError(
-                    "Chunked prefill and speculative decoding can be enabled "
-                    "simultaneously only for draft models with tensor "
-                    "parallel size 1.")
-
             draft_model_config.max_model_len = (
                 SpeculativeConfig._maybe_override_draft_max_model_len(
                     speculative_max_model_len,
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 530cbdc3a9190..d23009dae01ee 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1201,15 +1201,25 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
         # Update swapped requests.
         self.swapped.extend(running_scheduled.swapped_out)
         # Put prefills first due to Attention backend ordering assumption.
+        scheduled_seq_groups = (prefills.seq_groups +
+                                running_scheduled.prefill_seq_groups +
+                                swapped_in.prefill_seq_groups +
+                                running_scheduled.decode_seq_groups +
+                                swapped_in.decode_seq_groups)
+        num_prefill_groups = (len(prefills.seq_groups) +
+                              len(swapped_in.prefill_seq_groups) +
+                              len(running_scheduled.prefill_seq_groups))
+        # If all prompts, then we set num_lookahead_slots to 0
+        # this allows us to go through the `no_spec` path in
+        # `spec_decode_worker.py`
+        all_prefills = (len(scheduled_seq_groups) == num_prefill_groups)
+        num_lookahead_slots = (0 if
+                               (all_prefills
+                                and not self.scheduler_config.is_multi_step)
+                               else running_scheduled.num_lookahead_slots)
         return SchedulerOutputs(
-            scheduled_seq_groups=(prefills.seq_groups +
-                                  running_scheduled.prefill_seq_groups +
-                                  swapped_in.prefill_seq_groups +
-                                  running_scheduled.decode_seq_groups +
-                                  swapped_in.decode_seq_groups),
-            num_prefill_groups=(len(prefills.seq_groups) +
-                                len(swapped_in.prefill_seq_groups) +
-                                len(running_scheduled.prefill_seq_groups)),
+            scheduled_seq_groups=scheduled_seq_groups,
+            num_prefill_groups=num_prefill_groups,
             num_batched_tokens=budget.num_batched_tokens +
             budget.num_cached_tokens,
             blocks_to_swap_in=swapped_in.blocks_to_swap_in,
@@ -1218,7 +1228,7 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
             swapped_in.blocks_to_copy,
             ignored_seq_groups=prefills.ignored_seq_groups +
             swapped_in.infeasible_seq_groups,
-            num_lookahead_slots=running_scheduled.num_lookahead_slots,
+            num_lookahead_slots=num_lookahead_slots,
             running_queue_size=len(self.running),
             preempted=(len(running_scheduled.preempted) +
                        len(running_scheduled.swapped_out)),
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index b57742c2ebfdd..b279931ca4b02 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -408,7 +408,20 @@ def execute_model(
         disable_all_speculation = self._should_disable_all_speculation(
             execute_model_req)
         num_lookahead_slots = execute_model_req.num_lookahead_slots
-
+        all_prompt = True
+        atleast_one_prompt = False
+        all_zero_spec_tokens = True
+        for sgm in execute_model_req.seq_group_metadata_list:
+            all_prompt = all_prompt and sgm.is_prompt
+            atleast_one_prompt = atleast_one_prompt or sgm.is_prompt
+            all_zero_spec_tokens = all_zero_spec_tokens and (
+                sgm.num_speculative_tokens == 0)
+
+        if all_prompt and execute_model_req.seq_group_metadata_list:
+            assert num_lookahead_slots == 0, (
+                "Prompt only runs should have num_lookahead_slots equal to 0. "
+                "This should never happen, please file a bug at "
+                "https://github.com/vllm-project/vllm/issues")
         # Speculative decoding is disabled in the following cases:
         # 1. Prefill phase: Speculative decoding is not
         #    used during the prefill phase.
@@ -419,11 +432,8 @@ def execute_model(
         # In any of these cases, the proposer and scorer workers
         # are called normally.
         # We expect `num_speculative_tokens` to be None for prefills.
-        no_spec = all(
-            sgm.is_prompt for sgm in execute_model_req.seq_group_metadata_list
-        ) or num_lookahead_slots == 0 or disable_all_speculation or all(
-            sgm.num_speculative_tokens == 0
-            for sgm in execute_model_req.seq_group_metadata_list)
+        no_spec = (num_lookahead_slots == 0 or disable_all_speculation
+                   or all_zero_spec_tokens)
 
         # Broadcast how many lookahead slots are scheduled for this step, and
         # whether all speculation is disabled, to all non-driver workers.
@@ -442,6 +452,15 @@ def execute_model(
             num_lookahead_slots=num_lookahead_slots,
             no_spec=no_spec,
             disable_all_speculation=disable_all_speculation,
+            # When both chunked prefill and speculative decoding are enabled
+            # it is possible that the same batch contains both prefill
+            # and decodes. If that happens in the scorer we run the batch
+            # as one single forward pass. However, in the proposer we
+            # run them as 2 different batches - one for prefill and
+            # the other for decodes. The variable indicates to the non-driver
+            # worker that there are prefills as part of the speculative batch
+            # and hence it needs to run an extra prefill forward pass.
+            run_spec_proposer_for_prefill=atleast_one_prompt,
         )
         broadcast_tensor_dict(broadcast_dict, src=self._driver_rank)
 
@@ -653,6 +672,8 @@ def _run_non_driver_rank(self) -> bool:
 
         if not data["no_spec"]:
             self.scorer_worker.execute_model()
+            if data["run_spec_proposer_for_prefill"]:
+                self.proposer_worker.execute_model()
 
         return True
 

From cec04431295a9e26b3917298ef61c9509f9e9801 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 11:35:10 -0500
Subject: [PATCH 055/293] refactor

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/worker/gpu_model_runner.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index dd0d1824246d4..1492a3ba89f0a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -329,18 +329,15 @@ def _prepare_inputs(
         # request in the batch. While we should not sample any token from this
         # partial request, we do so for simplicity. We will ignore the sampled
         # token from the partial request.
-        # TODO: Support prompt logprobs.
         maybe_sample_logits_indices = query_start_loc[1:] - 1
         num_query_tokens = torch.diff(query_start_loc)
 
-        # One or more requests require prompt logprobs
-        complete_req_mask = torch.tensor(
-            [not x for x in scheduler_output.partial_running_reqs])
-
         if do_prompt_logprobs:
             prompt_logits_mask = torch.ones(num_input_tokens, dtype=torch.bool)
-            prompt_logits_mask[
-                maybe_sample_logits_indices[complete_req_mask]] = False
+            # Sequence offsets where a token is being decoded are *not* prompt
+            # tokens, unless the request in question is partial
+            prompt_logits_mask[maybe_sample_logits_indices[
+                ~torch.tensor(scheduler_output.partial_running_reqs)]] = False
 
             return (input_ids, attn_metadata, num_query_tokens,
                     maybe_sample_logits_indices, prompt_logits_mask)
@@ -448,6 +445,9 @@ def execute_model(
 
         sampling_metadata = self._prepare_sampling(scheduler_output)
 
+        do_logprobs = sampling_metadata.max_num_logprobs > 0
+        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
+
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
@@ -459,9 +459,6 @@ def execute_model(
             # Eager mode.
             num_input_tokens = num_scheduled_tokens
 
-        do_logprobs = sampling_metadata.max_num_logprobs > 0
-        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
-
         # Prepare the decoder inputs.
         (
             input_ids,

From 73157819c72af1e12c49714bfe387e29fba4f4d6 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 12:53:18 -0500
Subject: [PATCH 056/293] attempted sample_metadata fix; sample logprobs work,
 prompt logprobs broken

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/core/scheduler.py          | 31 +++++-----
 vllm/v1/sample/metadata.py         | 10 ++-
 vllm/v1/sample/sampler.py          | 15 ++++-
 vllm/v1/worker/gpu_model_runner.py | 99 ++++++++++++++++--------------
 4 files changed, 90 insertions(+), 65 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 0e09da028b16f..87113ea2f65e8 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -109,6 +109,7 @@ def schedule(self) -> "SchedulerOutput":
         # V1 model runner.
         # TODO(woosuk): Remove this constraint after refactoring model runner.
         has_partial_request = False
+        partial_req_index = -1
         req_index = 0
         while req_index < len(self.running):
             # Only the last request in the RUNNING queue can be "partial".
@@ -158,9 +159,11 @@ def schedule(self) -> "SchedulerOutput":
             ]
             num_scheduled_tokens[request.request_id] = num_new_tokens
             token_budget -= num_new_tokens
+            if (request.num_computed_tokens + num_new_tokens <
+                    request.num_tokens):
+                has_partial_request = True
+                partial_req_index = req_index
             req_index += 1
-            has_partial_request = (request.num_computed_tokens + num_new_tokens
-                                   < request.num_tokens)
 
             # Encoder-related.
             if encoder_inputs_to_schedule:
@@ -236,8 +239,10 @@ def schedule(self) -> "SchedulerOutput":
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
                 request.num_computed_tokens = num_computed_tokens
-                has_partial_request = (num_computed_tokens + num_new_tokens <
-                                       request.num_tokens)
+                if (request.num_computed_tokens + num_new_tokens <
+                        request.num_tokens):
+                    has_partial_request = True
+                    partial_req_index = req_index
 
                 # Encoder-related.
                 if encoder_inputs_to_schedule:
@@ -248,13 +253,6 @@ def schedule(self) -> "SchedulerOutput":
                         self.encoder_cache_manager.allocate(request, i)
                     encoder_budget = new_encoder_budget
 
-        # Now that requests are scheduled, generate a mask indicating which
-        # request is partial
-        partial_running_reqs = [
-            (req.num_computed_tokens + num_scheduled_tokens[req.request_id] <
-             req.num_tokens) for req in self.running
-        ]
-
         # Check if the scheduling constraints are satisfied.
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
         assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
@@ -285,7 +283,7 @@ def schedule(self) -> "SchedulerOutput":
             scheduled_new_reqs=new_reqs_data,
             scheduled_resumed_reqs=resumed_reqs_data,
             scheduled_running_reqs=running_reqs_data,
-            partial_running_reqs=partial_running_reqs,
+            partial_req_index=partial_req_index,
             num_scheduled_tokens=num_scheduled_tokens,
             total_num_scheduled_tokens=total_num_scheduled_tokens,
             scheduled_encoder_inputs=scheduled_encoder_inputs,
@@ -470,9 +468,14 @@ def update_from_output(
 
             if do_prompt_logprobs:
                 max_prompt_logprobs = request.max_prompt_logprobs
+                # Number of new prompt tokens is the number of scheduled
+                # tokens *if* the request is partial (because the sampled
+                # token is discarded and all sequence offsets are prompt
+                # offsets), otherwise it is the number of scheduled
+                # tokens minus one (for the sampled token)
                 num_new_prompt_tokens = (
                     num_scheduled_tokens[request.request_id] -
-                    int(not scheduler_output.partial_running_reqs[req_index]))
+                    int(scheduler_output.partial_req_index != req_index))
 
                 request_do_prompt_logprobs = (max_prompt_logprobs is not None
                                               and max_prompt_logprobs > 0
@@ -774,7 +777,7 @@ class SchedulerOutput:
     scheduled_new_reqs: List[NewRequestData]
     scheduled_resumed_reqs: List[ResumedRequestData]
     scheduled_running_reqs: List[RunningRequestData]
-    partial_running_reqs: List[bool]  # True if running req is partial
+    partial_req_index: int  # >0 if running req is partial, -1 o/w
 
     num_scheduled_tokens: Dict[str, int]
     total_num_scheduled_tokens: int
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 51fdae841971b..c1d817c8f3ffd 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -21,6 +21,10 @@ class SamplingMetadata:
     max_num_logprobs: int
     max_num_prompt_logprobs: int
 
-    num_query_tokens: Optional[torch.Tensor] = None
-    maybe_sample_logits_indices: Optional[torch.Tensor] = None
-    prompt_logits_mask: Optional[torch.Tensor] = None
+    query_start_loc: Optional[torch.Tensor]
+    num_query_tokens: Optional[torch.Tensor]
+    #maybe_sample_logits_indices: Optional[torch.Tensor] = None
+    #prompt_logits_mask: Optional[torch.Tensor] = None
+
+    num_input_tokens: int
+    partial_req_index: int  # >0 if there is a partial request, -1 o/w
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 4a0a3afb35e0b..4448b55deb868 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -228,9 +228,18 @@ def forward(
         do_any_logprobs = do_logprobs or do_prompt_logprobs
 
         num_query_tokens = sampling_metadata.num_query_tokens
-        maybe_sample_logits_indices = (
-            sampling_metadata.maybe_sample_logits_indices)
-        prompt_logits_mask = sampling_metadata.prompt_logits_mask
+        # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
+        # request in the batch. While we should not sample any token from this
+        # partial request, we do so for simplicity. We will ignore the sampled
+        # token from the partial request.
+        maybe_sample_logits_indices = sampling_metadata.query_start_loc[1:] - 1
+        prompt_logits_mask = torch.ones(sampling_metadata.num_input_tokens,
+                                        dtype=torch.bool)
+        # Sequence offsets where a token is being decoded are *not* prompt
+        # tokens...
+        prompt_logits_mask[maybe_sample_logits_indices] = False
+        # ...unless the request in question is partial.
+        prompt_logits_mask[sampling_metadata.partial_req_index] = True
 
         # Apply temperature, top-k and top-p to logits at sequence offsets
         # where a new token is being decoded.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 1492a3ba89f0a..2e642c5869c97 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -211,10 +211,8 @@ def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
         sampling_metadata: SamplingMetadata,
-        num_input_tokens: int,
     ) -> Tuple[torch.Tensor, FlashAttentionMetadata, torch.Tensor,
                torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
 
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
@@ -291,14 +289,7 @@ def _prepare_inputs(
                   out=slot_mapping)
 
         # Prepare the attention metadata.
-        query_start_loc = torch.empty((num_reqs + 1, ),
-                                      dtype=torch.int32,
-                                      device="cpu",
-                                      pin_memory=self.pin_memory)
-        query_start_loc_np = query_start_loc.numpy()
-        query_start_loc_np[0] = 0
-        np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1:])
-
+        query_start_loc = sampling_metadata.query_start_loc
         seq_lens = (self.input_batch.num_computed_tokens_cpu[:num_reqs] +
                     num_scheduled_tokens)
         max_seq_len = seq_lens.max()
@@ -313,7 +304,6 @@ def _prepare_inputs(
         input_ids = input_ids.to(self.device, non_blocking=True)
         self.positions[:total_num_scheduled_tokens].copy_(positions,
                                                           non_blocking=True)
-        query_start_loc = query_start_loc.to(self.device, non_blocking=True)
         seq_start_loc = seq_start_loc.to(self.device, non_blocking=True)
         slot_mapping = slot_mapping.to(self.device, non_blocking=True).long()
         attn_metadata = FlashAttentionMetadata(
@@ -329,26 +319,12 @@ def _prepare_inputs(
         # request in the batch. While we should not sample any token from this
         # partial request, we do so for simplicity. We will ignore the sampled
         # token from the partial request.
-        maybe_sample_logits_indices = query_start_loc[1:] - 1
-        num_query_tokens = torch.diff(query_start_loc)
-
-        if do_prompt_logprobs:
-            prompt_logits_mask = torch.ones(num_input_tokens, dtype=torch.bool)
-            # Sequence offsets where a token is being decoded are *not* prompt
-            # tokens, unless the request in question is partial
-            prompt_logits_mask[maybe_sample_logits_indices[
-                ~torch.tensor(scheduler_output.partial_running_reqs)]] = False
-
-            return (input_ids, attn_metadata, num_query_tokens,
-                    maybe_sample_logits_indices, prompt_logits_mask)
-        else:
-            # No requests require prompt logprobs
-            return (input_ids, attn_metadata, num_query_tokens,
-                    maybe_sample_logits_indices, None)
+        return (input_ids, attn_metadata)
 
     def _prepare_sampling(
         self,
         scheduler_output: "SchedulerOutput",
+        num_input_tokens: int,
     ) -> SamplingMetadata:
         skip_copy = True
         if (scheduler_output.finished_req_ids
@@ -358,7 +334,11 @@ def _prepare_sampling(
                 or scheduler_output.scheduled_resumed_reqs):
             skip_copy = False
         # Create the sampling metadata.
-        sampling_metadata = self.input_batch.make_sampling_metadata(skip_copy)
+        sampling_metadata = self.input_batch.make_sampling_metadata(
+            scheduler_output,
+            num_input_tokens,
+            skip_copy,
+        )
         return sampling_metadata
 
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
@@ -443,11 +423,6 @@ def execute_model(
         self._execute_encoder(scheduler_output)
         encoder_outputs = self._gather_encoder_outputs(scheduler_output)
 
-        sampling_metadata = self._prepare_sampling(scheduler_output)
-
-        do_logprobs = sampling_metadata.max_num_logprobs > 0
-        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
-
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
@@ -459,16 +434,17 @@ def execute_model(
             # Eager mode.
             num_input_tokens = num_scheduled_tokens
 
+        sampling_metadata = self._prepare_sampling(scheduler_output,
+                                                   num_input_tokens)
+        do_logprobs = sampling_metadata.max_num_logprobs > 0
+        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
+
         # Prepare the decoder inputs.
         (
             input_ids,
             attn_metadata,
-            num_query_tokens,
-            maybe_sample_logits_indices,
-            prompt_logits_mask,
         ) = self._prepare_inputs(scheduler_output=scheduler_output,
-                                 sampling_metadata=sampling_metadata,
-                                 num_input_tokens=num_input_tokens)
+                                 sampling_metadata=sampling_metadata)
 
         # Get the inputs embeds.
         if encoder_outputs:
@@ -494,11 +470,6 @@ def execute_model(
 
         hidden_states = hidden_states[:num_scheduled_tokens]
 
-        sampling_metadata.num_query_tokens = num_query_tokens
-        sampling_metadata.maybe_sample_logits_indices = (
-            maybe_sample_logits_indices)
-        sampling_metadata.prompt_logits_mask = prompt_logits_mask
-
         # Sample the next token and get logprobs if needed.
         sampler_output = self.model.sample(
             logits=self.model.compute_logits(hidden_states, None),
@@ -855,6 +826,8 @@ def condense(self, empty_req_indices: List[int]) -> None:
 
     def make_sampling_metadata(
         self,
+        scheduler_output: "SchedulerOutput",
+        num_input_tokens: int,
         skip_copy: bool = False,
     ) -> SamplingMetadata:
         if not skip_copy:
@@ -864,8 +837,36 @@ def make_sampling_metadata(
                 self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True)
             self.top_k[:self.num_reqs].copy_(
                 self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True)
+
+        num_reqs = self.num_reqs
+
+        # Get the number of scheduled tokens for each request.
+        # TODO: The Python loop can be slow. Optimize.
+        num_scheduled_tokens = []
+        max_num_scheduled_tokens = 0
+        for req_id in self.req_ids[:num_reqs]:
+            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            num_scheduled_tokens.append(num_tokens)
+            max_num_scheduled_tokens = max(max_num_scheduled_tokens,
+                                           num_tokens)
+        num_scheduled_tokens = np.array(num_scheduled_tokens, dtype=np.int32)
+        assert max_num_scheduled_tokens > 0
+
+        # Compute query start offsets. It makes sense to compute this here
+        # rather than in model runner _prepare_inputs() because query start
+        # offsets are required for computing num_query_tokens in the scenario
+        # where prompt logprobs are required by the batch.
+        query_start_loc = torch.empty((num_reqs + 1, ),
+                                      dtype=torch.int32,
+                                      device="cpu",
+                                      pin_memory=self.pin_memory)
+        query_start_loc_np = query_start_loc.numpy()
+        query_start_loc_np[0] = 0
+        np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1:])
+        query_start_loc = query_start_loc.to(self.device, non_blocking=True)
+
         return SamplingMetadata(
-            temperature=self.temperature[:self.num_reqs],
+            temperature=self.temperature[:num_reqs],
             all_greedy=self.all_greedy,
             all_random=self.all_random,
             top_p=self.top_p[:self.num_reqs],
@@ -874,7 +875,15 @@ def make_sampling_metadata(
             no_top_k=self.no_top_k,
             generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
-            max_num_prompt_logprobs=self.max_num_prompt_logprobs)
+            max_num_prompt_logprobs=self.max_num_prompt_logprobs,
+            query_start_loc=query_start_loc,
+            num_input_tokens=num_input_tokens,
+            partial_req_index=scheduler_output.partial_req_index,
+            # Required for prompt logprobs temperature computation.
+            # If prompt logprobs is not required for this batch, then
+            # avoid storing num_query_tokens
+            num_query_tokens=(torch.diff(query_start_loc)
+                              if self.max_num_prompt_logprobs > 0 else None))
 
     @property
     def num_reqs(self) -> int:

From 2cee23142d4832759d8557b66f40e28c325a0d3d Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 14:03:20 -0500
Subject: [PATCH 057/293] cleaned up sampling metadata

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/core/scheduler.py  | 24 +++++++++++++++---------
 vllm/v1/sample/metadata.py |  6 +++---
 vllm/v1/sample/sampler.py  |  5 +++--
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 87113ea2f65e8..5ada9ceab54e6 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -109,7 +109,6 @@ def schedule(self) -> "SchedulerOutput":
         # V1 model runner.
         # TODO(woosuk): Remove this constraint after refactoring model runner.
         has_partial_request = False
-        partial_req_index = -1
         req_index = 0
         while req_index < len(self.running):
             # Only the last request in the RUNNING queue can be "partial".
@@ -159,10 +158,8 @@ def schedule(self) -> "SchedulerOutput":
             ]
             num_scheduled_tokens[request.request_id] = num_new_tokens
             token_budget -= num_new_tokens
-            if (request.num_computed_tokens + num_new_tokens <
-                    request.num_tokens):
-                has_partial_request = True
-                partial_req_index = req_index
+            has_partial_request = (request.num_computed_tokens + num_new_tokens
+                                   < request.num_tokens)
             req_index += 1
 
             # Encoder-related.
@@ -239,10 +236,8 @@ def schedule(self) -> "SchedulerOutput":
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
                 request.num_computed_tokens = num_computed_tokens
-                if (request.num_computed_tokens + num_new_tokens <
-                        request.num_tokens):
-                    has_partial_request = True
-                    partial_req_index = req_index
+                has_partial_request = (request.num_computed_tokens +
+                                       num_new_tokens < request.num_tokens)
 
                 # Encoder-related.
                 if encoder_inputs_to_schedule:
@@ -279,6 +274,17 @@ def schedule(self) -> "SchedulerOutput":
                 req.num_computed_tokens) for req in scheduled_running_reqs
         ]
         preempted_req_ids = {req.request_id for req in preempted_reqs}
+
+        partial_req_indices = [
+            idx for idx, request in enumerate(self.running)
+            if request.num_computed_tokens +
+            num_scheduled_tokens[request.request_id] < request.num_tokens
+        ]
+        num_partial_reqs = len(partial_req_indices)
+        assert num_partial_reqs < 2
+        partial_req_index = (partial_req_indices[0]
+                             if num_partial_reqs > 0 else -1)
+
         scheduler_output = SchedulerOutput(
             scheduled_new_reqs=new_reqs_data,
             scheduled_resumed_reqs=resumed_reqs_data,
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index c1d817c8f3ffd..b9c97bcfb0d47 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -18,13 +18,13 @@ class SamplingMetadata:
 
     generators: Dict[int, torch.Generator]
 
+    # Max number of sample or prompt logprobs
+    # (respectiely) at the batch level
     max_num_logprobs: int
     max_num_prompt_logprobs: int
 
+    # Attributes which support logprob computation
     query_start_loc: Optional[torch.Tensor]
     num_query_tokens: Optional[torch.Tensor]
-    #maybe_sample_logits_indices: Optional[torch.Tensor] = None
-    #prompt_logits_mask: Optional[torch.Tensor] = None
-
     num_input_tokens: int
     partial_req_index: int  # >0 if there is a partial request, -1 o/w
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 4448b55deb868..e0b03f7aa03b3 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -237,9 +237,10 @@ def forward(
                                         dtype=torch.bool)
         # Sequence offsets where a token is being decoded are *not* prompt
         # tokens...
+        pdx = sampling_metadata.partial_req_index
         prompt_logits_mask[maybe_sample_logits_indices] = False
-        # ...unless the request in question is partial.
-        prompt_logits_mask[sampling_metadata.partial_req_index] = True
+        # ...unless the request in question is partial
+        prompt_logits_mask[maybe_sample_logits_indices[pdx]] = True
 
         # Apply temperature, top-k and top-p to logits at sequence offsets
         # where a new token is being decoded.

From cc1e43a2cffcf5be80613c4907fdf4a42b68fe95 Mon Sep 17 00:00:00 2001
From: Conroy Cheers <conroy@corncheese.org>
Date: Wed, 27 Nov 2024 05:26:28 +1100
Subject: [PATCH 058/293] [Hardware][NVIDIA] Add non-NVML CUDA mode for Jetson
 (#9735)

Signed-off-by: Conroy Cheers <conroy@corncheese.org>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 CMakeLists.txt             |  10 +-
 vllm/platforms/__init__.py |  10 +-
 vllm/platforms/cuda.py     | 222 +++++++++++++++++++++++--------------
 3 files changed, 155 insertions(+), 87 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff34225537cdd..882d4412632a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 
 # Supported NVIDIA architectures.
-set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
+set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
 
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
@@ -249,7 +249,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Only build Marlin kernels if we are building for at least some compatible archs.
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
   if (MARLIN_ARCHS)
     set(MARLIN_SRCS
        "csrc/quantization/fp8/fp8_marlin.cu"
@@ -300,8 +300,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   #
   # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
   # kernels for the remaining archs that are not already built for 3x.
-  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS 
+    "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
   # subtract out the archs that are already built for 3x
   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
   if (SCALED_MM_2X_ARCHS)
@@ -427,7 +427,7 @@ set_gencode_flags_for_srcs(
   CUDA_ARCHS "${CUDA_ARCHS}")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
     set(MARLIN_MOE_SRC
         "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 1f68fc2e25df3..7cb8ac4b0a1e0 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -28,7 +28,15 @@
     finally:
         pynvml.nvmlShutdown()
 except Exception:
-    pass
+    # CUDA is supported on Jetson, but NVML may not be.
+    import os
+
+    def cuda_is_jetson() -> bool:
+        return os.path.isfile("/etc/nv_tegra_release") \
+            or os.path.exists("/sys/class/tegra-firmware")
+
+    if cuda_is_jetson():
+        is_cuda = True
 
 is_rocm = False
 
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 70724b8be4c45..0d07050fd1b6a 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -4,7 +4,7 @@
 
 import os
 from functools import lru_cache, wraps
-from typing import TYPE_CHECKING, Callable, List, Tuple, TypeVar
+from typing import TYPE_CHECKING, Callable, List, TypeVar
 
 import pynvml
 import torch
@@ -38,10 +38,23 @@
 # see https://github.com/huggingface/diffusers/issues/9704 for details
 torch.backends.cuda.enable_cudnn_sdp(False)
 
-# NVML utils
-# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
-# all the related functions work on real physical device ids.
-# the major benefit of using NVML is that it will not initialize CUDA
+
+def device_id_to_physical_device_id(device_id: int) -> int:
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        if device_ids == [""]:
+            msg = (
+                "CUDA_VISIBLE_DEVICES is set to empty string, which means"
+                " GPU support is disabled. If you are using ray, please unset"
+                " the environment variable `CUDA_VISIBLE_DEVICES` inside the"
+                " worker/actor. "
+                "Check https://github.com/vllm-project/vllm/issues/8402 for"
+                " more information.")
+            raise RuntimeError(msg)
+        physical_device_id = device_ids[device_id]
+        return int(physical_device_id)
+    else:
+        return device_id
 
 
 def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
@@ -57,87 +70,75 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
     return wrapper
 
 
-@lru_cache(maxsize=8)
-@with_nvml_context
-def get_physical_device_capability(device_id: int = 0) -> Tuple[int, int]:
-    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
-    return pynvml.nvmlDeviceGetCudaComputeCapability(handle)
-
-
-@lru_cache(maxsize=8)
-@with_nvml_context
-def get_physical_device_name(device_id: int = 0) -> str:
-    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
-    return pynvml.nvmlDeviceGetName(handle)
-
-
-@lru_cache(maxsize=8)
-@with_nvml_context
-def get_physical_device_total_memory(device_id: int = 0) -> int:
-    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
-    return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
-
+class CudaPlatformBase(Platform):
+    _enum = PlatformEnum.CUDA
+    device_type: str = "cuda"
+    dispatch_key: str = "CUDA"
 
-@with_nvml_context
-def warn_if_different_devices():
-    device_ids: int = pynvml.nvmlDeviceGetCount()
-    if device_ids > 1:
-        device_names = [get_physical_device_name(i) for i in range(device_ids)]
-        if len(set(device_names)) > 1 and os.environ.get(
-                "CUDA_DEVICE_ORDER") != "PCI_BUS_ID":
-            logger.warning(
-                "Detected different devices in the system: \n%s\nPlease"
-                " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
-                "avoid unexpected behavior.", "\n".join(device_names))
+    @classmethod
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        raise NotImplementedError
 
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        raise NotImplementedError
 
-try:
-    from sphinx.ext.autodoc.mock import _MockModule
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        raise NotImplementedError
 
-    if not isinstance(pynvml, _MockModule):
-        warn_if_different_devices()
-except ModuleNotFoundError:
-    warn_if_different_devices()
+    @classmethod
+    def is_full_nvlink(cls, device_ids: List[int]) -> bool:
+        raise NotImplementedError
 
+    @classmethod
+    def log_warnings(cls):
+        pass
 
-def device_id_to_physical_device_id(device_id: int) -> int:
-    if "CUDA_VISIBLE_DEVICES" in os.environ:
-        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
-        if device_ids == [""]:
-            msg = (
-                "CUDA_VISIBLE_DEVICES is set to empty string, which means"
-                " GPU support is disabled. If you are using ray, please unset"
-                " the environment variable `CUDA_VISIBLE_DEVICES` inside the"
-                " worker/actor. "
-                "Check https://github.com/vllm-project/vllm/issues/8402 for"
-                " more information.")
-            raise RuntimeError(msg)
-        physical_device_id = device_ids[device_id]
-        return int(physical_device_id)
-    else:
-        return device_id
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        parallel_config = vllm_config.parallel_config
+        scheduler_config = vllm_config.scheduler_config
+        if parallel_config.worker_cls == "auto":
+            if scheduler_config.is_multi_step:
+                parallel_config.worker_cls = \
+                    "vllm.worker.multi_step_worker.MultiStepWorker"
+            elif vllm_config.speculative_config:
+                parallel_config.worker_cls = \
+                    "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+            else:
+                parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
 
-class CudaPlatform(Platform):
-    _enum = PlatformEnum.CUDA
-    device_type: str = "cuda"
-    dispatch_key: str = "CUDA"
+# NVML utils
+# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using NVML is that it will not initialize CUDA
+class NvmlCudaPlatform(CudaPlatformBase):
 
     @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
         physical_device_id = device_id_to_physical_device_id(device_id)
-        major, minor = get_physical_device_capability(physical_device_id)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+        major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
         return DeviceCapability(major=major, minor=minor)
 
     @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
     def get_device_name(cls, device_id: int = 0) -> str:
         physical_device_id = device_id_to_physical_device_id(device_id)
-        return get_physical_device_name(physical_device_id)
+        return cls._get_physical_device_name(physical_device_id)
 
     @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         physical_device_id = device_id_to_physical_device_id(device_id)
-        return get_physical_device_total_memory(physical_device_id)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+        return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
 
     @classmethod
     @with_nvml_context
@@ -153,27 +154,86 @@ def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
                 if i < j:
                     try:
                         p2p_status = pynvml.nvmlDeviceGetP2PStatus(
-                            handle, peer_handle,
-                            pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
+                            handle,
+                            peer_handle,
+                            pynvml.NVML_P2P_CAPS_INDEX_NVLINK,
+                        )
                         if p2p_status != pynvml.NVML_P2P_STATUS_OK:
                             return False
                     except pynvml.NVMLError:
                         logger.exception(
-                            "NVLink detection failed. This is normal if your"
-                            " machine has no NVLink equipped.")
+                            "NVLink detection failed. This is normal if"
+                            " your machine has no NVLink equipped.")
                         return False
         return True
 
     @classmethod
-    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        parallel_config = vllm_config.parallel_config
-        scheduler_config = vllm_config.scheduler_config
-        if parallel_config.worker_cls == "auto":
-            if scheduler_config.is_multi_step:
-                parallel_config.worker_cls = \
-                    "vllm.worker.multi_step_worker.MultiStepWorker"
-            elif vllm_config.speculative_config:
-                parallel_config.worker_cls = \
-                    "vllm.spec_decode.spec_decode_worker.create_spec_worker"
-            else:
-                parallel_config.worker_cls = "vllm.worker.worker.Worker"
+    def _get_physical_device_name(cls, device_id: int = 0) -> str:
+        handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+        return pynvml.nvmlDeviceGetName(handle)
+
+    @classmethod
+    @with_nvml_context
+    def log_warnings(cls):
+        device_ids: int = pynvml.nvmlDeviceGetCount()
+        if device_ids > 1:
+            device_names = [
+                cls._get_physical_device_name(i) for i in range(device_ids)
+            ]
+            if (len(set(device_names)) > 1
+                    and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"):
+                logger.warning(
+                    "Detected different devices in the system: \n%s\nPlease"
+                    " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
+                    "avoid unexpected behavior.",
+                    "\n".join(device_names),
+                )
+
+
+class NonNvmlCudaPlatform(CudaPlatformBase):
+
+    @classmethod
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return torch.cuda.get_device_name(device_id)
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.cuda.get_device_properties(device_id)
+        return device_props.total_memory
+
+    @classmethod
+    def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
+        logger.exception(
+            "NVLink detection not possible, as context support was"
+            " not found. Assuming no NVLink available.")
+        return False
+
+
+# Autodetect either NVML-enabled or non-NVML platform
+# based on whether NVML is available.
+nvml_available = False
+try:
+    try:
+        pynvml.nvmlInit()
+        nvml_available = True
+    except Exception:
+        # On Jetson, NVML is not supported.
+        nvml_available = False
+finally:
+    if nvml_available:
+        pynvml.nvmlShutdown()
+
+CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
+
+try:
+    from sphinx.ext.autodoc.mock import _MockModule
+
+    if not isinstance(pynvml, _MockModule):
+        CudaPlatform.log_warnings()
+except ModuleNotFoundError:
+    CudaPlatform.log_warnings()

From 07f9e89bc0c03ec00d6019b3da32ece42c98df3e Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 26 Nov 2024 13:44:01 -0500
Subject: [PATCH 059/293] [Bugfix] Fix using `-O[0,3]` with LLM entrypoint
 (#10677)

Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/engine/arg_utils.py |  5 ++++-
 vllm/entrypoints/llm.py  | 10 ++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 60ad5ee54a2f2..90b4798f17a13 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -206,7 +206,10 @@ def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
         # without having to manually construct a
         # CompilationConfig object
-        if isinstance(self.compilation_config, (int, dict)):
+        if isinstance(self.compilation_config, (int)):
+            self.compilation_config = CompilationConfig.from_cli(
+                str(self.compilation_config))
+        elif isinstance(self.compilation_config, (dict)):
             self.compilation_config = CompilationConfig.from_cli(
                 json.dumps(self.compilation_config))
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index e07f4c04abd84..1551a9a998160 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -185,8 +185,14 @@ def __init__(
             kwargs["disable_log_stats"] = True
 
         if compilation_config is not None:
-            compilation_config_instance = CompilationConfig.from_cli(
-                json.dumps(compilation_config))
+            if isinstance(compilation_config, (int)):
+                compilation_config_instance = CompilationConfig.from_cli(
+                    str(compilation_config))
+            elif isinstance(compilation_config, (dict)):
+                compilation_config_instance = CompilationConfig.from_cli(
+                    json.dumps(compilation_config))
+            else:
+                compilation_config_instance = compilation_config
         else:
             compilation_config_instance = None
 

From 27e4923d07f359c9f4f7a6b703c955d0bb9c15c7 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 14:12:50 -0500
Subject: [PATCH 060/293] small change

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/llm_engine.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index b93634230529e..402a1c5dc85ad 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -160,11 +160,8 @@ def step(self) -> List[RequestOutput]:
         return request_outputs
 
     def get_model_config(self):
-        """Gets the model configuration."""
         return self.model_config
 
-    # TODO(rob): Can we get rid of these?
-
     def start_profile(self):
         self.engine_core.profile(True)
 

From 1ccef6c9edd4937b831aa08626bd3464b3fe2f40 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 15:00:26 -0500
Subject: [PATCH 061/293] partially re-enabled detokenize cases in test

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/samplers/test_logprobs.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index a303438c8a3d9..01be27926ef84 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -57,7 +57,7 @@ def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
             (None, 0),
             (0, None),
             (0, 0),
-            (None, 7),
+            (None, 6),
             (0, 5),
         ]
     elif batch_logprobs_composition == "SAMPLE_PROMPT":
@@ -67,7 +67,7 @@ def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
             (0, 0),
             (5, None),
             (3, 0),
-            (7, 3),
+            (6, 3),
             (None, 6),
             (0, 5),
         ]
@@ -243,7 +243,7 @@ def _test_case_get_logprobs_and_prompt_logprobs(
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype",
                          ["half"])  # needed for comparing logprobs with HF
-# @pytest.mark.parametrize("detokenize", [True, False])
+@pytest.mark.parametrize("detokenize", [True, False])
 @pytest.mark.parametrize("max_num_batched_tokens", [128, 256, 1024])
 @pytest.mark.parametrize("batch_logprobs_composition",
                          ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
@@ -252,7 +252,7 @@ def test_get_logprobs_and_prompt_logprobs(
     vllm_runner,
     model: str,
     dtype: str,
-    # detokenize: bool,
+    detokenize: bool,
     batch_logprobs_composition: str,
     max_num_batched_tokens: int,
     example_prompts,
@@ -279,6 +279,7 @@ def test_get_logprobs_and_prompt_logprobs(
       dtype
       detokenize: if False, return generated tokens bypassing detokenizer
       batch_logprobs_composition: logprobs configuration for test batch
+      max_num_batched_tokens: token budget for scheduling
       example_prompts
       monkeypatch
     """
@@ -301,8 +302,7 @@ def test_get_logprobs_and_prompt_logprobs(
                          ["half"])  # needed for comparing logprobs with HF
 # @pytest.mark.parametrize("detokenize", [True, False])
 @pytest.mark.parametrize("max_num_batched_tokens", [128])
-@pytest.mark.parametrize("batch_logprobs_composition",
-                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
+@pytest.mark.parametrize("batch_logprobs_composition", ["SAMPLE_PROMPT"])
 def test_fast_get_logprobs_and_prompt_logprobs(
     hf_runner,
     vllm_runner,

From a29345137b12b292e075acbdbf352b895c2f3a09 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Tue, 26 Nov 2024 15:13:57 -0500
Subject: [PATCH 062/293] deferring support for detokenization feature to
 subsequent SamplingParams work

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/samplers/test_logprobs.py | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index 01be27926ef84..7c736d957e38a 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -243,7 +243,6 @@ def _test_case_get_logprobs_and_prompt_logprobs(
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype",
                          ["half"])  # needed for comparing logprobs with HF
-@pytest.mark.parametrize("detokenize", [True, False])
 @pytest.mark.parametrize("max_num_batched_tokens", [128, 256, 1024])
 @pytest.mark.parametrize("batch_logprobs_composition",
                          ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
@@ -252,7 +251,6 @@ def test_get_logprobs_and_prompt_logprobs(
     vllm_runner,
     model: str,
     dtype: str,
-    detokenize: bool,
     batch_logprobs_composition: str,
     max_num_batched_tokens: int,
     example_prompts,
@@ -277,20 +275,17 @@ def test_get_logprobs_and_prompt_logprobs(
       vllm_runner
       model
       dtype
-      detokenize: if False, return generated tokens bypassing detokenizer
       batch_logprobs_composition: logprobs configuration for test batch
       max_num_batched_tokens: token budget for scheduling
       example_prompts
       monkeypatch
     """
-    detokenize = True
-
     _test_case_get_logprobs_and_prompt_logprobs(
         hf_runner=hf_runner,
         vllm_runner=vllm_runner,
         model=model,
         dtype=dtype,
-        detokenize=detokenize,
+        detokenize=True,
         batch_logprobs_composition=batch_logprobs_composition,
         max_num_batched_tokens=max_num_batched_tokens,
         example_prompts=example_prompts,
@@ -300,15 +295,14 @@ def test_get_logprobs_and_prompt_logprobs(
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype",
                          ["half"])  # needed for comparing logprobs with HF
-# @pytest.mark.parametrize("detokenize", [True, False])
 @pytest.mark.parametrize("max_num_batched_tokens", [128])
-@pytest.mark.parametrize("batch_logprobs_composition", ["SAMPLE_PROMPT"])
+@pytest.mark.parametrize("batch_logprobs_composition",
+                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
 def test_fast_get_logprobs_and_prompt_logprobs(
     hf_runner,
     vllm_runner,
     model: str,
     dtype: str,
-    # detokenize: bool,
     batch_logprobs_composition: str,
     max_num_batched_tokens: int,
     example_prompts,
@@ -319,14 +313,13 @@ def test_fast_get_logprobs_and_prompt_logprobs(
     Faster version of `test_get_logprobs_and_prompt_logprobs` with
     fewer test cases.
     """
-    detokenize = True
 
     _test_case_get_logprobs_and_prompt_logprobs(
         hf_runner=hf_runner,
         vllm_runner=vllm_runner,
         model=model,
         dtype=dtype,
-        detokenize=detokenize,
+        detokenize=True,
         batch_logprobs_composition=batch_logprobs_composition,
         max_num_batched_tokens=max_num_batched_tokens,
         example_prompts=example_prompts,
@@ -356,15 +349,12 @@ def test_max_logprobs(monkeypatch):
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("detokenize", [True, False])
-def test_none_logprobs(vllm_runner, model, detokenize: bool, example_prompts,
-                       monkeypatch):
+def test_none_logprobs(vllm_runner, model, example_prompts, monkeypatch):
     """Engine should return `logprobs` and `prompt_logprobs` as `None`
     
     Args:
       vllm_runner
       model
-      detokenize: whether to feed generated tokens to detokenizer
       example_prompts
       monkeypatch
     """
@@ -385,8 +375,7 @@ def test_none_logprobs(vllm_runner, model, detokenize: bool, example_prompts,
         sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
                                                        logprobs=None,
                                                        prompt_logprobs=None,
-                                                       temperature=0.0,
-                                                       detokenize=detokenize)
+                                                       temperature=0.0)
         results_logprobs_none = vllm_model.model.generate(
             example_prompts, sampling_params=sampling_params_logprobs_none)
 

From 86d02594465080b86c5fa03d9df4a0005e817845 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 26 Nov 2024 15:29:00 -0500
Subject: [PATCH 063/293] [Bugfix] Check bnb_4bit_quant_storage for
 bitsandbytes (#10642)

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .../layers/quantization/bitsandbytes.py               | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 39965ac9115c2..6a0de3034142a 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -20,6 +20,7 @@ def __init__(
         load_in_8bit: bool = False,
         load_in_4bit: bool = True,
         bnb_4bit_compute_dtype: str = "float32",
+        bnb_4bit_quant_storage: str = "uint8",
         bnb_4bit_quant_type: str = "fp4",
         bnb_4bit_use_double_quant: bool = False,
         llm_int8_enable_fp32_cpu_offload: bool = False,
@@ -31,6 +32,7 @@ def __init__(
         self.load_in_8bit = load_in_8bit
         self.load_in_4bit = load_in_4bit
         self.bnb_4bit_compute_dtype = bnb_4bit_compute_dtype
+        self.bnb_4bit_quant_storage = bnb_4bit_quant_storage
         self.bnb_4bit_quant_type = bnb_4bit_quant_type
         self.bnb_4bit_use_double_quant = bnb_4bit_use_double_quant
         self.llm_int8_enable_fp32_cpu_offload = llm_int8_enable_fp32_cpu_offload
@@ -38,10 +40,15 @@ def __init__(
         self.llm_int8_skip_modules = llm_int8_skip_modules or []
         self.llm_int8_threshold = llm_int8_threshold
 
+        if self.bnb_4bit_quant_storage not in ["uint8"]:
+            raise ValueError("Unsupported bnb_4bit_quant_storage: "
+                             f"{self.bnb_4bit_quant_storage}")
+
     def __repr__(self) -> str:
         return (f"BitsAndBytesConfig(load_in_8bit={self.load_in_8bit}, "
                 f"load_in_4bit={self.load_in_4bit}, "
                 f"bnb_4bit_compute_dtype={self.bnb_4bit_compute_dtype}, "
+                f"bnb_4bit_quant_storage={self.bnb_4bit_quant_storage}, "
                 f"bnb_4bit_quant_type={self.bnb_4bit_quant_type}, "
                 f"llm_int8_skip_modules={self.llm_int8_skip_modules})")
 
@@ -80,6 +87,9 @@ def get_safe_value(config, keys, default_value=None):
         bnb_4bit_compute_dtype = get_safe_value(config,
                                                 ["bnb_4bit_compute_dtype"],
                                                 default_value="float32")
+        bnb_4bit_quant_storage = get_safe_value(config,
+                                                ["bnb_4bit_quant_storage"],
+                                                default_value="uint8")
         bnb_4bit_quant_type = get_safe_value(config, ["bnb_4bit_quant_type"],
                                              default_value="fp4")
         bnb_4bit_use_double_quant = get_safe_value(
@@ -99,6 +109,7 @@ def get_safe_value(config, keys, default_value=None):
             load_in_8bit=load_in_8bit,
             load_in_4bit=load_in_4bit,
             bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
+            bnb_4bit_quant_storage=bnb_4bit_quant_storage,
             bnb_4bit_quant_type=bnb_4bit_quant_type,
             bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
             llm_int8_enable_fp32_cpu_offload=llm_int8_enable_fp32_cpu_offload,

From 1f6d7d2f79658be7f3d2ab86d284a51efa105fcf Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 26 Nov 2024 12:46:11 -0800
Subject: [PATCH 064/293] [V1] Refactor model executable interface for
 multimodal models (#10570)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/model_executor/models/blip2.py           |  61 ++++++-----
 vllm/model_executor/models/chameleon.py       |  58 +++++++---
 vllm/model_executor/models/chatglm.py         |  54 ++++++----
 vllm/model_executor/models/fuyu.py            |  43 +++++---
 vllm/model_executor/models/interfaces.py      |  36 ++++++-
 vllm/model_executor/models/internvl.py        |  54 +++++++---
 vllm/model_executor/models/llava.py           |  15 +--
 vllm/model_executor/models/llava_next.py      |  51 +++++----
 .../model_executor/models/llava_next_video.py |  44 +++++---
 vllm/model_executor/models/llava_onevision.py |  74 +++++++++----
 vllm/model_executor/models/molmo.py           |  88 +++++++--------
 vllm/model_executor/models/paligemma.py       |  52 +++++----
 vllm/model_executor/models/phi3v.py           |  16 +--
 vllm/model_executor/models/qwen2_audio.py     |  59 ++++++----
 vllm/model_executor/models/qwen2_vl.py        | 102 ++++++++++++------
 vllm/model_executor/models/ultravox.py        |  72 ++++++++-----
 vllm/model_executor/models/utils.py           |   5 +-
 vllm/v1/worker/gpu_model_runner.py            |   3 +-
 18 files changed, 581 insertions(+), 306 deletions(-)

diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 7d7639b4a92ce..d2592016aff34 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -16,6 +16,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.utils import consecutive_placeholder_ranges
 from vllm.sequence import IntermediateTensors, SequenceData
 
@@ -609,6 +610,25 @@ def _process_image_input(self,
 
         return self.language_projection(query_output)
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                BLIP2_IMAGE_TOKEN_ID)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -616,6 +636,7 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[SamplerOutput, IntermediateTensors]:
         """Run forward pass for BLIP-2.
@@ -648,32 +669,24 @@ def forward(
         See also:
             :class:`Blip2ImageInputs`
         """
+
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-
-            if image_input is not None:
-                vision_embeddings = self._process_image_input(image_input)
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
-
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, vision_embeddings,
-                    BLIP2_IMAGE_TOKEN_ID)
-
-                input_ids = None
-            else:
-                inputs_embeds = None
-
-        hidden_states = self.language_model.model(
-            input_ids,
-            positions,
-            kv_caches,
-            attn_metadata,
-            intermediate_tensors=intermediate_tensors,
-            inputs_embeds=inputs_embeds)
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  kv_caches,
+                                                  attn_metadata,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
 
         return hidden_states
 
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 5a6d6432112f0..a40c321ce0a58 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -29,6 +29,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges,
                                    repeat_and_pad_placeholder_tokens)
@@ -38,7 +39,7 @@
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+                    maybe_prefix, merge_multimodal_embeddings)
 
 # These configs are not part of the model config but the preprocessor
 # and processor files, so we hardcode them in the model file for now.
@@ -987,6 +988,29 @@ def _parse_and_validate_image_input(
             data=self._validate_pixel_values(pixel_values),
         )
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        assert self.model.vqmodel is not None
+        image_tokens = self.model.get_image_tokens(image_input["data"].to(
+            self.config.torch_dtype))
+        vision_embeddings = self.model.get_input_embeddings(image_tokens)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+
+        inputs_embeds = self.model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.model.vocabulary_mapping.image_token_id)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -994,27 +1018,27 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[torch.Tensor, IntermediateTensors]:
 
         if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
             input_ids = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-
-            if image_input is not None:
-                assert self.model.vqmodel is not None
-                image_tokens = self.model.get_image_tokens(
-                    image_input["data"].to(self.config.torch_dtype))
-                image_token_id = self.model.vocabulary_mapping.image_token_id
-                special_image_mask = input_ids == image_token_id
-                image_tokens = image_tokens.to(input_ids.device,
-                                               input_ids.dtype)
-                input_ids = input_ids.masked_scatter(special_image_mask,
-                                                     image_tokens)
-
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+
+        hidden_states = self.model(input_ids,
+                                   positions,
+                                   kv_caches,
+                                   attn_metadata,
+                                   intermediate_tensors,
+                                   inputs_embeds=inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 5bcbce7180ca4..6c50882d83c3b 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -33,7 +33,8 @@
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalData, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalData, MultiModalKwargs,
+                                    NestedTensors)
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
                            SequenceData)
@@ -545,6 +546,30 @@ def _parse_and_validate_image_input(
                     """)
         return GLMImagePixelInputs(pixel_values=pixel_values)
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input["pixel_values"] is None:
+            return None
+        pixel_values = image_input["pixel_values"].to(
+            dtype=self.config.torch_dtype)
+        vision_embeddings = self.vision(pixel_values)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.embedding(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_glm_vision_embeddings(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                vision_embeddings=multimodal_embeddings,
+                boi_token_id=self.config.boi_token_id,
+                eoi_token_id=self.config.eoi_token_id)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -552,26 +577,17 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> torch.Tensor:
-        if intermediate_tensors is None:
-            inputs_embeds = self.embedding(input_ids)
-            image_input = self._parse_and_validate_image_input(**kwargs)
-
-            if image_input["pixel_values"] is not None:
-                pixel_values = image_input["pixel_values"].to(
-                    dtype=inputs_embeds.dtype)
-                image_embeds = self.vision(pixel_values)
-
-                boi_token_id = self.config.boi_token_id
-                eoi_token_id = self.config.eoi_token_id
-
-                inputs_embeds = merge_glm_vision_embeddings(
-                    input_ids=input_ids,
-                    inputs_embeds=inputs_embeds,
-                    vision_embeddings=image_embeds,
-                    boi_token_id=boi_token_id,
-                    eoi_token_id=eoi_token_id)
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        if intermediate_tensors is None and inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
         else:
             inputs_embeds = intermediate_tensors["hidden_states"]
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 7b46907ac83ab..6e86900326c4b 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -35,6 +35,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.image import cached_get_image_processor
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges)
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
@@ -302,6 +303,25 @@ def _process_image_input(
         vision_embeddings, _ = self.vision_embed_tokens(image_input["data"])
         return vision_embeddings
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                _IMAGE_TOKEN_ID)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -309,24 +329,19 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ):
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-
-            if image_input is not None:
-                vision_embeddings = self._process_image_input(image_input)
-                inputs_embeds = self.language_model.model.embed_tokens(
-                    input_ids)
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, vision_embeddings,
-                    self.image_token_id)
-
-            else:
-                inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
 
         hidden_states = self.language_model(
             input_ids=input_ids,
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 9b4a97abf9b51..1545ce332309f 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -2,7 +2,7 @@
                     Protocol, Type, Union, overload, runtime_checkable)
 
 import torch
-from typing_extensions import TypeIs
+from typing_extensions import TypeIs, TypeVar
 
 from vllm.logger import init_logger
 from vllm.utils import supports_kw
@@ -10,10 +10,14 @@
 from .interfaces_base import is_embedding_model
 
 if TYPE_CHECKING:
+    from vllm.attention import AttentionMetadata
+    from vllm.multimodal.inputs import NestedTensors  # noqa: F401
     from vllm.sequence import IntermediateTensors
 
 logger = init_logger(__name__)
 
+T = TypeVar("T", default="NestedTensors")
+
 
 @runtime_checkable
 class SupportsMultiModal(Protocol):
@@ -28,6 +32,36 @@ class SupportsMultiModal(Protocol):
         MRO of your model class.
     """
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
+        """
+        Returns multimodal embeddings generated from multimodal kwargs 
+        to be merged with text embeddings.
+        """
+        ...
+
+    # Only for models that support v0 chunked prefill
+    # TODO(ywang96): Remove this overload once v0 is deprecated
+    @overload
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[T] = None,
+        attn_metadata: Optional["AttentionMetadata"] = None,
+    ) -> torch.Tensor:
+        ...
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[T] = None,
+    ) -> torch.Tensor:
+        """
+        Returns the input embeddings merged from the text embeddings from 
+        input_ids and the multimodal embeddings generated from multimodal 
+        kwargs.
+        """
+        ...
+
 
 # We can't use runtime_checkable with ClassVar for issubclass checks
 # so we need to treat the class as an instance and use isinstance instead
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 47ac00b6afe9b..b1c0065afbf30 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -26,6 +26,7 @@
                                                    InternVisionPatchModel)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
@@ -641,6 +642,26 @@ def _get_visual_token_mask(self, input_ids: torch.Tensor) -> torch.Tensor:
             visual_token_mask = None
         return visual_token_mask
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            assert self.img_context_token_id is not None
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.img_context_token_id)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -648,26 +669,22 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[SamplerOutput, IntermediateTensors]:
+
+        visual_token_mask = None
         if intermediate_tensors is not None:
             input_ids = None
             inputs_embeds = None
-            visual_token_mask = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            if image_input is not None:
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
-                vision_embeddings = self._process_image_input(image_input)
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, vision_embeddings,
-                    self.img_context_token_id)
-                visual_token_mask = self._get_visual_token_mask(input_ids)
-                input_ids = None
-            else:
-                inputs_embeds = None
-                visual_token_mask = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
 
         forward_kwargs = {
             "input_ids": input_ids,
@@ -677,6 +694,13 @@ def forward(
             "intermediate_tensors": intermediate_tensors,
             "inputs_embeds": inputs_embeds,
         }
+        if self.img_context_token_id is not None:
+            visual_token_mask = self._get_visual_token_mask(input_ids)
+
+            # We always overwrite it back to None after computing visual token
+            # mask so that this doesn't need to depend on encoder output
+            self.img_context_token_id = None
+
         if self.is_mono:
             forward_kwargs.update({"visual_token_mask": visual_token_mask})
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 05c6cc62efcd7..e7757b3c7d405 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -478,7 +478,7 @@ def _process_image_input(self,
         image_features = self._process_image_pixels(image_input)
         return self.multi_modal_projector(image_features)
 
-    def process_mm_inputs(self, **kwargs):
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -488,12 +488,12 @@ def process_mm_inputs(self, **kwargs):
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        vision_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[NestedTensors] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if vision_embeddings is not None:
+        if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, vision_embeddings,
+                input_ids, inputs_embeds, multimodal_embeddings,
                 self.config.image_token_index)
         return inputs_embeds
 
@@ -544,10 +544,11 @@ def forward(
         """
         if intermediate_tensors is not None:
             inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
         elif inputs_embeds is None:
-            vision_embeddings = self.process_mm_inputs(**kwargs)
-            # always pass the input via `inputs_embeds`
-            # to make sure the computation graph is consistent
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(input_ids,
                                                       vision_embeddings)
             input_ids = None
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index abeebb45fc4a7..e113f5862830d 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -19,6 +19,7 @@
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import NestedTensors
 from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.utils import is_list_of
 
@@ -565,6 +566,30 @@ def _process_image_input(
             for i, patch_features_batch in enumerate(patch_embeddings)
         ]
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+
+        if multimodal_embeddings is None:
+            return self.language_model.get_input_embeddings(input_ids)
+
+        inputs_embeds = embed_multimodal(
+            input_ids,
+            self.config.image_token_index,
+            self.language_model.model.get_input_embeddings,
+            multimodal_embeddings,
+        )
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -572,6 +597,7 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         """Run forward pass for LlaVA-NeXT.
@@ -620,24 +646,14 @@ def forward(
         """
         if intermediate_tensors is not None:
             inputs_embeds = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-
-            if image_input is not None:
-                inputs_embeds = embed_multimodal(
-                    input_ids,
-                    self.config.image_token_index,
-                    self.language_model.model.get_input_embeddings,
-                    lambda _: self._process_image_input(image_input),
-                )
-            else:
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
 
-        # always pass the input via `inputs_embeds`
-        # to make sure the computation graph is consistent
-        # for `torch.compile` integration
-        input_ids = None
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
@@ -645,7 +661,6 @@ def forward(
                                                   attn_metadata,
                                                   intermediate_tensors,
                                                   inputs_embeds=inputs_embeds)
-
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index e2880c76cf43d..b130791808924 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -18,6 +18,7 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    repeat_and_pad_placeholder_tokens)
 from vllm.sequence import IntermediateTensors
@@ -388,6 +389,25 @@ def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
             raise ValueError(
                 f"Unsupported type of video input {type(video_pixels)}")
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        video_input = self._parse_and_validate_video_input(**kwargs)
+        if video_input is None:
+            return None
+        vision_embeddings = self._process_video_pixels(video_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.config.video_token_index)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -395,6 +415,7 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         """Run forward pass for LlaVA-NeXT-Video.
@@ -404,22 +425,15 @@ def forward(
             pixel_values_videos: Pixels in each frames for each input videos.
         """
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            video_input = self._parse_and_validate_video_input(**kwargs)
-            if video_input is not None:
-                video_embeddings = self._process_video_pixels(video_input)
-                inputs_embeds = self.language_model \
-                    .model.get_input_embeddings(input_ids)
-
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, video_embeddings,
-                    self.config.video_token_index)
-
-                input_ids = None
-            else:
-                inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 705ca1e4ab6e6..3166737d61582 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -21,6 +21,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    repeat_and_pad_placeholder_tokens)
 from vllm.sequence import IntermediateTensors
@@ -824,6 +825,49 @@ def apply_pooling(self, image_features, stride=2):
         image_feature = image_feature.view(batch_frames, -1, dim)
         return image_feature
 
+    def get_multimodal_embeddings(
+            self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]:
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        # We make a tuple of each embedding with its modality string. This is a
+        # temporary workaround for models to handle mixed modalities when
+        # get_multimodal_embeddings and get_input_embeddings are called
+        # separately.
+        # TODO(ywang96): Add support for mixed-modality inference for v1.
+        multimodal_embeddings: List[Tuple[NestedTensors, str]] = []
+
+        if "images" in modalities:
+            image_input = modalities["images"]
+            vision_embeddings = self._process_image_input(image_input)
+            multimodal_embeddings.append((vision_embeddings, "image"))
+        if "videos" in modalities:
+            video_input = modalities["videos"]
+            video_embeddings = self._process_video_pixels(video_input)
+            multimodal_embeddings.append((video_embeddings, "video"))
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[List[Tuple[NestedTensors,
+                                                   str]]] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            for embeddings, modality in multimodal_embeddings:
+                if modality == "image":
+                    inputs_embeds = merge_multimodal_embeddings(
+                        input_ids, inputs_embeds, embeddings,
+                        self.config.image_token_index)
+                if modality == "video":
+                    inputs_embeds = merge_multimodal_embeddings(
+                        input_ids, inputs_embeds, embeddings,
+                        self.config.video_token_index)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -831,6 +875,7 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         """Run forward pass for LlaVA-Onevision.
@@ -840,28 +885,15 @@ def forward(
             pixel_values_videos: Pixels in each frames for each input videos.
         """
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
-            if modalities:
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
-                if "images" in modalities:
-                    image_input = modalities["images"]
-                    vision_embeddings = self._process_image_input(image_input)
-                    inputs_embeds = merge_multimodal_embeddings(
-                        input_ids, inputs_embeds, vision_embeddings,
-                        self.config.image_token_index)
-                if "videos" in modalities:
-                    video_input = modalities["videos"]
-                    video_embeddings = self._process_video_pixels(video_input)
-                    inputs_embeds = merge_multimodal_embeddings(
-                        input_ids, inputs_embeds, video_embeddings,
-                        self.config.video_token_index)
-                input_ids = None
-            else:
-                inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      multimodal_embeddings)
+            input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index ee7b560fe1ee4..acedddd84d7cb 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -3,7 +3,7 @@
 from array import array
 from dataclasses import dataclass
 from functools import lru_cache, partial
-from typing import Iterable, List, Mapping, Optional, Tuple, TypedDict, Union
+from typing import Iterable, List, Mapping, Optional, Tuple, TypedDict
 
 import torch
 from einops import rearrange
@@ -36,6 +36,7 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.platforms import _Backend
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
@@ -756,6 +757,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -1098,19 +1105,16 @@ def _process_image_input(
 
         return image_features
 
-    def _merge_multimodal_embeddings(
-        self,
-        inputs_embeds: torch.Tensor,
-        image_features: torch.Tensor,
-        image_input_idx: torch.Tensor,
-        seq_len: Union[torch.Tensor, List[torch.Tensor]],
-    ) -> torch.Tensor:
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        image_features = self._process_image_input(image_input)
+        image_input_idx = image_input["image_input_idx"]
+        seq_len = image_input["seq_len"]
         batch_size, num_image, num_patch = image_features.shape[:3]
         assert image_input_idx.shape == (batch_size, num_image, num_patch)
 
-        image_features = image_features.to(inputs_embeds.device)
-        seq_len = seq_len.to(inputs_embeds.device)
-
         # insert the image feature into the embedding.
         image_features = image_features.view(batch_size, num_image * num_patch,
                                              -1)
@@ -1130,12 +1134,24 @@ def _merge_multimodal_embeddings(
         image_input_idx = image_input_idx + offset.to(image_input_idx.dtype)
         image_input_idx = image_input_idx.flatten()[:, None]
         mat = image_input_idx == torch.arange(
-            seq_len.sum().item(), device=inputs_embeds.device)[None, :]
+            seq_len.sum().item(), device=image_features.device)[None, :]
         mat = mat.to(image_features.dtype)
 
-        inputs_embeds = inputs_embeds + torch.einsum('nd,nm->md',
-                                                     image_features, mat)
+        # Note: In this original implementation from AI2, the final
+        # vision_embeddings will be always be the same length
+        # of input embedddings, which is not very efficient.
+        # TODO(ywang96): see if this can be optimized.
+        vision_embeddings = torch.einsum('nd,nm->md', image_features, mat)
+        return vision_embeddings
 
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = inputs_embeds + multimodal_embeddings
         return inputs_embeds
 
     def forward(
@@ -1145,39 +1161,27 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> SamplerOutput:
+
         if intermediate_tensors is not None:
             inputs_embeds = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-
-            if image_input is not None:
-                inputs_embeds = self.model.embed_tokens(input_ids)
-                image_features = self._process_image_input(image_input)
-
-                inputs_embeds = self._merge_multimodal_embeddings(
-                    inputs_embeds,
-                    image_features,
-                    image_input["image_input_idx"],
-                    image_input["seq_len"],
-                )
-            else:
-                inputs_embeds = self.model.embed_tokens(input_ids)
 
-        # always pass the input via `inputs_embeds`
-        # to make sure the computation graph is consistent
-        # for `torch.compile` integration
-        input_ids = None
-
-        hidden_states = self.model(
-            input_ids=input_ids,
-            positions=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
-            intermediate_tensors=intermediate_tensors,
-            inputs_embeds=inputs_embeds,
-        )
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.model(input_ids,
+                                   positions,
+                                   kv_caches,
+                                   attn_metadata,
+                                   intermediate_tensors,
+                                   inputs_embeds=inputs_embeds)
 
         return hidden_states
 
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index dd5256eb87ab3..2e5b6bee784e7 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -13,6 +13,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors
 
@@ -240,36 +241,45 @@ def _process_image_input(
 
         return self.multi_modal_projector(image_features)
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        # https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa
+        vision_embeddings = vision_embeddings * (self.config.hidden_size**-0.5)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.config.image_token_index)
+        return inputs_embeds
+
     def forward(self,
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
                 kv_caches: List[torch.Tensor],
                 attn_metadata: AttentionMetadata,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs: object) -> Union[SamplerOutput, IntermediateTensors]:
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            parsed_image_input = self._parse_and_validate_image_input(**kwargs)
-
-            if parsed_image_input is not None:
-                vision_embeddings = self._process_image_input(
-                    parsed_image_input)
-                # https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa
-                vision_embeddings = vision_embeddings * (
-                    self.config.hidden_size**-0.5)
-
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
-
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, vision_embeddings,
-                    self.config.image_token_index)
-
-                input_ids = None
-            else:
-                inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 2e583bb08e87a..4cb874a13e0c1 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -676,7 +676,7 @@ def _process_image_input(
 
         return image_embeds
 
-    def process_mm_inputs(self, **kwargs):
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -686,12 +686,12 @@ def process_mm_inputs(self, **kwargs):
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        vision_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[NestedTensors] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.embed_tokens(input_ids)
-        if vision_embeddings is not None:
+        if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, vision_embeddings,
+                input_ids, inputs_embeds, multimodal_embeddings,
                 self.image_token_id)
         return inputs_embeds
 
@@ -703,12 +703,14 @@ def forward(self,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs: object):
+
         if intermediate_tensors is not None:
             inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility
         elif inputs_embeds is None:
-            vision_embeddings = self.process_mm_inputs(**kwargs)
-            # always pass the input via `inputs_embeds`
-            # to make sure the computation graph is consistent
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(input_ids,
                                                       vision_embeddings)
             input_ids = None
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 0c2374c3c3fc9..a0605fee82aca 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -42,10 +42,12 @@
 from vllm.model_executor.models.qwen2 import Qwen2Model
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.utils import consecutive_placeholder_ranges
 from vllm.sequence import IntermediateTensors, SequenceData
 
 from .interfaces import SupportsMultiModal, SupportsPP
+from .utils import merge_multimodal_embeddings
 
 logger = init_logger(__name__)
 
@@ -371,6 +373,25 @@ def _process_audio_input(self,
 
         return masked_audio_features
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        if audio_input is None:
+            return None
+        masked_audio_features = self._process_audio_input(audio_input)
+        return masked_audio_features
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.config.audio_token_index)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -378,33 +399,27 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
+
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            audio_input = self._parse_and_validate_audio_input(**kwargs)
 
-            if audio_input is None:
-                inputs_embeds = None
-            else:
-                inputs_embeds = self.language_model.embed_tokens(input_ids)
-                masked_audio_features = self._process_audio_input(audio_input)
-                # merge llm embeddings and audio features
-                mask = (input_ids == self.config.audio_token_index)
-                inputs_embeds[mask, :] = masked_audio_features
-
-                input_ids = None
-
-        hidden_states = self.language_model(
-            input_ids=input_ids,
-            positions=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
-            intermediate_tensors=intermediate_tensors,
-            inputs_embeds=inputs_embeds,
-        )
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      multimodal_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model(input_ids,
+                                            positions,
+                                            kv_caches,
+                                            attn_metadata,
+                                            intermediate_tensors,
+                                            inputs_embeds=inputs_embeds)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 531608a877f2f..7956a98b21569 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -63,7 +63,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import cached_get_image_processor
 from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict,
-                                    MultiModalKwargs)
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors, PoolerOutput, SequenceData
@@ -1238,6 +1238,55 @@ def _merge_multimodal_embeddings(
         inputs_embeds[mask, :] = multimodal_embeddings
         return inputs_embeds
 
+    def get_multimodal_embeddings(
+            self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]:
+
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        video_input = self._parse_and_validate_video_input(**kwargs)
+        if image_input is None and video_input is None:
+            return None
+
+        # We make a tuple of each embedding with its modality string. This is a
+        # temporary workaround for models to handle mixed modalities when
+        # get_multimodal_embeddings and get_input_embeddings are called
+        # separately.
+        # TODO(ywang96): Add support for mixed-modality inference for v1.
+        multimodal_embeddings: List[Tuple[NestedTensors, str]] = []
+
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            multimodal_embeddings.append((image_embeds, "image"))
+        if video_input is not None:
+            video_embeds = self._process_video_input(video_input)
+            multimodal_embeddings.append((video_embeds, "video"))
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[List[Tuple[NestedTensors,
+                                                   str]]] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            for embeddings, modality in multimodal_embeddings:
+                if modality == "image":
+                    inputs_embeds = self._merge_multimodal_embeddings(
+                        input_ids,
+                        inputs_embeds,
+                        embeddings,
+                        placeholder_token_id=self.config.image_token_id,
+                    )
+                if modality == "video":
+                    inputs_embeds = self._merge_multimodal_embeddings(
+                        input_ids,
+                        inputs_embeds,
+                        embeddings,
+                        placeholder_token_id=self.config.video_token_id,
+                    )
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -1245,6 +1294,7 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         """Run forward pass for Qwen2-VL.
@@ -1266,42 +1316,26 @@ def forward(
             video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
                 `None` if no videos are passed.
         """
+
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            video_input = self._parse_and_validate_video_input(**kwargs)
-
-            if image_input is None and video_input is None:
-                inputs_embeds = None
-            else:
-                if uses_mrope(self.config):
-                    assert positions.ndim == 2 and positions.size(0) == 3, (
-                        "multimodal section rotary embedding requires "
-                        f"(3, seq_len) positions, but got {positions.size()}")
-
-                inputs_embeds = self.model.embed_tokens(input_ids)
-
-                if image_input is not None:
-                    image_embeds = self._process_image_input(image_input)
-                    inputs_embeds = self._merge_multimodal_embeddings(
-                        input_ids,
-                        inputs_embeds,
-                        image_embeds,
-                        placeholder_token_id=self.config.image_token_id,
-                    )
-
-                if video_input is not None:
-                    video_embeds = self._process_video_input(video_input)
-                    inputs_embeds = self._merge_multimodal_embeddings(
-                        input_ids,
-                        inputs_embeds,
-                        video_embeds,
-                        placeholder_token_id=self.config.video_token_id,
-                    )
 
-                input_ids = None
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
+
+            # We need to check for usage of mrope here in case there is
+            # multimodal data.
+            # TODO (ywang96): move this to model runner in V1.
+            if multimodal_embeddings is not None and uses_mrope(self.config):
+                assert positions.ndim == 2 and positions.size(0) == 3, (
+                    "multimodal section rotary embedding requires "
+                    f"(3, seq_len) positions, but got {positions.size()}")
+
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      multimodal_embeddings)
+            input_ids = None
 
         hidden_states = self.model(
             input_ids=input_ids,
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 512adbc7db35e..b61deccde45b7 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -449,10 +449,36 @@ def _process_audio_input(
 
         return result
 
-    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        if audio_input is None:
+            return None
+        audio_embeddings = self._process_audio_input(audio_input)
+        return audio_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+        attn_metadata: Optional[AttentionMetadata] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+
+            # TODO(ywang96): use merge_multimodal_embeddings after
+            # v0 is deprecated
+            merge_multimodal_embeddings_from_map(
+                inputs_embeds, multimodal_embeddings,
+                attn_metadata.multi_modal_placeholder_index_maps["audio"])
+        return inputs_embeds
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
                 kv_caches: List[torch.Tensor],
                 attn_metadata: AttentionMetadata,
-                intermediate_tensors: Optional[torch.Tensor],
+                intermediate_tensors: Optional[torch.Tensor] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs) -> Union[torch.Tensor, IntermediateTensors]:
         """Run forward pass for Ultravox
 
@@ -466,30 +492,28 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
         Args:
             audio_features: A batch of audio inputs [B, N, 80, M].
         """
+
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            audio_input = self._parse_and_validate_audio_input(**kwargs)
-            if audio_input is not None:
-                audio_embeddings = self._process_audio_input(audio_input)
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
-
-                merge_multimodal_embeddings_from_map(
-                    inputs_embeds, audio_embeddings,
-                    attn_metadata.multi_modal_placeholder_index_maps["audio"])
-                input_ids = None
-            else:
-                inputs_embeds = None
-
-        hidden_states = self.language_model.model(
-            input_ids=input_ids,
-            positions=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
-            intermediate_tensors=intermediate_tensors,
-            inputs_embeds=inputs_embeds)
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
+
+            # TODO(ywang96): remove attn_metadata from get_input_embeddings
+            # after v0 is deprecated
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      multimodal_embeddings,
+                                                      attn_metadata)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  kv_caches,
+                                                  attn_metadata,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index dcfd2cb7d2622..4c13cbc953273 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -356,8 +356,7 @@ def embed_multimodal(
     input_ids: torch.Tensor,
     multimodal_token_id: int,
     get_text_embeds: Callable[[torch.Tensor], torch.Tensor],
-    get_multimodal_embeds: Callable[[torch.Tensor], Union[torch.Tensor,
-                                                          List[torch.Tensor]]],
+    multimodal_embeds: Union[torch.Tensor, List[torch.Tensor]],
 ) -> torch.Tensor:
     """
     Embed token IDs and multimodal inputs and combine their embeddings.
@@ -374,8 +373,6 @@ def embed_multimodal(
     is_text = ~is_multimodal
 
     text_embeds = get_text_embeds(input_ids[is_text])
-    multimodal_embeds = get_multimodal_embeds(input_ids[is_multimodal])
-
     merged_embeds = torch.empty(
         (input_ids.shape[0], text_embeds.shape[1]),
         dtype=text_embeds.dtype,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2e642c5869c97..8dbfb6ef3aaa4 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -365,7 +365,8 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
         # 2. A list (length: num_images) of tensors, each of shape
         # [feature_size, hidden_size] in case when the feature size is
         # dynamic depending on input images.
-        encoder_outputs = self.model.process_mm_inputs(**batched_mm_inputs)
+        encoder_outputs = self.model.get_multimodal_embeddings(
+            **batched_mm_inputs)
 
         # Cache the encoder outputs.
         for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):

From 95dd57876dd48c5027ba2150b3b289e1960744b5 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Fri, 29 Nov 2024 02:45:14 +0000
Subject: [PATCH 065/293] tweak tolerance; fast check

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml      | 9 +++++++++
 tests/v1/samplers/test_logprobs.py | 4 ++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index fc23c9cff0d87..c6d31b837c55d 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -174,6 +174,15 @@ steps:
   commands:
     - VLLM_USE_V1=1 pytest -v -s v1
 
+- label: V1 Fast Test
+  #mirror_hardwares: [amd]
+  fast_check: true
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - VLLM_USE_V1=1 pytest -v -s v1/samplers/test_logprobs.py::test_fast_get_logprobs_and_prompt_logprobs
+
 - label: Examples Test # 15min
   working_dir: "/vllm-workspace/examples"
   #mirror_hardwares: [amd]
diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index 7c736d957e38a..a42e78da85ca0 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -234,8 +234,8 @@ def _test_case_get_logprobs_and_prompt_logprobs(
                     torch.testing.assert_close(
                         logprob.logprob,
                         hf_logprob[0][i][token_id].item(),
-                        atol=1e-2,
-                        rtol=1e-2)
+                        atol=2e-2,
+                        rtol=2e-2)
         else:
             assert vllm_result.prompt_logprobs is None
 

From dd8ea8b21ddad7818d43ddca3c700edf6107c1d0 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Tue, 26 Nov 2024 19:57:11 -0600
Subject: [PATCH 066/293] Remove hard-dependencies of Speculative decode to
 CUDA workers (#10587)

Signed-off-by: Chendi Xue <chendi.xue@intel.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/spec_decode/test_spec_decode_worker.py  |  4 +-
 vllm/config.py                                |  1 +
 .../layers/spec_decode_base_sampler.py        | 17 +++++++-
 vllm/platforms/cpu.py                         |  8 +++-
 vllm/platforms/cuda.py                        |  4 +-
 vllm/spec_decode/draft_model_runner.py        | 24 ++++++------
 vllm/spec_decode/interfaces.py                |  8 ++--
 vllm/spec_decode/medusa_worker.py             |  9 +++--
 vllm/spec_decode/metrics.py                   | 15 ++++++-
 vllm/spec_decode/multi_step_worker.py         | 31 +++++++++++----
 vllm/spec_decode/ngram_worker.py              |  3 +-
 vllm/spec_decode/spec_decode_worker.py        | 36 +++++++++++------
 vllm/spec_decode/target_model_runner.py       | 33 ++++++----------
 vllm/spec_decode/util.py                      | 12 ++++--
 vllm/worker/cpu_model_runner.py               | 39 ++++++++++++++++++-
 vllm/worker/cpu_worker.py                     | 27 ++++++++++++-
 vllm/worker/model_runner_base.py              | 15 +++++++
 vllm/worker/worker.py                         |  7 ++--
 vllm/worker/worker_base.py                    |  3 ++
 19 files changed, 219 insertions(+), 77 deletions(-)

diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index d7caf57147278..caf7a7e625b46 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -595,8 +595,8 @@ def test_init_device(acceptance_sampler_method: str):
 
     target_worker.init_device.assert_called_once()
 
-    metrics_collector.init_gpu_tensors.assert_called_once()
-    spec_decode_sampler.init_gpu_tensors.assert_called_once()
+    metrics_collector.init_tensors.assert_called_once()
+    spec_decode_sampler.init_tensors.assert_called_once()
 
 
 @pytest.mark.parametrize("acceptance_sampler_method",
diff --git a/vllm/config.py b/vllm/config.py
index eae6f909e3933..68f73bf4b4dc9 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -990,6 +990,7 @@ class ParallelConfig:
     # the full name of the worker class to use. If "auto", the worker class
     # will be determined based on the platform.
     worker_cls: str = "auto"
+    sd_worker_cls: str = "auto"
 
     world_size: int = field(init=False)
 
diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py
index 7e750a744e25f..6aa4b8bd34cde 100644
--- a/vllm/model_executor/layers/spec_decode_base_sampler.py
+++ b/vllm/model_executor/layers/spec_decode_base_sampler.py
@@ -43,6 +43,21 @@ def init_gpu_tensors(self, device: Union[int, str]) -> None:
                                                dtype=torch.long,
                                                device=device)
 
+    def init_tensors(self,
+                     device: Union[int, str],
+                     device_type: Union[torch.device, str] = 'cuda') -> None:
+        assert self.num_accepted_tokens is None
+        if isinstance(device_type, torch.device):
+            device_type = device_type.type
+        if isinstance(device, int):
+            device = f"{device_type}:{device}"
+        self.num_accepted_tokens = torch.tensor(0,
+                                                dtype=torch.long,
+                                                device=device)
+        self.num_emitted_tokens = torch.tensor(0,
+                                               dtype=torch.long,
+                                               device=device)
+
     @property
     def probs_dtype(self):
         return torch.float32
@@ -77,7 +92,7 @@ def _create_output(
             tensor is [batch_size, k + num_bonus_tokens]
         """
         batch_size, k = substitute_token_ids.shape
-        bonus_token_ids = bonus_token_ids.squeeze()
+        bonus_token_ids = bonus_token_ids.squeeze(-1)
         # Determine the index of the first False value for each row.
         limits = (accepted == 0).max(1).indices
         limits[~(accepted == 0).any(1)] = k
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index cbc982752c6b4..3e22c87f61fac 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -86,4 +86,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                            parallel_config.distributed_executor_backend)
             parallel_config.distributed_executor_backend = "mp"
         if parallel_config.worker_cls == "auto":
-            parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"
+            if vllm_config.speculative_config:
+                parallel_config.worker_cls = \
+                    "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+                parallel_config.sd_worker_cls = \
+                    "vllm.worker.cpu_worker.CPUWorker"
+            else:
+                parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 0d07050fd1b6a..5e9ce551f2332 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -106,6 +106,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             elif vllm_config.speculative_config:
                 parallel_config.worker_cls = \
                     "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+                parallel_config.sd_worker_cls = \
+                    "vllm.worker.worker.Worker"
             else:
                 parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
@@ -236,4 +238,4 @@ def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
     if not isinstance(pynvml, _MockModule):
         CudaPlatform.log_warnings()
 except ModuleNotFoundError:
-    CudaPlatform.log_warnings()
+    CudaPlatform.log_warnings()
\ No newline at end of file
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index cf166e3eb5bad..fe5fd39f42ac9 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -20,8 +20,9 @@
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalKwargs
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
-from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
-                                      ModelRunner)
+from vllm.worker.model_runner_base import (ModelRunnerBase,
+                                           ModelRunnerInputBase,
+                                           ModelRunnerWrapperBase)
 
 logger = init_logger(__name__)
 
@@ -33,7 +34,7 @@
 allow_gpu_advance_step = True
 
 
-class TP1DraftModelRunner(ModelRunner):
+class TP1DraftModelRunner(ModelRunnerWrapperBase):
     """Specialized model runner for speculative decoding draft model.
     Since the draft model always execute k forward passes consecutively to
     generate k speculative tokens in a single speculative decoding step,
@@ -46,13 +47,14 @@ class TP1DraftModelRunner(ModelRunner):
        any broadcasting inside execute_model).
     """
 
-    def __init__(self, *args, **kwargs):
-        if kwargs.get("return_hidden_states"):
+    def __init__(self, model_runner: ModelRunnerBase):
+        if hasattr(
+                model_runner,
+                "return_hidden_states") and model_runner.return_hidden_states:
             raise ValueError(
                 "return_hidden_states is not supported for TP1DraftModelRunner."
             )
-
-        super().__init__(*args, **kwargs)
+        super().__init__(model_runner)
 
         self.indices_of_seq_with_bonus_tokens = None
 
@@ -73,10 +75,8 @@ def _update_sampling_metadata(self, sampling_metadata, num_seqs,
             assert seq_group.prompt_logprob_indices == []  # No prompt
             assert seq_group.sample_indices == [i]  # Simple
 
-    def _gpu_advance_step(
-            self, model_input: ModelInputForGPUWithSamplingMetadata,
-            last_output: SamplerOutput
-    ) -> ModelInputForGPUWithSamplingMetadata:
+    def _gpu_advance_step(self, model_input: ModelRunnerInputBase,
+                          last_output: SamplerOutput) -> ModelRunnerInputBase:
         # Currently, we expect "decode mode" only
         assert not model_input.is_prompt
 
@@ -168,7 +168,7 @@ def set_indices_of_seq_with_bonus_tokens(self,
     @torch.inference_mode()
     def execute_model(
         self,
-        model_input: ModelInputForGPUWithSamplingMetadata,
+        model_input: ModelRunnerInputBase,
         kv_caches: List[torch.Tensor],
         previous_hidden_states: Optional[torch.Tensor] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index 029f56460f5c1..a4fe0f13c8db1 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Optional, Set
+from typing import Optional, Set, Union
 
 import torch
 
@@ -75,9 +75,11 @@ def get_spec_proposals(
 
 class SpeculativeScorer(ABC):
 
-    def __init__(self, scorer_worker: WorkerBase, device: str,
-                 vocab_size: int):
+    def __init__(self, scorer_worker: WorkerBase,
+                 device: Union[torch.device, str], vocab_size: int):
         self._scorer_worker = scorer_worker
+        if isinstance(device, torch.device):
+            device = device.type
         self._device = device
         self._vocab_size = vocab_size
 
diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py
index 0d233f393cb8c..1ab691a7ef047 100644
--- a/vllm/spec_decode/medusa_worker.py
+++ b/vllm/spec_decode/medusa_worker.py
@@ -9,21 +9,22 @@
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer
-from vllm.worker.worker import Worker
+from vllm.worker.worker_base import WorkerWrapperBase
 
 
-class MedusaWorker(NonLLMProposerWorkerBase, Worker):
+class MedusaWorker(NonLLMProposerWorkerBase, WorkerWrapperBase):
     """Worker for Medusa.
     """
 
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+        super().__init__(kwargs.get("vllm_config"))
+        self.init_worker(*args, **kwargs)
 
         # Lazy initialization list.
         self._proposer: Top1Proposer
 
     def init_device(self):
-        super().init_device()
+        self.worker.init_device()
 
         self._proposer = Top1Proposer(
             weakref.proxy(self),  # type: ignore[arg-type]
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index 89ccaba70e93c..03dc46600d8a9 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -1,11 +1,12 @@
 import time
-from typing import Callable, Optional
+from typing import Callable, Optional, Union
 
 import msgspec
 import torch
 
 from vllm.model_executor.layers.spec_decode_base_sampler import (
     SpecDecodeBaseSampler)
+from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available
 
 
@@ -81,8 +82,20 @@ def init_gpu_tensors(self, rank: int) -> None:
         self._rank = rank
         self._copy_stream = torch.cuda.Stream()
 
+    def init_tensors(self,
+                     rank: int,
+                     device_type: Union[torch.device, str] = 'cuda') -> None:
+        self._rank = rank
+        if isinstance(device_type, torch.device):
+            device_type = device_type.type
+        if device_type == 'cuda':
+            self._copy_stream = torch.cuda.Stream()
+
     def maybe_collect_rejsample_metrics(
             self, k: int) -> Optional[SpecDecodeWorkerMetrics]:
+        # currently using cuda.Event, skip for any non_cuda_alike platform
+        if not current_platform.is_cuda_alike():
+            return None
 
         # If a copy was initiated in the previous call, collect and return.
         if self._in_flight_copy is not None:
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index f49b98f5c9528..d249b37c780e4 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -5,17 +5,21 @@
 import torch
 
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.platforms import current_platform
 from vllm.sequence import (ExecuteModelRequest, HiddenStates, SequenceData,
                            SequenceGroupMetadata)
-from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
+
+if current_platform.is_cuda_alike():
+    from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
+
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeProposer)
 from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer
-from vllm.worker.worker import Worker
+from vllm.worker.worker_base import WorkerWrapperBase
 
 
-class MultiStepWorker(Worker, ProposerWorkerBase):
+class MultiStepWorker(ProposerWorkerBase, WorkerWrapperBase):
     """The MultiStepWorker is equivalent to a Worker except that it allows
     multiple forward passes in a single call, assuming the scheduler has
     allocated enough space to store the additional KV. This reduces overhead
@@ -28,13 +32,14 @@ class MultiStepWorker(Worker, ProposerWorkerBase):
     """
 
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+        super().__init__(kwargs.get("vllm_config"))
+        self.init_worker(*args, **kwargs)
 
         # Lazy initialization list.
         self._proposer: SpeculativeProposer
 
     def init_device(self) -> None:
-        super().init_device()
+        self.worker.init_device()
 
         self._proposer = Top1Proposer(
             weakref.proxy(self),  # type: ignore[arg-type]
@@ -51,6 +56,18 @@ def set_should_modify_greedy_probs_inplace(self) -> None:
         self.model_runner.model.sampler.should_modify_greedy_probs_inplace = (
             True)
 
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        return self.worker.determine_num_available_blocks()
+
+    def get_cache_block_size_bytes(self) -> int:
+        return self.worker.get_cache_block_size_bytes()
+
+    def initialize_cache(self, *args, **kwargs) -> None:
+        self.worker.initialize_cache(*args, **kwargs)
+
+    def execute_model(self, *args, **kwargs) -> List[SamplerOutput]:
+        return self.worker.execute_model(*args, **kwargs)
+
     @torch.inference_mode()
     def sampler_output(
         self,
@@ -75,7 +92,7 @@ def sampler_output(
 
         # Run model sample_len times.
         model_outputs: List[SamplerOutput] = []
-        if isinstance(
+        if current_platform.is_cuda_alike() and isinstance(
                 self.model_runner, TP1DraftModelRunner
         ) and self.model_runner.supports_gpu_multi_step(expanded_request):
             # Here we run the draft_model_runner with multi-step prepare
@@ -92,7 +109,7 @@ def sampler_output(
             # and other restrictions that are part of DraftModelRunner's
             # supports_gpu_multi_step(..)
             for _ in range(sample_len):
-                model_output: List[SamplerOutput] = super().execute_model(
+                model_output: List[SamplerOutput] = self.worker.execute_model(
                     execute_model_req=expanded_request)
                 assert (len(model_output) == 1
                         ), "composing multistep workers not supported"
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index debb3b2d5ec30..bb6b99135580e 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -22,6 +22,7 @@ def __init__(self, *args, **kwargs):
         # Get local_rank/vocab_size from kwargs attribute
         self.local_rank = kwargs["local_rank"]
         self.vocab_size = kwargs["vllm_config"].model_config.get_vocab_size()
+        self.device_type = kwargs.get("device_type", "cuda")
 
         # Lazy initialization list.
         self._proposer: Top1Proposer
@@ -34,7 +35,7 @@ def set_ngram_window_size(self, ngram_prompt_lookup_min: int,
         self.ngram_prompt_lookup_min = ngram_prompt_lookup_min
 
     def init_device(self):
-        self.device = torch.device(f"cuda:{self.local_rank}")
+        self.device = torch.device(f"{self.device_type}:{self.local_rank}")
         self.load_model = lambda *args, **kwargs: None
 
         # Current NGramWorker only supports Top1Proposer
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index b279931ca4b02..53634f7b0b366 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -14,12 +14,16 @@
     SpecDecodeBaseSampler, SpecDecodeStochasticBaseSampler)
 from vllm.model_executor.layers.typical_acceptance_sampler import (
     TypicalAcceptanceSampler)
+from vllm.platforms import current_platform
 from vllm.sequence import (VLLM_INVALID_TOKEN_ID,
                            CompletionSequenceGroupOutput, ExecuteModelRequest,
                            HiddenStates, SequenceGroupMetadata,
                            get_all_seq_ids_and_request_ids)
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
-from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
+
+if current_platform.is_cuda_alike():
+    from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
+
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
 from vllm.spec_decode.medusa_worker import MedusaWorker
@@ -36,8 +40,8 @@
                                    get_all_num_logprobs,
                                    get_sampled_token_logprobs, nvtx_range,
                                    split_batch_by_proposal_len)
-from vllm.worker.worker import Worker
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
+from vllm.worker.worker_base import (LoraNotSupportedWorkerBase, WorkerBase,
+                                     WorkerWrapperBase)
 
 logger = init_logger(__name__)
 
@@ -53,7 +57,11 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     draft_worker_kwargs = kwargs.copy()
 
     kwargs["model_runner_cls"] = TargetModelRunner
-    target_worker = Worker(*args, **kwargs)
+    target_worker_config = copy.deepcopy(vllm_config)
+    target_worker_config.parallel_config.worker_cls =\
+        target_worker_config.parallel_config.sd_worker_cls
+    target_worker = WorkerWrapperBase(vllm_config=target_worker_config)
+    target_worker.init_worker(*args, **kwargs)
     # Set the disable_logprobs variable in the TargetModelRunner instance
     # as per its value specified in the SpeculativeConfig.
     target_worker.model_runner.disable_logprobs =\
@@ -65,6 +73,8 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
         draft_worker_config.model_config,
         vllm_config.load_config,
     )
+    speculative_config.draft_parallel_config.worker_cls =\
+        draft_worker_config.parallel_config.sd_worker_cls
     draft_worker_config.parallel_config = speculative_config.draft_parallel_config  # noqa
     # TODO allow draft-model specific load config.
 
@@ -125,7 +135,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
     @classmethod
     def create_worker(
         cls,
-        scorer_worker: Worker,
+        scorer_worker: WorkerBase,
         draft_worker_kwargs: Dict[str, Any],
         disable_mqa_scorer: bool,
         disable_by_batch_size: Optional[int],
@@ -145,6 +155,8 @@ def create_worker(
         draft_parallel_config: ParallelConfig = draft_worker_kwargs[
             'vllm_config'].parallel_config
         if ngram_prompt_lookup_max > 0:
+            draft_worker_kwargs[
+                "device_type"] = scorer_worker.device_config.device.type
             proposer_worker = NGramWorker(**draft_worker_kwargs)
             proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
                                                   ngram_prompt_lookup_max)
@@ -158,8 +170,9 @@ def create_worker(
                 proposer_worker = MedusaWorker(**draft_worker_kwargs)
             else:
                 if draft_tp == 1:
-                    draft_worker_kwargs[
-                        "model_runner_cls"] = TP1DraftModelRunner
+                    if current_platform.is_cuda_alike():
+                        draft_worker_kwargs[
+                            "model_runner_cls"] = TP1DraftModelRunner
                 else:
                     if draft_model_config.hf_config.model_type == "eagle":
                         raise NotImplementedError(
@@ -306,8 +319,9 @@ def init_device(self) -> None:
         self.scorer_worker.load_model()
         self.proposer_worker.load_model()
 
-        self._metrics.init_gpu_tensors(self.rank)
-        self.spec_decode_sampler.init_gpu_tensors(self.rank)
+        self._metrics.init_tensors(self.rank, device_type=self.device)
+        self.spec_decode_sampler.init_tensors(self.rank,
+                                              device_type=self.device)
 
         scorer_cls: Type[SpeculativeScorer]
         if self.disable_mqa_scorer:
@@ -1111,11 +1125,11 @@ def get_cache_block_size_bytes(self):
         raise NotImplementedError
 
     def start_profile(self):
-        if isinstance(self.scorer_worker, Worker):
+        if isinstance(self.scorer_worker, WorkerBase):
             self.scorer_worker.start_profile()
 
     def stop_profile(self):
-        if isinstance(self.scorer_worker, Worker):
+        if isinstance(self.scorer_worker, WorkerBase):
             self.scorer_worker.stop_profile()
 
 
diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py
index e61cde5b17f20..56540744b73a9 100644
--- a/vllm/spec_decode/target_model_runner.py
+++ b/vllm/spec_decode/target_model_runner.py
@@ -1,12 +1,12 @@
 from typing import List, Optional
 
-from vllm.config import VllmConfig
 from vllm.sequence import SequenceGroupMetadata
-from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
-                                      ModelRunner)
+from vllm.worker.model_runner_base import (ModelRunnerBase,
+                                           ModelRunnerInputBase,
+                                           ModelRunnerWrapperBase)
 
 
-class TargetModelRunner(ModelRunner):
+class TargetModelRunner(ModelRunnerWrapperBase):
     """Specialized model runner for speculative decoding target model.
     In speculative decoding, the log probabilities selected finally may not
     be the same ones as selected by the target model sampling. This means
@@ -18,32 +18,21 @@ class TargetModelRunner(ModelRunner):
     requested or not. 
     """
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        kv_cache_dtype: Optional[str] = "auto",
-        is_driver_worker: bool = False,
-        return_hidden_states: bool = False,
-    ):
+    def __init__(self, model_runner: ModelRunnerBase):
         # An internal boolean member variable to indicate if token log
         # probabilities are needed or not.
+        super().__init__(model_runner)
         self.disable_logprobs = True
-        super().__init__(
-            vllm_config=vllm_config,
-            kv_cache_dtype=kv_cache_dtype,
-            is_driver_worker=is_driver_worker,
-            return_hidden_states=return_hidden_states,
-        )
 
     def prepare_model_input(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
         virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> ModelInputForGPUWithSamplingMetadata:
-        model_input: ModelInputForGPUWithSamplingMetadata = super(
-        ).prepare_model_input(seq_group_metadata_list, virtual_engine,
-                              finished_requests_ids)
+        finished_requests_ids: Optional[List[str]] = None,
+    ) -> ModelRunnerInputBase:
+        model_input: ModelRunnerInputBase =\
+            self.model_runner.prepare_model_input(
+            seq_group_metadata_list, virtual_engine, finished_requests_ids)
         # If token log probabilities is disabled then skip generating sampler
         # CPU output. We directly serialize the GPU sampled_token_id tensors
         # as needed. If log probabilities is enabled then synchronize all the
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index 193ef870dfceb..da8706658d09a 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -5,6 +5,7 @@
 import torch
 
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.platforms import current_platform
 from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
                            PromptLogprobs, SequenceGroupMetadata,
                            SequenceOutput)
@@ -247,11 +248,14 @@ def nvtx_range(msg, *args, **kwargs):
     Arguments:
         msg (string): message to associate with the range
     """
-    torch.cuda.nvtx.range_push(msg.format(*args, **kwargs))
-    try:
+    if current_platform.is_cuda_alike():
+        torch.cuda.nvtx.range_push(msg.format(*args, **kwargs))
+        try:
+            yield
+        finally:
+            torch.cuda.nvtx.range_pop()
+    else:
         yield
-    finally:
-        torch.cuda.nvtx.range_pop()
 
 
 class Timer:
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index b08171d79f002..420aaf8a1b4cd 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -80,6 +80,7 @@ class ModelInputForCPUWithSamplingMetadata(ModelInputForCPU):
     Used by the ModelRunner.
     """
     sampling_metadata: Optional["SamplingMetadata"] = None
+    is_prompt: Optional[bool] = None
 
     def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
         tensor_dict = {
@@ -395,6 +396,7 @@ def __init__(
         vllm_config: VllmConfig,
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
+        return_hidden_states: bool = False,
         *args,
         **kwargs,
     ):
@@ -403,19 +405,25 @@ def __init__(
         cache_config = self.cache_config
 
         self.is_driver_worker = is_driver_worker
+        self.return_hidden_states = return_hidden_states
 
         self.device = self.device_config.device
+        self.pin_memory = False
 
         self.kv_cache_dtype = kv_cache_dtype
         self.sliding_window = model_config.get_sliding_window()
         self.block_size = cache_config.block_size
+        num_attn_heads = self.model_config.get_num_attention_heads(
+            self.parallel_config)
+        needs_attn_backend = (num_attn_heads != 0
+                              or self.model_config.is_attention_free)
         self.attn_backend = get_attn_backend(
             self.model_config.get_head_size(),
             self.model_config.dtype,
             self.kv_cache_dtype,
             self.block_size,
             self.model_config.is_attention_free,
-        )
+        ) if needs_attn_backend else None
 
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
@@ -444,6 +452,15 @@ def _prepare_model_input_tensors(
 
         return builder.build()  # type: ignore
 
+    # sampler property will be used by spec_decode_worker
+    @property
+    def sampler(self):
+        return self.model.sampler
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_config.get_vocab_size()
+
 
 class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]):
     _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = (
@@ -480,9 +497,12 @@ def prepare_model_input(
                                                      pin_memory=False,
                                                      generators=generators)
 
+        is_prompt = (seq_group_metadata_list[0].is_prompt
+                     if seq_group_metadata_list else None)
         return dataclasses.replace(model_input,
                                    sampling_metadata=sampling_metadata,
-                                   virtual_engine=virtual_engine)
+                                   virtual_engine=virtual_engine,
+                                   is_prompt=is_prompt)
 
     @torch.no_grad()
     def execute_model(
@@ -491,16 +511,22 @@ def execute_model(
         kv_caches: List[torch.Tensor],
         intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
+        previous_hidden_states: Optional[torch.Tensor] = None,
     ) -> Optional[List[SamplerOutput]]:
         if num_steps > 1:
             raise ValueError(
                 "CPU worker does not support multi-step execution.")
 
         model_executable = self.model
+
         multimodal_kwargs = {}
         if model_input.multi_modal_kwargs is not None:
             multimodal_kwargs = MultiModalKwargs.as_kwargs(
                 model_input.multi_modal_kwargs, device=self.device)
+        execute_model_kwargs = {}
+        if previous_hidden_states is not None:
+            execute_model_kwargs.update(
+                {"previous_hidden_states": previous_hidden_states})
 
         with set_forward_context(model_input.attn_metadata, self.vllm_config):
             hidden_states = model_executable(
@@ -509,6 +535,7 @@ def execute_model(
                 kv_caches=kv_caches,
                 attn_metadata=model_input.attn_metadata,
                 intermediate_tensors=intermediate_tensors,
+                **execute_model_kwargs,
                 **multimodal_kwargs,
             )
 
@@ -525,4 +552,12 @@ def execute_model(
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )
+        if self.return_hidden_states:
+            # we only need to pass hidden states of most recent token
+            if model_input.is_prompt:
+                output.prefill_hidden_states = hidden_states
+            output.hidden_states = hidden_states
         return [output]
+
+    def generate_proposals(self, *args, **kwargs):
+        return self.model.generate_proposals(*args, **kwargs)
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index bc9164bd9d5df..cf04808b73372 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -128,6 +128,7 @@ def __init__(
         distributed_init_method: str,
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
+        model_runner_cls: Optional[Type[CPUModelRunner]] = None,
     ) -> None:
         WorkerBase.__init__(self, vllm_config=vllm_config)
 
@@ -151,6 +152,16 @@ def __init__(
         else:
             self.local_omp_cpuid = omp_cpuids.split("|")[rank]
 
+        # Return hidden states from target model if the draft model is an
+        # mlp_speculator
+        speculative_config = self.speculative_config
+        model_config = self.model_config
+        speculative_args = {} if speculative_config is None \
+            or (speculative_config.draft_model_config.model ==
+                model_config.model) \
+            or (speculative_config.draft_model_config.hf_config.model_type
+                not in ["medusa", "mlp_speculator", "eagle"]) \
+                    else {"return_hidden_states": True}
         ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner
         if self.model_config.task == "embedding":
             ModelRunnerClass = CPUEmbeddingModelRunner
@@ -159,7 +170,11 @@ def __init__(
         self.model_runner: CPUModelRunnerBase = ModelRunnerClass(
             vllm_config=vllm_config,
             kv_cache_dtype=kv_cache_dtype,
-            is_driver_worker=is_driver_worker)
+            is_driver_worker=is_driver_worker,
+            **speculative_args,
+        )
+        if model_runner_cls is not None:
+            self.model_runner = model_runner_cls(self.model_runner)
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: List[CPUCacheEngine]
@@ -197,7 +212,7 @@ def init_device(self) -> None:
             ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
             if ret:
                 logger.info(ret)
-
+        self.device = torch.device("cpu")
         self.init_distributed_environment()
         # Set random seed.
         set_random_seed(self.model_config.seed)
@@ -297,6 +312,14 @@ def do_metadata_broadcast(self) -> bool:
     def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
         return self.cpu_cache
 
+    @property
+    def vocab_size(self) -> int:
+        return self.model_runner.vocab_size
+
+    @property
+    def max_model_len(self) -> int:
+        return self.model_config.max_model_len
+
     def execute_worker(
         self,
         worker_input: WorkerInput,
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 9e529f86b46bb..cd4770202a186 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -289,3 +289,18 @@ def get_generators(self, finished_request_ids: Optional[List[str]] = None):
                 self.generators.pop(request_id, None)
 
         return self.generators
+
+
+class ModelRunnerWrapperBase:
+    """
+    The whole point of this class is to lazily initialize the model_runner.
+    """
+
+    def __init__(
+        self,
+        moderl_runner: ModelRunnerBase,
+    ) -> None:
+        self.model_runner: ModelRunnerBase = moderl_runner
+
+    def __getattr__(self, attr):
+        return getattr(self.model_runner, attr)
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 80fd7bc3b67cc..24e7bc760b0c0 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -74,9 +74,7 @@ def __init__(
                     else {"return_hidden_states": True}
 
         ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
-        if model_runner_cls is not None:
-            ModelRunnerClass = model_runner_cls
-        elif model_config.task == "embedding":
+        if model_config.task == "embedding":
             ModelRunnerClass = EmbeddingModelRunner
         elif self.model_config.is_encoder_decoder:
             ModelRunnerClass = EncoderDecoderModelRunner
@@ -86,6 +84,9 @@ def __init__(
             is_driver_worker=is_driver_worker,
             **speculative_args,
         )
+        if model_runner_cls is not None:
+            self.model_runner = model_runner_cls(self.model_runner)
+
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: List[CacheEngine]
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index e7fec6d17eecd..7aaa8b453cff1 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -466,6 +466,9 @@ def execute_method(self, method, *args, **kwargs):
             logger.exception(msg)
             raise e
 
+    def __getattr__(self, attr):
+        return getattr(self.worker, attr)
+
 
 def extract_previous_hidden_states(
         data: Union[ExecuteModelRequest, Dict[str, torch.Tensor]]) -> \

From d41446435720bcbd512af3943a1d4b3365db7f77 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 26 Nov 2024 18:04:01 -0800
Subject: [PATCH 067/293] [V1] Update interface for idefics3 (#10680)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/model_executor/models/idefics3.py | 73 ++++++++++++++++----------
 1 file changed, 46 insertions(+), 27 deletions(-)

diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 5d176b2a4e416..58f7635275c05 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -39,6 +39,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.image import cached_get_image_processor
+from vllm.multimodal.inputs import NestedTensors
 from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.utils import is_list_of
@@ -597,6 +598,12 @@ def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor:
         image_features = self._process_image_pixels(image_input)
         return self.connector(image_features)
 
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.text_model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -604,26 +611,8 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
-        **kwargs: object,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        if intermediate_tensors is not None:
-            input_ids = None
-            inputs_embeds = None
-        else:
-            # always pass the input via `inputs_embeds`
-            # to make sure the computation graph is consistent
-            image_input = self._parse_and_validate_image_input(**kwargs)
-
-            if image_input is not None:
-                vision_embeddings = self._process_image_input(image_input)
-                inputs_embeds = self.text_model.get_input_embeddings(input_ids)
-
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, vision_embeddings,
-                    self.image_token_id)
-            else:
-                inputs_embeds = self.text_model.get_input_embeddings(input_ids)
-            input_ids = None
 
         hidden_states = self.text_model(
             input_ids,
@@ -718,6 +707,25 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
         self.sampler = Sampler()
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self.model._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self.model._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.config.image_token_id)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -725,16 +733,27 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(
-            input_ids,
-            positions,
-            kv_caches,
-            attn_metadata,
-            intermediate_tensors,
-            **kwargs,
-        )
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.model.text_model(input_ids,
+                                              positions,
+                                              kv_caches,
+                                              attn_metadata,
+                                              intermediate_tensors,
+                                              inputs_embeds=inputs_embeds)
+
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,

From 0f196ac7f9e0ef4e2ecf039feef4721ce6fc22bf Mon Sep 17 00:00:00 2001
From: jeongin601 <78595701+jeongin601@users.noreply.github.com>
Date: Wed, 27 Nov 2024 14:07:30 +0900
Subject: [PATCH 068/293] [Bugfix][SpecDecode] apply sampling parameters to
 target probabilities for consistency in rejection sampling. (#10198)

Signed-off-by: jeongin601 <0200angela@gmail.com>
Signed-off-by: jeong_in.bae <jeong_in.bae@navercorp.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/spec_decode/e2e/test_mlp_correctness.py |  2 +-
 tests/spec_decode/test_batch_expansion.py     |  8 ++++++++
 vllm/spec_decode/batch_expansion.py           | 14 +-------------
 3 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index 5ecc0d4e95719..183ff2f5db274 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -203,7 +203,7 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
 @pytest.mark.parametrize("output_len", [64])
 @pytest.mark.parametrize("batch_size", [1, 32])
-@pytest.mark.parametrize("temperature", [0.1, 1.0])
+@pytest.mark.parametrize("temperature", [1.0])
 @pytest.mark.parametrize("seed", [1])
 def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
                                     per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
index 0d6aaa449d856..3504fcf43e361 100644
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -90,6 +90,14 @@ def test_create_single_target_seq_group_metadata(k: int):
     )
 
     assert output.request_id == input_seq_group_metadata.request_id
+    assert output.sampling_params.repetition_penalty == \
+        input_seq_group_metadata.sampling_params.repetition_penalty
+    assert output.sampling_params.temperature == \
+        input_seq_group_metadata.sampling_params.temperature
+    assert output.sampling_params.top_p == \
+        input_seq_group_metadata.sampling_params.top_p
+    assert output.sampling_params.top_k == \
+        input_seq_group_metadata.sampling_params.top_k
     assert len(output.seq_data) == 1
     assert output.seq_data[target_seq_id].get_prompt_token_ids() == tuple(
         prompt_tokens)
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index 25ef27b8378f0..01b9cdad963da 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -307,28 +307,16 @@ def _create_target_seq_group_metadata(
         token_ids_to_score = self._get_token_ids_to_score(
             proposal_token_ids[batch_index])
 
-        # Use simpler sampling parameters apart from for final token
-        # (in particular don't do seeded sampling) since those sampled tokens
-        # aren't used.
-        # We don't replace the sampling_params in the greedy case because
-        # this also controls whether the probs get modified in the sampler
-        # (see use of _modify_greedy_probs_inplace there).
         sampling_params = input_seq_group_metadata.sampling_params
-        non_bonus_sampling_params = DEFAULT_SIMPLE_SAMPLING_PARAMS \
-            if sampling_params.temperature else sampling_params
-
         target_seq_group_metadata_list: List[SequenceGroupMetadata] = []
-        last_index = len(token_ids_to_score) - 1
         for i, token_ids in enumerate(token_ids_to_score):
-            target_sampling_params = sampling_params if i == last_index \
-                else non_bonus_sampling_params
             target_seq_group_metadata_list.append(
                 self._create_single_target_seq_group_metadata(
                     input_seq_group_metadata,
                     input_seq_id,
                     next(target_seq_ids_iter),
                     token_ids,
-                    sampling_params=target_sampling_params,
+                    sampling_params=sampling_params,
                 ))
 
         return target_seq_group_metadata_list

From 429d17e428932083e739ad51e3f49661fd38ff9c Mon Sep 17 00:00:00 2001
From: yansh97 <yansh97@foxmail.com>
Date: Wed, 27 Nov 2024 13:55:23 +0800
Subject: [PATCH 069/293] [bugfix] fix the default value of llm_int8_threshold
 in BitsAndBytesConfig (#10657)

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/model_executor/layers/quantization/bitsandbytes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 6a0de3034142a..e01c713dd14db 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -26,7 +26,7 @@ def __init__(
         llm_int8_enable_fp32_cpu_offload: bool = False,
         llm_int8_has_fp16_weight: bool = False,
         llm_int8_skip_modules: Optional[List[str]] = None,
-        llm_int8_threshold: float = 0.0,
+        llm_int8_threshold: float = 6.0,
     ) -> None:
 
         self.load_in_8bit = load_in_8bit
@@ -103,7 +103,7 @@ def get_safe_value(config, keys, default_value=None):
                                                ["llm_int8_skip_modules"],
                                                default_value=[])
         llm_int8_threshold = get_safe_value(config, ["llm_int8_threshold"],
-                                            default_value=0.0)
+                                            default_value=6.0)
 
         return cls(
             load_in_8bit=load_in_8bit,

From 89c4f78c59dd6c2777329c6f3462b2e45c724337 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Wed, 27 Nov 2024 14:49:40 +0800
Subject: [PATCH 070/293] [Hardware][Gaudi]add get_name method for
 HPUAttentionBackend (#10667)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/attention/backends/hpu_attn.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 4a3ddd5db94e5..5359941d41fde 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -22,6 +22,10 @@
 
 class HPUAttentionBackend(AttentionBackend):
 
+    @staticmethod
+    def get_name() -> str:
+        return "HPU_ATTN"
+
     @staticmethod
     def get_impl_cls() -> Type["HPUAttentionImpl"]:
         return HPUAttentionImpl

From a809ee1494538f6417ee13fb1d3e336a042d11cf Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 27 Nov 2024 14:54:12 +0800
Subject: [PATCH 071/293] [Misc]Further  reduce BNB static variable (#10597)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/model_executor/model_loader/loader.py | 218 ++++++++++++---------
 vllm/model_executor/models/baichuan.py     |   8 -
 vllm/model_executor/models/falcon.py       |   6 -
 vllm/model_executor/models/gemma.py        |   9 -
 vllm/model_executor/models/gemma2.py       |   9 -
 vllm/model_executor/models/idefics3.py     |  15 --
 vllm/model_executor/models/llama.py        |   9 -
 vllm/model_executor/models/minicpmv.py     |  34 ----
 vllm/model_executor/models/mllama.py       |  14 --
 vllm/model_executor/models/opt.py          |   3 -
 vllm/model_executor/models/phi.py          |   3 -
 vllm/model_executor/models/phi3.py         |   6 -
 vllm/model_executor/models/qwen.py         |   7 +-
 vllm/model_executor/models/qwen2.py        |   9 -
 14 files changed, 131 insertions(+), 219 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 441dd409b4f9d..37c2d789030b6 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -28,7 +28,8 @@
                               get_tensor_model_parallel_world_size)
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+from vllm.model_executor.layers.linear import (LinearBase,
+                                               MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
@@ -78,12 +79,14 @@ def device_loading_context(module: torch.nn.Module,
                 original_device: torch.device = original_device_states[name]
                 if original_device.type == "cpu":
                     # `torch.empty_like` does not support `pin_memory` argument
-                    cpu_data = torch.empty_strided(size=p.data.size(),
-                                                   stride=p.data.stride(),
-                                                   dtype=p.data.dtype,
-                                                   layout=p.data.layout,
-                                                   device="cpu",
-                                                   pin_memory=pin_memory)
+                    cpu_data = torch.empty_strided(
+                        size=p.data.size(),
+                        stride=p.data.stride(),
+                        dtype=p.data.dtype,
+                        layout=p.data.layout,
+                        device="cpu",
+                        pin_memory=pin_memory,
+                    )
                     cpu_data.copy_(p.data)
                     p.data = cpu_data
                 else:
@@ -112,7 +115,8 @@ def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module:
     logger.warning(msg)
     logger.warning(
         "Trying to guess the arguments for old-style model class %s",
-        model_class)
+        model_class,
+    )
     # try to be compatible with old-style model class
     kwargs = {}
     if "prefix" in all_params:
@@ -198,14 +202,17 @@ def _maybe_download_from_modelscope(
             return model_path
         return None
 
-    def _prepare_weights(self, model_name_or_path: str,
-                         revision: Optional[str],
-                         fall_back_to_pt: bool) -> Tuple[str, List[str], bool]:
+    def _prepare_weights(
+        self,
+        model_name_or_path: str,
+        revision: Optional[str],
+        fall_back_to_pt: bool,
+    ) -> Tuple[str, List[str], bool]:
         """Prepare weights for the model.
 
         If the model is not local, it will be downloaded."""
-        model_name_or_path = self._maybe_download_from_modelscope(
-            model_name_or_path, revision) or model_name_or_path
+        model_name_or_path = (self._maybe_download_from_modelscope(
+            model_name_or_path, revision) or model_name_or_path)
 
         is_local = os.path.isdir(model_name_or_path)
         load_format = self.load_config.load_format
@@ -258,8 +265,11 @@ def _prepare_weights(self, model_name_or_path: str,
             # any files not found in the index.
             if not is_local:
                 download_safetensors_index_file_from_hf(
-                    model_name_or_path, index_file,
-                    self.load_config.download_dir, revision)
+                    model_name_or_path,
+                    index_file,
+                    self.load_config.download_dir,
+                    revision,
+                )
             hf_weights_files = filter_duplicate_safetensors_files(
                 hf_weights_files, hf_folder, index_file)
         else:
@@ -282,8 +292,11 @@ def _get_weights_iterator(
             # Currently np_cache only support *.bin checkpoints
             assert use_safetensors is False
             weights_iterator = np_cache_weights_iterator(
-                source.model_or_path, self.load_config.download_dir, hf_folder,
-                hf_weights_files)
+                source.model_or_path,
+                self.load_config.download_dir,
+                hf_folder,
+                hf_weights_files,
+            )
         elif use_safetensors:
             weights_iterator = safetensors_weights_iterator(hf_weights_files)
         else:
@@ -310,17 +323,19 @@ def _get_all_weights(
         model_config: ModelConfig,
         model: nn.Module,
     ) -> Generator[Tuple[str, torch.Tensor], None, None]:
-
         primary_weights = DefaultModelLoader.Source(
             model_config.model,
             model_config.revision,
             prefix="",
             fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load",
-                                    True))
+                                    True),
+        )
         yield from self._get_weights_iterator(primary_weights)
 
-        secondary_weights = cast(Iterable[DefaultModelLoader.Source],
-                                 getattr(model, "secondary_weights", ()))
+        secondary_weights = cast(
+            Iterable[DefaultModelLoader.Source],
+            getattr(model, "secondary_weights", ()),
+        )
         for source in secondary_weights:
             yield from self._get_weights_iterator(source)
 
@@ -416,7 +431,7 @@ def _verify_config(self, model_config: ModelConfig,
         self.tensorizer_config.verify_with_parallel_config(parallel_config)
 
     def _get_weights_iterator(
-            self) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        self, ) -> Generator[Tuple[str, torch.Tensor], None, None]:
         tensorizer_args = self.tensorizer_config._construct_tensorizer_args()
         return tensorizer_weights_iterator(tensorizer_args)
 
@@ -479,9 +494,10 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
 
         if parallel_config.tensor_parallel_size > 1:
             from vllm.distributed import get_tensor_model_parallel_rank
-            self.tensorizer_config.tensorizer_uri = \
-                self.tensorizer_config.tensorizer_uri \
-                    % get_tensor_model_parallel_rank()
+
+            self.tensorizer_config.tensorizer_uri = (
+                self.tensorizer_config.tensorizer_uri %
+                get_tensor_model_parallel_rank())
 
         if is_vllm_tensorized(self.tensorizer_config):
             return self._load_model_serialized(vllm_config=vllm_config)
@@ -520,13 +536,13 @@ def __init__(self, load_config: LoadConfig):
 
     @staticmethod
     def _filter_subtensors(
-            tensors: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        tensors: Dict[str, torch.Tensor], ) -> Dict[str, torch.Tensor]:
         """
         Filter out all tensors that share the same memory or a subset of the
         memory of another tensor.
         """
-        same_storage_groups: Dict[Any, List[Tuple[
-            str, torch.Tensor]]] = collections.defaultdict(list)
+        same_storage_groups: Dict[Any, List[Tuple[str, torch.Tensor]]] = (
+            collections.defaultdict(list))
         for key, tensor in tensors.items():
             if tensor.numel():
                 ptr = tensor.untyped_storage().data_ptr()
@@ -615,8 +631,11 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
                         if tensor.shape != param_shape:
                             logger.warning(
                                 "loading tensor of shape %s into "
-                                "parameter '%s' of shape %s", tensor.shape,
-                                key, param_shape)
+                                "parameter '%s' of shape %s",
+                                tensor.shape,
+                                key,
+                                param_shape,
+                            )
                         param_data.copy_(tensor)
                         state_dict.pop(key)
             if state_dict:
@@ -634,6 +653,7 @@ def save_model(
         from safetensors.torch import save_file
 
         from vllm.distributed import get_tensor_model_parallel_rank
+
         if pattern is None:
             pattern = ShardedStateLoader.DEFAULT_PATTERN
         rank = get_tensor_model_parallel_rank()
@@ -667,24 +687,6 @@ class BitsAndBytesModelLoader(BaseModelLoader):
 
     possible_config_file_names = ["adapter_config.json"]
 
-    default_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-        '.fc1.',
-        '.fc2.',
-        '.dense.',
-        '.query_key_value.',
-        '.qkv_proj.',
-        '.dense_h_to_4h.',
-        '.dense_4h_to_h.',
-        '.out_proj.',
-    ]
-
     def __init__(self, load_config: LoadConfig):
         super().__init__(load_config)
 
@@ -709,6 +711,11 @@ def __init__(self, load_config: LoadConfig):
         with open(config_file_path) as f:
             config = json.load(f)
             self.target_modules = config["target_modules"]
+            # TODO: target_modules could be either a list or a regex string.
+            # We need to handle both cases.
+            assert isinstance(self.target_modules,
+                              list), "Unsupported target_modules: "
+            f"{self.target_modules}"
 
     def _get_config_file(self, qlora_adapter: str) -> str:
         is_local = os.path.isdir(qlora_adapter)
@@ -734,12 +741,13 @@ def _get_config_file(self, qlora_adapter: str) -> str:
         return config_file_path
 
     def _get_weight_files(
-            self,
-            model_name_or_path: str,
-            allowed_patterns: List[str],
-            revision: Optional[str] = None) -> Tuple[List[str], str]:
-        """Retrieve weight files. Download the files if necessary. 
-        
+        self,
+        model_name_or_path: str,
+        allowed_patterns: List[str],
+        revision: Optional[str] = None,
+    ) -> Tuple[List[str], str]:
+        """Retrieve weight files. Download the files if necessary.
+
         Return the weight files and the file pattern."""
         is_local = os.path.isdir(model_name_or_path)
 
@@ -806,6 +814,7 @@ def _get_quantized_weights_iterator(
         # only load the bitsandbytes module when needed
         try:
             import bitsandbytes
+
             if bitsandbytes.__version__ < "0.44.0":
                 raise ImportError("bitsandbytes version is wrong. Please "
                                   "install bitsandbytes>=0.44.0.")
@@ -839,8 +848,11 @@ def _is_8bit_weight_name(self, weight_name: str):
 
     def _is_4bit_weight_name(self, weight_name: str):
         quantized_suffix = {
-            "absmax", "quant_map", "nested_absmax", "nested_quant_map",
-            "bitsandbytes"
+            "absmax",
+            "quant_map",
+            "nested_absmax",
+            "nested_quant_map",
+            "bitsandbytes",
         }
         suffix = weight_name.split(".")[-1]
         return any(q_suffix in suffix for q_suffix in quantized_suffix)
@@ -857,7 +869,6 @@ def _quantized_8bit_generator(self, hf_weights_files, use_safetensors,
 
         for weight_name, weight_tensor in self._hf_weight_iter(
                 hf_weights_files, use_safetensors):
-
             if self._is_8bit_weight_name(weight_name):
                 continue
 
@@ -899,14 +910,13 @@ def _parse_quant_state(param_name: str,
         # pre quantized weights would have a quant_state
         for weight_name, weight_tensor in self._hf_weight_iter(
                 hf_weights_files, use_safetensors):
-
             if self._is_4bit_weight_name(weight_name):
                 continue
 
-            if (f"{weight_name}.quant_state.bitsandbytes__nf4" \
-                    in temp_state_dict) or \
-            (f"{weight_name}.quant_state.bitsandbytes__fp4" \
-                    in temp_state_dict):
+            if (f"{weight_name}.quant_state.bitsandbytes__nf4"
+                    in temp_state_dict) or (
+                        f"{weight_name}.quant_state.bitsandbytes__fp4"
+                        in temp_state_dict):
                 quant_state = _parse_quant_state(weight_name, temp_state_dict)
                 quant_state_dict[weight_name] = quant_state
                 yield weight_name, weight_tensor
@@ -916,12 +926,12 @@ def _parse_quant_state(param_name: str,
     def _unquantized_generator(self, hf_weights_files, use_safetensors,
                                quant_state_dict) -> Generator:
         from bitsandbytes.functional import quantize_4bit
+
         tp_size = get_tensor_model_parallel_world_size()
         tp_rank = get_tensor_model_parallel_rank()
 
         for weight_name, weight_tensor in self._hf_weight_iter(
                 hf_weights_files, use_safetensors):
-
             if any(target_module in weight_name for target_module in
                    self.target_modules) and weight_name.endswith(".weight"):
                 # Without sharding
@@ -954,12 +964,11 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors,
                     # get the start/end index of each shard weight tensor
                     total_start_index = list(
                         itertools.accumulate([0] + total_shard_sizes))[:-1]
-                    shard_weights_index = [
-                        (idx + size // tp_size * tp_rank,
-                         idx + size // tp_size * (tp_rank + 1))
-                        for idx, size in zip(total_start_index,
-                                             total_shard_sizes)
-                    ]
+                    shard_weights_index = [(
+                        idx + size // tp_size * tp_rank,
+                        idx + size // tp_size * (tp_rank + 1),
+                    ) for idx, size in zip(total_start_index,
+                                           total_shard_sizes)]
                     # slice and reorder the weight tensor
                     weight_tensor = [
                         weight_tensor[start_index:end_index, ...]
@@ -989,7 +998,8 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors,
                     processed_weight, quant_state = quantize_4bit(
                         loaded_weight,
                         compress_statistics=True,
-                        quant_type="nf4")
+                        quant_type="nf4",
+                    )
 
                 quant_state_dict[weight_name] = quant_state
             else:
@@ -997,28 +1007,58 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors,
 
             yield weight_name, processed_weight
 
+    def _get_bnb_target_modules(self, model: nn.Module) -> None:
+
+        # TODO: Maybe we can replace bitsandbytes_stacked_params_mapping with
+        # packed_modules_mapping.
+        inverse_stacked_mapping: Dict[str, List[str]] = {}
+        for orig, (
+                packed,
+                idx,
+        ) in model.bitsandbytes_stacked_params_mapping.items():
+            if packed not in inverse_stacked_mapping:
+                inverse_stacked_mapping[packed] = []
+            inverse_stacked_mapping[packed].insert(idx, orig)
+
+        linear_module_lst = []
+        for name, module in model.named_modules():
+            if isinstance(module, (LinearBase, )):
+                last_name = name.split(".")[-1]
+                if sub_modules := inverse_stacked_mapping.get(last_name, []):
+                    # Map vllm's names to transformers' names.
+                    for sub_name in sub_modules:
+                        linear_module_lst.append(
+                            name.replace(last_name, sub_name))
+                else:
+                    linear_module_lst.append(name)
+        if self.target_modules:
+            # Update self.target_modules
+            self.target_modules = [
+                qual_name for qual_name in linear_module_lst
+                if any(t in qual_name for t in self.target_modules)
+            ]
+        else:
+            self.target_modules = linear_module_lst
+        assert (self.target_modules
+                ), "vllm currently does not support BNB quantization for"
+        f" {type(model).__name__}"
+
     def _load_weights(self, model_config: ModelConfig,
                       model: nn.Module) -> None:
-        if not hasattr(model, 'load_weights'):
+        if not hasattr(model, "load_weights"):
             raise AttributeError(
                 "The required method 'load_weights' is not defined in class"
                 f" {type(model).__name__}.")
 
-        if not hasattr(model, 'bitsandbytes_stacked_params_mapping'):
+        if not hasattr(model, "bitsandbytes_stacked_params_mapping"):
             raise AttributeError(
                 f"Model {type(model).__name__} does not support BitsAndBytes "
                 "quantization yet.")
 
-        if len(self.target_modules) == 0:
-            if hasattr(model, 'default_bitsandbytes_target_modules'):
-                self.target_modules = model.default_bitsandbytes_target_modules
-            else:
-                self.target_modules = self.default_target_modules
-
         # Modules whose weights might have fused on disk
         # we need their output_sizes to make shard in flight correctly with TP
         self.maybe_fused_weights_modules: Dict[str, List[int]] = {}
-
+        self._get_bnb_target_modules(model)
         for name, module in model.named_modules():
             # Some modules like `ReplicatedLinear` should not have their weights
             # sharded. The reason for implementing it this way is to avoid new
@@ -1046,7 +1086,7 @@ def _load_weights(self, model_config: ModelConfig,
 
         pre_quant = False
         if quant_config is not None:
-            quant_method = quant_config.get('quant_method')
+            quant_method = quant_config.get("quant_method")
             if quant_method == "bitsandbytes":
                 pre_quant = True
             else:
@@ -1063,11 +1103,12 @@ def _load_weights(self, model_config: ModelConfig,
 
         load_8bit = False
         if pre_quant:
-            load_8bit = quant_config.get('load_in_8bit', False)
+            load_8bit = quant_config.get("load_in_8bit", False)
 
-        qweight_iterator, quant_state_dict = \
-            self._get_quantized_weights_iterator(
-            model_config.model, model_config.revision, pre_quant, load_8bit)
+        qweight_iterator, quant_state_dict = (
+            self._get_quantized_weights_iterator(model_config.model,
+                                                 model_config.revision,
+                                                 pre_quant, load_8bit))
 
         model.load_weights(qweight_iterator)
 
@@ -1078,6 +1119,7 @@ def _load_weights(self, model_config: ModelConfig,
         # TODO: Change this lazy import to normal import
         # after the checks are updated to run on a new version
         from vllm.model_executor.models.utils import is_pp_missing_parameter
+
         for quant_param_name in quant_state_dict:
             if is_pp_missing_parameter(quant_param_name, model):
                 continue
@@ -1086,9 +1128,9 @@ def _load_weights(self, model_config: ModelConfig,
 
             shard_index = 0
             for shard_name, (
-                    weight_name, index
+                    weight_name,
+                    index,
             ) in model.bitsandbytes_stacked_params_mapping.items():
-
                 shard_pos = quant_param_name.find(shard_name)
                 # Some models, such as MiniCPM V2.5/2.6, contain both
                 # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj'
@@ -1123,8 +1165,8 @@ def _load_weights(self, model_config: ModelConfig,
 
                 num_elements = [0] * len(quant_states)
                 for seq, quant_state in quant_states.items():
-                    num_elements[seq] = math.prod(
-                        quant_state.shape) // pack_ratio
+                    num_elements[seq] = (math.prod(quant_state.shape) //
+                                         pack_ratio)
 
                 offsets = np.concatenate(([0], np.cumsum(num_elements)))
                 set_weight_attrs(param, {"bnb_shard_offsets": offsets})
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 39cb5a8b2cbbe..5e68b7f165bf4 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -351,14 +351,6 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_padding_modules = []
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".W_pack.",
-        ".o_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".gate_proj.",
-        ".up_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "gate_proj": ("gate_up_proj", 0),
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 096ad32b38e86..8660cf79b9cdb 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -412,12 +412,6 @@ class FalconForCausalLM(nn.Module, SupportsPP):
 
     # BitandBytes specific attributes
     bitsandbytes_stacked_params_mapping = {}
-    default_bitsandbytes_target_modules = [
-        ".query_key_value.",
-        ".dense.",
-        ".dense_h_to_4h.",
-        ".dense_4h_to_h.",
-    ]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 131e9af139c2a..b28715c48adfb 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -350,15 +350,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "down_proj",
     ]
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index d229eb74669ee..c93223c740272 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -386,15 +386,6 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_padding_modules = []
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 58f7635275c05..014e27bc869d4 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -656,21 +656,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
     ]
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-        # vision_model
-        ".fc1.",
-        ".fc2.",
-        ".out_proj.",
-        # connector
-        ".proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 355b2f3ef8b28..7cc5547b4a4d5 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -463,15 +463,6 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_padding_modules = ["lm_head"]
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 99bf1d42d0355..aacce477e0460 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -822,25 +822,6 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
     ]
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-        # vision encoder
-        ".fc1.",
-        ".fc2.",
-        # Currently, vllm does not support BNB quantization for the `out_proj`
-        # of the resampler, so it's necessary to distinguish between the
-        # vision encoder and the resampler's out_proj. The same applies to
-        # MiniCPMV2_6.
-        ".self_attn.out_proj.",  #  vision encoder out_proj
-        # resampler
-        ".kv_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
@@ -964,21 +945,6 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
     ]
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-        # vision encoder
-        ".fc1.",
-        ".fc2.",
-        ".self_attn.out_proj.",
-        # resampler
-        ".kv_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 9e6634a9a7579..6536f9807730c 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1104,20 +1104,6 @@ def forward(
 @INPUT_REGISTRY.register_input_processor(input_processor_for_mllama)
 class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-        ".fc1.",
-        ".fc2.",
-        # The `multi_modal_projector` is at the top level of the model,
-        # so we can't add a dot in front of it.
-        "multi_modal_projector."
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index db85a494980a7..7edafcd20b5db 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -337,9 +337,6 @@ class OPTForCausalLM(nn.Module, SupportsPP):
         "k_proj": ("qkv_proj", 1),
         "v_proj": ("qkv_proj", 2),
     }
-    default_bitsandbytes_target_modules = [
-        ".q_proj.", ".k_proj.", ".v_proj.", ".out_proj.", ".fc1.", ".fc2."
-    ]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 998d3723a0d7d..f9e972688ddd1 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -286,9 +286,6 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "k_proj": ("qkv_proj", 1),
         "v_proj": ("qkv_proj", 2),
     }
-    default_bitsandbytes_target_modules = [
-        ".q_proj.", ".k_proj.", ".v_proj.", ".fc1.", ".fc2.", ".dense."
-    ]
 
     embedding_modules = {}
     embedding_padding_modules = []
diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py
index 54158bc141235..937858ee3b8c2 100644
--- a/vllm/model_executor/models/phi3.py
+++ b/vllm/model_executor/models/phi3.py
@@ -16,11 +16,5 @@ class Phi3ForCausalLM(LlamaForCausalLM):
     }
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_up_proj.",
-        ".down_proj.",
-        ".qkv_proj.",
-        ".o_proj.",
-    ]
     # Initialize an empty dict when there is no stacked parameter mapping.
     bitsandbytes_stacked_params_mapping = {}
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 8f001200308fe..63d1374ab4092 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -1028,12 +1028,7 @@ class QWenLLM(QWenBaseModel):
     embedding_modules = {}
     embedding_padding_modules = []
 
-    default_bitsandbytes_target_modules = [
-        ".c_attn.",
-        ".c_proj.",
-        ".w1.",
-        ".w2.",
-    ]
+    # BitandBytes specific attributes
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "w2": ("gate_up_proj", 0),
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 46640226d4cf8..9f706610a129a 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -419,15 +419,6 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_padding_modules = []
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),

From 57485ba105b64dd298f868d6b3a89b313fb74df5 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Wed, 27 Nov 2024 01:55:32 -0500
Subject: [PATCH 072/293] [Kernel] Remove if-else with identical branches in
 marlin 2:4 (#10687)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .../marlin/sparse/marlin_24_cuda_kernel.cu             | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
index 8fce76eb52f9b..17837351324be 100644
--- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
@@ -296,13 +296,9 @@ __global__ void Marlin_24(
   // We use a different scale layout for grouped and column-wise quantization as
   // we scale a `half2` tile in column-major layout in the former and in
   // row-major in the latter case.
-  if (group_blocks != -1) {
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-  } else {
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-  }
+  s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+            (threadIdx.x % 32) / 4;  // Note that in the original Marlin kernel
+                                     // this is (threadIdx.x % 32) / 4
 
   // Precompute which thread should not read memory in which iterations; this is
   // needed if there are more threads than required for a certain tilesize or

From e2552622389da8b6af3fb82b9eb66a3d6ba2fd3d Mon Sep 17 00:00:00 2001
From: shunxing12345 <168084185+shunxing12345@users.noreply.github.com>
Date: Wed, 27 Nov 2024 19:32:35 +0800
Subject: [PATCH 073/293] [Model] Support telechat2 (#10311)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: xiangw2 <xiangw2@chinatelecom.cn>
Co-authored-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 docs/source/models/supported_models.rst      |   5 +
 tests/models/registry.py                     |   2 +
 vllm/model_executor/models/llama.py          |   6 +-
 vllm/model_executor/models/registry.py       |   2 +
 vllm/model_executor/models/telechat2.py      | 131 +++++++++++++++++++
 vllm/transformers_utils/config.py            |   4 +-
 vllm/transformers_utils/configs/__init__.py  |   2 +
 vllm/transformers_utils/configs/telechat2.py |  61 +++++++++
 8 files changed, 210 insertions(+), 3 deletions(-)
 create mode 100644 vllm/model_executor/models/telechat2.py
 create mode 100644 vllm/transformers_utils/configs/telechat2.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index b5cbe6915d581..c5fbb30b24e28 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -309,6 +309,11 @@ Text Generation
     - :code:`upstage/solar-pro-preview-instruct`, etc.
     - ✅︎
     - ✅︎
+  * - :code:`TeleChat2ForCausalLM`
+    - TeleChat2
+    - :code:`TeleAI/TeleChat2-3B`, :code:`TeleAI/TeleChat2-7B`, :code:`TeleAI/TeleChat2-35B`, etc.
+    - ✅︎
+    - ✅︎
   * - :code:`XverseForCausalLM`
     - XVERSE
     - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc.
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 865e90b3f8b0e..a93bfe907e0d7 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -115,6 +115,8 @@ class _HfExamplesInfo:
     "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
     "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
     "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
+    "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
+                                            trust_remote_code=True),
     "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
                                          is_available_online=False,
                                          trust_remote_code=True),
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 7cc5547b4a4d5..fffb3fe53b94c 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -501,8 +501,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.config = config
         self.lora_config = lora_config
 
-        self.model = LlamaModel(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
+        self.model = self._init_model(vllm_config=vllm_config, prefix=prefix)
         if get_pp_group().is_last_rank:
             self.unpadded_vocab_size = config.vocab_size
             if lora_config:
@@ -539,6 +538,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             normalize=False,
             softmax=False)
 
+    def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
+        return LlamaModel(vllm_config=vllm_config, prefix=prefix)
+
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index f5a02a5b25ca2..4462f6ed55a9c 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -91,6 +91,7 @@
     "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
     "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
     "SolarForCausalLM": ("solar", "SolarForCausalLM"),
+    "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
     # [Encoder-decoder]
     "BartModel": ("bart", "BartForConditionalGeneration"),
@@ -118,6 +119,7 @@
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
     "Qwen2ForSequenceClassification": ("qwen2_cls", "Qwen2ForSequenceClassification"),  # noqa: E501
+    "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     # [Multimodal]
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
new file mode 100644
index 0000000000000..39c9103527f01
--- /dev/null
+++ b/vllm/model_executor/models/telechat2.py
@@ -0,0 +1,131 @@
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Iterable, Set, Tuple
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.llama import LlamaForCausalLM, LlamaModel
+
+from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
+                    is_pp_missing_parameter)
+
+
+class TeleChat2Model(LlamaModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        # 1. Initialize the LlamaModel with bias
+        vllm_config.model_config.hf_config.bias = True
+        vllm_config.model_config.hf_config.mlp_bias = True
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        # 2. Remove the bias from the qkv_proj and gate_up_proj based on config
+        # Telechat2's gate_up_proj and qkv_proj don't have bias
+        # see: https://github.com/vllm-project/vllm/pull/10311#issuecomment-2490297566
+        for layer in self.layers:
+            if not isinstance(layer, PPMissingLayer):
+                layer.self_attn.qkv_proj.bias = None
+                layer.self_attn.qkv_proj.skip_bias_add = True
+                layer.mlp.gate_up_proj.bias = None
+                layer.mlp.gate_up_proj.skip_bias_add = True
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            ('gate_up_proj', 'gate_proj', 0),
+            ('gate_up_proj', 'up_proj', 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        total_num_heads = self.config.n_head
+        head_dim = self.config.hidden_size // total_num_heads
+        for name, loaded_weight in weights:
+            if "self_attn.key_value" in name:
+                k_weight = []
+                v_weight = []
+                for i in range(total_num_heads):
+                    start = i * head_dim * 2
+                    k_weight.append(loaded_weight[start:start + head_dim, :])
+                    v_weight.append(loaded_weight[start + head_dim:start +
+                                                  2 * head_dim:])
+                k_weight = torch.cat(k_weight, dim=0)
+                v_weight = torch.cat(v_weight, dim=0)
+                name = name.replace("key_value", "qkv_proj")
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, k_weight, "k")
+                weight_loader(param, v_weight, "v")
+            elif "query" in name:
+                name = name.replace("query", "qkv_proj")
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, "q")
+            else:
+                for param_name, weight_name, shard_id in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class TeleChat2ForCausalLM(LlamaForCausalLM):
+
+    def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
+        return TeleChat2Model(vllm_config=vllm_config, prefix=prefix)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+
+        hf_to_vllm_mapper = WeightsMapper(
+            orig_to_new_prefix={
+                "transformer.": "model.",
+            },
+            orig_to_new_substr={
+                ".h.": ".layers.",
+                ".self_attention.": ".self_attn.",
+                ".word_embeddings.": ".embed_tokens.",
+                ".dense.": ".o_proj.",
+                ".ln_f.": ".norm.",
+            },
+        )
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 4c096acdf2035..3da99bcbee9ae 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -29,7 +29,8 @@
                                              MLPSpeculatorConfig, MPTConfig,
                                              NemotronConfig, NVLM_D_Config,
                                              Olmo2Config, RWConfig,
-                                             SolarConfig, UltravoxConfig)
+                                             SolarConfig, Telechat2Config,
+                                             UltravoxConfig)
 # yapf: enable
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import resolve_obj_by_qualname
@@ -64,6 +65,7 @@
     "NVLM_D": NVLM_D_Config,
     "olmo2": Olmo2Config,
     "solar": SolarConfig,
+    "telechat": Telechat2Config,
     "ultravox": UltravoxConfig,
     **_CONFIG_REGISTRY_OVERRIDE_HF
 }
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 4c721001d8434..c24433cd436b4 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -17,6 +17,7 @@
 from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
 from vllm.transformers_utils.configs.olmo2 import Olmo2Config
 from vllm.transformers_utils.configs.solar import SolarConfig
+from vllm.transformers_utils.configs.telechat2 import Telechat2Config
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
 __all__ = [
@@ -36,5 +37,6 @@
     "NVLM_D_Config",
     "Olmo2Config",
     "SolarConfig",
+    "Telechat2Config",
     "UltravoxConfig",
 ]
\ No newline at end of file
diff --git a/vllm/transformers_utils/configs/telechat2.py b/vllm/transformers_utils/configs/telechat2.py
new file mode 100644
index 0000000000000..eb6f5a059169f
--- /dev/null
+++ b/vllm/transformers_utils/configs/telechat2.py
@@ -0,0 +1,61 @@
+# adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py
+""" Telechat configuration compatible with LlamaConfig. """
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class Telechat2Config(PretrainedConfig):
+
+    model_type = "telechat"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_hidden_layers": "n_layer",
+        "num_attention_heads": "n_head",
+        "intermediate_size": "ffn_hidden_size",
+        "rms_norm_eps": "layer_norm_epsilon"
+    }
+
+    def __init__(
+        self,
+        vocab_size=160256,
+        hidden_size=4096,
+        n_layer=30,
+        n_head=32,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=1,
+        eos_token_id=2,
+        apply_residual_connection_post_layernorm=False,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        ffn_hidden_size=12288,
+        training_seqlen=8192,
+        logn=True,
+        embed_layernorm=False,
+        hidden_act="silu",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        n_embed = kwargs.pop("n_embed", None)
+        self.hidden_size = hidden_size if n_embed is None else n_embed
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.apply_residual_connection_post_layernorm = (
+            apply_residual_connection_post_layernorm)
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.logn = logn
+        self.training_seqlen = training_seqlen
+        self.embed_layernorm = embed_layernorm
+        self.num_key_value_heads = kwargs.pop("num_key_value_heads", None)
+        self.ffn_hidden_size = ffn_hidden_size
+        self.hidden_act = hidden_act
+        super().__init__(bos_token_id=bos_token_id,
+                         eos_token_id=eos_token_id,
+                         **kwargs)

From fcc717246ceb3fb1cb135f698f562da5347527ae Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Wed, 27 Nov 2024 19:55:38 +0800
Subject: [PATCH 074/293] [Bugfix][Hardware][CPU] Fix intel-omp version to
 avoid segfault (#10700)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 Dockerfile.cpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index d2f72ea975a3d..ebe226cf6d148 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -16,7 +16,7 @@ RUN --mount=type=cache,target=/var/cache/apt \
 # intel-openmp provides additional performance improvement vs. openmp
 # tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install intel-openmp
+    pip install intel-openmp==2025.0.1
 
 ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
 

From 9cc018ae38553ede15bd3646679916d05147d599 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Wed, 27 Nov 2024 04:26:27 -0800
Subject: [PATCH 075/293] [V1] Update interface for mistral-format Pixtral
 (#10703)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/model_executor/models/pixtral.py | 47 ++++++++++++++++-----------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 6711cbf5694b9..45171c1a04b17 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -31,7 +31,7 @@
 from vllm.model_executor.models.utils import merge_multimodal_embeddings
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import PlaceholderRange
+from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges,
                                    resolve_visual_encoder_outputs)
@@ -190,6 +190,25 @@ def sampler(self):
 
         return get_sampler()
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.vision_args.image_token_id)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -197,31 +216,21 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         """Run forward pass for pixtral.
-
-        TODO
-
         """
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-
-            if image_input is not None:
-                vision_embeddings = self._process_image_input(image_input)
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
 
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, vision_embeddings,
-                    self.vision_args.image_token_id)
-
-                input_ids = None
-            else:
-                inputs_embeds = None
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,

From d65fc83a816ab1ac3f3883b3d70e68887df58cce Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 27 Nov 2024 09:26:14 -0800
Subject: [PATCH 076/293] [ci] fix slow tests (#10698)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/entrypoints/llm/test_lazy_outlines.py   | 22 ++++++++++++++-----
 tests/test_lazy_torch_compile.py              | 22 ++++++++++++++-----
 .../vllm_test_utils/vllm_test_utils/blame.py  | 10 ++++-----
 3 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index 81fb000d8ac56..2c53676c5f5dd 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -1,6 +1,7 @@
 import sys
+from contextlib import nullcontext
 
-from vllm_test_utils import blame
+from vllm_test_utils import BlameResult, blame
 
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
@@ -56,9 +57,20 @@ def test_lazy_outlines(sample_regex):
     """
     # make sure outlines is not imported
     module_name = "outlines"
-    with blame(lambda: module_name in sys.modules) as result:
+    # In CI, we only check finally if the module is imported.
+    # If it is indeed imported, we can rerun the test with `use_blame=True`,
+    # which will trace every function call to find the first import location,
+    # and help find the root cause.
+    # We don't run it in CI by default because it is slow.
+    use_blame = False
+    context = blame(
+        lambda: module_name in sys.modules) if use_blame else nullcontext()
+    with context as result:
         run_normal()
         run_lmfe(sample_regex)
-    assert not result.found, (
-        f"Module {module_name} is already imported, the"
-        f" first import location is:\n{result.trace_stack}")
+    if use_blame:
+        assert isinstance(result, BlameResult)
+        print(f"the first import location is:\n{result.trace_stack}")
+    assert module_name not in sys.modules, (
+        f"Module {module_name} is imported. To see the first"
+        f" import location, run the test with `use_blame=True`.")
diff --git a/tests/test_lazy_torch_compile.py b/tests/test_lazy_torch_compile.py
index 4756fac8e2a8d..b950877a4337b 100644
--- a/tests/test_lazy_torch_compile.py
+++ b/tests/test_lazy_torch_compile.py
@@ -2,15 +2,27 @@
 # The utility function cannot be placed in `vllm.utils`
 # this needs to be a standalone script
 import sys
+from contextlib import nullcontext
 
-from vllm_test_utils import blame
+from vllm_test_utils import BlameResult, blame
 
 module_name = "torch._inductor.async_compile"
 
-with blame(lambda: module_name in sys.modules) as result:
+# In CI, we only check finally if the module is imported.
+# If it is indeed imported, we can rerun the test with `use_blame=True`,
+# which will trace every function call to find the first import location,
+# and help find the root cause.
+# We don't run it in CI by default because it is slow.
+use_blame = False
+context = blame(
+    lambda: module_name in sys.modules) if use_blame else nullcontext()
+with context as result:
     import vllm  # noqa
 
-assert not result.found, (f"Module {module_name} is already imported, the"
-                          f" first import location is:\n{result.trace_stack}")
+if use_blame:
+    assert isinstance(result, BlameResult)
+    print(f"the first import location is:\n{result.trace_stack}")
 
-print(f"Module {module_name} is not imported yet")
+assert module_name not in sys.modules, (
+    f"Module {module_name} is imported. To see the first"
+    f" import location, run the test with `use_blame=True`.")
diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py
index ad23ab83c2d81..1ddd3471d357b 100644
--- a/tests/vllm_test_utils/vllm_test_utils/blame.py
+++ b/tests/vllm_test_utils/vllm_test_utils/blame.py
@@ -46,8 +46,8 @@ def _trace_calls(frame, event, arg=None):
                 pass
         return _trace_calls
 
-    sys.settrace(_trace_calls)
-
-    yield result
-
-    sys.settrace(None)
+    try:
+        sys.settrace(_trace_calls)
+        yield result
+    finally:
+        sys.settrace(None)

From 046dfc437631050431477abb8db3cce7f796657e Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 27 Nov 2024 10:16:10 -0800
Subject: [PATCH 077/293] [torch.compile] fix shape specialization (#10722)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/config.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 68f73bf4b4dc9..cd24e9ffdf598 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2151,7 +2151,7 @@ class CompilationConfig(BaseModel):
 
     use_inductor: bool = True
     inductor_specialize_for_cudagraph_no_more_than: Optional[int] = None
-    inductor_compile_sizes: Optional[List[int]] = Field(default_factory=dict)
+    inductor_compile_sizes: Optional[List[int]] = Field(default=None)
     inductor_compile_config: Dict = Field(default_factory=dict)
     inductor_passes: Dict[str, str] = Field(default_factory=dict)
 
@@ -2290,9 +2290,8 @@ def init_during_runtime(self):
                 if x <= self.inductor_specialize_for_cudagraph_no_more_than
             ]
         else:
-            assert self.inductor_compile_sizes is not None, (
-                "inductor_compile_sizes should not be None when "
-                "inductor_specialize_for_cudagraph_no_more_than is None")
+            if self.inductor_compile_sizes is None:
+                self.inductor_compile_sizes = []
             self.compile_sizes = self.inductor_compile_sizes
 
 

From 9bf5c8dae7a64dfc814af7cd5f3ccf9c7ba3f90e Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 28 Nov 2024 02:43:17 +0800
Subject: [PATCH 078/293] [Bugfix] Fix GGUF inference with FP16 unquantized
 checkpoint (#10675)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .../layers/quantization/gguf.py               | 69 ++++++++++++++++---
 1 file changed, 60 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 24138662eb25c..f0943efa0039d 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -2,6 +2,7 @@
 
 import gguf
 import torch
+from gguf import GGMLQuantizationType as WeightType
 from torch.nn.parameter import Parameter, UninitializedParameter
 
 from vllm import _custom_ops as ops
@@ -49,19 +50,65 @@ def get_quant_method(self, layer: torch.nn.Module,
         return None
 
 
+UNQUANTIZED_TYPES = {WeightType.F32, WeightType.F16, WeightType.BF16}
+STANDARD_QUANT_TYPES = {
+    WeightType.Q4_0,
+    WeightType.Q4_1,
+    WeightType.Q5_0,
+    WeightType.Q5_1,
+    WeightType.Q8_0,
+    WeightType.Q8_1,
+}
+KQUANT_TYPES = {
+    WeightType.Q2_K,
+    WeightType.Q3_K,
+    WeightType.Q4_K,
+    WeightType.Q5_K,
+    WeightType.Q6_K,
+}
+IMATRIX_QUANT_TYPES = {
+    WeightType.IQ1_M,
+    WeightType.IQ1_S,
+    WeightType.IQ2_XXS,
+    WeightType.IQ2_XS,
+    WeightType.IQ2_S,
+    WeightType.IQ3_XXS,
+    WeightType.IQ3_S,
+    WeightType.IQ4_XS,
+    WeightType.IQ4_NL,
+}
+# TODO(Isotr0py): Currently, we don't have MMQ kernel for I-Matrix quantization.
+# Consolidate DEQUANT_TYPES, MMVQ_QUANT_TYPES and MMQ_QUANT_TYPES after we add
+# MMQ kernel for I-Matrix quantization.
+DEQUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES
+MMVQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES
+MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES
+
+
 def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor,
                   qweight_type: int) -> torch.Tensor:
-    # use dequantize mulmat for IQmatrix, mmq for k-quants
-    if x.shape[0] == 1:
-        # enable mmvq in contiguous batching
+    # there is no need to call any kernel for fp16/bf16
+    if qweight_type in UNQUANTIZED_TYPES:
+        return x @ qweight.T
+    # enable MMVQ in contiguous batching with batch_size=1
+    if x.shape[0] == 1 and qweight_type in MMVQ_QUANT_TYPES:
         y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0])
-    elif qweight_type >= 16:
+    # Use MMQ Kernel if it's available (standard + k-quants)
+    elif qweight_type in MMQ_QUANT_TYPES:
+        y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0])
+    # If there is no available MMQ kernel, fallback to dequantize
+    elif qweight_type in DEQUANT_TYPES:
         block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
         shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
         weight = ops.ggml_dequantize(qweight, qweight_type, *shape)
         y = x @ weight.T
     else:
-        y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0])
+        # Raise an error if the quantization type is not supported.
+        # Might be useful if llama.cpp adds a new quantization type.
+        # Wrap to GGMLQuantizationType IntEnum to make sure it's a valid type.
+        qweight_type = WeightType(qweight_type)
+        raise NotImplementedError(
+            f"Unsupported GGUF quantization type: {qweight_type}")
     return y
 
 
@@ -121,9 +168,9 @@ def apply(self,
             shard_id = ["q", "k", "v"] if "q" in shard_id else shard_id
             qweight = layer.qweight.unbind(0)
             result = []
-            for id in shard_id:
-                q_idx = layer.qweight.shard_id_map[id]
-                qweight_type = layer.qweight_type.shard_weight_type[id]
+            for idx in shard_id:
+                q_idx = layer.qweight.shard_id_map[idx]
+                qweight_type = layer.qweight_type.shard_weight_type[idx]
                 result.append(_fuse_mul_mat(x, qweight[q_idx], qweight_type))
             out = torch.cat(result, axis=1)
         else:
@@ -163,9 +210,13 @@ class GGUFUninitializedParameter(UninitializedParameter):
     data_container: List[torch.Tensor]
 
     def materialize_nested(self) -> Parameter:
+        dtype = {data.dtype for data in self.data_container}
+        assert len(dtype) == 1, ValueError(
+            f"Data container has mixed dtypes: {dtype}")
+        dtype = next(iter(dtype))
         nested_data = torch.nested.nested_tensor(self.data_container,
                                                  device=self.device,
-                                                 dtype=torch.uint8)
+                                                 dtype=dtype)
         self.data_container.clear()
         param = torch.Tensor._make_subclass(self.cls_to_become,
                                             nested_data,

From 4e53851d4a579f3d16bc73a5808fe6ea38fcf356 Mon Sep 17 00:00:00 2001
From: Mor Zusman <mor.zusmann@gmail.com>
Date: Wed, 27 Nov 2024 21:02:27 +0200
Subject: [PATCH 079/293] [Bugfix][Mamba] Fix Multistep on Mamba-like models
 (#10705)

Signed-off-by: mzusman <mor.zusmann@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .../decoder_only/language/test_jamba.py       | 38 +++++++++++++++++++
 .../decoder_only/language/test_mamba.py       | 36 ++++++++++++++++++
 vllm/engine/async_llm_engine.py               |  7 +++-
 vllm/engine/llm_engine.py                     |  7 +++-
 4 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py
index 6542689c3f277..87a05b3011393 100644
--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
@@ -275,6 +275,44 @@ def test_state_cleanup(
                     "could be related to finished_requests_ids")
 
 
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_multistep(
+    vllm_runner,
+    model: str,
+    dtype: str,
+    example_prompts,
+) -> None:
+    # This test is verifying that multistep works correctly
+    #on mamba-like models
+    with vllm_runner(model, num_scheduler_steps=8,
+                     max_num_seqs=2) as vllm_model:
+        vllm_model.generate_greedy([example_prompts[0]] * 10, 1)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [64])
+def test_multistep_correctness(vllm_runner, model: str, dtype: str,
+                               max_tokens: int, example_prompts) -> None:
+    with vllm_runner(model, num_scheduler_steps=8,
+                     max_num_seqs=2) as vllm_model:
+        vllm_outputs_multistep = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    with vllm_runner(model, num_scheduler_steps=1,
+                     max_num_seqs=2) as vllm_model:
+        vllm_outputs_single_step = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_outputs_multistep,
+        outputs_1_lst=vllm_outputs_single_step,
+        name_0="vllm_outputs_multistep",
+        name_1="vllm_outputs_single_step",
+    )
+
+
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py
index 78eab8d5354fd..01e208347bff4 100644
--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -283,3 +283,39 @@ def test_state_cleanup(
     except ValueError:
         pytest.fail("Mamba inner state wasn't cleaned up between states, "
                     "could be related to finished_requests_ids")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_multistep(
+    vllm_runner,
+    model: str,
+    dtype: str,
+    example_prompts,
+) -> None:
+    with vllm_runner(model, num_scheduler_steps=8,
+                     max_num_seqs=2) as vllm_model:
+        vllm_model.generate_greedy([example_prompts[0]] * 10, 1)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [64])
+def test_multistep_correctness(vllm_runner, model: str, dtype: str,
+                               max_tokens: int, example_prompts) -> None:
+    with vllm_runner(model, num_scheduler_steps=8,
+                     max_num_seqs=2) as vllm_model:
+        vllm_outputs_multistep = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    with vllm_runner(model, num_scheduler_steps=1,
+                     max_num_seqs=2) as vllm_model:
+        vllm_outputs_single_step = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_outputs_multistep,
+        outputs_1_lst=vllm_outputs_single_step,
+        name_0="vllm_outputs_multistep",
+        name_1="vllm_outputs_single_step",
+    )
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 3224577c567f8..31a15b04314d5 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -300,6 +300,9 @@ async def step_async(
             ctx.seq_group_metadata_list = seq_group_metadata_list
             ctx.scheduler_outputs = scheduler_outputs
 
+            finished_requests_ids = self.scheduler[
+                virtual_engine].get_and_reset_finished_requests_ids()
+
             # Maybe switch from async mode to sync mode
             if not allow_async_output_proc and len(ctx.output_queue) > 0:
                 self._process_model_outputs(ctx=ctx)
@@ -311,13 +314,13 @@ async def step_async(
                 self._cache_scheduler_outputs_for_multi_step(
                     virtual_engine, seq_group_metadata_list, scheduler_outputs,
                     allow_async_output_proc)
+        else:
+            finished_requests_ids = list()
 
         assert seq_group_metadata_list is not None
         assert scheduler_outputs is not None
 
         if not scheduler_outputs.is_empty():
-            finished_requests_ids = self.scheduler[
-                virtual_engine].get_and_reset_finished_requests_ids()
 
             # Check if we have a cached last_output from the previous iteration.
             # For supporting PP this is probably the best way to pass the
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index a4975cece9a81..ecc222f692c41 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1398,6 +1398,9 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
             ctx.seq_group_metadata_list = seq_group_metadata_list
             ctx.scheduler_outputs = scheduler_outputs
 
+            finished_requests_ids = self.scheduler[
+                virtual_engine].get_and_reset_finished_requests_ids()
+
             # Maybe switch from async mode to sync mode
             if not allow_async_output_proc and len(ctx.output_queue) > 0:
                 self._process_model_outputs(ctx=ctx)
@@ -1409,13 +1412,13 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
                 self._cache_scheduler_outputs_for_multi_step(
                     virtual_engine, seq_group_metadata_list, scheduler_outputs,
                     allow_async_output_proc)
+        else:
+            finished_requests_ids = list()
 
         assert seq_group_metadata_list is not None
         assert scheduler_outputs is not None
 
         if not scheduler_outputs.is_empty():
-            finished_requests_ids = self.scheduler[
-                virtual_engine].get_and_reset_finished_requests_ids()
 
             # Check if we have a cached last_output from the previous iteration.
             # For supporting PP this is probably the best way to pass the

From 8239c6f09695aba80102fd7dbd87c567d2bf2889 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 28 Nov 2024 03:05:29 +0800
Subject: [PATCH 080/293] [Bugfix] Ignore `lm_head` when loading embedding
 models (#10719)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/model_executor/models/bert.py   | 2 ++
 vllm/model_executor/models/gemma2.py | 2 ++
 vllm/model_executor/models/llama.py  | 2 ++
 vllm/model_executor/models/qwen2.py  | 2 ++
 4 files changed, 8 insertions(+)

diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 1fff72b3490e9..053d838432885 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -443,6 +443,8 @@ def pooler(
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
         weights = hf_to_vllm_mapper.apply(weights)
+        weights = ((name, data) for name, data in weights
+                   if not name.startswith("lm_head."))
         self.model.load_weights(weights)
 
     def _build_model(self,
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index c93223c740272..d35fcb012e166 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -504,4 +504,6 @@ def pooler(
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
         weights = hf_to_vllm_mapper.apply(weights)
+        weights = ((name, data) for name, data in weights
+                   if not name.startswith("lm_head."))
         self.model.load_weights(weights)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index fffb3fe53b94c..fe94bb352961b 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -689,6 +689,8 @@ def pooler(
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
         weights = hf_to_vllm_mapper.apply(weights)
+        weights = ((name, data) for name, data in weights
+                   if not name.startswith("lm_head."))
         self.model.load_weights(weights)
 
     def load_kv_cache_scales(self, quantization_param_path: str) -> None:
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 9f706610a129a..87943e53d861c 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -580,4 +580,6 @@ def pooler(
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
         weights = hf_to_vllm_mapper.apply(weights)
+        weights = ((name, data) for name, data in weights
+                   if not name.startswith("lm_head."))
         self.model.load_weights(weights)

From 5a3a0eb23607e1d96c8cd2a74d730045477e28e9 Mon Sep 17 00:00:00 2001
From: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Date: Wed, 27 Nov 2024 23:21:10 +0200
Subject: [PATCH 081/293] [Frontend] don't block event loop in tokenization
 (preprocess) in OpenAI compatible server (#10635)

Signed-off-by: Tomer Asida <tomera@ai21.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .../openai/test_async_tokenization.py         | 137 ++++++++++++++++++
 vllm/entrypoints/openai/serving_completion.py |   2 +-
 vllm/entrypoints/openai/serving_embedding.py  |  15 +-
 vllm/entrypoints/openai/serving_engine.py     |  75 +++++-----
 vllm/entrypoints/openai/serving_score.py      |  10 +-
 .../openai/serving_tokenization.py            |  15 +-
 vllm/utils.py                                 |   8 +-
 7 files changed, 206 insertions(+), 56 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_async_tokenization.py

diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py
new file mode 100644
index 0000000000000..fcce8b46c4344
--- /dev/null
+++ b/tests/entrypoints/openai/test_async_tokenization.py
@@ -0,0 +1,137 @@
+import asyncio
+import contextlib
+import random
+import time
+from typing import Callable
+
+import openai
+import pytest
+import pytest_asyncio
+import requests
+
+from tests.utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def server():  # noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+        "--load-format",
+        "dummy",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ids=["completion", "chat"],
+    argnames=["create_func_gen", "content_body"],
+    argvalues=[
+        (lambda x: x.completions.create, {
+            "prompt": " ".join(['A'] * 10_000)
+        }),
+        (lambda x: x.chat.completions.create, {
+            "messages": [{
+                "role": "user",
+                "content": " ".join(['A'] * 10_000)
+            }]
+        }),
+    ],
+)
+async def test_with_and_without_truncate(
+    server: RemoteOpenAIServer,
+    client: openai.AsyncOpenAI,
+    create_func_gen: Callable,
+    content_body: dict,
+):
+    create_func = create_func_gen(client)
+    body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
+
+    num_requests = 10
+    truncate_prompt_tokens = ([1000] * (num_requests // 2) + [None] *
+                              (num_requests - num_requests // 2))
+    random.shuffle(truncate_prompt_tokens)
+
+    bodies = [{
+        **body, "extra_body": {
+            'truncate_prompt_tokens': t
+        }
+    } for t in truncate_prompt_tokens]
+
+    async def get_status_code(**kwargs):
+        try:
+            await create_func(**kwargs)
+            return 200
+        except openai.APIStatusError as e:
+            return e.status_code
+
+    responses = await asyncio.gather(*[get_status_code(**b) for b in bodies])
+    assert 500 not in responses
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ids=["single completion", "multiple completions", "chat"],
+    argnames=["create_func_gen", "content_body"],
+    argvalues=[
+        (lambda x: x.completions.create, {
+            "prompt": " ".join(['A'] * 300_000)
+        }),
+        (lambda x: x.completions.create, {
+            "prompt": [" ".join(['A'] * 300_000)] * 2
+        }),
+        (lambda x: x.chat.completions.create, {
+            "messages": [{
+                "role": "user",
+                "content": " ".join(['A'] * 300_000)
+            }]
+        }),
+    ],
+)
+async def test_healthcheck_response_time(
+    server: RemoteOpenAIServer,
+    client: openai.AsyncOpenAI,
+    create_func_gen: Callable,
+    content_body: dict,
+):
+    num_requests = 50
+
+    create_func = create_func_gen(client)
+    body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
+
+    def get_response_time(url):
+        start_time = time.monotonic()
+        res = requests.get(url)
+        end_time = time.monotonic()
+        assert res.status_code == 200
+        return end_time - start_time
+
+    no_load_response_time = get_response_time(server.url_for("health"))
+    tasks = [
+        asyncio.create_task(create_func(**body)) for _ in range(num_requests)
+    ]
+    await asyncio.sleep(1)  # give the tasks a chance to start running
+    load_response_time = get_response_time(server.url_for("health"))
+
+    with contextlib.suppress(openai.APIStatusError):
+        await asyncio.gather(*tasks)
+
+    assert load_response_time < 100 * no_load_response_time
+    assert load_response_time < 0.1
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 936aae8f1c267..fc1c4908d6650 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -101,7 +101,7 @@ async def create_completion(
 
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
-            request_prompts, engine_prompts = self._preprocess_completion(
+            request_prompts, engine_prompts = await self._preprocess_completion(
                 request,
                 tokenizer,
                 request.prompt,
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index c84a7d2d8e13e..78e2416d9d4da 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -156,13 +156,14 @@ async def create_embedding(
                     add_special_tokens=request.add_special_tokens,
                 )
             else:
-                request_prompts, engine_prompts = self._preprocess_completion(
-                    request,
-                    tokenizer,
-                    request.input,
-                    truncate_prompt_tokens=truncate_prompt_tokens,
-                    add_special_tokens=request.add_special_tokens,
-                )
+                (request_prompts,
+                 engine_prompts) = await self._preprocess_completion(
+                     request,
+                     tokenizer,
+                     request.input,
+                     truncate_prompt_tokens=truncate_prompt_tokens,
+                     add_special_tokens=request.add_special_tokens,
+                 )
         except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index cae2877ea7e99..8232c6116c1bd 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1,5 +1,6 @@
 import json
 import pathlib
+from concurrent.futures.thread import ThreadPoolExecutor
 from dataclasses import dataclass
 from http import HTTPStatus
 from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping,
@@ -46,7 +47,7 @@
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
                           log_tracing_disabled_warning)
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import AtomicCounter, is_list_of
+from vllm.utils import AtomicCounter, is_list_of, make_async
 
 logger = init_logger(__name__)
 
@@ -140,6 +141,14 @@ def __init__(
         self.request_logger = request_logger
         self.return_tokens_as_token_ids = return_tokens_as_token_ids
 
+        self._tokenizer_executor = ThreadPoolExecutor(max_workers=1)
+
+        self._tokenize_prompt_input_async = make_async(
+            self._tokenize_prompt_input, executor=self._tokenizer_executor)
+        self._tokenize_prompt_input_or_inputs_async = make_async(
+            self._tokenize_prompt_input_or_inputs,
+            executor=self._tokenizer_executor)
+
     async def show_available_models(self) -> ModelList:
         """Show available models. Right now we only have one model."""
         model_cards = [
@@ -368,7 +377,7 @@ def _tokenize_prompt_input_or_inputs(
         input_or_inputs: Union[str, List[str], List[int], List[List[int]]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
         add_special_tokens: bool = True,
-    ) -> Iterator[TextTokensPrompt]:
+    ) -> List[TextTokensPrompt]:
         """
         Tokenize/detokenize depending on the input format.
 
@@ -376,45 +385,41 @@ def _tokenize_prompt_input_or_inputs(
         , each input can be a string or array of tokens. Note that each request
         can pass one or more inputs.
         """
-        for prompt_input in parse_and_batch_prompt(input_or_inputs):
-            # Although our type checking is based on mypy,
-            # VSCode Pyright extension should still work properly
-            # "is True" is required for Pyright to perform type narrowing
-            # See: https://github.com/microsoft/pyright/issues/7672
-            if prompt_input["is_tokens"] is False:
-                yield self._normalize_prompt_text_to_input(
-                    request,
-                    tokenizer,
-                    prompt=prompt_input["content"],
-                    truncate_prompt_tokens=truncate_prompt_tokens,
-                    add_special_tokens=add_special_tokens,
-                )
-            else:
-                yield self._normalize_prompt_tokens_to_input(
-                    request,
-                    tokenizer,
-                    prompt_ids=prompt_input["content"],
-                    truncate_prompt_tokens=truncate_prompt_tokens,
-                )
+        # Although our type checking is based on mypy,
+        # VSCode Pyright extension should still work properly
+        # "is True" is required for Pyright to perform type narrowing
+        # See: https://github.com/microsoft/pyright/issues/7672
+        return [
+            self._normalize_prompt_text_to_input(
+                request,
+                tokenizer,
+                prompt=prompt_input["content"],
+                truncate_prompt_tokens=truncate_prompt_tokens,
+                add_special_tokens=add_special_tokens)
+            if prompt_input["is_tokens"] is False else
+            self._normalize_prompt_tokens_to_input(
+                request,
+                tokenizer,
+                prompt_ids=prompt_input["content"],
+                truncate_prompt_tokens=truncate_prompt_tokens)
+            for prompt_input in parse_and_batch_prompt(input_or_inputs)
+        ]
 
-    def _preprocess_completion(
+    async def _preprocess_completion(
         self,
         request: CompletionLikeRequest,
         tokenizer: AnyTokenizer,
         input_or_inputs: Union[str, List[str], List[int], List[List[int]]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
         add_special_tokens: bool = True,
-    ) -> Tuple[Sequence[TextTokensPrompt], List[TokensPrompt]]:
-        request_prompts = [
-            request_prompt
-            for request_prompt in self._tokenize_prompt_input_or_inputs(
-                request,
-                tokenizer,
-                input_or_inputs,
-                truncate_prompt_tokens=truncate_prompt_tokens,
-                add_special_tokens=add_special_tokens,
-            )
-        ]
+    ) -> Tuple[List[TextTokensPrompt], List[TokensPrompt]]:
+        request_prompts = await self._tokenize_prompt_input_or_inputs_async(
+            request,
+            tokenizer,
+            input_or_inputs,
+            truncate_prompt_tokens=truncate_prompt_tokens,
+            add_special_tokens=add_special_tokens,
+        )
 
         engine_prompts = [
             TokensPrompt(prompt_token_ids=request_prompt["prompt_token_ids"])
@@ -493,7 +498,7 @@ async def _preprocess_chat(
                 request=request)
 
         if isinstance(request_prompt, str):
-            prompt_inputs = self._tokenize_prompt_input(
+            prompt_inputs = await self._tokenize_prompt_input_async(
                 request,
                 tokenizer,
                 request_prompt,
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 156fea6f47982..7cd8ff08b5608 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -15,7 +15,7 @@
 from vllm.logger import init_logger
 from vllm.outputs import EmbeddingRequestOutput
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
-from vllm.utils import merge_async_iterators, random_uuid
+from vllm.utils import make_async, merge_async_iterators, random_uuid
 
 logger = init_logger(__name__)
 
@@ -145,9 +145,11 @@ async def create_score(
                 tokenization_kwargs["truncation"] = True
                 tokenization_kwargs["max_length"] = truncate_prompt_tokens
 
-            prompt_inputs = tokenizer(text=q,
-                                      text_pair=t,
-                                      **tokenization_kwargs)
+            tokenize_async = make_async(tokenizer.__call__,
+                                        executor=self._tokenizer_executor)
+            prompt_inputs = await tokenize_async(text=q,
+                                                 text_pair=t,
+                                                 **tokenization_kwargs)
             engine_prompt = TokensPrompt(
                 prompt_token_ids=prompt_inputs["input_ids"],
                 token_type_ids=prompt_inputs.get("token_type_ids"))
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 59b3b1311f881..9c3dc2c98b2dd 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -81,12 +81,13 @@ async def create_tokenize(
                     add_special_tokens=request.add_special_tokens,
                 )
             else:
-                request_prompts, engine_prompts = self._preprocess_completion(
-                    request,
-                    tokenizer,
-                    request.prompt,
-                    add_special_tokens=request.add_special_tokens,
-                )
+                (request_prompts,
+                 engine_prompts) = await self._preprocess_completion(
+                     request,
+                     tokenizer,
+                     request.prompt,
+                     add_special_tokens=request.add_special_tokens,
+                 )
         except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
@@ -134,7 +135,7 @@ async def create_detokenize(
         # Silently ignore prompt adapter since it does not affect tokenization
         # (Unlike in Embeddings API where an error is raised)
 
-        prompt_input = self._tokenize_prompt_input(
+        prompt_input = await self._tokenize_prompt_input_async(
             request,
             tokenizer,
             request.tokens,
diff --git a/vllm/utils.py b/vllm/utils.py
index bec876d983701..6f7a6f8c54e47 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,5 +1,6 @@
 import argparse
 import asyncio
+import concurrent
 import contextlib
 import datetime
 import enum
@@ -351,7 +352,10 @@ def in_wsl() -> bool:
     return "microsoft" in " ".join(uname()).lower()
 
 
-def make_async(func: Callable[P, T]) -> Callable[P, Awaitable[T]]:
+def make_async(
+    func: Callable[P, T],
+    executor: Optional[concurrent.futures.Executor] = None
+) -> Callable[P, Awaitable[T]]:
     """Take a blocking function, and run it on in an executor thread.
 
     This function prevents the blocking function from blocking the
@@ -362,7 +366,7 @@ def make_async(func: Callable[P, T]) -> Callable[P, Awaitable[T]]:
     def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future:
         loop = asyncio.get_event_loop()
         p_func = partial(func, *args, **kwargs)
-        return loop.run_in_executor(executor=None, func=p_func)
+        return loop.run_in_executor(executor=executor, func=p_func)
 
     return _async_wrapper
 

From b22e27c42dac7293c21ca290234f46f09d6005e8 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 27 Nov 2024 19:54:58 -0800
Subject: [PATCH 082/293] [misc] upgrade filelock version (#10731)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index f62ad66a1ecc4..02e3d65fb774c 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -20,7 +20,7 @@ tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
 outlines >= 0.0.43, < 0.1
 typing_extensions >= 4.10
-filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
+filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq
 msgspec

From b5864e2fb8f11d689289fc77ec3faad5d2833a42 Mon Sep 17 00:00:00 2001
From: zixuanzhang226 <zixuanzhang@bytedance.com>
Date: Wed, 27 Nov 2024 23:58:02 -0800
Subject: [PATCH 083/293] [Model] support bitsandbytes quantization with
 minicpm3 model (#10682)

Signed-off-by: Ubuntu <zixuanzhang@bytedance.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/model_executor/models/minicpm3.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index c38c31a0d4953..c66be2d9c2d07 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -241,6 +241,12 @@ class MiniCPM3ForCausalLM(MiniCPMForCausalLM):
     # `embedding_modules` and `embedding_padding_modules`
     # are inherited from MiniCPMForCausalLM
 
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
     def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = MiniCPM3Model(vllm_config=vllm_config,
                                    prefix=maybe_prefix(prefix, "model"))

From b9cabc97a133355e1e52f3b19b77acfede1677e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=BD=97=E6=B3=BD=E8=BD=A9?= <spacewanderlzx@gmail.com>
Date: Thu, 28 Nov 2024 15:58:39 +0800
Subject: [PATCH 084/293] [Doc] Update model in arch_overview.rst to match
 comment (#10701)

Signed-off-by: spacewander <spacewanderlzx@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 docs/source/design/arch_overview.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/design/arch_overview.rst b/docs/source/design/arch_overview.rst
index a9e7b4bd69bc7..bc3f509f0a66e 100644
--- a/docs/source/design/arch_overview.rst
+++ b/docs/source/design/arch_overview.rst
@@ -42,7 +42,7 @@ Here is a sample of `LLM` class usage:
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
     # Initialize the LLM engine with the OPT-125M model
-    llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct")
+    llm = LLM(model="facebook/opt-125m")
 
     # Generate outputs for the input prompts
     outputs = llm.generate(prompts, sampling_params)

From d61d6615f09adb0a1905d4f82ca2b53f0a6cc99a Mon Sep 17 00:00:00 2001
From: Ricky Xu <rickyx@anyscale.com>
Date: Wed, 27 Nov 2024 23:59:28 -0800
Subject: [PATCH 085/293] [Bug][CLI] Allow users to disable prefix caching
 explicitly (#10724)

Signed-off-by: rickyx <rickyx@anyscale.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/engine/test_arg_utils.py      | 19 +++++++++++++++++++
 tests/v1/engine/test_engine_args.py | 19 +++++++++++++++++++
 vllm/engine/arg_utils.py            | 10 +++++++---
 3 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 5b0e76fe53685..de78d41ad12eb 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -59,6 +59,25 @@ def test_compilation_config():
     assert args.compilation_config.level == 3
 
 
+def test_prefix_cache_default():
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    args = parser.parse_args([])
+
+    engine_args = EngineArgs.from_cli_args(args=args)
+    assert (not engine_args.enable_prefix_caching
+            ), "prefix caching defaults to off."
+
+    # with flag to turn it on.
+    args = parser.parse_args(["--enable-prefix-caching"])
+    engine_args = EngineArgs.from_cli_args(args=args)
+    assert engine_args.enable_prefix_caching
+
+    # with disable flag to turn it off.
+    args = parser.parse_args(["--no-enable-prefix-caching"])
+    engine_args = EngineArgs.from_cli_args(args=args)
+    assert not engine_args.enable_prefix_caching
+
+
 def test_valid_pooling_config():
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     args = parser.parse_args([
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index 69cfdf5a395c1..ac5e7dde525a7 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -4,6 +4,7 @@
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.usage.usage_lib import UsageContext
+from vllm.utils import FlexibleArgumentParser
 
 if not envs.VLLM_USE_V1:
     pytest.skip(
@@ -12,6 +13,24 @@
     )
 
 
+def test_prefix_caching_from_cli():
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    args = parser.parse_args([])
+    engine_args = EngineArgs.from_cli_args(args=args)
+    assert (engine_args.enable_prefix_caching
+            ), "V1 turns on prefix caching by default."
+
+    # Turn it off possible with flag.
+    args = parser.parse_args(["--no-enable-prefix-caching"])
+    engine_args = EngineArgs.from_cli_args(args=args)
+    assert not engine_args.enable_prefix_caching
+
+    # Turn it on with flag.
+    args = parser.parse_args(["--enable-prefix-caching"])
+    engine_args = EngineArgs.from_cli_args(args=args)
+    assert engine_args.enable_prefix_caching
+
+
 def test_defaults():
     engine_args = EngineArgs(model="facebook/opt-125m")
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 90b4798f17a13..f0020562c3c3a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -416,9 +416,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             'tokens. This is ignored on neuron devices and '
                             'set to max-model-len')
 
-        parser.add_argument('--enable-prefix-caching',
-                            action='store_true',
-                            help='Enables automatic prefix caching.')
+        parser.add_argument(
+            "--enable-prefix-caching",
+            action=argparse.BooleanOptionalAction,
+            default=EngineArgs.enable_prefix_caching,
+            help="Enables automatic prefix caching. "
+            "Use --no-enable-prefix-caching to disable explicitly.",
+        )
         parser.add_argument('--disable-sliding-window',
                             action='store_true',
                             help='Disables sliding window, '

From 39f449473d2c528ef50fd5a89b0b83e800bce2e0 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 28 Nov 2024 00:13:15 -0800
Subject: [PATCH 086/293] [V1] Do not allocate beyond the max_model_len
 (#10730)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/core/test_prefix_caching.py | 24 ++++++++++++++++--------
 vllm/v1/core/kv_cache_manager.py     | 17 +++++++++++++++++
 vllm/v1/core/scheduler.py            | 15 ++++++++-------
 3 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 83bfbb6ade8d7..b44d3e5cb0678 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -23,7 +23,8 @@ def test_prefill():
     manager = KVCacheManager(
         block_size=16,
         num_gpu_blocks=10,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=16,
     )
@@ -121,7 +122,8 @@ def test_decode():
     manager = KVCacheManager(
         block_size=16,
         num_gpu_blocks=10,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=16,
     )
@@ -172,7 +174,8 @@ def test_evict():
     manager = KVCacheManager(
         block_size=16,
         num_gpu_blocks=10,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=16,
     )
@@ -220,7 +223,8 @@ def test_hash_block_correct_reuse():
     manager = KVCacheManager(
         block_size=block_size,
         num_gpu_blocks=1,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=0,
     )
@@ -256,7 +260,8 @@ def test_computed_blocks_not_evicted():
     manager = KVCacheManager(
         block_size=block_size,
         num_gpu_blocks=2,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=0,
     )
@@ -303,7 +308,8 @@ def test_basic_prefix_caching_disabled():
     manager = KVCacheManager(
         block_size=block_size,
         num_gpu_blocks=4,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
         enable_caching=False,
         num_preallocate_tokens=0,
     )
@@ -342,7 +348,8 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
     manager = KVCacheManager(
         block_size=block_size,
         num_gpu_blocks=10,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=num_preallocate_tokens,
     )
@@ -370,7 +377,8 @@ def test_cache_blocks():
     manager = KVCacheManager(
         block_size=block_size,
         num_gpu_blocks=5,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=0,
     )
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 8eb3fb976eb87..b492a755e6dd5 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -17,12 +17,15 @@ def __init__(
         self,
         block_size: int,
         num_gpu_blocks: int,
+        max_model_len: int,
         sliding_window: Optional[int] = None,
         enable_caching: bool = True,
         num_preallocate_tokens: int = 64,
     ) -> None:
         self.block_size = block_size
         self.num_gpu_blocks = num_gpu_blocks
+        self.max_model_len = max_model_len
+        self.max_num_blocks_per_req = cdiv(max_model_len, block_size)
         self.sliding_window = sliding_window
         self.enable_caching = enable_caching
         # NOTE(woosuk): To avoid frequent block allocation, we preallocate some
@@ -132,7 +135,14 @@ def append_slots(
             num_new_blocks = min(
                 num_new_blocks + self.num_preallocate_blocks,
                 self.free_block_queue.num_free_blocks,
+                # Should not exceed the maximum number of blocks per request.
+                # This is especially because the block table has the shape
+                # [..., max_num_blocks_per_req].
+                # TODO(woosuk): Check and reject requests if
+                # num_prompt_tokens + max_tokens > max_model_len.
+                self.max_num_blocks_per_req - len(req_blocks),
             )
+            assert num_new_blocks > 0
 
             new_blocks = self._get_new_blocks(num_new_blocks)
             req_blocks.extend(new_blocks)
@@ -212,7 +222,14 @@ def allocate_slots(
             num_required_blocks + self.num_preallocate_blocks,
             self.free_block_queue.num_free_blocks -
             num_evictable_computed_blocks,
+            # Should not exceed the maximum number of blocks per request.
+            # This is especially because the block table has the shape
+            # [..., max_num_blocks_per_req].
+            # TODO(woosuk): Check and reject requests if
+            # num_prompt_tokens + max_tokens > max_model_len.
+            self.max_num_blocks_per_req - len(computed_blocks),
         )
+        assert num_new_blocks > 0
 
         # Concatenate the computed block IDs and the new block IDs.
         new_blocks = self._get_new_blocks(num_new_blocks)
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 5ada9ceab54e6..b515d15172c44 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -34,22 +34,23 @@ def __init__(
         # TODO: Support LoRA.
         assert lora_config is None, "V1 does not support LoRA yet."
 
+        # Scheduling constraints.
+        self.max_num_running_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_scheduled_tokens = \
+            self.scheduler_config.max_num_batched_tokens
+        self.max_model_len = self.scheduler_config.max_model_len
+
         num_gpu_blocks = cache_config.num_gpu_blocks
         assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
-        # Create the block space manager.
+        # Create the KV cache manager.
         self.kv_cache_manager = KVCacheManager(
             block_size=self.cache_config.block_size,
             num_gpu_blocks=num_gpu_blocks,
+            max_model_len=self.max_model_len,
             sliding_window=self.cache_config.sliding_window,
             enable_caching=self.cache_config.enable_prefix_caching)
         self.block_size = self.cache_config.block_size
 
-        # Scheduling constraints.
-        self.max_num_running_reqs = self.scheduler_config.max_num_seqs
-        self.max_num_scheduled_tokens = \
-            self.scheduler_config.max_num_batched_tokens
-        self.max_model_len = self.scheduler_config.max_model_len
-
         # req_id -> Request
         self.requests: Dict[str, Request] = {}
         # Priority queues for requests.

From dcdf2f37ff4a54209578155f2f230d491e342a18 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 28 Nov 2024 02:25:59 -0800
Subject: [PATCH 087/293] [Kernel] Update vllm-flash-attn version (#10736)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 882d4412632a5..45a3b484e0360 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -522,7 +522,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9
+          GIT_TAG d886f88165702b3c7e7744502772cd98b06be9e1
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

From ea6ed6b24f8ae21dfed25391c6f7ec7eec231066 Mon Sep 17 00:00:00 2001
From: Richard Liu <39319471+richardsliu@users.noreply.github.com>
Date: Thu, 28 Nov 2024 02:30:48 -0800
Subject: [PATCH 088/293] [TPU] Update requirements-tpu (#10726)

Signed-off-by: Richard Liu <ricliu@google.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 requirements-tpu.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index 3d1e80f6be620..b8f0b15469e77 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -16,8 +16,8 @@ ray[default]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.6.0.dev20241114+cpu
-torchvision==0.20.0.dev20241114+cpu
-torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241114-cp310-cp310-linux_x86_64.whl
-jaxlib==0.4.32.dev20240829
-jax==0.4.32.dev20240829
+torch==2.6.0.dev20241126+cpu
+torchvision==0.20.0.dev20241126+cpu
+torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl
+jaxlib==0.4.36.dev20241122
+jax==0.4.36.dev20241122

From ac0b495b3f39070331136bfe0da2ee9d353a91f9 Mon Sep 17 00:00:00 2001
From: sixgod <evethwillbeok@outlook.com>
Date: Thu, 28 Nov 2024 22:53:31 +0800
Subject: [PATCH 089/293] [Model] Added GLM-4 series hf format model support
 vllm==0.6.4 (#10561)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 docs/source/models/supported_models.rst |  5 +++++
 tests/models/registry.py                |  1 +
 tests/models/test_initialization.py     |  2 +-
 vllm/model_executor/models/glm.py       | 21 +++++++++++++++++++++
 vllm/model_executor/models/registry.py  |  2 ++
 5 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100644 vllm/model_executor/models/glm.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index c5fbb30b24e28..fd0671beacee7 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -139,6 +139,11 @@ Text Generation
     - :code:`google/gemma-2-9b`, :code:`google/gemma-2-27b`, etc.
     - ✅︎
     - ✅︎
+  * - :code:`GlmForCausalLM`
+    - GLM-4
+    - :code:`THUDM/glm-4-9b-chat-hf`, etc.
+    - ✅︎
+    - ✅︎
   * - :code:`GPT2LMHeadModel`
     - GPT-2
     - :code:`gpt2`, :code:`gpt2-xl`, etc.
diff --git a/tests/models/registry.py b/tests/models/registry.py
index a93bfe907e0d7..461f453d8b1c3 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -63,6 +63,7 @@ class _HfExamplesInfo:
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"),
     "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
+    "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
     "GPT2LMHeadModel": _HfExamplesInfo("gpt2"),
     "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder"),
     "GPTJForCausalLM": _HfExamplesInfo("EleutherAI/gpt-j-6b"),
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index b8312c2d9b7cc..2a072737db043 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -11,7 +11,7 @@
 
 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
 def test_can_initialize(model_arch):
-    if (model_arch == "Idefics3ForConditionalGeneration"
+    if (model_arch in {"Idefics3ForConditionalGeneration", "GlmForCausalLM"}
             and transformers.__version__ < "4.46.0"):
         pytest.skip(reason="Model introduced in HF >= 4.46.0")
 
diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py
new file mode 100644
index 0000000000000..942d1e14baed1
--- /dev/null
+++ b/vllm/model_executor/models/glm.py
@@ -0,0 +1,21 @@
+"""Inference-only HF format GLM-4 model compatible with THUDM weights."""
+from vllm.config import VllmConfig
+from vllm.model_executor.models.llama import LlamaForCausalLM
+
+from .utils import PPMissingLayer
+
+
+class GlmForCausalLM(LlamaForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        # Hack Llama model to fit HF format GLM implementation
+        # Attention difference between GLM and Llama:
+        # 1. Half partial rotary_dim and no Neox style.
+        # 2. There is no bias for o_proj in attention
+        for layer in self.model.layers:
+            if not isinstance(layer, PPMissingLayer):
+                layer.self_attn.rotary_emb.rotary_dim //= 2
+                layer.self_attn.rotary_emb.is_neox_style = False
+                layer.self_attn.o_proj.bias = None
+                layer.self_attn.o_proj.skip_bias_add = True
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 4462f6ed55a9c..c400c7d59828c 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -48,6 +48,7 @@
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
     "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
+    "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
     "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
     "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"),
@@ -107,6 +108,7 @@
     "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"),
+    "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "LlamaModel": ("llama", "LlamaEmbeddingModel"),
     **{
         # Multiple models share the same architecture, so we include them all

From 1362dacabc4b6a51601e60b27726f6f87d182827 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 28 Nov 2024 08:31:28 -0800
Subject: [PATCH 090/293] [Kernel] Update vllm-flash-attn version to reduce CPU
 overheads (#10742)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 45a3b484e0360..f43bf8143458b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -522,7 +522,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG d886f88165702b3c7e7744502772cd98b06be9e1
+          GIT_TAG fdf6d72b48aea41f4ae6a89139a453dae554abc8
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

From bc6637cf14ed426f8d7a4d0361360a6bd0fe8b92 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 28 Nov 2024 09:01:02 -0800
Subject: [PATCH 091/293] [V1] Optimize the CPU overheads in FlashAttention
 custom op (#10733)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/attention/backends/flash_attn.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 5f8535eaa303f..e618edf7d35bf 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -135,6 +135,13 @@ def forward(
         assert k_scale == 1.0 and v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")
 
+        # Reshape the query, key, and value tensors.
+        # NOTE(woosuk): We do this outside the custom op to minimize the CPU
+        # overheads from the non-CUDA-graph regions.
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)
+
         output = torch.empty_like(query)
         torch.ops.vllm.unified_v1_flash_attention(
             output,
@@ -153,7 +160,7 @@ def forward(
             self.alibi_slopes,
             self.logits_soft_cap,
         )
-        return output
+        return output.view(-1, self.num_heads * self.head_size)
 
 
 def unified_v1_flash_attention(
@@ -184,11 +191,6 @@ def unified_v1_flash_attention(
     attn_metadata: FlashAttentionMetadata = current_metadata
     num_actual_tokens = attn_metadata.num_actual_tokens
 
-    # Reshape the query, key, and value tensors.
-    query = query.view(-1, num_heads, head_size)
-    key = key.view(-1, num_kv_heads, head_size)
-    value = value.view(-1, num_kv_heads, head_size)
-
     # Reshape the input keys and values and store them in the cache.
     key_cache = kv_cache[0]
     value_cache = kv_cache[1]
@@ -218,8 +220,7 @@ def unified_v1_flash_attention(
         block_table=attn_metadata.block_table,
         softcap=logits_soft_cap,
     )
-    attn_output = attn_output.view(num_actual_tokens, -1)
-    # TODO(woosuk): Optimize this.
+    # TODO(woosuk): Remove this unnecessary copy.
     output[:num_actual_tokens].copy_(attn_output)
 
 

From 3733796f63577211e5a0c615a4f0b47d53ffc9bc Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 29 Nov 2024 01:29:04 +0800
Subject: [PATCH 092/293] [Model] Add Internlm2 LoRA support (#5064)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 docs/source/models/supported_models.rst |  2 +-
 vllm/model_executor/models/internlm2.py | 22 ++++++++++++++++++++--
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index fd0671beacee7..7b7a83f20871b 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -182,7 +182,7 @@ Text Generation
   * - :code:`InternLM2ForCausalLM`
     - InternLM2
     - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc.
-    -
+    - ✅︎
     - ✅︎
   * - :code:`JAISLMHeadModel`
     - Jais
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 906128940ff76..41b9f110d771f 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -27,7 +27,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -319,7 +319,21 @@ def forward(
         return hidden_states
 
 
-class InternLM2ForCausalLM(nn.Module, SupportsPP):
+class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+    packed_modules_mapping = {
+        "wqkv": ["wqkv"],
+        "gate_up_proj": ["w1", "w3"],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "wqkv",
+        "wo",
+        "gate_up_proj",
+        "w2",
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
 
     def __init__(self,
                  *,
@@ -329,8 +343,12 @@ def __init__(self,
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
         self.config = config
         self.quant_config = quant_config
+        self.lora_config = lora_config
+
         self.model = model_type(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
         self.output = ParallelLMHead(config.vocab_size,

From 170a30c1f77707e1702ea4152dff7c467047e64b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 29 Nov 2024 12:47:06 +0800
Subject: [PATCH 093/293] [Model] Clean up MiniCPMV (#10751)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .../vision_language/test_models.py            |  19 ++-
 .../vision_language/vlm_utils/model_utils.py  |  13 +-
 vllm/model_executor/layers/fused_moe/layer.py |  10 +-
 vllm/model_executor/models/minicpm.py         | 153 +++++++++---------
 vllm/model_executor/models/minicpm3.py        |   5 +-
 vllm/model_executor/models/minicpmv.py        | 136 ++++------------
 vllm/model_executor/models/utils.py           |  28 +---
 7 files changed, 149 insertions(+), 215 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 3f6d8ef42cd5f..3457ec6b8e73b 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -295,16 +295,29 @@
             )
         ],
     ),
-    "minicpmv": VLMTestInfo(
+    "minicpmv_25": VLMTestInfo(
         models=["openbmb/MiniCPM-Llama3-V-2_5"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        test_type=VLMTestType.IMAGE,
         prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
         max_model_len=4096,
         max_num_seqs=2,
         get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
         postprocess_inputs=model_utils.wrap_inputs_post_processor,
-        hf_output_post_proc=model_utils.minicmpv_trunc_hf_output,
+        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+    ),
+    "minicpmv_26": VLMTestInfo(
+        models=["openbmb/MiniCPM-V-2_6"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
+        postprocess_inputs=model_utils.ignore_inputs_post_processor(
+            "image_sizes"
+        ),
+        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
     ),
     # Tests for phi3v currently live in another file because of a bug in
     # transformers. Once this issue is fixed, we can enable them here instead.
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 849857b4232e7..15f15dd7d8030 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -170,7 +170,7 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
 
 
 ####### Post-processors for HF outputs
-def minicmpv_trunc_hf_output(hf_output: RunnerOutput,
+def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
                              model: str) -> RunnerOutput:
     output_ids, output_str, out_logprobs = hf_output
     if output_str.endswith("<|eot_id|>"):
@@ -197,6 +197,17 @@ def process(hf_inputs: BatchEncoding, dtype: str):
     return process
 
 
+def ignore_inputs_post_processor(
+        hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
+    """Gets a handle to a post processor which ignores a given key."""
+
+    def process(hf_inputs: BatchEncoding, dtype: str):
+        del hf_inputs[hf_inp_key]
+        return hf_inputs
+
+    return process
+
+
 def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
     return {"model_inputs": hf_inputs}
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 5570771ac917b..8c6f7c6e06515 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -242,7 +242,7 @@ def _load_per_tensor_weight_scale(self, shard_id: str,
     def _load_model_weight_or_group_weight_scale(self, shard_dim: int,
                                                  expert_data: torch.Tensor,
                                                  shard_id: str,
-                                                 loaded_weight: torch.tensor,
+                                                 loaded_weight: torch.Tensor,
                                                  tp_rank: int):
         # Load grouped weight scales for group quantization
         # or model weights
@@ -261,7 +261,7 @@ def _load_model_weight_or_group_weight_scale(self, shard_dim: int,
 
     def _load_per_channel_weight_scale(self, expert_data: torch.Tensor,
                                        shard_dim: int, shard_id: str,
-                                       loaded_weight: torch.tensor,
+                                       loaded_weight: torch.Tensor,
                                        tp_rank: int):
         # for per channel weight quantization
         if shard_id == "w2":
@@ -274,7 +274,7 @@ def _load_per_channel_weight_scale(self, expert_data: torch.Tensor,
                            tp_rank=tp_rank)
 
     def _load_w13(self, expert_data: torch.Tensor, shard_dim: int,
-                  shard_id: str, loaded_weight: torch.tensor, tp_rank: int):
+                  shard_id: str, loaded_weight: torch.Tensor, tp_rank: int):
 
         # Index the loaded weight for tp sharding.
         # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
@@ -292,7 +292,7 @@ def _load_w13(self, expert_data: torch.Tensor, shard_dim: int,
         expert_data.copy_(loaded_weight)
 
     def _load_w2(self, expert_data: torch.Tensor, shard_dim: int,
-                 shard_id: str, loaded_weight: torch.tensor, tp_rank: int):
+                 shard_id: str, loaded_weight: torch.Tensor, tp_rank: int):
 
         # Index the loaded weight for tp sharding.
         # down_proj: "RowParallel" so tp sharding on input_dim
@@ -311,7 +311,7 @@ def _load_single_value(self, param: torch.nn.Parameter,
         param_data[expert_id] = loaded_weight
 
     def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor,
-                    shard_dim: int, loaded_weight: torch.tensor, tp_rank: int):
+                    shard_dim: int, loaded_weight: torch.Tensor, tp_rank: int):
 
         if shard_id == "w2":
             self._load_w2(shard_id=shard_id,
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index c9a573278a136..6254d26c7060d 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -52,7 +52,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -378,6 +378,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             config.hidden_size,
             org_num_embeddings=config.vocab_size,
         )
+        self.num_experts = getattr(self.config, "num_experts", 0)
         self._init_layers(prefix, config, cache_config, quant_config)
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.make_empty_intermediate_tensors = (
@@ -437,6 +438,73 @@ def forward(
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        expert_params_mapping = [
+            # (param_name, weight_name, expert_id)
+            ("ws" if weight_name in ["w1", "w3"] else "w2s",
+             f"experts.{expert_id}.{weight_name}.weight", expert_id)
+            for expert_id in range(self.num_experts)
+            for weight_name in ["w1", "w2", "w3"]
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for param_name, weight_name, expert_id in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  weight_name,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
@@ -480,8 +548,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.cache_config = cache_config
         self.quant_config = quant_config
 
-        self.num_experts = getattr(self.config, "num_experts", 0)
-        self._init_model(vllm_config=vllm_config, prefix=prefix)
+        self.model = self._init_model(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(prefix, "model"))
+
         unpadded_vocab_size = config.vocab_size
         if lora_config:
             unpadded_vocab_size += lora_config.lora_extra_vocab_size
@@ -506,8 +575,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors)
 
     def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        self.model = MiniCPMModel(vllm_config=vllm_config,
-                                  prefix=maybe_prefix(prefix, "model"))
+        return MiniCPMModel(vllm_config=vllm_config, prefix=prefix)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
@@ -546,72 +614,9 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        expert_params_mapping = [
-            # (param_name, weight_name, expert_id)
-            ("ws" if weight_name in ["w1", "w3"] else "w2s",
-             f"experts.{expert_id}.{weight_name}.weight", expert_id)
-            for expert_id in range(self.num_experts)
-            for weight_name in ["w1", "w2", "w3"]
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            # With tie_word_embeddings, we can skip lm_head.weight
-            # The weight might appear unnecessarily in the files if the model is
-            # processed with quantization, LoRA, fine-tuning, etc.
-            if self.config.tie_word_embeddings and "lm_head.weight" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                for param_name, weight_name, expert_id in expert_params_mapping:
-                    if weight_name not in name:
-                        continue
-                    name = name.replace(weight_name, param_name)
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  weight_name,
-                                  expert_id=expert_id)
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index c66be2d9c2d07..e9d7eada1d16c 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -40,7 +40,7 @@
                                                 MiniCPMForCausalLM,
                                                 MiniCPMModel)
 
-from .utils import make_layers, maybe_prefix
+from .utils import make_layers
 
 
 class MiniCPM3Attention(nn.Module):
@@ -248,5 +248,4 @@ class MiniCPM3ForCausalLM(MiniCPMForCausalLM):
     }
 
     def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        self.model = MiniCPM3Model(vllm_config=vllm_config,
-                                   prefix=maybe_prefix(prefix, "model"))
+        return MiniCPM3Model(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index aacce477e0460..1e8f9bd4cf418 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -22,7 +22,7 @@
 """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
 import math
 import re
-from functools import partial
+from functools import cached_property, partial
 from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
                     Set, Tuple, TypedDict, Union)
 
@@ -37,19 +37,15 @@
 from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
                                                   get_2d_sincos_pos_embed)
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.utils import set_default_torch_dtype
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.llama import LlamaModel
-from vllm.model_executor.models.minicpm import MiniCPMModel
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
 from vllm.model_executor.models.module_mapping import MultiModelKeys
-from vllm.model_executor.models.qwen2 import Qwen2Model
-from vllm.model_executor.models.utils import LLMWrapper
+from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.image import cached_get_image_processor
@@ -58,11 +54,7 @@
 
 from .idefics2_vision_model import Idefics2VisionTransformer
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
-from .utils import is_pp_missing_parameter, maybe_prefix
-
-_KEYS_TO_MODIFY_MAPPING = {
-    "llm.lm_head": "lm_head",
-}
+from .utils import AutoWeightsLoader, maybe_prefix
 
 RawImageType = Union[Image.Image, torch.Tensor]
 
@@ -297,10 +289,9 @@ def input_processor_for_minicpmv(ctx: InputContext, inputs: DecoderOnlyInputs):
 
     def get_placeholder(image_size: Tuple[int, int], num_image: int):
         if version == (2, 0) or version == (2, 5):
-            return image_processor. \
-                get_slice_image_placeholder(image_size)
-        return image_processor. \
-            get_slice_image_placeholder(image_size, num_image)
+            return image_processor.get_slice_image_placeholder(image_size)
+        return image_processor.get_slice_image_placeholder(
+            image_size, num_image)
 
     prompt = inputs.get("prompt")
     token_ids = inputs.get("prompt_token_ids")
@@ -400,37 +391,32 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.vpm = self.init_vision_module(config,
                                            quant_config,
                                            prefix=maybe_prefix(prefix, "vpm"))
-        param_dtype = torch.get_default_dtype()
-        self.vpm.to(dtype=param_dtype)
         self.vision_dim = (self.vpm.embed_dim if self.version == (2, 0) else
                            self.vpm.embeddings.embed_dim)
         self.embed_dim = self.config.hidden_size
+
         self.resampler = self.init_resampler(self.embed_dim,
                                              self.vision_dim,
                                              quant_config=quant_config,
                                              prefix=maybe_prefix(
                                                  prefix, "resampler"))
-        self.resampler.to(device="cuda", dtype=param_dtype)
-        # TODO: why is there _KEYS_TO_MODIFY_MAPPING? lm_head should be in llm
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config,
-                                      prefix=maybe_prefix(
-                                          prefix, "llm.lm_head"))
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.llm.make_empty_intermediate_tensors)
 
+    @cached_property
+    def sampler(self):
+        if hasattr(self.llm, "sampler"):
+            return self.llm.sampler
+
+        return get_sampler()
+
     def get_embedding(
         self,
         input_ids: torch.Tensor,
         image_inputs: Optional[MiniCPMVImageInputs],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        vlm_embedding: torch.Tensor = self.llm.embed_tokens(input_ids)
-        if hasattr(self.config, "scale_emb"):
-            vlm_embedding *= self.config.scale_emb
+        vlm_embedding: torch.Tensor = self.llm.get_input_embeddings(input_ids)
 
         if image_inputs is None:  # No image
             vision_hidden_states = torch.tensor([], device=input_ids.device)
@@ -575,7 +561,7 @@ def forward(
         # for `torch.compile` integration
         input_ids = None
 
-        output = self.llm(
+        output = self.llm.model(
             input_ids=input_ids,
             positions=positions,
             kv_caches=kv_caches,
@@ -590,9 +576,7 @@ def compute_logits(
         hidden_states: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
+        return self.llm.compute_logits(hidden_states, sampling_metadata)
 
     def sample(
         self,
@@ -604,52 +588,8 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
-                if key_to_modify in name:
-                    name = name.replace(key_to_modify, new_key)
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            use_default_weight_loading = False
-            if self.is_default_weight_loading(name):
-                use_default_weight_loading = True
-            else:
-                for param_name, weight_name, shard_id in stacked_params_mapping:
-                    if weight_name not in name:
-                        continue
-                    name = name.replace(weight_name, param_name)
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param, loaded_weight, shard_id)
-                    break
-                else:
-                    use_default_weight_loading = True
-            if use_default_weight_loading:
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
 
     def get_mm_mapping(self) -> MultiModelKeys:
         """
@@ -693,9 +633,6 @@ def get_vision_hidden_states(self,
                                  data: MiniCPMVImageInputs) -> torch.Tensor:
         raise NotImplementedError
 
-    def is_default_weight_loading(self, name: str) -> bool:
-        raise NotImplementedError
-
 
 class MiniCPMV2_0(MiniCPMVBaseModel):
 
@@ -708,8 +645,7 @@ def init_llm(
         vllm_config: VllmConfig,
         prefix: str = "",
     ) -> nn.Module:
-        return LLMWrapper(MiniCPMModel(vllm_config=vllm_config, prefix=prefix),
-                          name="model")
+        return MiniCPMForCausalLM(vllm_config=vllm_config, prefix=prefix)
 
     def init_vision_module(
         self,
@@ -717,11 +653,12 @@ def init_vision_module(
         quant_config: Optional[QuantizationConfig],
         prefix: str = "",
     ) -> nn.Module:
-        # TODO :refactor this vision model
+        # TODO: refactor this vision model
         try:
             import timm
         except ImportError:
             raise ImportError("Please install timm==0.9.10") from ImportError
+
         with set_default_torch_dtype(torch.float16):
             model = timm.create_model(
                 "vit_so400m_patch14_siglip_384.webli",
@@ -731,6 +668,8 @@ def init_vision_module(
                 dynamic_img_pad=True,
             )
 
+        model = model.to(dtype=torch.get_default_dtype())
+
         if (isinstance(model, timm.models.VisionTransformer)
                 and model.attn_pool is not None):
             model.attn_pool = torch.nn.Identity()
@@ -759,7 +698,7 @@ def init_resampler(self,
                                    quant_config=quant_config,
                                    prefix=prefix)
 
-        return resampler
+        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
 
     def get_vision_embedding(
         self,
@@ -790,9 +729,6 @@ def get_vision_hidden_states(self,
 
         return self.get_vision_embedding(pixel_values)
 
-    def is_default_weight_loading(self, name: str) -> bool:
-        return "resampler" in name or "vpm" in name
-
 
 class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
     packed_modules_mapping = {
@@ -843,8 +779,7 @@ def init_llm(
         vllm_config: VllmConfig,
         prefix: str = "",
     ) -> nn.Module:
-        return LLMWrapper(LlamaModel(vllm_config=vllm_config, prefix=prefix),
-                          name="model")
+        return LlamaForCausalLM(vllm_config=vllm_config, prefix=prefix)
 
     def init_vision_module(
         self,
@@ -871,7 +806,8 @@ def init_resampler(self,
                                      kv_dim=vision_dim,
                                      quant_config=quant_config,
                                      prefix=prefix)
-        return resampler
+
+        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
 
     def get_vision_embedding(
         self,
@@ -913,9 +849,6 @@ def get_vision_hidden_states(self,
         return self.get_vision_embedding(all_pixel_values.type(dtype),
                                          patch_attn_mask, tgt_sizes)
 
-    def is_default_weight_loading(self, name: str) -> bool:
-        return "resampler" in name
-
 
 class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
     packed_modules_mapping = {
@@ -966,8 +899,7 @@ def init_llm(
         vllm_config: VllmConfig,
         prefix: str = "",
     ) -> nn.Module:
-        return LLMWrapper(Qwen2Model(vllm_config=vllm_config, prefix=prefix),
-                          name="model")
+        return Qwen2ForCausalLM(vllm_config=vllm_config, prefix=prefix)
 
     def init_vision_module(
         self,
@@ -995,7 +927,8 @@ def init_resampler(self,
                                      kv_dim=vision_dim,
                                      quant_config=quant_config,
                                      prefix=prefix)
-        return resampler
+
+        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
 
     def get_vision_embedding(
         self,
@@ -1043,9 +976,6 @@ def get_vision_hidden_states(self,
 
         return self.resampler(vision_embedding, tgt_sizes)
 
-    def is_default_weight_loading(self, name: str) -> bool:
-        return "resampler" in name
-
 
 _SUPPORT_VERSION = {
     (2, 0): MiniCPMV2_0,
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 4c13cbc953273..a6b40a233439b 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -1,7 +1,7 @@
 import itertools
 from dataclasses import dataclass, field
-from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
-                    Optional, Protocol, Set, Tuple, Union, overload)
+from typing import (Callable, Dict, Iterable, List, Literal, Mapping, Optional,
+                    Protocol, Set, Tuple, Union, overload)
 
 import torch
 import torch.nn as nn
@@ -560,30 +560,6 @@ def make_empty_intermediate_tensors(
     return make_empty_intermediate_tensors
 
 
-class LLMWrapper(nn.Module):
-    """
-    To align with the key names of LoRA trained with PEFT, we need to add an
-    additional layer to the llm's implementation.
-    """
-
-    def __init__(self, llm: nn.Module, name: str) -> None:
-        super().__init__()
-        self.model_name = name
-        setattr(self, name, llm)
-
-    def __getattr__(self, key: str):
-        llm = super().__getattr__(self.model_name)
-        if key == self.model_name:
-            return llm
-
-        return getattr(llm, key)
-
-    # We need to explicitly override this
-    def __call__(self, *args: Any, **kwargs: Any) -> Any:
-        llm = super().__getattr__(self.model_name)
-        return llm(*args, **kwargs)
-
-
 def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
     """
     Get the available attention backend for Vision Transformer.

From 8d832441e603a7b4244e1c7811eb76bcfc4541c3 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Fri, 29 Nov 2024 13:17:57 +0800
Subject: [PATCH 094/293] [Misc] typo find in sampling_metadata.py (#10740)

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/model_executor/sampling_metadata.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 84f35f75a0c32..1df8f84ed4093 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -454,6 +454,7 @@ def from_sampling_metadata(
         if do_penalties:
             for seq_group in sampling_metadata.seq_groups:
                 seq_ids = seq_group.seq_ids
+                sampling_params = seq_group.sampling_params
                 if (seq_group.is_prompt
                         and sampling_params.prompt_logprobs is not None):
                     prefill_len = len(seq_group.prompt_logprob_indices)

From d8499c0b6cdbc14fcc7868ae9428a7fc27608d47 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 29 Nov 2024 21:56:46 +0800
Subject: [PATCH 095/293] [Bugfix] Fix Idefics3 bug (#10778)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/model_executor/models/idefics3.py | 92 +++++++++++++-------------
 1 file changed, 47 insertions(+), 45 deletions(-)

diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 014e27bc869d4..e5d2edbd81eb1 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -267,54 +267,56 @@ def input_processor_for_idefics3(ctx: InputContext,
     n_images_in_text = []
 
     text = inputs.get("prompt")
-    if text is not None:
-        if isinstance(text, str):
-            text = [text]
-        elif not isinstance(text, list) and not isinstance(text[0], str):
-            raise ValueError("Invalid input text. Please provide a string, "
-                             "or a list of strings")
-
-        fake_image_token = processor.fake_image_token.content
-        image_token = processor.image_token.content
-        global_img_token = processor.global_image_tag
-
-        prompt_strings = []
-        for sample, sample_rows, sample_cols in zip(text, image_rows,
-                                                    image_cols):
-            n_images_in_text.append(sample.count(image_token))
-
-            # Replace the image token with fake tokens around the expanded
-            # image token sequence of length `image_seq_len`
-            image_prompt_strings = []
-            for n_rows, n_cols in zip(sample_rows, sample_cols):
-                image_prompt_string = _get_image_prompt_string(
-                    n_rows,
-                    n_cols,
-                    processor.image_seq_len,
-                    image_token=image_token,
-                    fake_token_around_image=fake_image_token,
-                    global_img_token=global_img_token,
-                )
-                image_prompt_strings.append(image_prompt_string)
-
-            split_sample = sample.split(image_token)
-            if len(split_sample) == 0:
-                raise ValueError(
-                    "The image token should be present in the text.")
+    if text is None:
+        prompt_token_ids = inputs.get("prompt_token_ids", [])
+        assert prompt_token_ids
+        text = tokenizer.decode(prompt_token_ids)
+
+    if isinstance(text, str):
+        text = [text]
+    elif not isinstance(text, list) and not isinstance(text[0], str):
+        raise ValueError("Invalid input text. Please provide a string, "
+                         "or a list of strings")
+
+    fake_image_token = processor.fake_image_token.content
+    image_token = processor.image_token.content
+    global_img_token = processor.global_image_tag
+
+    prompt_strings = []
+    for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
+        n_images_in_text.append(sample.count(image_token))
+
+        # Replace the image token with fake tokens around the expanded
+        # image token sequence of length `image_seq_len`
+        image_prompt_strings = []
+        for n_rows, n_cols in zip(sample_rows, sample_cols):
+            image_prompt_string = _get_image_prompt_string(
+                n_rows,
+                n_cols,
+                processor.image_seq_len,
+                image_token=image_token,
+                fake_token_around_image=fake_image_token,
+                global_img_token=global_img_token,
+            )
+            image_prompt_strings.append(image_prompt_string)
 
-            # Place in the image prompt strings where the image tokens are
-            sample = split_sample[0]
-            for i, image_prompt_string in enumerate(image_prompt_strings):
-                sample += image_prompt_string + split_sample[i + 1]
-            prompt_strings.append(sample)
+        split_sample = sample.split(image_token)
+        if len(split_sample) == 0:
+            raise ValueError("The image token should be present in the text.")
 
-        prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids
+        # Place in the image prompt strings where the image tokens are
+        sample = split_sample[0]
+        for i, image_prompt_string in enumerate(image_prompt_strings):
+            sample += image_prompt_string + split_sample[i + 1]
+        prompt_strings.append(sample)
 
-        return token_inputs(
-            prompt_token_ids=prompt_token_ids,
-            prompt=prompt_strings[0],
-            multi_modal_data=multi_modal_data,
-        )
+    prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids
+
+    return token_inputs(
+        prompt_token_ids=prompt_token_ids,
+        prompt=prompt_strings[0],
+        multi_modal_data=multi_modal_data,
+    )
 
 
 def _get_max_num_image_patch(image_processor: Idefics3ImageProcessor) -> int:

From 3c8ced24a9ca74e6a4dfde3425932b5b2f3778eb Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan@huawei.com>
Date: Fri, 29 Nov 2024 23:22:21 +0800
Subject: [PATCH 096/293] [platform] Add verify_quantization in platform.
 (#10757)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/config.py              | 28 +---------------------------
 vllm/platforms/cpu.py       |  1 +
 vllm/platforms/cuda.py      |  1 +
 vllm/platforms/hpu.py       |  1 +
 vllm/platforms/interface.py | 13 +++++++++++++
 vllm/platforms/neuron.py    |  2 ++
 vllm/platforms/openvino.py  |  1 +
 vllm/platforms/rocm.py      | 15 +++++++++++++++
 vllm/platforms/tpu.py       |  2 ++
 vllm/platforms/xpu.py       |  1 +
 10 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index cd24e9ffdf598..b1e5b412fec8f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -393,17 +393,11 @@ def _parse_quant_hf_config(self):
 
     def _verify_quantization(self) -> None:
         supported_quantization = QUANTIZATION_METHODS
-        rocm_supported_quantization = [
-            "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
-            "fbgemm_fp8", "gguf"
-        ]
         optimized_quantization_methods = [
             "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
             "awq_marlin", "fbgemm_fp8", "compressed_tensors",
             "compressed-tensors", "experts_int8"
         ]
-        tpu_supported_quantization = ["tpu_int8"]
-        neuron_supported_quantization = ["neuron_quant"]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
 
@@ -438,32 +432,12 @@ def _verify_quantization(self) -> None:
                 raise ValueError(
                     f"Unknown quantization method: {self.quantization}. Must "
                     f"be one of {supported_quantization}.")
-            if current_platform.is_rocm(
-            ) and self.quantization not in rocm_supported_quantization:
-                raise ValueError(
-                    f"{self.quantization} quantization is currently not "
-                    f"supported in ROCm.")
-            if current_platform.is_tpu(
-            ) and self.quantization not in tpu_supported_quantization:
-                raise ValueError(
-                    f"{self.quantization} quantization is currently not "
-                    f"supported in TPU Backend.")
+            current_platform.verify_quantization(self.quantization)
             if self.quantization not in optimized_quantization_methods:
                 logger.warning(
                     "%s quantization is not fully "
                     "optimized yet. The speed can be slower than "
                     "non-quantized models.", self.quantization)
-            if (self.quantization == "awq" and current_platform.is_rocm()
-                    and not envs.VLLM_USE_TRITON_AWQ):
-                logger.warning(
-                    "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
-                    " is not set, enabling VLLM_USE_TRITON_AWQ.")
-                envs.VLLM_USE_TRITON_AWQ = True
-            if current_platform.is_neuron(
-            ) and self.quantization not in neuron_supported_quantization:
-                raise ValueError(
-                    f"{self.quantization} quantization is currently not "
-                    f"supported in Neuron Backend.")
 
     def _verify_cuda_graph(self) -> None:
         if self.max_seq_len_to_capture is None:
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 3e22c87f61fac..b5333fbd6f502 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -19,6 +19,7 @@
 
 class CpuPlatform(Platform):
     _enum = PlatformEnum.CPU
+    device_name: str = "cpu"
     device_type: str = "cpu"
     dispatch_key: str = "CPU"
 
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 5e9ce551f2332..846a1869da228 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -72,6 +72,7 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
 
 class CudaPlatformBase(Platform):
     _enum = PlatformEnum.CUDA
+    device_name: str = "cuda"
     device_type: str = "cuda"
     dispatch_key: str = "CUDA"
 
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 3071136e43b85..10aaa6d54962c 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -12,6 +12,7 @@
 
 class HpuPlatform(Platform):
     _enum = PlatformEnum.HPU
+    device_name: str = "hpu"
     device_type: str = "hpu"
     dispatch_key: str = "HPU"
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 3328665029039..eac2b413f9271 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -56,11 +56,13 @@ def to_int(self) -> int:
 
 class Platform:
     _enum: PlatformEnum
+    device_name: str
     device_type: str
     # available dispatch keys:
     # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa
     # use "CPU" as a fallback for platforms not registered in PyTorch
     dispatch_key: str = "CPU"
+    supported_quantization: list[str] = []
 
     def is_cuda(self) -> bool:
         return self._enum == PlatformEnum.CUDA
@@ -171,6 +173,17 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         """
         pass
 
+    @classmethod
+    def verify_quantization(cls, quant: str) -> None:
+        """
+        Verify whether the quantization is supported by the current platform.
+        """
+        if cls.supported_quantization and \
+            quant not in cls.supported_quantization:
+            raise ValueError(
+                f"{quant} quantization is currently not supported in "
+                f"{cls.device_name}.")
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 4c4d778ed3dd4..87655ea198303 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -10,7 +10,9 @@
 
 class NeuronPlatform(Platform):
     _enum = PlatformEnum.NEURON
+    device_name: str = "neuron"
     device_type: str = "neuron"
+    supported_quantization: list[str] = ["neuron_quant"]
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index ea5ec7b40b95c..29b61e955d9ab 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -23,6 +23,7 @@
 
 class OpenVinoPlatform(Platform):
     _enum = PlatformEnum.OPENVINO
+    device_name: str = "openvino"
     device_type: str = "openvino"
     dispatch_key: str = "CPU"
 
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index d2f44c3e423e3..3c14fbc179f69 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -4,6 +4,7 @@
 
 import torch
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 
 from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
@@ -35,8 +36,13 @@
 
 class RocmPlatform(Platform):
     _enum = PlatformEnum.ROCM
+    device_name: str = "rocm"
     device_type: str = "cuda"
     dispatch_key: str = "CUDA"
+    supported_quantization: list[str] = [
+        "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
+        "fbgemm_fp8", "gguf"
+    ]
 
     @classmethod
     def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
@@ -79,3 +85,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                     "vllm.spec_decode.spec_decode_worker.create_spec_worker"
             else:
                 parallel_config.worker_cls = "vllm.worker.worker.Worker"
+
+    @classmethod
+    def verify_quantization(cls, quant: str) -> None:
+        super().verify_quantization(quant)
+        if quant == "awq" and not envs.VLLM_USE_TRITON_AWQ:
+            logger.warning(
+                "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
+                " is not set, enabling VLLM_USE_TRITON_AWQ.")
+        envs.VLLM_USE_TRITON_AWQ = True
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 137af57023ea9..b138f7e1c54c5 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -16,8 +16,10 @@
 
 class TpuPlatform(Platform):
     _enum = PlatformEnum.TPU
+    device_name: str = "tpu"
     device_type: str = "tpu"
     dispatch_key: str = "XLA"
+    supported_quantization: list[str] = ["tpu_int8"]
 
     @classmethod
     def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 69388a8e0f27c..9665786f4c499 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -16,6 +16,7 @@
 
 class XPUPlatform(Platform):
     _enum = PlatformEnum.XPU
+    device_name: str = "xpu"
     device_type: str = "xpu"
     dispatch_key: str = "XPU"
 

From 5146352a8dad7f5fca3e8e07ed1ac22515189e54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Sat, 30 Nov 2024 05:07:13 +0100
Subject: [PATCH 097/293] [Bugfix] Fix OpenVino/Neuron `driver_worker` init
 (#10779)

Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/executor/neuron_executor.py   | 6 ++++--
 vllm/executor/openvino_executor.py | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index 31e6fdc3ab1bb..a9efc4f9a801c 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -29,11 +29,13 @@ def _init_worker(self):
         wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
         distributed_init_method = get_distributed_init_method(
             get_ip(), get_open_port())
-        self.driver_worker = wrapper.init_worker(
+        wrapper.init_worker(
             vllm_config=self.vllm_config,
             local_rank=0,
             rank=0,
-            distributed_init_method=distributed_init_method)
+            distributed_init_method=distributed_init_method,
+        )
+        self.driver_worker = wrapper.worker
         self.driver_worker.init_device()
         self.driver_worker.load_model()
 
diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py
index db0070ce510ee..057a32364e512 100644
--- a/vllm/executor/openvino_executor.py
+++ b/vllm/executor/openvino_executor.py
@@ -36,7 +36,7 @@ def _init_worker(self):
 
         distributed_init_method = get_distributed_init_method(
             get_ip(), get_open_port())
-        self.driver_worker = wrapper.init_worker(
+        wrapper.init_worker(
             ov_core=ov.Core(),
             vllm_config=self.vllm_config,
             local_rank=0,
@@ -45,6 +45,7 @@ def _init_worker(self):
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=True,
         )
+        self.driver_worker = wrapper.worker
         self.driver_worker.init_device()
         self.driver_worker.load_model()
 

From d95da875116ac41816dc1db71f17a552003a64e2 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 30 Nov 2024 12:19:14 +0800
Subject: [PATCH 098/293] [Model] Refactor Molmo weights loading to use
 AutoWeightsLoader (#10771)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/model_executor/models/molmo.py | 213 +++++++++++++++-------------
 1 file changed, 111 insertions(+), 102 deletions(-)

diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index acedddd84d7cb..98caa6857e211 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -3,7 +3,7 @@
 from array import array
 from dataclasses import dataclass
 from functools import lru_cache, partial
-from typing import Iterable, List, Mapping, Optional, Tuple, TypedDict
+from typing import Iterable, List, Mapping, Optional, Set, Tuple, TypedDict
 
 import torch
 from einops import rearrange
@@ -44,7 +44,8 @@
 from vllm.transformers_utils.processor import get_processor
 
 from .interfaces import SupportsMultiModal, SupportsPP
-from .utils import (get_vit_attn_backend,
+from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -720,6 +721,42 @@ def forward(
         # image_features: (batch_size, num_image, num_patch, d_model)
         return image_features
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 @support_torch_compile
 class MolmoModel(nn.Module):
@@ -804,6 +841,28 @@ def forward(
             hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "gate_up_proj" in name:
+                up_proj, gate_proj = loaded_weight.chunk(2, dim=0)
+                loaded_weight = torch.cat([gate_proj, up_proj], dim=0)
+
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 cached_get_processor = lru_cache(get_processor)
 
@@ -1200,103 +1259,53 @@ def sample(
         return next_tokens
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-
-        params_mapping = [
-            ("model.transformer.ln_f.weight", "model.norm.weight"),
-            ("attn_out", "self_attn.o_proj"),
-            ("att_proj", "self_attn.qkv_proj"),
-            ("q_norm", "self_attn.q_norm"),
-            ("k_norm", "self_attn.k_norm"),
-            ("attn_norm", "input_layernorm"),
-            ("ff_norm", "post_attention_layernorm"),
-        ]
-
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-
-        embedding_weight = dict()
-        projector_weight = dict()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if self.config.tie_word_embeddings and "lm_head.weight" in name:
-                continue
-
-            if "wte.embedding" in name:
-                embedding_weight["embedding"] = loaded_weight
-                continue
-
-            if "wte.new_embedding" in name:
-                embedding_weight["new_embedding"] = loaded_weight
-                continue
-
-            if "vision_backbone" in name:
-                if name.startswith("model"):
-                    name = name[len("model."):]
-                if 'image_projector' in name:
-                    if 'w1' in name:
-                        projector_weight['gate_proj'] = loaded_weight
-                    elif 'w3' in name:
-                        projector_weight['up_proj'] = loaded_weight
-                    elif 'w2' in name:
-                        projector_weight['down_proj'] = loaded_weight
-                    else:
-                        raise ValueError(
-                            f"Unexpected projector weight: {name}")
-                    continue
-            else:
-                if "transformer.blocks" in name:
-                    name = name.replace("transformer.blocks", "layers")
-
-                if "ff_proj" in name:
-                    name = name.replace("ff_proj", "mlp.gate_up_proj")
-                    assert 'weight' in name
-                    up_weight, gate_weight = loaded_weight.chunk(2, dim=0)
-                    loaded_weight = torch.cat([gate_weight, up_weight], dim=0)
-
-                elif "ff_out" in name:
-                    if "layers" in name:
-                        name = name.replace("ff_out", "mlp.down_proj")
-                    else:
-                        # lm head
-                        name = name.replace("model.transformer.ff_out",
-                                            "lm_head")
-
-                else:
-                    for (param_name, weight_name) in params_mapping:
-                        if param_name in name:
-                            name = name.replace(param_name, weight_name)
-                            break
-
-            try:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-            except KeyError:
-                raise ValueError(f"Unexpected weight: {name}") from None
-
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-
-        gate_up_proj_weight = torch.cat(
-            [projector_weight["gate_proj"], projector_weight["up_proj"]],
-            dim=0)
-        name = "vision_backbone.image_projector.gate_up_proj.weight"
-        param = params_dict[name]
-        weight_loader = getattr(param, "weight_loader", default_weight_loader)
-        weight_loader(param, gate_up_proj_weight)
-
-        down_proj_weight = projector_weight["down_proj"]
-        name = "vision_backbone.image_projector.down_proj.weight"
-        param = params_dict[name]
-        weight_loader = getattr(param, "weight_loader", default_weight_loader)
-        weight_loader(param, down_proj_weight)
-
-        embedding_weight = torch.cat(
-            [embedding_weight["embedding"], embedding_weight["new_embedding"]],
-            dim=0)
-        name = "model.embed_tokens.weight"
-        param = params_dict[name]
-        weight_loader = getattr(param, "weight_loader", default_weight_loader)
-        weight_loader(param, embedding_weight)
+        hf_to_vllm_mapper = WeightsMapper(
+            orig_to_new_substr={
+                # vision backbone mapping
+                "image_projector.w1.": "image_projector.gate_proj.",
+                "image_projector.w3.": "image_projector.up_proj.",
+                "image_projector.w2.": "image_projector.down_proj.",
+                # language backbone mapping
+                "att_proj": "self_attn.qkv_proj",
+                "attn_out": "self_attn.o_proj",
+                "q_norm": "self_attn.q_norm",
+                "k_norm": "self_attn.k_norm",
+                "ff_proj": "mlp.gate_up_proj",
+                "ff_out": "mlp.down_proj",
+                "attn_norm": "input_layernorm",
+                "ff_norm": "post_attention_layernorm",
+            },
+            orig_to_new_prefix={
+                # vision backbone mapping
+                "model.vision_backbone.": "vision_backbone.",
+                # language backbone mapping
+                "model.transformer.blocks.": "model.layers.",
+                "model.transformer.ln_f.": "model.norm.",
+                # lm_head is renamed to model.transformer.mlp.down_proj firstly,
+                # we need to run a second renaming for it
+                "model.transformer.mlp.down_proj.": "lm_head.",
+            },
+        )
+        loader = AutoWeightsLoader(self)
+        weights = _get_weights_with_merged_embedding(weights)
+        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
+
+
+def _get_weights_with_merged_embedding(
+    weights: Iterable[Tuple[str, torch.Tensor]]
+) -> Iterable[Tuple[str, torch.Tensor]]:
+    embedding_weights = {}
+    for name, weight in weights:
+        if "wte.embedding" in name:
+            embedding_weights["embedding"] = weight
+        elif "wte.new_embedding" in name:
+            embedding_weights["new_embedding"] = weight
+        else:
+            yield (name, weight)
+    # this is compatible with most of quantization,
+    # because they won't quantize embed_tokens
+    embedding_weights = torch.cat(
+        [embedding_weights["embedding"], embedding_weights["new_embedding"]],
+        dim=0,
+    )
+    yield ("model.embed_tokens.weight", embedding_weights)

From 7831672c0ea16b647f9e462f67331a315efd4804 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Sat, 30 Nov 2024 08:45:50 +0100
Subject: [PATCH 099/293] [Interleaved ATTN] Support for Mistral-8B (#10591)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/model_executor/models/llama.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index fe94bb352961b..ff0ab011a9158 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -54,7 +54,7 @@
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
-                    is_pp_missing_parameter,
+                    extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -114,6 +114,7 @@ def __init__(
         prefix: str = "",
     ) -> None:
         super().__init__()
+        layer_idx = extract_layer_index(prefix)
         self.hidden_size = hidden_size
         tp_size = get_tensor_model_parallel_world_size()
         self.total_num_heads = num_heads
@@ -168,6 +169,18 @@ def __init__(
             rope_scaling=rope_scaling,
             is_neox_style=is_neox_style,
         )
+
+        if hasattr(config, "interleaved_sliding_window"):
+            if isinstance(config.interleaved_sliding_window, int):
+                sliding_window = config.interleaved_sliding_window
+            elif isinstance(config.interleaved_sliding_window, list):
+                sw_idx = layer_idx % len(config.interleaved_sliding_window)
+                sliding_window = config.interleaved_sliding_window[sw_idx]
+            else:
+                raise ValueError(f"{type(sliding_window)} is not supported.")
+        else:
+            sliding_window = None
+
         self.attn = Attention(
             self.num_heads,
             self.head_dim,
@@ -175,6 +188,7 @@ def __init__(
             num_kv_heads=self.num_kv_heads,
             cache_config=cache_config,
             quant_config=quant_config,
+            per_layer_sliding_window=sliding_window,
             prefix=f"{prefix}.attn",
         )
 

From a877540b5460f0830fb80679a734288750f1f41c Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Sat, 30 Nov 2024 19:38:40 +0800
Subject: [PATCH 100/293] [doc] format fix (#10789)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .../automatic_prefix_caching/details.md       |  2 +-
 .../getting_started/gaudi-installation.rst    | 36 +++++++++----------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/docs/source/automatic_prefix_caching/details.md b/docs/source/automatic_prefix_caching/details.md
index 2d3214e28ed93..17f806217aa65 100644
--- a/docs/source/automatic_prefix_caching/details.md
+++ b/docs/source/automatic_prefix_caching/details.md
@@ -25,7 +25,7 @@ With this mapping, we can add another indirection in vLLM’s KV cache managemen
 This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system.
 
 
-# Generalized Caching Policy
+## Generalized Caching Policy
 
 Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full.
 
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index 68c1a56660fa4..249e08278ff8f 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -4,7 +4,7 @@ Installation with Intel® Gaudi® AI Accelerators
 This README provides instructions on running vLLM with Intel Gaudi devices.
 
 Requirements and Installation
-=============================
+-----------------------------
 
 Please follow the instructions provided in the `Gaudi Installation
 Guide <https://docs.habana.ai/en/latest/Installation_Guide/index.html>`__
@@ -13,7 +13,7 @@ please follow the methods outlined in the `Optimizing Training Platform
 Guide <https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html>`__.
 
 Requirements
-------------
+~~~~~~~~~~~~
 
 -  OS: Ubuntu 22.04 LTS
 -  Python: 3.10
@@ -22,7 +22,7 @@ Requirements
 
 
 Quick start using Dockerfile
-----------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code:: console
 
    $ docker build -f Dockerfile.hpu -t vllm-hpu-env  .
@@ -34,10 +34,10 @@ Quick start using Dockerfile
 
 
 Build from source
------------------
+~~~~~~~~~~~~~~~~~
 
 Environment verification
-~~~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^^^
 
 To verify that the Intel Gaudi software was correctly installed, run:
 
@@ -53,7 +53,7 @@ Verification <https://docs.habana.ai/en/latest/Installation_Guide/SW_Verificatio
 for more details.
 
 Run Docker Image
-~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^
 
 It is highly recommended to use the latest Docker image from Intel Gaudi
 vault. Refer to the `Intel Gaudi
@@ -68,7 +68,7 @@ Use the following commands to run a Docker image:
    $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
 
 Build and Install vLLM
-~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^
 
 To build and install vLLM from source, run:
 
@@ -90,7 +90,7 @@ Currently, the latest features and performance optimizations are developed in Ga
 
 
 Supported Features
-==================
+------------------
 
 -  `Offline batched
    inference <https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference>`__
@@ -107,7 +107,7 @@ Supported Features
 -  Attention with Linear Biases (ALiBi)
 
 Unsupported Features
-====================
+--------------------
 
 -  Beam search
 -  LoRA adapters
@@ -115,7 +115,7 @@ Unsupported Features
 -  Prefill chunking (mixed-batch inferencing)
 
 Supported Configurations
-========================
+------------------------
 
 The following configurations have been validated to be function with
 Gaudi2 devices. Configurations that are not listed may or may not work.
@@ -152,10 +152,10 @@ Gaudi2 devices. Configurations that are not listed may or may not work.
    with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
 
 Performance Tuning
-==================
+------------------
 
 Execution modes
----------------
+~~~~~~~~~~~~~~~
 
 Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via ``PT_HPU_LAZY_MODE`` environment variable), and ``--enforce-eager`` flag.  
 
@@ -184,7 +184,7 @@ Currently in vLLM for HPU we support four execution modes, depending on selected
 
 
 Bucketing mechanism
--------------------
+~~~~~~~~~~~~~~~~~~~
 
 Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. `Intel Gaudi Graph Compiler <https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime>`__ is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
 In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - ``batch_size`` and ``sequence_length``. 
@@ -233,7 +233,7 @@ As an example, if a request of 3 sequences, with max sequence length of 412 come
    Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
 
 Warmup
-------
+~~~~~~
 
 Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
 
@@ -257,7 +257,7 @@ This example uses the same buckets as in *Bucketing mechanism* section. Each out
    Compiling all the buckets might take some time and can be turned off with ``VLLM_SKIP_WARMUP=true`` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
 
 HPU Graph capture
------------------
+~~~~~~~~~~~~~~~~~
 
 `HPU Graphs <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html>`__ are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
 
@@ -321,7 +321,7 @@ Each described step is logged by vLLM server, as follows (negative values corres
 
 
 Recommended vLLM Parameters
----------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 -  We recommend running inference on Gaudi 2 with ``block_size`` of 128
    for BF16 data type. Using default values (16, 32) might lead to
@@ -333,7 +333,7 @@ Recommended vLLM Parameters
    If you encounter out-of-memory issues, see troubleshooting section.
 
 Environment variables
----------------------
+~~~~~~~~~~~~~~~~~~~~~
 
 **Diagnostic and profiling knobs:**
 
@@ -380,7 +380,7 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM
 -   ``PT_HPU_ENABLE_LAZY_COLLECTIVES``: required to be ``true`` for tensor parallel inference with HPU Graphs
 
 Troubleshooting: Tweaking HPU Graphs
-====================================
+------------------------------------
 
 If you experience device out-of-memory issues or want to attempt
 inference at higher batch sizes, try tweaking HPU Graphs by following

From cbf14899b4889ffbfa485b288fb934518111c97c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 1 Dec 2024 08:02:54 +0800
Subject: [PATCH 101/293] [Model] Replace embedding models with pooling adapter
 (#10769)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml                 |   4 +-
 docs/source/models/supported_models.rst       |  15 ++-
 tests/conftest.py                             |   1 -
 .../embedding/language/test_embedding.py      |   5 +
 tests/models/test_registry.py                 |  31 +++---
 .../my_gemma_embedding.py                     |  45 +++++++-
 tests/test_config.py                          |   3 +-
 vllm/config.py                                |  25 +++++
 vllm/inputs/registry.py                       |  16 +--
 vllm/model_executor/layers/pooler.py          |   4 +-
 vllm/model_executor/model_loader/loader.py    |  18 +++-
 vllm/model_executor/model_loader/utils.py     |  18 +++-
 vllm/model_executor/models/adapters.py        |  98 +++++++++++++++++
 vllm/model_executor/models/blip2.py           |   5 +-
 vllm/model_executor/models/gemma2.py          |  58 +---------
 vllm/model_executor/models/internvl.py        |   5 +-
 vllm/model_executor/models/llama.py           | 102 ++----------------
 vllm/model_executor/models/llava.py           |   5 +-
 vllm/model_executor/models/llava_next.py      |  26 +----
 .../model_executor/models/llava_next_video.py |   5 +-
 vllm/model_executor/models/llava_onevision.py |   5 +-
 vllm/model_executor/models/paligemma.py       |   5 +-
 vllm/model_executor/models/phi3v.py           |  39 +++----
 vllm/model_executor/models/pixtral.py         |   5 +-
 vllm/model_executor/models/qwen2.py           |  28 +++--
 vllm/model_executor/models/qwen2_vl.py        |  18 +---
 vllm/model_executor/models/registry.py        |  59 ++++++----
 vllm/model_executor/models/ultravox.py        |   5 +-
 vllm/model_executor/models/utils.py           |  24 ++++-
 vllm/multimodal/base.py                       |   6 +-
 vllm/multimodal/registry.py                   |   5 +-
 vllm/utils.py                                 |  22 +++-
 32 files changed, 387 insertions(+), 323 deletions(-)
 create mode 100644 vllm/model_executor/models/adapters.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c6d31b837c55d..02a80640ac3f8 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -343,7 +343,6 @@ steps:
   commands:
     - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/language -m core_model
-    - pytest -v -s models/embedding/vision_language -m core_model
 
 - label: Language Models Test (Extended) # 50min
   optional: true
@@ -355,7 +354,6 @@ steps:
   commands:
     - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
     - pytest -v -s models/embedding/language -m 'not core_model'
-    - pytest -v -s models/embedding/vision_language -m 'not core_model'
 
 - label: Multi-Modal Models Test (Standard) # 26min
   #mirror_hardwares: [amd]
@@ -368,6 +366,7 @@ steps:
   commands:
     - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
     - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
+    - pytest -v -s models/embedding/vision_language -m core_model
     - pytest -v -s models/encoder_decoder/language -m core_model
     - pytest -v -s models/encoder_decoder/vision_language -m core_model
 
@@ -385,6 +384,7 @@ steps:
     # https://github.com/huggingface/transformers/issues/34307
     - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
     - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+    - pytest -v -s models/embedding/vision_language -m 'not core_model'
     - pytest -v -s models/encoder_decoder/language -m 'not core_model'
     - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
 
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 7b7a83f20871b..f571b8bf6735e 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -357,7 +357,7 @@ Text Embedding
     - ✅︎
   * - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM`
     - Qwen2-based
-    - :code:`ssmits/Qwen2-7B-Instruct-embed-base`, :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
+    - :code:`ssmits/Qwen2-7B-Instruct-embed-base` (see note), :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
     - ✅︎
     - ✅︎
   * - :code:`RobertaModel`, :code:`RobertaForMaskedLM`
@@ -378,6 +378,10 @@ Text Embedding
 .. tip::
   You can override the model's pooling method by passing :code:`--override-pooler-config`.
 
+.. note::
+  :code:`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
+  You should manually set mean pooling by passing :code:`--override-pooler-config '{"pooling_type": "MEAN"}'`.
+
 .. note::
   Unlike base Qwen2, :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
   You can set :code:`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
@@ -397,12 +401,21 @@ Reward Modeling
     - Example HF Models
     - :ref:`LoRA <lora>`
     - :ref:`PP <distributed_serving>`
+  * - :code:`LlamaForCausalLM`
+    - Llama-based
+    - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc.
+    - ✅︎
+    - ✅︎
   * - :code:`Qwen2ForRewardModel`
     - Qwen2-based
     - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc.
     - ✅︎
     - ✅︎
 
+.. important::
+  For process-supervised reward models such as :code:`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
+  e.g.: :code:`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
+
 .. note::
     As an interim measure, these models are supported in both offline and online inference via Embeddings API.
 
diff --git a/tests/conftest.py b/tests/conftest.py
index d56942d8912af..36f1d477fab59 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -263,7 +263,6 @@ def __init__(
         dtype: str = "half",
         *,
         model_kwargs: Optional[Dict[str, Any]] = None,
-        is_embedding_model: bool = False,
         is_sentence_transformer: bool = False,
         is_cross_encoder: bool = False,
         skip_tokenizer_init: bool = False,
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index 36b1e5887981c..5ef8540265d14 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -4,6 +4,8 @@
 """
 import pytest
 
+from vllm.config import PoolerConfig
+
 from ..utils import check_embeddings_close
 
 
@@ -33,6 +35,9 @@ def test_models(
     dtype: str,
 ) -> None:
     vllm_extra_kwargs = {}
+    if model == "ssmits/Qwen2-7B-Instruct-embed-base":
+        vllm_extra_kwargs["override_pooler_config"] = \
+            PoolerConfig(pooling_type="MEAN")
     if model == "Alibaba-NLP/gte-Qwen2-7B-instruct":
         vllm_extra_kwargs["hf_overrides"] = {"is_causal": False}
 
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 289ea66b5ebc5..1886b1f9898ad 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -6,11 +6,8 @@
 from vllm.model_executor.models import (is_embedding_model,
                                         is_text_generation_model,
                                         supports_multimodal)
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.model_executor.models.registry import (_CROSS_ENCODER_MODELS,
-                                                 _EMBEDDING_MODELS,
-                                                 _MULTIMODAL_MODELS,
+from vllm.model_executor.models.adapters import as_embedding_model
+from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
                                                  _SPECULATIVE_DECODING_MODELS,
                                                  _TEXT_GENERATION_MODELS,
                                                  ModelRegistry)
@@ -26,18 +23,18 @@ def test_registry_imports(model_arch):
     model_cls, _ = ModelRegistry.resolve_model_cls(model_arch)
 
     if model_arch in _SPECULATIVE_DECODING_MODELS:
-        pass  # Ignore these models which do not have a unified format
-    else:
-        assert is_text_generation_model(model_cls) is (
-            model_arch in _TEXT_GENERATION_MODELS
-            or model_arch in _MULTIMODAL_MODELS)
-
-        embedding_models = {**_EMBEDDING_MODELS, **_CROSS_ENCODER_MODELS}
-        assert is_embedding_model(model_cls) is (model_arch
-                                                 in embedding_models)
-
-        assert supports_multimodal(model_cls) is (model_arch
-                                                  in _MULTIMODAL_MODELS)
+        return  # Ignore these models which do not have a unified format
+
+    if (model_arch in _TEXT_GENERATION_MODELS
+            or model_arch in _MULTIMODAL_MODELS):
+        assert is_text_generation_model(model_cls)
+
+    # All vLLM models should be convertible to an embedding model
+    embed_model = as_embedding_model(model_cls)
+    assert is_embedding_model(embed_model)
+
+    if model_arch in _MULTIMODAL_MODELS:
+        assert supports_multimodal(model_cls)
 
 
 @fork_new_process_for_each_test
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
index 21958b1640204..d676eacffb056 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -1,13 +1,34 @@
-from typing import List, Optional, Union
+from typing import Iterable, List, Optional, Tuple, Union
 
 import torch
+import torch.nn as nn
 
 from vllm.attention import AttentionMetadata
-from vllm.model_executor.models.gemma2 import Gemma2EmbeddingModel
-from vllm.sequence import IntermediateTensors
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.pooler import Pooler, PoolingType
+from vllm.model_executor.models.gemma2 import Gemma2Model
+from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.sequence import IntermediateTensors, PoolerOutput
 
 
-class MyGemma2Embedding(Gemma2EmbeddingModel):
+class MyGemma2Embedding(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        self.model = Gemma2Model(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
+
+        self._pooler = Pooler.from_config_with_defaults(
+            vllm_config.model_config.pooler_config,
+            pooling_type=PoolingType.LAST,
+            normalize=True,
+            softmax=False,
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
 
     def forward(
         self,
@@ -18,7 +39,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = super().forward(
+        hidden_states = self.model(
             input_ids,
             positions,
             kv_caches,
@@ -32,3 +53,17 @@ def forward(
 
         # Return all-zero embeddings
         return torch.zeros_like(hidden_states)
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
+        weights = hf_to_vllm_mapper.apply(weights)
+        weights = ((name, data) for name, data in weights
+                   if not name.startswith("lm_head."))
+        return self.model.load_weights(weights)
diff --git a/tests/test_config.py b/tests/test_config.py
index 3cf90297ce177..45b0b938af215 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -26,8 +26,7 @@ def test_auto_task(model_id, expected_task):
 
 
 @pytest.mark.parametrize(("model_id", "bad_task"), [
-    ("facebook/opt-125m", "embedding"),
-    ("intfloat/e5-mistral-7b-instruct", "generate"),
+    ("Qwen/Qwen2.5-Math-RM-72B", "generate"),
 ])
 def test_incorrect_task(model_id, bad_task):
     with pytest.raises(ValueError, match=r"does not support the .* task"):
diff --git a/vllm/config.py b/vllm/config.py
index b1e5b412fec8f..51b8cf24803ab 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -370,6 +370,31 @@ def _resolve_task(
             selected_task = next(iter(supported_tasks_lst))
 
             if len(supported_tasks) > 1:
+                suffix_to_preferred_task: List[Tuple[str, _Task]] = [
+                    # Hardcode the models that are exceptions
+                    ("AquilaModel", "generate"),
+                    ("ChatGLMModel", "generate"),
+                    # Other models follow this pattern
+                    ("ForCausalLM", "generate"),
+                    ("ForConditionalGeneration", "generate"),
+                    ("ChatModel", "generate"),
+                    ("LMHeadModel", "generate"),
+                    ("EmbeddingModel", "embedding"),
+                    ("RewardModel", "embedding"),
+                    ("ForSequenceClassification", "embedding"),
+                ]
+                info, arch = ModelRegistry.inspect_model_cls(architectures)
+
+                for suffix, pref_task in suffix_to_preferred_task:
+                    if arch.endswith(suffix) and pref_task in supported_tasks:
+                        selected_task = pref_task
+                        break
+                else:
+                    if (arch.endswith("Model")
+                            and info.architecture.endswith("ForCausalLM")
+                            and "embedding" in supported_tasks):
+                        selected_task = "embedding"
+
                 logger.info(
                     "This model supports multiple tasks: %s. "
                     "Defaulting to '%s'.", supported_tasks, selected_task)
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 68b4756331e6d..85ab4355cc2e4 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -11,8 +11,8 @@
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import (get_allowed_kwarg_only_overrides, print_warning_once,
-                        resolve_mm_processor_kwargs)
+from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
+                        print_warning_once, resolve_mm_processor_kwargs)
 
 from .data import ProcessorInputs, SingletonInputs
 from .parse import is_encoder_decoder_inputs
@@ -136,12 +136,12 @@ class InputRegistry:
     """
 
     def __init__(self) -> None:
-        self._dummy_factories_by_model_type: Dict[Type[nn.Module],
-                                                  DummyDataFactory] = {}
-        self._dummy_encoder_factories_by_model_type: Dict[
-            Type[nn.Module], DummyDataFactory] = {}
-        self._input_processors_by_model_type: Dict[Type[nn.Module],
-                                                   InputProcessor] = {}
+        self._dummy_factories_by_model_type = \
+            ClassRegistry[nn.Module, DummyDataFactory]()
+        self._dummy_encoder_factories_by_model_type = \
+            ClassRegistry[nn.Module, DummyDataFactory]()
+        self._input_processors_by_model_type = \
+            ClassRegistry[nn.Module, InputProcessor]()
 
     def _default_dummy_data_factory(
         self,
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index f9437b4112ceb..e0d42e30ebef3 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -60,9 +60,7 @@ def from_config_with_defaults(
         softmax: bool,
         step_tag_id: Optional[int] = None,
         returned_token_ids: Optional[List[int]] = None,
-    ) -> Optional["Pooler"]:
-        if pooler_config is None:
-            return None
+    ) -> "Pooler":
         return cls(
             pooling_type=PoolingType[pooler_config.pooling_type]
             if pooler_config.pooling_type is not None else pooling_type,
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 37c2d789030b6..0e12bc5691538 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -9,6 +9,7 @@
 import json
 import math
 import os
+import warnings
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, cast
@@ -97,22 +98,31 @@ def device_loading_context(module: torch.nn.Module,
 logger = init_logger(__name__)
 
 
-def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module:
+def _initialize_model(
+    vllm_config: VllmConfig,
+    *,
+    prefix: str = "",
+    architectures: Optional[list[str]] = None,
+) -> nn.Module:
     """Initialize a model with the given configurations."""
     model_config = vllm_config.model_config
-    model_class, _ = get_model_architecture(model_config)
+    model_class, _ = get_model_architecture(model_config,
+                                            architectures=architectures)
+
     signatures = inspect.signature(model_class.__init__)
     all_params = [param.name for param in signatures.parameters.values()]
     if "vllm_config" in all_params and "prefix" in all_params:
         # new-style model class
         with set_current_vllm_config(vllm_config):
             return model_class(vllm_config=vllm_config, prefix=prefix)
+
     msg = ("vLLM model class should accept `vllm_config` and `prefix` as "
            "input arguments. Possibly you have an old-style model class"
            " registered from out of tree and it is used for new vLLM version. "
            "Check https://docs.vllm.ai/en/latest/design/arch_overview.html "
            "for the design and update the model class accordingly.")
-    logger.warning(msg)
+    warnings.warn(msg, DeprecationWarning, stacklevel=2)
+
     logger.warning(
         "Trying to guess the arguments for old-style model class %s",
         model_class,
@@ -356,7 +366,7 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
             weights_to_load = {name for name, _ in model.named_parameters()}
             loaded_weights = model.load_weights(
                 self._get_all_weights(model_config, model))
-            # We only enable strict check for non-quantiized models
+            # We only enable strict check for non-quantized models
             # that have loaded weights tracking currently.
             if model_config.quantization is None and loaded_weights is not None:
                 weights_not_loaded = weights_to_load - loaded_weights
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index b95c0b7cd0612..864dd04e79921 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -1,12 +1,13 @@
 """Utilities for selecting and loading models."""
 import contextlib
-from typing import Tuple, Type
+from typing import Optional, Tuple, Type
 
 import torch
 from torch import nn
 
 from vllm.config import ModelConfig
 from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.models.adapters import as_embedding_model
 
 
 @contextlib.contextmanager
@@ -19,8 +20,13 @@ def set_default_torch_dtype(dtype: torch.dtype):
 
 
 def get_model_architecture(
-        model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
-    architectures = getattr(model_config.hf_config, "architectures", [])
+    model_config: ModelConfig,
+    *,
+    architectures: Optional[list[str]] = None,
+) -> Tuple[Type[nn.Module], str]:
+    if architectures is None:
+        architectures = getattr(model_config.hf_config, "architectures", [])
+
     # Special handling for quantized Mixtral.
     # FIXME(woosuk): This is a temporary hack.
     mixtral_supported = [
@@ -32,7 +38,11 @@ def get_model_architecture(
             and "MixtralForCausalLM" in architectures):
         architectures = ["QuantMixtralForCausalLM"]
 
-    return ModelRegistry.resolve_model_cls(architectures)
+    model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
+    if model_config.task == "embedding":
+        model_cls = as_embedding_model(model_cls)
+
+    return model_cls, arch
 
 
 def get_architecture_class_name(model_config: ModelConfig) -> str:
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
new file mode 100644
index 0000000000000..360433a07c5b8
--- /dev/null
+++ b/vllm/model_executor/models/adapters.py
@@ -0,0 +1,98 @@
+from collections.abc import Iterable
+from typing import Any, TypeVar
+
+import torch
+import torch.nn as nn
+
+from .interfaces_base import VllmModelForEmbedding, is_embedding_model
+
+_T = TypeVar("_T", bound=type[nn.Module])
+
+
+def as_embedding_model(cls: _T) -> _T:
+    """Subclass an existing vLLM model to support embeddings."""
+    # Avoid modifying existing embedding models
+    if is_embedding_model(cls):
+        return cls
+
+    # Lazy import
+    from vllm.config import VllmConfig
+    from vllm.model_executor.layers.pooler import (Pooler, PoolerOutput,
+                                                   PoolingType)
+    from vllm.model_executor.pooling_metadata import PoolingMetadata
+
+    from .utils import AutoWeightsLoader, WeightsMapper
+
+    class ModelForEmbedding(cls, VllmModelForEmbedding):
+
+        def __init__(
+            self,
+            *,
+            vllm_config: "VllmConfig",
+            prefix: str = "",
+            **kwargs: Any,
+        ) -> None:
+            super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
+
+            # These are not used in embedding models
+            for attr in ("lm_head", "logits_processor"):
+                if hasattr(self, attr):
+                    delattr(self, attr)
+
+            pooler_config = vllm_config.model_config.pooler_config
+            assert pooler_config is not None
+
+            # If the model already defines a pooler instance, don't overwrite it
+            if not getattr(self, "_pooler", None):
+                self._pooler = Pooler.from_config_with_defaults(
+                    pooler_config,
+                    pooling_type=PoolingType.LAST,
+                    normalize=True,
+                    softmax=False,
+                )
+
+        def pooler(
+            self,
+            hidden_states: torch.Tensor,
+            pooling_metadata: PoolingMetadata,
+        ) -> PoolerOutput:
+            return self._pooler(hidden_states, pooling_metadata)
+
+        def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+            # TODO: Support uninitialized params tracking
+
+            # We have deleted this attribute, so don't load it
+            weights = ((name, data) for name, data in weights
+                       if not name.startswith("lm_head."))
+
+            # If `*ForCausalLM` defines `load_weights` on the inner model
+            # and there are no other inner modules with parameters,
+            # we support loading from both `*Model` and `*ForCausalLM`
+            if hasattr(self, "model") and hasattr(self.model, "load_weights"):
+                # Whether only `self.model` contains parameters
+                model_is_only_param = all(
+                    name == "model" or next(child.parameters(), None) is None
+                    for name, child in self.named_children())
+
+                if model_is_only_param:
+                    mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
+                    weights = mapper.apply(weights)
+
+                    self.model.load_weights(weights)
+                    return
+
+            # For most other models
+            if hasattr(cls, "load_weights"):
+                cls.load_weights(self, weights)  # type: ignore
+            # Fallback
+            else:
+                loader = AutoWeightsLoader(self)
+                loader.load_weights(weights)
+
+    ModelForEmbedding.__name__ = cls.__name__ \
+        .removesuffix("ForCausalLM") \
+        .removesuffix("ForConditionalGeneration") \
+        .removesuffix("ChatModel") \
+        .removesuffix("LMHeadModel") + "ForEmbedding"
+
+    return ModelForEmbedding  # type: ignore
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index d2592016aff34..76b8505ee1c2a 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -512,9 +512,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
 
         self.language_model = init_vllm_registered_model(
-            config.text_config,
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"))
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
 
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index d35fcb012e166..4664aa53ea092 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -30,19 +30,17 @@
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors, PoolerOutput
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, WeightsMapper, extract_layer_index,
+from .utils import (AutoWeightsLoader, extract_layer_index,
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -455,55 +453,3 @@ def load_weights(self, weights: Iterable[Tuple[str,
                            if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
-
-
-class Gemma2EmbeddingModel(nn.Module, SupportsPP):
-    """
-    A model that uses Gemma2 with additional embedding functionalities.
-
-    This class encapsulates the Gemma2Model and provides an interface for
-    embedding operations and customized pooling functions.
-
-    Attributes:
-        model: An instance of Gemma2Model used for forward operations.
-        _pooler: An instance of Pooler used for pooling operations.
-    """
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-
-        self.model = Gemma2Model(vllm_config=vllm_config,
-                                 prefix=maybe_prefix(prefix, "model"))
-        self._pooler = Pooler.from_config_with_defaults(
-            vllm_config.model_config.pooler_config,
-            pooling_type=PoolingType.LAST,
-            normalize=True,
-            softmax=False)
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor],
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        return self.model(input_ids, positions, kv_caches, attn_metadata,
-                          intermediate_tensors, inputs_embeds)
-
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        return self._pooler(hidden_states, pooling_metadata)
-
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
-        weights = hf_to_vllm_mapper.apply(weights)
-        weights = ((name, data) for name, data in weights
-                   if not name.startswith("lm_head."))
-        self.model.load_weights(weights)
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index b1c0065afbf30..86aab38032450 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -474,9 +474,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         )
 
         self.language_model = init_vllm_registered_model(
-            config.text_config,
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"))
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
 
         self.mlp1 = self._init_mlp1(config)
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index ff0ab011a9158..31dfb235ae877 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -37,7 +37,6 @@
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     get_compressed_tensors_cache_scale)
@@ -47,14 +46,13 @@
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.platforms import current_platform
-from vllm.sequence import IntermediateTensors, PoolerOutput
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
-                    extract_layer_index, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -511,11 +509,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
-        pooler_config = vllm_config.model_config.pooler_config
         self.config = config
         self.lora_config = lora_config
 
-        self.model = self._init_model(vllm_config=vllm_config, prefix=prefix)
+        self.model = self._init_model(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(prefix, "model"))
+
         if get_pp_group().is_last_rank:
             self.unpadded_vocab_size = config.vocab_size
             if lora_config:
@@ -544,13 +543,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.sampler = get_sampler()
         else:
             self.lm_head = PPMissingLayer()
+
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.STEP,
-            normalize=False,
-            softmax=False)
 
     def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
         return LlamaModel(vllm_config=vllm_config, prefix=prefix)
@@ -581,14 +576,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        logits = self.compute_logits(hidden_states, None)
-        return self._pooler(logits, pooling_metadata)
-
     def sample(self, logits: torch.Tensor,
                sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
         next_tokens = self.sampler(logits, sampling_metadata)
@@ -639,78 +626,3 @@ def permute(w: torch.Tensor, n_heads: int):
                 name = name.replace(item, mapping[item])
 
         return name, loaded_weight
-
-
-class LlamaEmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
-    """
-    A model that uses Llama with additional embedding functionalities.
-
-    This class encapsulates the LlamaModel and provides an interface for
-    embedding operations and customized pooling functions.
-
-    Attributes:
-        model: An instance of LlamaModel used for forward operations.
-        _pooler: An instance of Pooler used for pooling operations.
-    """
-    packed_modules_mapping = {
-        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"]
-    }
-
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens"
-    ]
-    embedding_modules = {
-        "embed_tokens": "input_embeddings",
-    }
-    embedding_padding_modules = []
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-
-        pooler_config = vllm_config.model_config.pooler_config
-
-        self.model = LlamaModel(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.LAST,
-            normalize=True,
-            softmax=False)
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor],
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        return self.model(input_ids, positions, kv_caches, attn_metadata,
-                          intermediate_tensors, inputs_embeds)
-
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        return self._pooler(hidden_states, pooling_metadata)
-
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
-        weights = hf_to_vllm_mapper.apply(weights)
-        weights = ((name, data) for name, data in weights
-                   if not name.startswith("lm_head."))
-        self.model.load_weights(weights)
-
-    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
-        self.model.load_kv_cache_scales(quantization_param_path)
-
-    # LRUCacheWorkerLoRAManager instantiation requires model config.
-    @property
-    def config(self):
-        return self.model.config
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index e7757b3c7d405..7fd4b32774798 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -319,9 +319,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
             projector_hidden_act=config.projector_hidden_act)
 
         self.language_model = init_vllm_registered_model(
-            config.text_config,
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"))
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
 
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index e113f5862830d..a39f2f4124d05 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -14,13 +14,11 @@
 from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext)
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import NestedTensors
-from vllm.sequence import IntermediateTensors, PoolerOutput
+from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
 from .clip import (CLIPVisionModel, dummy_image_for_clip,
@@ -286,7 +284,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        pooler_config = vllm_config.model_config.pooler_config
         multimodal_config = vllm_config.model_config.multimodal_config
 
         vision_feature_layer = config.vision_feature_layer
@@ -321,17 +318,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
             projector_hidden_act=config.projector_hidden_act)
 
         self.language_model = init_vllm_registered_model(
-            config.text_config,
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"))
-
-        # The same model class supports both language generation and embedding
-        # because the architecture name is the same
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.LAST,
-            normalize=True,
-            softmax=False)
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
@@ -678,13 +669,6 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        return self._pooler(hidden_states, pooling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index b130791808924..0de9d8c5ea572 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -275,9 +275,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
             text_hidden_size=config.text_config.hidden_size,
             projector_hidden_act=config.projector_hidden_act)
         self.language_model = init_vllm_registered_model(
-            config.text_config,
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"))
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
 
         self.make_empty_intermediate_tensors = (
             self.language_model.model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 3166737d61582..0bebc1c745e2b 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -422,9 +422,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
             prefix=maybe_prefix(prefix, "vision_tower"))
         self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config)
         self.language_model = init_vllm_registered_model(
-            config.text_config,
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"))
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
         self.image_newline = nn.Parameter(
             torch.empty(config.text_config.hidden_size))
 
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 2e5b6bee784e7..253e689e50a3b 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -151,9 +151,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.quant_config = quant_config
         config.text_config.architectures = ["GemmaForCausalLM"]
         self.language_model = init_vllm_registered_model(
-            config.text_config,
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"))
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.language_model.logits_processor.scale *= logit_scale
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 4cb874a13e0c1..eef23029a2aca 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -29,24 +29,22 @@
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.logger import init_logger
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.models.clip import CLIPVisionModel
-from vllm.model_executor.models.llama import LlamaForCausalLM
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import cached_get_tokenizer, repeat_and_pad_token
-from vllm.sequence import IntermediateTensors, PoolerOutput
+from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
 from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
 from .interfaces import SupportsMultiModal, SupportsPP
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix,
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
 
 logger = init_logger(__name__)
@@ -536,7 +534,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        pooler_config = vllm_config.model_config.pooler_config
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multimodal_config = multimodal_config
@@ -556,18 +553,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             quant_config,
             prefix=maybe_prefix(prefix, "model.vision_embed_tokens"))
 
-        # The prefix is empty intentionally because default prefix of
-        # LlamaForCausalLM is "model"
-        self.language_model = LlamaForCausalLM(vllm_config=vllm_config,
-                                               prefix="")
-
-        # The same model class supports both language generation and embedding
-        # because the architecture name is the same
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.LAST,
-            normalize=True,
-            softmax=False)
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            # The prefix is empty intentionally because default prefix of
+            # LlamaForCausalLM is "model"
+            prefix="",
+            # We don't directly initialize vLLM's LlamaForCausalLM so we
+            # can automatically apply embedding wrapper if this model is
+            # initialized as an embedding model
+            architectures=["LlamaForCausalLM"],
+        )
+
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
@@ -739,13 +735,6 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        return self._pooler(hidden_states, pooling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         hf_to_vllm_mapper = WeightsMapper(
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 45171c1a04b17..215727cadd954 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -172,9 +172,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         # init MistralForCausalLM
         self.language_model = init_vllm_registered_model(
-            config.text_config,
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"))
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
 
         self.vision_encoder = VisionTransformer(self.vision_args)
         self.vision_language_adapter = VisionLanguageAdapter(
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 87943e53d861c..7d4cc4b69e614 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -31,6 +31,7 @@
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -55,6 +56,8 @@
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
+logger = init_logger(__name__)
+
 
 class Qwen2MLP(nn.Module):
 
@@ -433,7 +436,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
-        pooler_config = vllm_config.model_config.pooler_config
 
         self.config = config
         self.lora_config = lora_config
@@ -454,14 +456,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = get_sampler()
 
-        # The same model class supports both language generation and embedding
-        # because the architecture name is the same
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.LAST,
-            normalize=True,
-            softmax=False)
-
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -499,13 +493,6 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        return self._pooler(hidden_states, pooling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
@@ -553,6 +540,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = Qwen2Model(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
 
+        # TODO: Replace this model class with for_embedding(Qwen2ForCausalLM),
+        # after changing the default pooling method
+        if pooler_config.pooling_type is None:
+            logger.warning(
+                "This embedding model will default to last-token pooling in "
+                "an upcoming version. To avoid breaking changes, you should "
+                "pass `--override-pooler-config '{\"pooling_type\": \"MEAN\"}'`"
+                " explicitly.")
+
         self._pooler = Pooler.from_config_with_defaults(
             pooler_config,
             pooling_type=PoolingType.MEAN,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 7956a98b21569..27175dbae7483 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -50,7 +50,6 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import (
@@ -59,14 +58,13 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.qwen2 import Qwen2Model
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import cached_get_image_processor
 from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict,
                                     MultiModalKwargs, NestedTensors)
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.platforms import _Backend
-from vllm.sequence import IntermediateTensors, PoolerOutput, SequenceData
+from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.transformers_utils.config import uses_mrope
 from vllm.transformers_utils.processor import cached_get_processor
 
@@ -1070,7 +1068,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        pooler_config = vllm_config.model_config.pooler_config
         multimodal_config = vllm_config.model_config.multimodal_config
         assert not cache_config.enable_prefix_caching, \
             "Qwen2-VL currently does not support prefix caching"
@@ -1102,11 +1099,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = get_sampler()
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.LAST,
-            normalize=True,
-            softmax=False)
+
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
@@ -1361,13 +1354,6 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        return self._pooler(hidden_states, pooling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index c400c7d59828c..7d2bfce9ba264 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -20,6 +20,7 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
+from .adapters import as_embedding_model
 from .interfaces import (has_inner_state, is_attention_free,
                          supports_cross_encoding, supports_multimodal,
                          supports_pp)
@@ -107,15 +108,15 @@
     "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"),
     "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
-    "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"),
+    "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
-    "LlamaModel": ("llama", "LlamaEmbeddingModel"),
+    "LlamaModel": ("llama", "LlamaForCausalLM"),
     **{
         # Multiple models share the same architecture, so we include them all
         k: (mod, arch) for k, (mod, arch) in _TEXT_GENERATION_MODELS.items()
         if arch == "LlamaForCausalLM"
     },
-    "MistralModel": ("llama", "LlamaEmbeddingModel"),
+    "MistralModel": ("llama", "LlamaForCausalLM"),
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
     "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
@@ -125,7 +126,7 @@
     # [Multimodal]
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
-    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration") # noqa: E501,
+    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
 }
 
 _CROSS_ENCODER_MODELS = {
@@ -208,6 +209,7 @@
 
 @dataclass(frozen=True)
 class _ModelInfo:
+    architecture: str
     is_text_generation_model: bool
     is_embedding_model: bool
     supports_cross_encoding: bool
@@ -218,9 +220,19 @@ class _ModelInfo:
 
     @staticmethod
     def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo":
+        is_embedding_model_ = is_embedding_model(model)
+        if not is_embedding_model_:
+            try:
+                as_embedding_model(model)
+            except Exception:
+                pass
+            else:
+                is_embedding_model_ = True
+
         return _ModelInfo(
+            architecture=model.__name__,
             is_text_generation_model=is_text_generation_model(model),
-            is_embedding_model=is_embedding_model(model),
+            is_embedding_model=is_embedding_model_,
             supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
             supports_pp=supports_pp(model),
@@ -399,13 +411,13 @@ def _normalize_archs(
     def inspect_model_cls(
         self,
         architectures: Union[str, List[str]],
-    ) -> _ModelInfo:
+    ) -> Tuple[_ModelInfo, str]:
         architectures = self._normalize_archs(architectures)
 
         for arch in architectures:
             model_info = self._try_inspect_model_cls(arch)
             if model_info is not None:
-                return model_info
+                return (model_info, arch)
 
         return self._raise_for_unsupported(architectures)
 
@@ -426,39 +438,50 @@ def is_text_generation_model(
         self,
         architectures: Union[str, List[str]],
     ) -> bool:
-        return self.inspect_model_cls(architectures).is_text_generation_model
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.is_text_generation_model
 
     def is_embedding_model(
         self,
         architectures: Union[str, List[str]],
     ) -> bool:
-        return self.inspect_model_cls(architectures).is_embedding_model
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.is_embedding_model
 
     def is_cross_encoder_model(
         self,
         architectures: Union[str, List[str]],
     ) -> bool:
-        return self.inspect_model_cls(architectures).supports_cross_encoding
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.supports_cross_encoding
 
     def is_multimodal_model(
         self,
         architectures: Union[str, List[str]],
     ) -> bool:
-        return self.inspect_model_cls(architectures).supports_multimodal
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.supports_multimodal
 
     def is_pp_supported_model(
         self,
         architectures: Union[str, List[str]],
     ) -> bool:
-        return self.inspect_model_cls(architectures).supports_pp
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.supports_pp
 
-    def model_has_inner_state(self, architectures: Union[str,
-                                                         List[str]]) -> bool:
-        return self.inspect_model_cls(architectures).has_inner_state
+    def model_has_inner_state(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.has_inner_state
 
-    def is_attention_free_model(self, architectures: Union[str,
-                                                           List[str]]) -> bool:
-        return self.inspect_model_cls(architectures).is_attention_free
+    def is_attention_free_model(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.is_attention_free
 
 
 ModelRegistry = _ModelRegistry({
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index b61deccde45b7..ea1e5401d42c0 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -360,9 +360,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 ))
         self.multi_modal_projector = UltravoxProjector(config)
         self.language_model = init_vllm_registered_model(
-            config.text_config,
             vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"))
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
         if config.text_model_id is not None:
             # this prefix is not for initialization, but for loading weights
             # note the trailing dot
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index a6b40a233439b..7a1e1f9bf2be4 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -173,8 +173,15 @@ def _load_module(
             module_load_weights = getattr(module, "load_weights", None)
             if callable(module_load_weights):
                 loaded_params = module_load_weights(weights)
-                yield from map(lambda x: self._get_qualname(base_prefix, x),
-                               loaded_params)
+                if loaded_params is None:
+                    logger.warning(
+                        "Unable to collect loaded parameters "
+                        "for module %s", module)
+                else:
+                    yield from map(
+                        lambda x: self._get_qualname(base_prefix, x),
+                        loaded_params,
+                    )
 
         child_modules = dict(module.named_children())
         child_params = dict(module.named_parameters(recurse=False))
@@ -232,17 +239,24 @@ def load_weights(
 
 
 def init_vllm_registered_model(
-    hf_config: PretrainedConfig,
     vllm_config: VllmConfig,
+    *,
     prefix: str = "",
+    hf_config: Optional[PretrainedConfig] = None,
+    architectures: Optional[list[str]] = None,
 ) -> nn.Module:
     """
     Helper function to initialize an inner model registered to vLLM,
     based on the arguments passed to the outer vLLM model.
     """
     from vllm.model_executor.model_loader.loader import _initialize_model
-    vllm_config = vllm_config.with_hf_config(hf_config)
-    return _initialize_model(vllm_config, prefix)
+
+    if hf_config is not None:
+        vllm_config = vllm_config.with_hf_config(hf_config)
+
+    return _initialize_model(vllm_config=vllm_config,
+                             prefix=prefix,
+                             architectures=architectures)
 
 
 @overload
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 6eec660e42ac4..bbb8fb4bc1cd1 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -7,7 +7,7 @@
 
 from vllm.inputs import InputContext
 from vllm.logger import init_logger
-from vllm.utils import (get_allowed_kwarg_only_overrides,
+from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
                         resolve_mm_processor_kwargs)
 
 if TYPE_CHECKING:
@@ -54,8 +54,8 @@ class MultiModalPlugin(ABC):
     """
 
     def __init__(self) -> None:
-        self._input_mappers: Dict[Type[nn.Module], MultiModalInputMapper] = {}
-        self._max_mm_tokens: Dict[Type[nn.Module], MultiModalTokensCalc] = {}
+        self._input_mappers = ClassRegistry[nn.Module, MultiModalInputMapper]()
+        self._max_mm_tokens = ClassRegistry[nn.Module, MultiModalTokensCalc]()
 
     @abstractmethod
     def get_data_key(self) -> str:
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index b992442d3b314..b73daee98bd80 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -9,6 +9,7 @@
 from vllm.inputs import InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import ClassRegistry
 
 from .audio import AudioPlugin
 from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc
@@ -62,8 +63,8 @@ def __init__(
             plugins: Sequence[MultiModalPlugin] = DEFAULT_PLUGINS) -> None:
         self._plugins = {p.get_data_key(): p for p in plugins}
 
-        self._processor_factories: Dict[Type[nn.Module],
-                                        MultiModalProcessorFactory] = {}
+        self._processor_factories = ClassRegistry[nn.Module,
+                                                  MultiModalProcessorFactory]()
 
         # This is used for non-multimodal models
         self._disabled_limits_per_plugin = {k: 0 for k in self._plugins}
diff --git a/vllm/utils.py b/vllm/utils.py
index 6f7a6f8c54e47..0165a22582e7b 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -20,7 +20,7 @@
 import warnings
 import weakref
 from asyncio import FIRST_COMPLETED, AbstractEventLoop, Future, Task
-from collections import defaultdict
+from collections import UserDict, defaultdict
 from collections.abc import Iterable, Mapping
 from functools import lru_cache, partial, wraps
 from platform import uname
@@ -1517,13 +1517,13 @@ def value(self):
 
 
 # Adapted from: https://stackoverflow.com/a/47212782/5082708
-class LazyDict(Mapping, Generic[T]):
+class LazyDict(Mapping[str, T], Generic[T]):
 
     def __init__(self, factory: Dict[str, Callable[[], T]]):
         self._factory = factory
         self._dict: Dict[str, T] = {}
 
-    def __getitem__(self, key) -> T:
+    def __getitem__(self, key: str) -> T:
         if key not in self._dict:
             if key not in self._factory:
                 raise KeyError(key)
@@ -1540,6 +1540,22 @@ def __len__(self):
         return len(self._factory)
 
 
+class ClassRegistry(UserDict[type[T], _V]):
+
+    def __getitem__(self, key: type[T]) -> _V:
+        for cls in key.mro():
+            if cls in self.data:
+                return self.data[cls]
+
+        raise KeyError(key)
+
+    def __contains__(self, key: object) -> bool:
+        if not isinstance(key, type):
+            return False
+
+        return any(cls in self.data for cls in key.mro())
+
+
 def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor:
     """
     Create a weak reference to a tensor.

From db1ca39f0568fa64fe736a79b45dcc46d6b900cf Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 1 Dec 2024 09:48:35 +0800
Subject: [PATCH 102/293] [Misc] Improve type annotations for
 `support_torch_compile` (#10763)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/compilation/decorators.py | 38 ++++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 8b81a29936989..8700243c9d904 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -1,7 +1,8 @@
 import inspect
-from typing import Dict, List, Optional, Union
+from typing import Callable, Dict, List, Optional, TypeVar, Union, overload
 
 import torch
+import torch.nn as nn
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
@@ -12,10 +13,27 @@
 
 logger = init_logger(__name__)
 
+_T = TypeVar("_T", bound=type[nn.Module])
+
+
+@overload
+def support_torch_compile(
+    *,
+    dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]],
+) -> Callable[[_T], _T]:
+    ...
+
+
+@overload
+def support_torch_compile(cls: _T) -> _T:
+    ...
+
 
 def support_torch_compile(
-        cls: Optional[type] = None,
-        dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]] = None):
+    cls: Optional[_T] = None,
+    *,
+    dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]] = None,
+) -> Union[Callable[[_T], _T], _T]:
     """
     A decorator to add support for compiling the forward method of a class.
 
@@ -66,7 +84,7 @@ def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]):
     computation graph.
     """
 
-    def cls_decorator_helper(cls: type):
+    def cls_decorator_helper(cls: _T) -> _T:
         # helper to pass `dynamic_arg_dims`` to `_support_torch_compile``
         # to avoid too much indentation for `_support_torch_compile``
         if not hasattr(cls, 'forward'):
@@ -105,8 +123,10 @@ def cls_decorator_helper(cls: type):
     return cls_decorator_helper
 
 
-def _support_torch_compile(cls: type,
-                           dynamic_arg_dims: Dict[str, Union[int, List[int]]]):
+def _support_torch_compile(
+    cls: _T,
+    dynamic_arg_dims: Dict[str, Union[int, List[int]]],
+) -> _T:
     """
     A decorator to add support for compiling the forward method of a class.
     """
@@ -119,7 +139,7 @@ def _support_torch_compile(cls: type,
     #  other than TorchCompileWrapperWithCustomDispatcher
     cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher, )
 
-    old_init = cls.__init__  # type: ignore
+    old_init = cls.__init__
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
         old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
@@ -135,7 +155,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
         TorchCompileWrapperWithCustomDispatcher.__init__(
             self, compilation_level=vllm_config.compilation_config.level)
 
-    cls.__init__ = __init__  # type: ignore
+    cls.__init__ = __init__
 
     def __call__(self, *args, **kwargs):
         # torch.compiler.is_compiling() means we are inside the compilation
@@ -180,5 +200,5 @@ def __call__(self, *args, **kwargs):
             model_output = self.forward(*args, **kwargs)
             return model_output
 
-    cls.__call__ = __call__  # type: ignore
+    cls.__call__ = __call__
     return cls

From d198e8fc7372134366a2c262c5ea30d7cefb39e2 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 1 Dec 2024 14:36:51 +0800
Subject: [PATCH 103/293] [Misc] Rename embedding classes to pooling (#10801)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 examples/offline_inference_embedding.py       |  2 +-
 tests/entrypoints/llm/test_encode.py          |  6 +-
 tests/models/test_registry.py                 |  4 +-
 tests/worker/test_model_input.py              |  4 +-
 vllm/__init__.py                              | 31 +++++++++--
 vllm/config.py                                |  2 +-
 vllm/engine/async_llm_engine.py               | 24 ++++----
 vllm/engine/llm_engine.py                     |  8 +--
 vllm/engine/multiprocessing/client.py         | 14 ++---
 vllm/engine/protocol.py                       |  5 +-
 vllm/entrypoints/llm.py                       | 30 +++++-----
 vllm/entrypoints/openai/serving_embedding.py  | 12 ++--
 vllm/entrypoints/openai/serving_score.py      | 10 ++--
 vllm/model_executor/models/__init__.py        | 11 ++--
 vllm/model_executor/models/adapters.py        |  6 +-
 vllm/model_executor/models/interfaces.py      |  4 +-
 vllm/model_executor/models/interfaces_base.py | 15 +++--
 vllm/model_executor/models/registry.py        | 16 +++---
 vllm/outputs.py                               | 55 +++++++++++++------
 vllm/v1/engine/async_llm.py                   |  4 +-
 vllm/v1/engine/async_stream.py                |  8 +--
 ..._runner.py => cpu_pooling_model_runner.py} |  4 +-
 vllm/worker/cpu_worker.py                     |  4 +-
 ...odel_runner.py => pooling_model_runner.py} |  6 +-
 vllm/worker/worker.py                         |  4 +-
 25 files changed, 166 insertions(+), 123 deletions(-)
 rename vllm/worker/{cpu_embedding_model_runner.py => cpu_pooling_model_runner.py} (98%)
 rename vllm/worker/{embedding_model_runner.py => pooling_model_runner.py} (98%)

diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference_embedding.py
index 7d5ef128bc8e0..ae158eef2ca4c 100644
--- a/examples/offline_inference_embedding.py
+++ b/examples/offline_inference_embedding.py
@@ -10,7 +10,7 @@
 
 # Create an LLM.
 model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True)
-# Generate embedding. The output is a list of EmbeddingRequestOutputs.
+# Generate embedding. The output is a list of PoolingRequestOutputs.
 outputs = model.encode(prompts)
 # Print the outputs.
 for output in outputs:
diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index 4c9f796e5ed71..41163809237e9 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from vllm import LLM, EmbeddingRequestOutput, PoolingParams
+from vllm import LLM, PoolingParams, PoolingRequestOutput
 from vllm.distributed import cleanup_dist_env_and_memory
 
 MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
@@ -43,8 +43,8 @@ def llm():
     cleanup_dist_env_and_memory()
 
 
-def assert_outputs_equal(o1: List[EmbeddingRequestOutput],
-                         o2: List[EmbeddingRequestOutput]):
+def assert_outputs_equal(o1: List[PoolingRequestOutput],
+                         o2: List[PoolingRequestOutput]):
     assert [o.outputs for o in o1] == [o.outputs for o in o2]
 
 
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 1886b1f9898ad..b5368aab3ecf1 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -3,7 +3,7 @@
 import pytest
 import torch.cuda
 
-from vllm.model_executor.models import (is_embedding_model,
+from vllm.model_executor.models import (is_pooling_model,
                                         is_text_generation_model,
                                         supports_multimodal)
 from vllm.model_executor.models.adapters import as_embedding_model
@@ -31,7 +31,7 @@ def test_registry_imports(model_arch):
 
     # All vLLM models should be convertible to an embedding model
     embed_model = as_embedding_model(model_cls)
-    assert is_embedding_model(embed_model)
+    assert is_pooling_model(embed_model)
 
     if model_arch in _MULTIMODAL_MODELS:
         assert supports_multimodal(model_cls)
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
index b36e8bfe73ff3..309854e6babf3 100644
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@@ -8,10 +8,10 @@
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.pooling_metadata import PoolingMetadata
-from vllm.worker.embedding_model_runner import (
-    ModelInputForGPUWithPoolingMetadata)
 from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 from vllm.worker.multi_step_model_runner import StatefulModelInput
+from vllm.worker.pooling_model_runner import (
+    ModelInputForGPUWithPoolingMetadata)
 
 
 class MockAttentionBackend(AttentionBackend):
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 8f477ea84756d..a10f6d3128cb6 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -7,8 +7,8 @@
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import PromptType, TextPrompt, TokensPrompt
 from vllm.model_executor.models import ModelRegistry
-from vllm.outputs import (CompletionOutput, EmbeddingOutput,
-                          EmbeddingRequestOutput, RequestOutput)
+from vllm.outputs import (CompletionOutput, PoolingOutput,
+                          PoolingRequestOutput, RequestOutput)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
@@ -25,8 +25,8 @@
     "SamplingParams",
     "RequestOutput",
     "CompletionOutput",
-    "EmbeddingOutput",
-    "EmbeddingRequestOutput",
+    "PoolingOutput",
+    "PoolingRequestOutput",
     "LLMEngine",
     "EngineArgs",
     "AsyncLLMEngine",
@@ -34,3 +34,26 @@
     "initialize_ray_cluster",
     "PoolingParams",
 ]
+
+
+def __getattr__(name: str):
+    import warnings
+
+    if name == "EmbeddingOutput":
+        msg = ("EmbeddingOutput has been renamed to PoolingOutput. "
+               "The original name will be removed in an upcoming version.")
+
+        warnings.warn(DeprecationWarning(msg), stacklevel=2)
+
+        return PoolingOutput
+
+    if name == "EmbeddingRequestOutput":
+        msg = ("EmbeddingRequestOutput has been renamed to "
+               "PoolingRequestOutput. "
+               "The original name will be removed in an upcoming version.")
+
+        warnings.warn(DeprecationWarning(msg), stacklevel=2)
+
+        return PoolingRequestOutput
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/config.py b/vllm/config.py
index 51b8cf24803ab..da043afbe1ae7 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -359,7 +359,7 @@ def _resolve_task(
             # NOTE: Listed from highest to lowest priority,
             # in case the model supports multiple of them
             "generate": ModelRegistry.is_text_generation_model(architectures),
-            "embedding": ModelRegistry.is_embedding_model(architectures),
+            "embedding": ModelRegistry.is_pooling_model(architectures),
         }
         supported_tasks_lst: List[_Task] = [
             task for task, is_supported in task_support.items() if is_supported
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 31a15b04314d5..7b1bb7b05708d 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -25,7 +25,7 @@
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
@@ -74,7 +74,7 @@ def _log_task_completion(task: asyncio.Task,
 
 
 class AsyncStream:
-    """A stream of RequestOutputs or EmbeddingRequestOutputs for a request
+    """A stream of RequestOutputs or PoolingRequestOutputs for a request
     that can be iterated over asynchronously via an async generator."""
 
     def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
@@ -83,7 +83,7 @@ def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
         self._queue: asyncio.Queue = asyncio.Queue()
         self._finished = False
 
-    def put(self, item: Union[RequestOutput, EmbeddingRequestOutput,
+    def put(self, item: Union[RequestOutput, PoolingRequestOutput,
                               Exception]) -> None:
         if not self._finished:
             self._queue.put_nowait(item)
@@ -103,7 +103,7 @@ def finished(self) -> bool:
 
     async def generator(
         self
-    ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]:
+    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
         try:
             while True:
                 result = await self._queue.get()
@@ -154,7 +154,7 @@ def propagate_exception(self,
 
     def process_request_output(self,
                                request_output: Union[RequestOutput,
-                                                     EmbeddingRequestOutput],
+                                                     PoolingRequestOutput],
                                *,
                                verbose: bool = False) -> None:
         """Process a request output from the engine."""
@@ -265,7 +265,7 @@ def __init__(self, *args, **kwargs):
 
     async def step_async(
         self, virtual_engine: int
-    ) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
+    ) -> List[Union[RequestOutput, PoolingRequestOutput]]:
         """Performs one decoding iteration and returns newly generated results.
         The workers are ran asynchronously if possible.
 
@@ -907,7 +907,7 @@ def add_request(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> Coroutine[None, None, AsyncGenerator[Union[
-            RequestOutput, EmbeddingRequestOutput], None]]:
+            RequestOutput, PoolingRequestOutput], None]]:
         ...
 
     @overload
@@ -922,7 +922,7 @@ def add_request(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> Coroutine[None, None, AsyncGenerator[Union[
-            RequestOutput, EmbeddingRequestOutput], None]]:
+            RequestOutput, PoolingRequestOutput], None]]:
         ...
 
     @deprecate_kwargs(
@@ -941,7 +941,7 @@ async def add_request(
         priority: int = 0,
         *,
         inputs: Optional[PromptType] = None,  # DEPRECATED
-    ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]:
+    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
         if inputs is not None:
             prompt = inputs
         assert prompt is not None and params is not None
@@ -1070,7 +1070,7 @@ async def encode(
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
-    ) -> AsyncGenerator[EmbeddingRequestOutput, None]:
+    ) -> AsyncGenerator[PoolingRequestOutput, None]:
         """Generate outputs for a request from an embedding model.
 
         Generate outputs for a request. This method is a coroutine. It adds the
@@ -1088,7 +1088,7 @@ async def encode(
                 Only applicable with priority scheduling.
 
         Yields:
-            The output `EmbeddingRequestOutput` objects from the LLMEngine
+            The output `PoolingRequestOutput` objects from the LLMEngine
             for the request.
 
         Details:
@@ -1141,7 +1141,7 @@ async def encode(
                 trace_headers=trace_headers,
                 priority=priority,
         ):
-            yield LLMEngine.validate_output(output, EmbeddingRequestOutput)
+            yield LLMEngine.validate_output(output, PoolingRequestOutput)
 
     async def abort(self, request_id: str) -> None:
         """Abort a request.
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index ecc222f692c41..7911dc8d04500 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -40,7 +40,7 @@
     get_local_guided_decoding_logits_processor)
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
-from vllm.outputs import (EmbeddingRequestOutput, RequestOutput,
+from vllm.outputs import (PoolingRequestOutput, RequestOutput,
                           RequestOutputFactory)
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -80,7 +80,7 @@ def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
 
 
 _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
-_O = TypeVar("_O", RequestOutput, EmbeddingRequestOutput)
+_O = TypeVar("_O", RequestOutput, PoolingRequestOutput)
 
 
 @dataclass
@@ -112,7 +112,7 @@ class SchedulerContext:
     def __init__(self, multi_step_stream_outputs: bool = False):
         self.output_queue: Deque[OutputData] = deque()
         self.request_outputs: List[Union[RequestOutput,
-                                         EmbeddingRequestOutput]] = []
+                                         PoolingRequestOutput]] = []
         self.seq_group_metadata_list: Optional[
             List[SequenceGroupMetadata]] = None
         self.scheduler_outputs: Optional[SchedulerOutputs] = None
@@ -1314,7 +1314,7 @@ def _advance_to_next_step(
                 else:
                     seq.append_token_id(sample.output_token, sample.logprobs)
 
-    def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
+    def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
         """Performs one decoding iteration and returns newly generated results.
 
         .. figure:: https://i.imgur.com/sv2HssD.png
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index fe21c58c775fe..d26728e8c6e67 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -35,7 +35,7 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
@@ -495,7 +495,7 @@ def encode(
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
-    ) -> AsyncGenerator[EmbeddingRequestOutput, None]:
+    ) -> AsyncGenerator[PoolingRequestOutput, None]:
         ...
 
     @overload
@@ -507,7 +507,7 @@ def encode(
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
-    ) -> AsyncGenerator[EmbeddingRequestOutput, None]:
+    ) -> AsyncGenerator[PoolingRequestOutput, None]:
         ...
 
     @deprecate_kwargs(
@@ -524,7 +524,7 @@ def encode(
         priority: int = 0,
         *,
         inputs: Optional[PromptType] = None  # DEPRECATED
-    ) -> AsyncGenerator[EmbeddingRequestOutput, None]:
+    ) -> AsyncGenerator[PoolingRequestOutput, None]:
         """Generate outputs for a request from an embedding model.
 
         Generate outputs for a request. This method is a coroutine. It adds the
@@ -540,7 +540,7 @@ def encode(
             trace_headers: OpenTelemetry trace headers.
 
         Yields:
-            The output `EmbeddingRequestOutput` objects from the LLMEngine
+            The output `PoolingRequestOutput` objects from the LLMEngine
             for the request.
         """
         if inputs is not None:
@@ -549,7 +549,7 @@ def encode(
                 and request_id is not None)
 
         return cast(
-            AsyncGenerator[EmbeddingRequestOutput, None],
+            AsyncGenerator[PoolingRequestOutput, None],
             self._process_request(prompt,
                                   pooling_params,
                                   request_id,
@@ -567,7 +567,7 @@ async def _process_request(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> Union[AsyncGenerator[RequestOutput, None], AsyncGenerator[
-            EmbeddingRequestOutput, None]]:
+            PoolingRequestOutput, None]]:
         """Send an RPCGenerateRequest to the RPCServer and stream responses."""
 
         # If already dead, error out.
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index e15395d75c91f..4079de7d36793 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -11,8 +11,7 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.outputs import (CompletionOutput, EmbeddingRequestOutput,
-                          RequestOutput)
+from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import BeamSearchParams, SamplingParams
@@ -209,7 +208,7 @@ def encode(
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
-    ) -> AsyncGenerator[EmbeddingRequestOutput, None]:
+    ) -> AsyncGenerator[PoolingRequestOutput, None]:
         """Generate outputs for a request from an embedding model."""
         ...
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 1551a9a998160..a25c401b4ea10 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -26,7 +26,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.guided_decoding.guided_fields import (
     GuidedDecodingRequest, LLMGuidedOptions)
-from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
@@ -679,7 +679,7 @@ def encode(
         prompt_token_ids: Optional[List[int]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
-    ) -> List[EmbeddingRequestOutput]:
+    ) -> List[PoolingRequestOutput]:
         ...
 
     @overload  # LEGACY: multi (prompt + optional token ids)
@@ -691,7 +691,7 @@ def encode(
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
-    ) -> List[EmbeddingRequestOutput]:
+    ) -> List[PoolingRequestOutput]:
         ...
 
     @overload  # LEGACY: single (token ids + optional prompt)
@@ -704,7 +704,7 @@ def encode(
         prompt_token_ids: List[int],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
-    ) -> List[EmbeddingRequestOutput]:
+    ) -> List[PoolingRequestOutput]:
         ...
 
     @overload  # LEGACY: multi (token ids + optional prompt)
@@ -717,7 +717,7 @@ def encode(
         prompt_token_ids: List[List[int]],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
-    ) -> List[EmbeddingRequestOutput]:
+    ) -> List[PoolingRequestOutput]:
         ...
 
     @overload  # LEGACY: single or multi token ids [pos-only]
@@ -728,7 +728,7 @@ def encode(
         prompt_token_ids: Union[List[int], List[List[int]]],
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
-    ) -> List[EmbeddingRequestOutput]:
+    ) -> List[PoolingRequestOutput]:
         ...
 
     @overload
@@ -741,7 +741,7 @@ def encode(
                                        Sequence[PoolingParams]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
-    ) -> List[EmbeddingRequestOutput]:
+    ) -> List[PoolingRequestOutput]:
         ...
 
     @deprecate_kwargs(
@@ -759,7 +759,7 @@ def encode(
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> List[EmbeddingRequestOutput]:
+    ) -> List[PoolingRequestOutput]:
         """Generates the completions for the input prompts.
 
         This class automatically batches the given prompts, considering
@@ -778,7 +778,7 @@ def encode(
                 generation, if any.
 
         Returns:
-            A list of ``EmbeddingRequestOutput`` objects containing the
+            A list of ``PoolingRequestOutput`` objects containing the
             generated embeddings in the same order as the input prompts.
 
         Note:
@@ -821,7 +821,7 @@ def encode(
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
         return self.engine_class.validate_outputs(outputs,
-                                                  EmbeddingRequestOutput)
+                                                  PoolingRequestOutput)
 
     def score(
         self,
@@ -832,7 +832,7 @@ def score(
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> List[EmbeddingRequestOutput]:
+    ) -> List[PoolingRequestOutput]:
         """Generates similarity scores for all pairs <text,text_pair>.
 
         The inputs can be 1 -> 1, 1 -> N or N -> N. In the 1 - N case
@@ -854,7 +854,7 @@ def score(
                 generation, if any.
 
         Returns:
-            A list of ``EmbeddingRequestOutput`` objects containing the
+            A list of ``PoolingRequestOutput`` objects containing the
             generated scores in the same order as the input prompts.
         """
         task = self.llm_engine.model_config.task
@@ -943,7 +943,7 @@ def ensure_str(prompt: SingletonPrompt):
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
         return self.engine_class.validate_outputs(outputs,
-                                                  EmbeddingRequestOutput)
+                                                  PoolingRequestOutput)
 
     def start_profile(self) -> None:
         self.llm_engine.start_profile()
@@ -1085,7 +1085,7 @@ def _add_guided_params(
 
     def _run_engine(
             self, *, use_tqdm: bool
-    ) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
+    ) -> List[Union[RequestOutput, PoolingRequestOutput]]:
         # Initialize tqdm.
         if use_tqdm:
             num_requests = self.llm_engine.get_num_unfinished_requests()
@@ -1098,7 +1098,7 @@ def _run_engine(
             )
 
         # Run the engine.
-        outputs: List[Union[RequestOutput, EmbeddingRequestOutput]] = []
+        outputs: List[Union[RequestOutput, PoolingRequestOutput]] = []
         total_in_toks = 0
         total_out_toks = 0
         while self.llm_engine.has_unfinished_requests():
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 78e2416d9d4da..2cbb252610e39 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -18,14 +18,14 @@
                                               ErrorResponse, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
 from vllm.logger import init_logger
-from vllm.outputs import EmbeddingOutput, EmbeddingRequestOutput
+from vllm.outputs import PoolingOutput, PoolingRequestOutput
 from vllm.utils import merge_async_iterators, random_uuid
 
 logger = init_logger(__name__)
 
 
 def _get_embedding(
-    output: EmbeddingOutput,
+    output: PoolingOutput,
     encoding_format: Literal["float", "base64"],
 ) -> Union[List[float], str]:
     if encoding_format == "float":
@@ -40,7 +40,7 @@ def _get_embedding(
 
 
 def request_output_to_embedding_response(
-        final_res_batch: List[EmbeddingRequestOutput], request_id: str,
+        final_res_batch: List[PoolingRequestOutput], request_id: str,
         created_time: int, model_name: str,
         encoding_format: Literal["float", "base64"]) -> EmbeddingResponse:
     data: List[EmbeddingResponseData] = []
@@ -169,7 +169,7 @@ async def create_embedding(
             return self.create_error_response(str(e))
 
         # Schedule the request and get the result generator.
-        generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = []
+        generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
         try:
             pooling_params = request.to_pooling_params()
 
@@ -207,7 +207,7 @@ async def create_embedding(
         num_prompts = len(engine_prompts)
 
         # Non-streaming response
-        final_res_batch: List[Optional[EmbeddingRequestOutput]]
+        final_res_batch: List[Optional[PoolingRequestOutput]]
         final_res_batch = [None] * num_prompts
         try:
             async for i, res in result_generator:
@@ -215,7 +215,7 @@ async def create_embedding(
 
             assert all(final_res is not None for final_res in final_res_batch)
 
-            final_res_batch_checked = cast(List[EmbeddingRequestOutput],
+            final_res_batch_checked = cast(List[PoolingRequestOutput],
                                            final_res_batch)
 
             response = request_output_to_embedding_response(
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 7cd8ff08b5608..a1f14449ba9c3 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -13,7 +13,7 @@
 from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
 from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
-from vllm.outputs import EmbeddingRequestOutput
+from vllm.outputs import PoolingRequestOutput
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 from vllm.utils import make_async, merge_async_iterators, random_uuid
 
@@ -21,7 +21,7 @@
 
 
 def request_output_to_score_response(
-        final_res_batch: List[EmbeddingRequestOutput], request_id: str,
+        final_res_batch: List[PoolingRequestOutput], request_id: str,
         created_time: int, model_name: str) -> ScoreResponse:
     data: List[ScoreResponseData] = []
     score = None
@@ -133,7 +133,7 @@ async def create_score(
             return self.create_error_response(str(e))
 
         # Schedule the request and get the result generator.
-        generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = []
+        generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
 
         input_pairs = make_pairs(request.text_1, request.text_2)
 
@@ -194,7 +194,7 @@ async def create_score(
         num_prompts = len(engine_prompts)
 
         # Non-streaming response
-        final_res_batch: List[Optional[EmbeddingRequestOutput]]
+        final_res_batch: List[Optional[PoolingRequestOutput]]
         final_res_batch = [None] * num_prompts
 
         try:
@@ -203,7 +203,7 @@ async def create_score(
 
             assert all(final_res is not None for final_res in final_res_batch)
 
-            final_res_batch_checked = cast(List[EmbeddingRequestOutput],
+            final_res_batch_checked = cast(List[PoolingRequestOutput],
                                            final_res_batch)
 
             response = request_output_to_score_response(
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index d66373512b95e..a3ef9adad16d9 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -1,15 +1,14 @@
 from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
                          SupportsPP, has_inner_state, supports_lora,
                          supports_multimodal, supports_pp)
-from .interfaces_base import (VllmModelForEmbedding,
-                              VllmModelForTextGeneration, is_embedding_model,
-                              is_text_generation_model)
+from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration,
+                              is_pooling_model, is_text_generation_model)
 from .registry import ModelRegistry
 
 __all__ = [
     "ModelRegistry",
-    "VllmModelForEmbedding",
-    "is_embedding_model",
+    "VllmModelForPooling",
+    "is_pooling_model",
     "VllmModelForTextGeneration",
     "is_text_generation_model",
     "HasInnerState",
@@ -20,4 +19,4 @@
     "supports_multimodal",
     "SupportsPP",
     "supports_pp",
-]
\ No newline at end of file
+]
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 360433a07c5b8..9cc43ae9181b9 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -4,7 +4,7 @@
 import torch
 import torch.nn as nn
 
-from .interfaces_base import VllmModelForEmbedding, is_embedding_model
+from .interfaces_base import VllmModelForPooling, is_pooling_model
 
 _T = TypeVar("_T", bound=type[nn.Module])
 
@@ -12,7 +12,7 @@
 def as_embedding_model(cls: _T) -> _T:
     """Subclass an existing vLLM model to support embeddings."""
     # Avoid modifying existing embedding models
-    if is_embedding_model(cls):
+    if is_pooling_model(cls):
         return cls
 
     # Lazy import
@@ -23,7 +23,7 @@ def as_embedding_model(cls: _T) -> _T:
 
     from .utils import AutoWeightsLoader, WeightsMapper
 
-    class ModelForEmbedding(cls, VllmModelForEmbedding):
+    class ModelForEmbedding(cls, VllmModelForPooling):
 
         def __init__(
             self,
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 1545ce332309f..01a381381ccec 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -7,7 +7,7 @@
 from vllm.logger import init_logger
 from vllm.utils import supports_kw
 
-from .interfaces_base import is_embedding_model
+from .interfaces_base import is_pooling_model
 
 if TYPE_CHECKING:
     from vllm.attention import AttentionMetadata
@@ -389,4 +389,4 @@ def _supports_cross_encoding(
 def supports_cross_encoding(
     model: Union[Type[object], object],
 ) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]:
-    return is_embedding_model(model) and _supports_cross_encoding(model)
+    return is_pooling_model(model) and _supports_cross_encoding(model)
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 957a5a6e26b5c..de733b6d49a53 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -141,7 +141,7 @@ def is_text_generation_model(
 
 
 @runtime_checkable
-class VllmModelForEmbedding(VllmModel[C_co, T], Protocol[C_co, T]):
+class VllmModelForPooling(VllmModel[C_co, T], Protocol[C_co, T]):
 
     def pooler(
         self,
@@ -153,23 +153,22 @@ def pooler(
 
 
 @overload
-def is_embedding_model(
-        model: Type[object]) -> TypeIs[Type[VllmModelForEmbedding]]:
+def is_pooling_model(model: Type[object]) -> TypeIs[Type[VllmModelForPooling]]:
     ...
 
 
 @overload
-def is_embedding_model(model: object) -> TypeIs[VllmModelForEmbedding]:
+def is_pooling_model(model: object) -> TypeIs[VllmModelForPooling]:
     ...
 
 
-def is_embedding_model(
+def is_pooling_model(
     model: Union[Type[object], object],
-) -> Union[TypeIs[Type[VllmModelForEmbedding]], TypeIs[VllmModelForEmbedding]]:
+) -> Union[TypeIs[Type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]:
     if not is_vllm_model(model):
         return False
 
     if isinstance(model, type):
-        return isinstance(model, VllmModelForEmbedding)
+        return isinstance(model, VllmModelForPooling)
 
-    return isinstance(model, VllmModelForEmbedding)
+    return isinstance(model, VllmModelForPooling)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 7d2bfce9ba264..2b7b69e8c3a95 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -24,7 +24,7 @@
 from .interfaces import (has_inner_state, is_attention_free,
                          supports_cross_encoding, supports_multimodal,
                          supports_pp)
-from .interfaces_base import is_embedding_model, is_text_generation_model
+from .interfaces_base import is_pooling_model, is_text_generation_model
 
 logger = init_logger(__name__)
 
@@ -211,7 +211,7 @@
 class _ModelInfo:
     architecture: str
     is_text_generation_model: bool
-    is_embedding_model: bool
+    is_pooling_model: bool
     supports_cross_encoding: bool
     supports_multimodal: bool
     supports_pp: bool
@@ -220,19 +220,19 @@ class _ModelInfo:
 
     @staticmethod
     def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo":
-        is_embedding_model_ = is_embedding_model(model)
-        if not is_embedding_model_:
+        is_pooling_model_ = is_pooling_model(model)
+        if not is_pooling_model_:
             try:
                 as_embedding_model(model)
             except Exception:
                 pass
             else:
-                is_embedding_model_ = True
+                is_pooling_model_ = True
 
         return _ModelInfo(
             architecture=model.__name__,
             is_text_generation_model=is_text_generation_model(model),
-            is_embedding_model=is_embedding_model_,
+            is_pooling_model=is_pooling_model_,
             supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
             supports_pp=supports_pp(model),
@@ -441,12 +441,12 @@ def is_text_generation_model(
         model_cls, _ = self.inspect_model_cls(architectures)
         return model_cls.is_text_generation_model
 
-    def is_embedding_model(
+    def is_pooling_model(
         self,
         architectures: Union[str, List[str]],
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures)
-        return model_cls.is_embedding_model
+        return model_cls.is_pooling_model
 
     def is_cross_encoder_model(
         self,
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 912e485e40b59..ead37164f1113 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -53,8 +53,8 @@ def __repr__(self) -> str:
 
 
 @dataclass
-class EmbeddingOutput:
-    """The output data of one completion output of a request.
+class PoolingOutput:
+    """The output data of one pooling output of a request.
 
     Args:
         embedding: The embedding vector, which is a list of floats. The
@@ -63,7 +63,7 @@ class EmbeddingOutput:
     embedding: List[float]
 
     def __repr__(self) -> str:
-        return (f"EmbeddingOutput("
+        return (f"PoolingOutput("
                 f"embedding={len(self.embedding)})")
 
 
@@ -327,18 +327,18 @@ def __repr__(self) -> str:
                 f"multi_modal_placeholders={self.multi_modal_placeholders})")
 
 
-class EmbeddingRequestOutput:
+class PoolingRequestOutput:
     """
-    The output data of an embedding request to the LLM.
+    The output data of a pooling request to the LLM.
 
     Args:
-        request_id (str): A unique identifier for the embedding request.
-        outputs (EmbeddingOutput): The embedding results for the given input.
+        request_id (str): A unique identifier for the pooling request.
+        outputs (PoolingOutput): The pooling results for the given input.
         prompt_token_ids (List[int]): A list of token IDs used in the prompt.
-        finished (bool): A flag indicating whether the embedding is completed.
+        finished (bool): A flag indicating whether the pooling is completed.
     """
 
-    def __init__(self, request_id: str, outputs: "EmbeddingOutput",
+    def __init__(self, request_id: str, outputs: "PoolingOutput",
                  prompt_token_ids: List[int], finished: bool):
         self.request_id = request_id
         self.prompt_token_ids = prompt_token_ids
@@ -347,11 +347,11 @@ def __init__(self, request_id: str, outputs: "EmbeddingOutput",
 
     @classmethod
     def from_seq_group(cls,
-                       seq_group: 'SequenceGroup') -> "EmbeddingRequestOutput":
+                       seq_group: 'SequenceGroup') -> "PoolingRequestOutput":
         if seq_group.embeddings is None:
             raise ValueError(
                 "Embeddings are missing in seq_group for EmbeddingRequest.")
-        output = EmbeddingOutput(seq_group.embeddings)
+        output = PoolingOutput(seq_group.embeddings)
         prompt_token_ids = seq_group.prompt_token_ids
         finished = seq_group.is_finished()
 
@@ -359,15 +359,15 @@ def from_seq_group(cls,
 
     def __repr__(self):
         """
-        Returns a string representation of an EmbeddingRequestOutput instance.
+        Returns a string representation of an PoolingRequestOutput instance.
 
         The representation includes the request_id and the number of outputs,
-        providing a quick overview of the embedding request's results.
+        providing a quick overview of the pooling request's results.
 
         Returns:
-            str: A string representation of the EmbeddingRequestOutput instance.
+            str: A string representation of the PoolingRequestOutput instance.
         """
-        return (f"EmbeddingRequestOutput(request_id='{self.request_id}', "
+        return (f"PoolingRequestOutput(request_id='{self.request_id}', "
                 f"outputs={repr(self.outputs)}, "
                 f"prompt_token_ids={self.prompt_token_ids}, "
                 f"finished={self.finished})")
@@ -426,7 +426,30 @@ def create(seq_group: SequenceGroup,
         # Determine the type based on a condition, for example:
         if hasattr(seq_group,
                    'embeddings') and seq_group.embeddings is not None:
-            return EmbeddingRequestOutput.from_seq_group(seq_group)
+            return PoolingRequestOutput.from_seq_group(seq_group)
         else:
             return RequestOutput.from_seq_group(seq_group, use_cache,
                                                 seq_id_to_seq_group)
+
+
+def __getattr__(name: str):
+    import warnings
+
+    if name == "EmbeddingOutput":
+        msg = ("EmbeddingOutput has been renamed to PoolingOutput. "
+               "The original name will be removed in an upcoming version.")
+
+        warnings.warn(DeprecationWarning(msg), stacklevel=2)
+
+        return PoolingOutput
+
+    if name == "EmbeddingRequestOutput":
+        msg = ("EmbeddingRequestOutput has been renamed to "
+               "PoolingRequestOutput. "
+               "The original name will be removed in an upcoming version.")
+
+        warnings.warn(DeprecationWarning(msg), stacklevel=2)
+
+        return PoolingRequestOutput
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 421ecc8c0d921..1df9bc57a1cb2 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -9,7 +9,7 @@
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
@@ -133,7 +133,7 @@ async def add_request(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]:
+    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
         """Add new request to the AsyncLLM."""
 
         if self.detokenizer.is_request_active(request_id):
diff --git a/vllm/v1/engine/async_stream.py b/vllm/v1/engine/async_stream.py
index 3e6c759ad5ebd..35449238c3259 100644
--- a/vllm/v1/engine/async_stream.py
+++ b/vllm/v1/engine/async_stream.py
@@ -1,11 +1,11 @@
 import asyncio
 from typing import Any, AsyncGenerator, Callable, Optional, Type, Union
 
-from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+from vllm.outputs import PoolingRequestOutput, RequestOutput
 
 
 class AsyncStream:
-    """A stream of RequestOutputs or EmbeddingRequestOutputs for a request
+    """A stream of RequestOutputs or PoolingRequestOutputs for a request
     that can be iterated over asynchronously via an async generator."""
 
     STOP_ITERATION = Exception()  # Sentinel
@@ -16,7 +16,7 @@ def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
         self._queue: asyncio.Queue = asyncio.Queue()
         self._finished = False
 
-    def put(self, item: Union[RequestOutput, EmbeddingRequestOutput,
+    def put(self, item: Union[RequestOutput, PoolingRequestOutput,
                               Exception]) -> None:
         if not self._finished:
             self._queue.put_nowait(item)
@@ -32,7 +32,7 @@ def finish(
 
     async def generator(
         self
-    ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]:
+    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
         finished = False
         try:
             while True:
diff --git a/vllm/worker/cpu_embedding_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py
similarity index 98%
rename from vllm/worker/cpu_embedding_model_runner.py
rename to vllm/worker/cpu_pooling_model_runner.py
index 3954e4c4c8a5b..17b2fd2564a04 100644
--- a/vllm/worker/cpu_embedding_model_runner.py
+++ b/vllm/worker/cpu_pooling_model_runner.py
@@ -16,12 +16,12 @@
 @dataclasses.dataclass(frozen=True)
 class ModelInputForCPUWithPoolingMetadata(ModelInputForCPU):
     """
-    Used by the CPUEmbeddingModelRunner.
+    Used by the CPUPoolingModelRunner.
     """
     pooling_metadata: Optional["PoolingMetadata"] = None
 
 
-class CPUEmbeddingModelRunner(
+class CPUPoolingModelRunner(
         CPUModelRunnerBase[ModelInputForCPUWithPoolingMetadata]):
     _model_input_cls: Type[ModelInputForCPUWithPoolingMetadata] = (
         ModelInputForCPUWithPoolingMetadata)
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index cf04808b73372..4fad1a3f4caeb 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -14,9 +14,9 @@
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-from vllm.worker.cpu_embedding_model_runner import CPUEmbeddingModelRunner
 from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner
 from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase
+from vllm.worker.cpu_pooling_model_runner import CPUPoolingModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
                                      LoraNotSupportedWorkerBase, WorkerBase,
                                      WorkerInput)
@@ -164,7 +164,7 @@ def __init__(
                     else {"return_hidden_states": True}
         ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner
         if self.model_config.task == "embedding":
-            ModelRunnerClass = CPUEmbeddingModelRunner
+            ModelRunnerClass = CPUPoolingModelRunner
         elif self.model_config.is_encoder_decoder:
             ModelRunnerClass = CPUEncoderDecoderModelRunner
         self.model_runner: CPUModelRunnerBase = ModelRunnerClass(
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/pooling_model_runner.py
similarity index 98%
rename from vllm/worker/embedding_model_runner.py
rename to vllm/worker/pooling_model_runner.py
index f56805918fd15..1beae1e3884c5 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -21,12 +21,12 @@
 @dataclasses.dataclass(frozen=True)
 class ModelInputForGPUWithPoolingMetadata(ModelInputForGPU):
     """
-    Used by the EmbeddingModelRunner.
+    Used by the PoolingModelRunner.
     """
     pooling_metadata: Optional["PoolingMetadata"] = None
 
 
-class EmbeddingModelRunner(
+class PoolingModelRunner(
         GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]):
     _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = (
         ModelInputForGPUWithPoolingMetadata)
@@ -52,7 +52,7 @@ def execute_model(
     ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]:
         if num_steps > 1:
             raise ValueError(
-                "EmbeddingModelRunner does not support multi-step execution.")
+                "PoolingModelRunner does not support multi-step execution.")
 
         if self.lora_config:
             assert model_input.lora_requests is not None
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 24e7bc760b0c0..d58cb029618e9 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -22,9 +22,9 @@
 from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
                            SequenceGroupMetadata, SequenceGroupMetadataDelta)
 from vllm.worker.cache_engine import CacheEngine
-from vllm.worker.embedding_model_runner import EmbeddingModelRunner
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
+from vllm.worker.pooling_model_runner import PoolingModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
                                      WorkerInput)
 
@@ -75,7 +75,7 @@ def __init__(
 
         ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
         if model_config.task == "embedding":
-            ModelRunnerClass = EmbeddingModelRunner
+            ModelRunnerClass = PoolingModelRunner
         elif self.model_config.is_encoder_decoder:
             ModelRunnerClass = EncoderDecoderModelRunner
         self.model_runner: GPUModelRunnerBase = ModelRunnerClass(

From cf04e11d3c62c3cac5af1fb526a1f5a342b39838 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 1 Dec 2024 00:41:38 -0800
Subject: [PATCH 104/293] [doc] add warning about comparing hf and vllm outputs
 (#10805)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 docs/source/models/supported_models.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index f571b8bf6735e..9f3b6f59068e2 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -701,6 +701,9 @@ At vLLM, we are committed to facilitating the integration and support of third-p
 
 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
 
+.. tip::
+  When comparing the output of :code:`model.generate` from HuggingFace Transformers with the output of :code:`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., `generation_config.json <https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945>`__) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
+
 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
 
 4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use.

From b58062b5c63af2beffb89d8745aa20924373bf4d Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sun, 1 Dec 2024 00:47:05 -0800
Subject: [PATCH 105/293] [Misc] Adding `MMMU-Pro` vision dataset to serving
 benchmark (#10804)

Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 benchmarks/benchmark_serving.py | 65 +++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index e9fc037a46965..3256692142c5e 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -199,6 +199,56 @@ def sample_sonnet_requests(
     return sampled_requests
 
 
+def sample_mmmu_pro_vision_requests(
+    dataset,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int] = None,
+) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
+    sampled_requests: List[Tuple[str, int, int, Dict[str,
+                                                     Collection[str]]]] = []
+    for data in dataset:
+        if len(sampled_requests) == num_requests:
+            break
+
+        # MMMU-Pro vision direct prompt
+        # Ref: https://github.com/MMMU-Benchmark/MMMU/blob/6ce42f4d8f70c1841c67867152648974415b5cac/mmmu-pro/prompts.yaml#L5
+        prompt = (
+            "Answer with the option letter from the given choices directly. "
+            "The last line of your response should be of the following "
+            "format: 'Answer: $LETTER' (without quotes) where LETTER is one of "
+            "options.")
+
+        prompt_token_ids = tokenizer(prompt).input_ids
+        if fixed_output_len is None:
+            # Default max output len is set to 128
+            print("--hf-output-len is not provided. Using default value 128.")
+            fixed_output_len = 128
+
+        prompt_len = len(prompt_token_ids)
+        output_len = fixed_output_len
+
+        assert isinstance(
+            data["image"],
+            Image), ("Input image format must be `PIL.Image.Image`, "
+                     f"given {type(data['image'])}.")
+        image: Image = data["image"]
+        image = image.convert("RGB")
+        image_data = io.BytesIO()
+        image.save(image_data, format='JPEG')
+        image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
+        mm_content = {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{image_base64}"
+            },
+        }
+
+        sampled_requests.append((prompt, prompt_len, output_len, mm_content))
+
+    return sampled_requests
+
+
 def sample_hf_requests(
     dataset_path: str,
     dataset_subset: str,
@@ -208,6 +258,21 @@ def sample_hf_requests(
     random_seed: int,
     fixed_output_len: Optional[int] = None,
 ) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
+
+    # Special case for MMMU-Pro vision dataset
+    if dataset_path == 'MMMU/MMMU_Pro' and dataset_subset == 'vision':
+        assert dataset_split == "test"
+        dataset = load_dataset(dataset_path,
+                               name=dataset_subset,
+                               split=dataset_split,
+                               streaming=True)
+        assert "image" in dataset.features, (
+            "MMMU/MMMU_Pro vision dataset must have 'image' column.")
+        filter_func = lambda x: isinstance(x["image"], Image)
+        dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
+        return sample_mmmu_pro_vision_requests(dataset, num_requests,
+                                               tokenizer, fixed_output_len)
+
     dataset = load_dataset(dataset_path,
                            name=dataset_subset,
                            split=dataset_split,

From bcdb5b801c6bf64fe3035621edcb8f42e217242c Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 2 Dec 2024 00:38:36 +0000
Subject: [PATCH 106/293] removed fast tests from pipeline

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 02a80640ac3f8..46692506f01d4 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -174,15 +174,6 @@ steps:
   commands:
     - VLLM_USE_V1=1 pytest -v -s v1
 
-- label: V1 Fast Test
-  #mirror_hardwares: [amd]
-  fast_check: true
-  source_file_dependencies:
-    - vllm/
-    - tests/v1
-  commands:
-    - VLLM_USE_V1=1 pytest -v -s v1/samplers/test_logprobs.py::test_fast_get_logprobs_and_prompt_logprobs
-
 - label: Examples Test # 15min
   working_dir: "/vllm-workspace/examples"
   #mirror_hardwares: [amd]

From 88f7f571b5b981319ffab0bf13fa29d3f8484d64 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sun, 1 Dec 2024 19:01:00 -0600
Subject: [PATCH 107/293] [Core] Implement disagg prefill by
 StatelessProcessGroup (#10502)

This PR provides initial support for single-node disaggregated prefill in 1P1D scenario.
Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
Co-authored-by: ApostaC <yihua98@uchicago.edu>
Co-authored-by: YaoJiayi <120040070@link.cuhk.edu.cn>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml                 |   4 +
 .../disagg_overhead_benchmark.sh              | 144 +++++++++
 .../disagg_performance_benchmark.sh           | 164 +++++++++++
 .../disagg_prefill_proxy_server.py            |  61 ++++
 .../disagg_benchmarks/round_robin_proxy.py    |  60 ++++
 .../visualize_benchmark_results.py            |  46 +++
 examples/disaggregated_prefill.sh             | 109 +++++++
 tests/kv_transfer/disagg_test.py              | 119 ++++++++
 tests/kv_transfer/module_test.py              |  64 ++++
 tests/kv_transfer/test_lookup_buffer.py       | 160 ++++++++++
 tests/kv_transfer/test_lookup_buffer.sh       |   3 +
 tests/kv_transfer/test_send_recv.py           | 155 ++++++++++
 tests/kv_transfer/test_send_recv.sh           |   3 +
 vllm/config.py                                |  84 ++++++
 vllm/distributed/kv_transfer/README.md        |  30 ++
 vllm/distributed/kv_transfer/__init__.py      |   0
 .../kv_transfer/disagg_prefill_workflow.jpg   | Bin 0 -> 142656 bytes
 .../kv_transfer/kv_connector/__init__.py      |   0
 .../kv_transfer/kv_connector/base.py          | 122 ++++++++
 .../kv_transfer/kv_connector/factory.py       |  19 ++
 .../kv_connector/simple_connector.py          | 261 +++++++++++++++++
 .../kv_transfer/kv_lookup_buffer/__init__.py  |   0
 .../kv_transfer/kv_lookup_buffer/base.py      | 108 +++++++
 .../kv_lookup_buffer/simple_buffer.py         | 242 +++++++++++++++
 .../kv_transfer/kv_pipe/__init__.py           |   0
 vllm/distributed/kv_transfer/kv_pipe/base.py  |  65 +++++
 .../kv_transfer/kv_pipe/pynccl_pipe.py        | 276 ++++++++++++++++++
 .../kv_transfer/kv_transfer_agent.py          |  75 +++++
 vllm/distributed/parallel_state.py            |  35 ++-
 vllm/engine/arg_utils.py                      |  18 +-
 vllm/worker/model_runner.py                   | 105 ++++++-
 vllm/worker/worker.py                         |  13 +-
 vllm/worker/worker_base.py                    |   1 +
 33 files changed, 2525 insertions(+), 21 deletions(-)
 create mode 100644 benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
 create mode 100644 benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
 create mode 100644 benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
 create mode 100644 benchmarks/disagg_benchmarks/round_robin_proxy.py
 create mode 100644 benchmarks/disagg_benchmarks/visualize_benchmark_results.py
 create mode 100644 examples/disaggregated_prefill.sh
 create mode 100644 tests/kv_transfer/disagg_test.py
 create mode 100644 tests/kv_transfer/module_test.py
 create mode 100644 tests/kv_transfer/test_lookup_buffer.py
 create mode 100644 tests/kv_transfer/test_lookup_buffer.sh
 create mode 100644 tests/kv_transfer/test_send_recv.py
 create mode 100644 tests/kv_transfer/test_send_recv.sh
 create mode 100644 vllm/distributed/kv_transfer/README.md
 create mode 100644 vllm/distributed/kv_transfer/__init__.py
 create mode 100644 vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg
 create mode 100644 vllm/distributed/kv_transfer/kv_connector/__init__.py
 create mode 100644 vllm/distributed/kv_transfer/kv_connector/base.py
 create mode 100644 vllm/distributed/kv_transfer/kv_connector/factory.py
 create mode 100644 vllm/distributed/kv_transfer/kv_connector/simple_connector.py
 create mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py
 create mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
 create mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
 create mode 100644 vllm/distributed/kv_transfer/kv_pipe/__init__.py
 create mode 100644 vllm/distributed/kv_transfer/kv_pipe/base.py
 create mode 100644 vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
 create mode 100644 vllm/distributed/kv_transfer/kv_transfer_agent.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 46692506f01d4..f5591f1098534 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -430,6 +430,9 @@ steps:
   - vllm/model_executor/models/
   - tests/distributed/
   - vllm/compilation
+  - vllm/worker/worker_base.py
+  - vllm/worker/worker.py
+  - vllm/worker/model_runner.py
   commands:
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
@@ -443,6 +446,7 @@ steps:
   - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s distributed/test_distributed_oot.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
 
 - label: Multi-step Tests (4 GPUs) # 36min
   working_dir: "/vllm-workspace/tests"
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
new file mode 100644
index 0000000000000..2924ea4a49f54
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+# benchmark the overhead of disaggregated prefill.
+# methodology:
+# - send all request to prefill vLLM instance. It will buffer KV cache.
+# - then send all request to decode instance. 
+# - The TTFT of decode instance is the overhead.
+
+set -ex
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pkill -f pt_main_thread
+  sleep 10
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+benchmark() {
+
+  export VLLM_LOGGING_LEVEL=DEBUG
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+  # compare chunked prefill with disaggregated prefill
+
+  results_folder="./results"
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+  dataset_name="sonnet"
+  dataset_path="../sonnet_4x.txt"
+  num_prompts=10
+  qps=$1
+  prefix_len=50
+  input_len=2048
+  output_len=$2
+
+
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --port 8100 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.6 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    
+
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --port 8200 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.6 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+
+  wait_for_server 8100
+  wait_for_server 8200
+
+  # let the prefill instance finish prefill
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len "$output_len" \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8100 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_2xtp4.json \
+          --request-rate "inf"
+
+
+  # send the request to decode.
+  # The TTFT of this command will be the overhead of disagg prefill impl.
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len "$output_len" \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8200 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_2xtp4.json \
+          --request-rate "$qps"
+  kill_gpu_processes
+
+}
+
+
+main() {
+
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get -y install jq)
+  (which socat) || (apt-get -y install socat)
+
+  pip install quart httpx
+
+  cd "$(dirname "$0")"
+
+  cd ..
+  # create sonnet-4x.txt
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  cd disagg_benchmarks
+
+  rm -rf results
+  mkdir results
+
+  default_qps=1
+  default_output_len=1
+  benchmark $default_qps $default_output_len
+
+}
+
+
+main "$@"
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
new file mode 100644
index 0000000000000..d8d9e976dce76
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+
+# Requirement: 8x H100 GPUs.
+
+
+# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV 
+# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
+# Resource: 8x H100
+# Approaches:
+# 1. Chunked prefill: 1 vllm instance with tp=8
+# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
+# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
+# Prefilling instance: max_output_token=1
+# Decoding instance: force the input tokens be the same across requests to bypass prefilling
+
+set -ex
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pgrep pt_main_thread | xargs -r kill -9
+  pgrep python3 | xargs -r kill -9
+  for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
+  sleep 1
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+launch_chunked_prefill() {
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+  # disagg prefill
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8100 \
+    --max-model-len 10000 \
+    --enable-chunked-prefill \
+    --gpu-memory-utilization 0.6 &
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8200 \
+    --max-model-len 10000 \
+    --enable-chunked-prefill \
+    --gpu-memory-utilization 0.6 &
+  wait_for_server 8100
+  wait_for_server 8200
+  python3 round_robin_proxy.py &
+  sleep 1
+}
+
+
+launch_disagg_prefill() {
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct" 
+  # disagg prefill
+  CUDA_VISIBLE_DEVICES=0 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8100 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.6 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+
+  CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8200 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.6 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+
+  wait_for_server 8100
+  wait_for_server 8200
+  python3 disagg_prefill_proxy_server.py &
+  sleep 1
+}
+
+
+benchmark() {
+  results_folder="./results"
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
+  dataset_name="sonnet"
+  dataset_path="../sonnet_4x.txt"
+  num_prompts=100
+  qps=$1
+  prefix_len=50
+  input_len=1024
+  output_len=$2
+  tag=$3
+
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len "$output_len" \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8000 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename "$tag"-qps-"$qps".json \
+          --request-rate "$qps"
+
+  sleep 2
+
+}
+
+
+main() {
+
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get -y install jq)
+  (which socat) || (apt-get -y install socat)
+
+  pip install quart httpx matplotlib aiohttp
+
+  cd "$(dirname "$0")"
+
+  cd ..
+  # create sonnet-4x.txt so that we can sample 2048 tokens for input
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  cd disagg_benchmarks
+
+  rm -rf results
+  mkdir results
+
+  default_output_len=6
+
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+  launch_chunked_prefill
+  for qps in 2 4 6 8; do
+  benchmark $qps $default_output_len chunked_prefill
+  done
+  kill_gpu_processes
+
+  launch_disagg_prefill
+  for qps in 2 4 6 8; do
+  benchmark $qps $default_output_len disagg_prefill
+  done
+  kill_gpu_processes
+
+  python3 visualize_benchmark_results.py
+
+}
+
+
+main "$@"
diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
new file mode 100644
index 0000000000000..4058b1c0a3b79
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -0,0 +1,61 @@
+import os
+
+import aiohttp
+from quart import Quart, make_response, request
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+app = Quart(__name__)
+
+
+async def forward_request(url, data):
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+        async with session.post(url=url, json=data,
+                                headers=headers) as response:
+            if response.status == 200:
+                # if response.headers.get('Transfer-Encoding') == 'chunked':
+                if True:
+                    async for chunk_bytes in response.content.iter_chunked(
+                            1024):
+                        yield chunk_bytes
+                else:
+                    content = await response.read()
+                    yield content
+
+
+@app.route('/v1/completions', methods=['POST'])
+async def handle_request():
+    try:
+        original_request_data = await request.get_json()
+
+        prefill_request = original_request_data.copy()
+        # change max_tokens = 1 to let it only do prefill
+        prefill_request['max_tokens'] = 1
+
+        # finish prefill
+        async for _ in forward_request('http://localhost:8100/v1/completions',
+                                       prefill_request):
+            continue
+
+        # return decode
+        generator = forward_request('http://localhost:8200/v1/completions',
+                                    original_request_data)
+        response = await make_response(generator)
+        response.timeout = None
+
+        return response
+
+    except Exception as e:
+        import sys
+        import traceback
+        exc_info = sys.exc_info()
+        print("Error occurred in disagg prefill proxy server")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+
+
+if __name__ == '__main__':
+    app.run(port=8000)
diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py
new file mode 100644
index 0000000000000..6eb5f63980070
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
@@ -0,0 +1,60 @@
+import asyncio
+import itertools
+
+import aiohttp
+from aiohttp import web
+
+
+class RoundRobinProxy:
+
+    def __init__(self, target_ports):
+        self.target_ports = target_ports
+        self.port_cycle = itertools.cycle(self.target_ports)
+
+    async def handle_request(self, request):
+        target_port = next(self.port_cycle)
+        target_url = f"http://localhost:{target_port}{request.path_qs}"
+
+        async with aiohttp.ClientSession() as session:
+            try:
+                # Forward the request
+                async with session.request(
+                        method=request.method,
+                        url=target_url,
+                        headers=request.headers,
+                        data=request.content,
+                ) as response:
+                    # Start sending the response
+                    resp = web.StreamResponse(status=response.status,
+                                              headers=response.headers)
+                    await resp.prepare(request)
+
+                    # Stream the response content
+                    async for chunk in response.content.iter_any():
+                        await resp.write(chunk)
+
+                    await resp.write_eof()
+                    return resp
+
+            except Exception as e:
+                return web.Response(text=f"Error: {str(e)}", status=500)
+
+
+async def main():
+    proxy = RoundRobinProxy([8100, 8200])
+    app = web.Application()
+    app.router.add_route('*', '/{path:.*}', proxy.handle_request)
+
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, 'localhost', 8000)
+    await site.start()
+
+    print("Proxy server started on http://localhost:8000")
+
+    # Keep the server running
+    await asyncio.Event().wait()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
new file mode 100644
index 0000000000000..e59d8bb0e6c8c
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
@@ -0,0 +1,46 @@
+import json
+
+import matplotlib.pyplot as plt
+import pandas as pd
+
+if __name__ == "__main__":
+
+    data = []
+    for name in ['disagg_prefill', 'chunked_prefill']:
+        for qps in [2, 4, 6, 8]:
+            with open(f"results/{name}-qps-{qps}.json") as f:
+                x = json.load(f)
+                x['name'] = name
+                x['qps'] = qps
+                data.append(x)
+
+    df = pd.DataFrame.from_dict(data)
+    dis_df = df[df['name'] == 'disagg_prefill']
+    chu_df = df[df['name'] == 'chunked_prefill']
+
+    plt.style.use('bmh')
+    plt.rcParams['font.size'] = 20
+
+    for key in [
+            'mean_ttft_ms', 'median_ttft_ms', 'p99_ttft_ms', 'mean_itl_ms',
+            'median_itl_ms', 'p99_itl_ms'
+    ]:
+
+        fig, ax = plt.subplots(figsize=(11, 7))
+        plt.plot(dis_df['qps'],
+                 dis_df[key],
+                 label='disagg_prefill',
+                 marker='o',
+                 linewidth=4)
+        plt.plot(chu_df['qps'],
+                 chu_df[key],
+                 label='chunked_prefill',
+                 marker='o',
+                 linewidth=4)
+        ax.legend()
+
+        ax.set_xlabel('QPS')
+        ax.set_ylabel(key)
+        ax.set_ylim(bottom=0)
+        fig.savefig(f'results/{key}.png')
+        plt.close(fig)
diff --git a/examples/disaggregated_prefill.sh b/examples/disaggregated_prefill.sh
new file mode 100644
index 0000000000000..87155273a81d1
--- /dev/null
+++ b/examples/disaggregated_prefill.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+# This file demonstrates the example usage of disaggregated prefilling
+# We will launch 2 vllm instances (1 for prefill and 1 for decode),
+# and then transfer the KV cache between them.
+
+echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
+sleep 1
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'cleanup' INT
+
+# Cleanup function
+cleanup() {
+    echo "Caught Ctrl+C, cleaning up..."
+    # Cleanup commands
+    pgrep python | xargs kill -9
+    pkill -f python
+    echo "Cleanup complete. Exiting."
+    exit 0
+}
+
+export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+# install quart first -- required for disagg prefill proxy serve
+if python3 -c "import quart" &> /dev/null; then
+    echo "Quart is already installed."
+else
+    echo "Quart is not installed. Installing..."
+    python3 -m pip install quart
+fi 
+
+# a function that waits vLLM server to start
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+# You can also adjust --kv-ip and --kv-port for distributed inference.
+
+# prefilling instance, which is the KV producer
+CUDA_VISIBLE_DEVICES=0 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --port 8100 \
+    --max-model-len 100 \
+    --gpu-memory-utilization 0.8 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
+
+# decoding instance, which is the KV consumer
+CUDA_VISIBLE_DEVICES=1 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --port 8200 \
+    --max-model-len 100 \
+    --gpu-memory-utilization 0.8 \
+    --kv-transfer-config \
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
+
+# wait until prefill and decode instances are ready
+wait_for_server 8100
+wait_for_server 8200
+
+# launch a proxy server that opens the service at port 8000
+# the workflow of this proxy:
+# - send the request to prefill vLLM instance (port 8100), change max_tokens 
+#   to 1
+# - after the prefill vLLM finishes prefill, send the request to decode vLLM 
+#   instance
+# NOTE: the usage of this API is subject to change --- in the future we will 
+# introduce "vllm connect" to connect between prefill and decode instances
+python3 ../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py &
+sleep 1
+
+# serve two example requests
+output1=$(curl -X POST -s http://localhost:8000/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+"prompt": "San Francisco is a",
+"max_tokens": 10,
+"temperature": 0
+}')
+
+output2=$(curl -X POST -s http://localhost:8000/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+"prompt": "Santa Clara is a",
+"max_tokens": 10,
+"temperature": 0
+}')
+
+
+# Cleanup commands
+pgrep python | xargs kill -9
+pkill -f python
+
+echo ""
+
+sleep 1
+
+# Print the outputs of the curl requests
+echo ""
+echo "Output of first request: $output1"
+echo "Output of second request: $output2"
+
+echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
+echo ""
diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py
new file mode 100644
index 0000000000000..adc6150edece6
--- /dev/null
+++ b/tests/kv_transfer/disagg_test.py
@@ -0,0 +1,119 @@
+import os
+import subprocess
+import sys
+import time
+from subprocess import Popen
+
+import pytest
+import requests
+import torch
+
+
+# Fixture to set up environment variables and teardown servers after tests
+@pytest.fixture(scope="module", autouse=True)
+def setup_servers():
+    if torch.cuda.device_count() < 4:
+        pytest.skip("Skipping test: fewer than 4 GPUs available")
+
+    # Set up environment variables
+    VLLM_HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'",
+                                           shell=True).decode().strip()
+    os.environ["VLLM_HOST_IP"] = VLLM_HOST_IP
+
+    # Start prefill instance
+    prefill_cmd = [
+        sys.executable,
+        "-m",
+        "vllm.entrypoints.openai.api_server",
+        "--model",
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "--port",
+        "8100",
+        "--gpu-memory-utilization",
+        "0.5",
+        "--max-model-len",
+        "1000",
+        "--kv-transfer-config",
+        '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer",'\
+        '"kv_rank":0,"kv_parallel_size":2}',
+    ]
+    prefill_env = os.environ.copy()
+    prefill_env["CUDA_VISIBLE_DEVICES"] = "0"
+    prefill_proc = Popen(prefill_cmd, env=prefill_env)
+
+    # Start decode instance
+    decode_cmd = [
+        sys.executable,
+        "-m",
+        "vllm.entrypoints.openai.api_server",
+        "--model",
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "--port",
+        "8200",
+        "--gpu-memory-utilization",
+        "0.5",
+        "--max-model-len",
+        "1000",
+        "--kv-transfer-config",
+        '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer",'\
+        '"kv_rank":1,"kv_parallel_size":2}',
+    ]
+    decode_env = os.environ.copy()
+    decode_env["CUDA_VISIBLE_DEVICES"] = "1"
+    decode_proc = Popen(decode_cmd, env=decode_env)
+
+    # Wait for servers to be ready
+    assert wait_for_server(8100), "Prefill server did not start in time"
+    assert wait_for_server(8200), "Decode server did not start in time"
+
+    # Yield to the test function and handle teardown after tests
+    yield
+
+    # Cleanup: kill the processes
+    prefill_proc.terminate()
+    decode_proc.terminate()
+
+    # Additional cleanup if needed
+    prefill_proc.wait()
+    decode_proc.wait()
+
+
+# Helper function to wait for server
+def wait_for_server(port, timeout=240):
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            response = requests.get(f"http://localhost:{port}/v1/completions")
+            if response.status_code in [200, 405]:
+                return True
+        except requests.ConnectionError:
+            time.sleep(1)
+    return False
+
+
+# Test function to send curl requests and validate responses
+@pytest.mark.parametrize("prompt", ["San Francisco is a", "Santa Clara is a"])
+def test_disaggregated_prefilling(prompt):
+    # Send to prefill
+    response = requests.post("http://localhost:8100/v1/completions",
+                             headers={"Content-Type": "application/json"},
+                             json={
+                                 "model":
+                                 "meta-llama/Meta-Llama-3.1-8B-Instruct",
+                                 "prompt": prompt,
+                                 "max_tokens": 1,
+                                 "temperature": 0
+                             })
+    assert response.status_code == 200
+
+    # Send to decode
+    response = requests.post("http://localhost:8200/v1/completions",
+                             headers={"Content-Type": "application/json"},
+                             json={
+                                 "model":
+                                 "meta-llama/Meta-Llama-3.1-8B-Instruct",
+                                 "prompt": prompt,
+                                 "max_tokens": 10,
+                                 "temperature": 0
+                             })
+    assert response.status_code == 200
diff --git a/tests/kv_transfer/module_test.py b/tests/kv_transfer/module_test.py
new file mode 100644
index 0000000000000..355461919cd7c
--- /dev/null
+++ b/tests/kv_transfer/module_test.py
@@ -0,0 +1,64 @@
+import subprocess
+import sys
+
+import pytest
+import torch
+
+
+def run_python_script(script_name, timeout):
+    script_name = f'kv_transfer/{script_name}'
+    try:
+        # Start both processes asynchronously using Popen
+        process0 = subprocess.Popen(
+            [sys.executable, script_name],
+            env={"RANK":
+                 "0"},  # Set the RANK environment variable for process 0
+            stdout=sys.stdout,  # Pipe stdout to current stdout
+            stderr=sys.stderr,  # Pipe stderr to current stderr
+        )
+
+        process1 = subprocess.Popen(
+            [sys.executable, script_name],
+            env={"RANK":
+                 "1"},  # Set the RANK environment variable for process 1
+            stdout=sys.stdout,  # Pipe stdout to current stdout
+            stderr=sys.stderr,  # Pipe stderr to current stderr
+        )
+
+        # Wait for both processes to complete, with a timeout
+        process0.wait(timeout=timeout)
+        process1.wait(timeout=timeout)
+
+        # Check the return status of both processes
+        if process0.returncode != 0:
+            pytest.fail(
+                f"Test {script_name} failed for RANK=0, {process0.returncode}")
+        if process1.returncode != 0:
+            pytest.fail(
+                f"Test {script_name} failed for RANK=1, {process1.returncode}")
+
+    except subprocess.TimeoutExpired:
+        # If either process times out, terminate both and fail the test
+        process0.terminate()
+        process1.terminate()
+        pytest.fail(f"Test {script_name} timed out")
+    except Exception as e:
+        pytest.fail(f"Test {script_name} failed with error: {str(e)}")
+
+
+# Define the test cases using pytest's parametrize
+@pytest.mark.parametrize(
+    "script_name,timeout",
+    [
+        ("test_lookup_buffer.py",
+         60),  # Second test case with a 60-second timeout
+        ("test_send_recv.py", 120)  # First test case with a 120-second timeout
+    ])
+def test_run_python_script(script_name, timeout):
+    # Check the number of GPUs
+    if torch.cuda.device_count() < 2:
+        pytest.skip(
+            f"Skipping test {script_name} because <2 GPUs are available")
+
+    # Run the test if there are at least 2 GPUs
+    run_python_script(script_name, timeout)
diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
new file mode 100644
index 0000000000000..96b0e58713332
--- /dev/null
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -0,0 +1,160 @@
+import os
+import random
+
+import torch
+from tqdm import tqdm
+
+from vllm.config import KVTransferConfig
+from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
+    SimpleBuffer)
+from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
+
+# TODO: the test depends on a lot of fields in the current implementation.
+# We should have standard interface instead direct field access
+
+
+def test_run(my_rank, buffer, device):
+
+    # buffer should be empty in the beginning
+    if my_rank == 0:
+        assert buffer.buffer_size == 0
+        assert len(buffer.buffer) == 0
+
+    print("My rank: %d, device: %s" % (my_rank, device))
+
+    # insert
+    tokens = torch.tensor([1, 2, 3]).to(device)
+    roi = (tokens > 0)
+    if my_rank == 0:
+        key = 2.0 * torch.ones([5, 6]).to(device)
+        value = 3.0 * torch.ones([5, 6]).to(device)
+
+        placeholder = torch.tensor([1]).to(device)
+
+        buffer.insert(tokens, roi, key, value, placeholder)
+
+    torch.distributed.barrier()
+
+    # drop_select
+    if my_rank == 1:
+        tok, roi_, key, value, hidden = buffer.drop_select(tokens, roi)
+        assert torch.allclose(tokens, tok)
+        assert torch.allclose(roi, roi_)
+        assert torch.allclose(key, 2.0 * torch.ones([5, 6], device=device))
+        assert torch.allclose(value, 3.0 * torch.ones([5, 6], device=device))
+    torch.distributed.barrier()
+
+    if my_rank == 0:
+        assert buffer.buffer_size == 0
+        assert len(buffer.buffer) == 0
+
+    print("Test run passed!")
+
+
+def stress_test(my_rank, buf, device):
+
+    torch.distributed.barrier()
+    torch.manual_seed(100)
+
+    reqs = [
+        (
+            torch.rand(100).to(device),  # tokens
+            torch.ones(100).bool().to(device),  # roi
+            torch.rand(100).to(device),  # key
+            torch.rand(100).to(device),  # value
+            torch.rand(100).to(device),  # hidden
+        ) for i in tqdm(range(200))
+    ]
+
+    random.seed(my_rank)
+    random.shuffle(reqs)
+
+    torch.distributed.barrier()
+
+    n = 0
+
+    # the buffer size can only store 100 reqs
+    # so the sender will occasionally block to wait for the receiver.
+    for req in tqdm(reqs):
+        if my_rank == 0:
+            buf.insert(*req)
+        else:
+            tok, roi, k, v, h = req
+            tok_, roi_, k_, v_, h_ = buf.drop_select(tok, roi)
+
+            if tok_ is None:
+                assert roi_ is None
+                assert k_ is None
+                assert v_ is None
+                assert h_ is None
+                n += 1
+            else:
+                assert torch.allclose(tok, tok_)
+                assert torch.allclose(roi, roi_)
+                assert torch.allclose(k, k_)
+                assert torch.allclose(v, v_)
+                assert torch.allclose(h, h_)
+    print('Rank %d done' % my_rank)
+    torch.distributed.barrier()
+
+    if my_rank == 0:
+        x = torch.tensor([0])
+        torch.distributed.recv(x, 1)
+        # the # of None received is the kv that are not selected
+        assert x.item() == len(buf.buffer)
+        # and the size of the buffer should be 2000 * buffer len
+        print(buf.buffer_size)
+        assert buf.buffer_size == 1700 * len(buf.buffer)
+    else:
+        torch.distributed.send(torch.tensor([n]), 0)
+
+    print("Passed stress test!")
+
+
+if __name__ == "__main__":
+
+    my_rank = int(os.environ['RANK'])
+
+    torch.distributed.init_process_group(
+        backend='gloo',
+        init_method='tcp://localhost:12398',
+        world_size=2,
+        rank=my_rank,
+    )
+
+    print("initialized! My rank is %d" % my_rank)
+
+    config = KVTransferConfig(
+        kv_connector='PyNcclConnector',
+        kv_buffer_device='cuda',
+        kv_buffer_size=1e9,
+        kv_rank=my_rank,
+        kv_role="kv_both",  # this arg doesn't matter in this test
+        kv_parallel_size=2,
+        kv_ip="127.0.0.1",
+        kv_port=12345,
+    )
+
+    data_pipe = PyNcclPipe(
+        local_rank=my_rank,
+        config=config,
+        device="cuda",
+        port_offset=0,
+    )
+    cpu_pipe = PyNcclPipe(
+        local_rank=my_rank,
+        config=config,
+        device="cpu",
+        port_offset=1,
+    )
+
+    buffer = SimpleBuffer(cpu_pipe, data_pipe, 170000)
+
+    test_run(my_rank, buffer, data_pipe.device)
+
+    stress_test(my_rank, buffer, data_pipe.device)
+
+    buffer.close()
+    data_pipe.close()
+    cpu_pipe.close()
+    print('Done')
diff --git a/tests/kv_transfer/test_lookup_buffer.sh b/tests/kv_transfer/test_lookup_buffer.sh
new file mode 100644
index 0000000000000..09d7ee018c3f4
--- /dev/null
+++ b/tests/kv_transfer/test_lookup_buffer.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+RANK=0 python test_lookup_buffer.py &
+RANK=1 python test_lookup_buffer.py &
\ No newline at end of file
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
new file mode 100644
index 0000000000000..65973bf10a4d7
--- /dev/null
+++ b/tests/kv_transfer/test_send_recv.py
@@ -0,0 +1,155 @@
+import os
+import time
+from typing import List
+
+import torch
+from tqdm import tqdm
+
+from vllm.config import KVTransferConfig
+from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
+
+
+def test_run(my_rank, pipe):
+    # test run
+    x = torch.tensor([1]).to(pipe.device)
+    y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
+    if my_rank == 0:
+        pipe.send_tensor(x)
+        print("sent tensor x")
+        pipe.send_tensor(y)
+        print("sent tensor y")
+        x2 = pipe.recv_tensor()
+        print("received x2 = ", x2)
+        y2 = pipe.recv_tensor()
+        print("received y2 = ", x2)
+
+    else:
+        x2 = pipe.recv_tensor()
+        print("received x2 = ", x2)
+        y2 = pipe.recv_tensor()
+        print("received y2 = ", x2)
+        pipe.send_tensor(x)
+        print("sent tensor x")
+        pipe.send_tensor(y)
+        print("sent tensor y")
+
+    assert torch.allclose(x, x2)
+    assert torch.allclose(y, y2)
+
+
+def stress_test(my_rank, pipe):
+
+    torch.distributed.barrier()
+
+    tensors: List[torch.Tensor] = []
+
+    torch.manual_seed(0)
+
+    for i in tqdm(range(500)):
+        mean = torch.rand(1).item() * 100
+        std = torch.rand(1).item() * 100
+        size = torch.randint(900, 1000, (2, ))
+        x = torch.normal(mean * 1.0, std * 1.0,
+                         size=size.tolist()).to(pipe.device)
+
+        # 5% probability of sending a None
+        if torch.rand(1).item() < 0.05:
+            tensors.append(None)
+            tensors.append(None)
+            tensors.append(None)
+        else:
+            tensors.append(x)
+            tensors.append(x.mean().unsqueeze(0))
+            tensors.append(x.std().unsqueeze(0))
+
+    torch.distributed.barrier()
+
+    for i in tqdm(range(500)):
+        if my_rank == int((i % 10) > 3):
+            pipe.send_tensor(tensors[3 * i])
+            pipe.send_tensor(tensors[3 * i + 1])
+            pipe.send_tensor(tensors[3 * i + 2])
+        else:
+            x = pipe.recv_tensor()
+            mean = pipe.recv_tensor()
+            std = pipe.recv_tensor()
+
+            if x is None:
+                assert mean is None
+                assert std is None
+            else:
+                assert torch.allclose(x, tensors[3 * i])
+                assert x.mean() == mean[0]
+                assert x.std() == std[0]
+
+        torch.distributed.barrier()
+
+
+def latency_test(my_rank, pipe, nelement, ntensor):
+
+    latencies = []
+
+    torch.distributed.barrier()
+
+    for i in tqdm(range(500)):
+
+        tensors = []
+
+        if my_rank == 0:
+            # create tensor
+            tensors = [
+                torch.rand(nelement).to(pipe.device) for _ in range(ntensor)
+            ]
+
+        torch.distributed.barrier()
+
+        if my_rank == 0:
+            t = torch.tensor([time.time()],
+                             dtype=torch.float64).to(pipe.device)
+            for tensor in tensors:
+                pipe.send_tensor(tensor)
+            pipe.send_tensor(t)
+        else:
+            for _ in range(ntensor):
+                pipe.recv_tensor()
+            t = pipe.recv_tensor()
+            latencies.append(time.time() - t.item())
+
+    torch.distributed.barrier()
+
+    print('Latency test passed.')
+    print('Latency:', torch.tensor(latencies).mean().item() * 1000, 'ms')
+
+
+if __name__ == "__main__":
+
+    my_rank = int(os.environ['RANK'])
+
+    torch.distributed.init_process_group(
+        backend='gloo',
+        init_method='tcp://localhost:12398',
+        world_size=2,
+        rank=my_rank,
+    )
+
+    config = KVTransferConfig(
+        kv_connector='PyNcclConnector',
+        kv_buffer_device='cuda',
+        kv_buffer_size=1e9,
+        kv_rank=my_rank,
+        kv_role="kv_both",  # this arg doesn't matter in this test
+        kv_parallel_size=2,
+        kv_ip="127.0.0.1",
+        kv_port=12345,
+    )
+
+    pipe = PyNcclPipe(
+        local_rank=my_rank,
+        config=config,
+    )
+
+    test_run(my_rank, pipe)
+    stress_test(my_rank, pipe)
+
+    # Use this function if you want to test the latency of pipe impl.
+    # latency_test(my_rank, pipe, 1024 * 8 * 128, 80)
diff --git a/tests/kv_transfer/test_send_recv.sh b/tests/kv_transfer/test_send_recv.sh
new file mode 100644
index 0000000000000..1e89e246b4992
--- /dev/null
+++ b/tests/kv_transfer/test_send_recv.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+RANK=0 python3 test_send_recv.py &
+RANK=1 python3 test_send_recv.py &
\ No newline at end of file
diff --git a/vllm/config.py b/vllm/config.py
index da043afbe1ae7..5d9e2766c7faa 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2052,6 +2052,88 @@ def __post_init__(self):
                 f"installed. Original error:\n{otel_import_error_traceback}")
 
 
+class KVTransferConfig(BaseModel):
+    """Configuration for distributed KV cache transfer."""
+
+    # The KV connector for vLLM to transmit KV caches between vLLM instances.
+    kv_connector: Optional[str] = None
+
+    # The device used by kv connector to buffer the KV cache.
+    # Currently only support 'cuda'.
+    kv_buffer_device: Optional[str] = "cuda"
+
+    # The buffer size for TorchDistributedConnector. Measured in number of
+    # bytes. Recommended value: 1e9 (about 1GB).
+    kv_buffer_size: float = 1e9
+
+    # Whether this vLLM instance produces, consumes KV cache, or both. Choices
+    # are 'kv_producer', 'kv_consumer', and 'both'.
+    kv_role: Optional[str] = None
+
+    # The rank of this vLLM instance in the KV cache transfer. Typical value:
+    # 0 for prefill instance, 1 for decode instance.
+    # Currently only 1P1D is supported.
+    kv_rank: Optional[int] = None
+
+    # The number of parallel instances for KV cache transfer. For
+    # PyNcclConnector, this should be 2.
+    kv_parallel_size: int = 1
+
+    # The KV connector ip, used to build distributed connection
+    kv_ip: str = "127.0.0.1"
+
+    # The KV connector port, used to build distributed connection
+    kv_port: int = 14579
+
+    @classmethod
+    def from_cli(cls, cli_value: str) -> "KVTransferConfig":
+        """Parse the CLI value for the compilation config."""
+        return KVTransferConfig.model_validate_json(cli_value)
+
+    def model_post_init(self, __context: Any) -> None:
+        if all([
+                self.kv_connector is not None,
+                self.kv_connector != "PyNcclConnector"
+        ]):
+            raise ValueError(f"Unsupported kv_connector: {self.kv_connector}. "
+                             f"Supported connectors are "
+                             f"`PyNcclConnector`.")
+
+        if self.kv_role is not None and self.kv_role not in [
+                "kv_producer", "kv_consumer", "kv_both"
+        ]:
+            raise ValueError(
+                f"Unsupported kv_role: {self.kv_role}. "
+                f"Supported roles are `kv_producer`, `kv_consumer`, "
+                f"and `kv_both`")
+
+        if self.kv_connector is not None and self.kv_role is None:
+            raise ValueError("Please specify kv_disagg_role when kv_connector "
+                             "is set, supported roles are `kv_producer`, "
+                             "`kv_consumer`, and `kv_both`")
+
+    @property
+    def is_kv_transfer_instance(self) -> bool:
+        return self.kv_connector is not None and \
+            self.kv_role in ["kv_producer", "kv_consumer", "kv_both"]
+
+    @property
+    def need_kv_parallel_group(self) -> bool:
+        # for those database-based connector, vLLM does not need to create
+        # parallel group, and in that case the kv parallel size will be 1.
+        return self.kv_connector is not None and self.kv_parallel_size > 1
+
+    @property
+    def is_kv_producer(self) -> bool:
+        return self.kv_connector is not None and \
+            self.kv_role in ["kv_producer", "kv_both"]
+
+    @property
+    def is_kv_consumer(self) -> bool:
+        return self.kv_connector is not None and \
+            self.kv_role in ["kv_consumer", "kv_both"]
+
+
 class CompilationLevel:
     # constants for the levels of the compilation process
     NO_COMPILATION = 0
@@ -2317,6 +2399,8 @@ class VllmConfig:
     quant_config: Optional[QuantizationConfig] = None
     compilation_config: CompilationConfig = field(default=None,
                                                   init=True)  # type: ignore
+    kv_transfer_config: KVTransferConfig = field(default=None,
+                                                 init=True)  # type: ignore
 
     @staticmethod
     def _get_quantization_config(
diff --git a/vllm/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md
new file mode 100644
index 0000000000000..dab2d10c4c9d0
--- /dev/null
+++ b/vllm/distributed/kv_transfer/README.md
@@ -0,0 +1,30 @@
+
+# Distributed KV cache transfer
+
+This folder implements distributed KV cache transfer across vLLM instances.
+Currently the main usecase is for disaggregated prefilling.
+
+## Abstractions
+
+The KV cache transfer contains three layer of abstractions:
+
+- KV pipe: a FIFO pipe for torch.tensor transmission. Key APIs: `send_tensor` and `recv_tensor`.
+- KV lookup buffer: a lookup buffer for KV caches. Key: the tokens, value: the KV caches (and/or hidden states). Key APIs: `insert` and `drop_select` (similar to SQL semantics).
+- KV connector: a connector that connects the KV pipe and KV lookup buffer to vLLM. Key APIs: `send_kv_caches_and_hidden_states` and `recv_kv_caches_and_hidden_states`.
+
+Why we need KV lookup buffer: FIFO pipe itself is not enough as prefill vLLM worker may process requests in a different order compared to decode vLLM worker. Say the QPS is really high, prefill worker may handle requests in order A -> B -> C, but the decode worker may process request C first. This is not the case that can be naturally handled by FIFO pipe, so we provide KV lookup buffer to help translate a FIFO pipe to a lookup buffer.
+
+NOTE: KV pipe layer is bypassible: you can skip this layer if your distributed 
+communication service already supports key-value-based lookup (like redis or 
+RDMA database).
+
+NOTE: If you want to not only transfer KV caches, but adjust the model execution flow of vLLM as well (for example, allow vLLM to receive KV caches on some tokens and do prefill on the remaining tokens), you can bypass both KV pipe layer and KV lookup buffer layer, and directly implement on KV connector layer. Bear in mind that as vLLM's model input is constantly changing, this implementation will likely be broken when vLLM has new updates.
+
+## Disaggregated prefilling
+
+The example usage is in [this file](../../../examples/disaggregated_prefill.sh).
+
+Here is the diagram of how we run disaggretgated prefilling.
+
+![Disaggregated prefill workflow](./disagg_prefill_workflow.jpg)
+
diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg b/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a25ec5ef52491a0e3faf596669e6cf0e7c7ae175
GIT binary patch
literal 142656
zcmeFZ2Ut_v)-D`+C(=7XKv5|{1nERTnuv%<kroA%5^2%`1VRyzE+C+wAWcBJ^dh~e
zNUw&P07@r8z(7Lyv(Gu-Ip^DZ|IfYm-19&GEnmpwSy^+HIp$nr&N1Hcj&(Y7x&%0L
zL)SnTKtVwPcuxKUoX!HY0aTQfzkSIcYVwzco`!~+nud{%j+UO8k(rr^k%@_gmHiA0
zD;p~l(;4nFY#f|iTwKgR9$s!vUUp6{&fhLVK}9}?nudXfhJllXiG}lj_;cC;V56s2
zq7kQ}5Cu@OQBbi_ob~`f000FIxwXGF_-`KyN^%=%>F604naBs!odHl%P*G7*Q~lOz
z^3lQMdH^*W4g1;4nzS4y59ma_IOW2UbLqvjsyn&vjAF&F*gpwpVC3fE<>NnhUgE;V
zOY#a=6_u1#w6E*v>ggNYFuiMLZee-P%E9sBBPVAUS8pF*KYw^YV8qkNsOV=gu_>u9
z($X_tzRJwYFDQIhR9y1@Lrra6eM4hYb60myZ(skX&tJyICnl$+XTG7(nB|q#??2Yo
zH*kAD_YV$#;g61glZygC^;fa}F4^zoVk65%Nli^fP4}Bz6qNqtLd8Z+bM`VVyQT@<
z11}CyxiET8t>oP5P6n|ncd%UcPevKJ#pTiGaKDN67s>wn1PlL{B>THy|Bwp>V5XuV
z4;~d800JQVIy8pjdvhX?hv%IakF{9>D^Ppw-&9FPPD_g?F`0^_=iPRt*BI&(YHM#(
zy5+k`a{{uQXAN}#=TTS3M*o$HFhd<qfXV+R<>kLa;(uf3{qa&EmrenluI(%9t<t9e
z`-hdJ45*3WDS)Q*6p;E3+i((--hONwtOE#nAM$x|4+fz<1#I0MZQqaLKd{=X{KH7o
zaGG5EmkXDv`I~K@*%$}6in!-qnZ2>gcT?n%4HxEe+b3BuFNLJJh5|B+h^GMX$uZ|Y
zwkSqNZGOCZj~w1R1uW+3{eA12kkEgf@b~`u-?o>yxCR$WJcLgH^(8C71MQChUrr0<
z@2)vNiS57EO#WlzHl3c1clM)_QvhV(6c7?A{Etn5kI#Rd@YnA8Kc<(AyxvRZoiv4b
z96MfI_^+Y<zx?^W+w8Wwd-!2?&W0g5U%HE8AylzAFZNo9B(l@>6!4$cHa#MDCFxdH
z*W%%Mt5d+b?*S*paS1^J`MHcJa0;ltqI(J$<AqJazO*~;9x42jVN(v^G|~B&%k638
z3ef))U^D1MzHNUQWCy%gF62+T_6vFNb;`Gr7}!n1oB}Qd(@dntUMP^wi@gB&ca`^<
zImE!EU(-jE%_ZddvA_5f5R7~Su>N;l|DUcyc@Z+LgP*A5%-PH%x>^A)<i-A5%OwHG
z5wGYepn`?C;Tkne(xq?Tj6kZ~_mMZz5rJ^EFV!LUd=|LL11UFs3NWlJ&L=ThoB||_
z&>jR1Y2<$MaV+5E?n!F<%2Wu;DPZPRK60H|VWVyp`oW)=Io#P!;E?w?IUDuSJe2W5
z-oIQfz}oldTFA|OR8x*zo&x>iIWkWq{}u^qKmHC=nI@?}I0bki(=@_2$kyx~-hVb{
zEn<m&_;n04s!avU^AvFrEeYm=*-58tD6yXc%&xNa+<*4v8}ibv_vP!KgzL7=_WZXW
zRJscDoD?9PwoPx0$h4XiThF=nNsIPbVlsbRy>pRq>`XpI+>^kIyG9IeY(TXQajig1
zC2$?d9U=%0-yaOby3||kyeh3Ia&EZE+t1mUz-PFvdTV^KN?~8OkF>s-_-1nzA_zxf
zklTi(aBZDwl5l%Bglb)7LS-S}`@L)ZFaL}-U9=#Z9)*V8_K$Pb@swBMuX^+(G0r3O
zPg);hYmdL@zJos@ba)Mv-kZ5Ks<34U@BhMDz&Y@E06bVL7p`bp1+v?AoAvY$bjUQ<
zHEt~o$!oigFF`vO;a_1}4n44gy}zQ=D9&ejmu^(g%o+Zev+6npNEg3(SwHopx9SNk
z9TgX}&yHpk>EID_3Sd%z0@M%F&`3_07u>@dyOr@_&Pu=LfkTt#S^E^-?bhX8fF567
zr}vg<p{Krx@QXHUq|g5Ftx4P|V4GoCvU27ZruG!T=JRR#Hg-30iS+g32ytIG&OG)i
z4EAy|>A0TeX@OTt(?%V%A5yc^f`)vf4VtM70BeY?E0!*omfzrUA{gx{W{E#&(y&ar
z9PbsDB74h_X1IFNP>>>Z+W;y_492a11Y7B_{$0YtZH^Hbzg!nz-*aUIM<W?d9z|^t
z%A!?<*OY=ZB0iNRZ1%McX17Jd#R1-3#lob}JA6y+LJ-$vtOq7K8X?USeXGVny;Tvl
zQ+SvUJTxks_Coc_j6F4YCBEWrLy_92e~=RsBI^M_sNfW1MeA)fYlE*Y-?sLa>CGC5
zZ?s(!q)Kd5547mGg9yYM<M@XUpEcr=;!JN$wLZ8!y8qJr%Q0H4rZ&F3tW?}YC1H%)
zy~jc6;F#$ezM!4AeJSU7`dRh^ncPzVecE<<ZdkmnHkBTjmzTN%-LJ=Getwy#Owd8N
zOt;b%!cpMxz>r+@pw<_4UHP%rJ2g;wW-3HgUGu}k6u+H~%eEG=5Z8oDi-*t_`~bTA
z@<H5%yoB+atr=Tmbczg2{bDq3WrZ<I!r2F_IaC^#5f;&h2<+7)G$@=uv3O*SJN+>8
zdw%)1*?R_rI<uF#NZEd(C6%A5888k?=iWv`c6L+{CTKU^V|6`qVcYGK9_7nfuPV_2
zlpLRL-=YxVn5K-A3_HA)6yj?u+=VZC;a<M{x{uXw=i1CL;!&lPJMn!D@PQ?X?G%u#
zr*#Ub3T`LGTHz~~c_N9@1*8#S@Kn1ql65n4IJ}O*%>i5VY}>KPO;JDQ_HLqyJ38-q
z(-5XeLZZmPDxOe^dOiAK@5{AcTzeAR;AOmqFp&*!i-uCcGi<W7akq**-hpG}VXqW~
z3<=7fdjbJxc2l$%izPqZzP4?1t~Hwh@3IV}-9*wN#BgHz)2!7r^$o1K1+GZPNvVgb
z60lSy$&Vox76P7{%kMxJYzPyQg7-PH={V)&PFBS>B$t!8;le*K{oA9<#&nCD7WHjs
zko4bdT$_UnyL+aqr9!37PJ}-j7Lks32PA%PC?kT4LI#m_JfS}G%mzhp$vzdqY>=UW
zgny#s?43uUJBWlqKF7@zEmq?@9l@rx+Y*9-V`c6G&&%|RxqeDS`(`EagiJs<5MCFu
zp+KVgsBXjqwmH!h-1IijEzZ~OX7pL1D>u695J%V(5<l@F_Nm=fd}$t0AE%g9H#l9r
z$#0%i62O0_TH~gl5rA!5hvDag4;J#4G9iiY)7P%-zb(BKem68w(4sSkTg`1b!TXR2
zd%mPhC%6DV2IO#=vbVM46U}@SwWE*DbiB{IdpF^=FmL)6Yy%u|3aD9()&MR*Hx3*4
z6B{re&AF1yrt4r0*+KQpL$?@a`FH)#-UZ0fDHYyV*ZTeP{6D(97NYzjA|gh{EQW;D
z(%LvlGud=F{x2iW*&5^DvETC|XRdqXLa$C~=e)Qg`tmsZx~uZ{mmqx_aVNpR=?o$b
zeh}Q30PeL&S!XB^;^?tJvFF+FwW2*PYF<2N;qutYL+L=F3xCTnCnl%fBK9C*T!^Ye
z<+_zn^P4V-2^lZ>?QqX~V)QMcYg?Bxw!o(V(f*!O0O11vBI!k@AKN;l)}rrX2n|93
z$)#QJu6DEd-pon5VFaM(XMs)wjYmt1Qom9^*Sc5xEB^Y+jZ{!2WTz%%W$rs!6-Y^^
zfWlFW(fU&WZkLCUe{^S!E)oW<qZmQD4l1H)rb0`Kp!d2(VW$9?c=^}ruQ;s--$q-b
zDH?n?Xnc{i(S4uVopzrdXT;4Kkx(sB`sHQ&)!-M?rut|tTE_mcrbNy+#r=X7gCGzB
zSOQ->48?aGGe0rFBv5rNfN<QgXZv3r4j9J8+&<gS{6&aX_Q}b$L0f#oawF22FzaS*
zsDos}Yj>pJ+a4uq-F5Y^G7u^g$2E$>t~mOC(nvGcYC3>+L4+VZyOC{dI7GT(kwL|}
zIV$yDS)23I0rs{Iyg+peU&oM&g>7y2HKlbs6N1Dk;Bys`{J+}Th-^4IZ}yX<;X0&q
zA+v$NNcV#iZ@p0otWodaoQm{MEA_F$Ifu_qc3M|Y0a@-$6!A+FvnDy#ALN2pB@Z}r
z`Klu&)Ri`&0^rd7#czs-#dwE&Jk75BDS#G!<i6z#Wcj!`;Kqr&^ub0~T&lCpS114w
zf5>OYhlhdsA*hDsUy%rtQdbVV@YJ$#bMuJXjvZ&~i>~!E{&#qF^Sv%dL_Oex_-EX2
zI$Se<)Iptd^Ul`&PJNMM`!*1M8UqAwf<R(;GYrg_#~9nvWe2K#;jF=3)7nG`?taeZ
zo-OrhR9y4gQIuu9fjU_L^73}xIM(hGJ{WDw)Mw??CFI@x;pXFmSxJI?b<0B4fFOra
zl(F1J3JS}r%PI7|+)QsEok#)VE&Mi)s?Scm0hiVgP2@l)vD-L%!mVa?qdc~`8mPRD
zd?j|p9zE(aO;n<4Hzb?HLWsLq@K(#R2N%3q2&jkKdFAHo<=QMKUtg^&NN>P?Wyh^G
zdv2d=8B#^pc?w{hQS5<mYCcp8oJztBI_TGCZKrZtKkTA*l&%{Z3wcD-?0s($xO1-g
z6cFx2)J$=)a{(TXfW^4gph|){_b(o>ZGr^b6!9$`r+^Ph;Y9BOyH2`i2Uqe=-l4aX
zN83jEYiG2s#-{|bs#=V*U%1E18k!8{B8@NZJwURwwUK>@QA4QfAgS5-SJKatAm)dh
zCi)L_IB!$@u2A1WR>a{-T>ok?*ZwJ@dFMmg2L16^u&xebkf6SR?8?P|#-*a`OFU__
zJY;d%&k5aCRj=)R`}{%DUGevqLuhG7EQM!r-u>)|%lHQv5AJ?@1)k9@@sHLP&wFY(
z`DWYc+=D$``@11LkRD(Zi5+({z104AC&}r(!u!wP>2G4i3#snA&$g4F@+)CbF?f*i
zN3tF52u)n%RB%vg$L&E~&9OtQB2`+WvzHuGxL)vV#>*{vrS1=JEuR9;Z(~bIz0zW*
z0R7x|Be5EC_;JNZ4Gp3)-m_W7RvR+_BScRmeAcZa$OA7>sz--xza(_??b^MIYf1}r
z#I!l!$pa`cOHe0U=BhSieNx_BUqen65xP?Oe(rAlAee#s!Tj)nZpC9ty!Y)<iPSCV
z`lAQM)&+Z7dle?0X5%!vUfH!dwHe!%?Gy-?B@bp4Jcz{3w8&<YcWcn+zV!vGyD^SR
zx}T-`0ox$3)zR$ID+sUvdkW}j{H5mv>`8!i^M8$t)Pw~|8FHrN?DVVjJ%QgI{4}i4
zreUL)prCGzkyxy2!RP10^={eU(ipfe@zaud`FoSxneO*t77w4ehr~2*KeB67DzyAY
z+DJcU^p+unw^b%hwaMW1usPZ%AL>B72t9oIaa7a0{z_Yy5YUJ@$ysWj0DJQO!==72
zHw)4S1f5b7bw!-2h_v`z3`iJp2S1y)sd(AA2o8lt7>{8RY2tgCwC+X7@;%WXvR2q0
z5RhBBC^o@}pFMO)0Y)6RZ=0xrgG*6(CsXUE1fOm7uj@1~D!2+do$PtsJq?yiD$iq8
z5fFSOb_&8pygnWx0bAtob^g&3-4p1S_TE7F#m?ZWbWHq_mb`#hsAj+VMww(z0q6IG
ziHB(~f@1NNJvrPVBKWLBBp!?oiksK&wv)r{RI~|A97x8QhaalKiJkIa5@|n+-@Rol
z79f_bS4&hrc^)D}l)$;?V89HTrEAA87}uXTOjZc1>v@VPUif<brG)1a^AhG=chfdZ
z05X4PQPXI>b7{J-ow-fG8B(9|Of6^`9CLotc9iU~Y-*0k^X?Qj)K!O5H4ODUG`w*%
zXXvBmgQ+E*L1<2EFmL7vl7tsGc(TuqX@J(T8zfN9Z5A(2X~$TLy>rh68?}mgazz>*
zqGx1+{h5u#ioV=nr%=MQiXDRSg&5;_B;{kG5vFBk_aLT@sgd7#N#DZpiQ6?UfRO4z
zCHN%!6!376V;R(K%u2k8k0{$jE%t!fyE4D39JH0Iw>IC-0CM`}TDwx)=2NrHFjLq7
zJE5_eIh_#v`TBB90@tW-X|V1N>WfPT+~8r8C9&&rxS65sFdvn&AlVUm8)CgZUsr#;
z9P1Y11+VN5VHiPBU2Rz!1Y=qd?lla>y_nJj;9YiCp7*o!=ggttfmE3=za_dKVj`qS
zIP`9g4Yn_-#MWBj&HU;MMGk8N0Wlr1zOqkdO}ygq!tfH;4wq3K&ZytR8C)zm#CSI3
zG%d+jLEGsRAVv*6=xl!@tHOUF?>Q)T5o^_cq~gJZoOwlFzze4*O#c6g5^yRk9#PGo
z0;aFrn`*~x0ts=%Qa#&Kz;pG*6K{n#<hBV{odO!$zvO88p3959$o$V3gk*on1%$-3
z+5_BAG%)@#|ELd=(Wuy<n4wVeB8SyyM*4}1bN;eJ`W?j5nc{a8c2_mnU^&=n35-+L
zL$Y_XOB4mogg`SFT?l$8#BfuCfZwlqyu}BTYr^Mn%S(wLmw|k6Rg6TKud1|ae+Z|`
z9eRapT>~a{8NzO@Uqe5vP@07*Ni2%J84uwkuY{dA$u7GK_-f$@^fpfy55!|!P4Vs5
zU%DFvDz~rWmj$%WFutYBF7Q9rh?NoR((6!dkAZR_pcrH<x7DMP0);H+Y`!tBwvlvm
z_xAQT4I-2Qx0Lx%Jbe3eCz%MEW#gz2imc8yxyI@#soJ>o+L7J+F9yVqodG&R9T(bL
zZ_`UW*A#x5V^v{|(F=WmPeZ>Obyp~dX+^FYSpCp!B}jLoylX-jP(fiwqn;Y(CkZ5h
zVGmplhNf%rjdf=mMf-%BGCJl(vY=gXM>ggRklEqrcGdEDxa0;jE)W@`kQ8Niew--m
zmfG_xUC!Flz+Y99^N!ND?>uJqS81hWl`qlp4H{vm(Us9{Y<QLaZegA({*Yvs#<HNe
zmKMR*#2*=cYUvV?G@pINVnO4M#b_AT9}S9B12Jc1-Kb9FThZ=GWotGc<{L;??lZQ+
zZSx#@<DgNGA<iDGF2DG_@z8!<vH@|;;!WE2u8hb<l`k!~Cg)7FDHo-#Eje-Rf8?}b
zl~_uO@E{2hg|Nnvb>?9Q(jfi%>L!>Im@P%AtA0v^T~1}m9JsB69k#^xVkLTofg@Iv
z3T_4>mi&9#g4z#zo(O%V5L?woeY_K^)@L#zRWR-&Bsb*N=Jv7dcH4Mz)9s6$`=2U#
zaK8Q9bPn(F;r%zs%DfB$LJdU5SKAvi2e)${L?&X`g&vwMD=9zDenZz{at$Q=jnuCg
zXuP$I22!stoVPaShZ~#nlTs_nbjmSp>^0ka!x{!GwUu_nef9Qm-(eJ;$yDtCloRjW
zfx4Q(`N>yEci6P4F&!S|U*4*E%|b%33BZhnS32dzJ;UVyiDJ_}i>UElB>N+nF<#FX
z%slISxbY#?>qkL#3wxF)4`aZPH>wut0*tiE==A_t^`h6&wK%`%_g1h>q2BHnG45|)
z$ljx05xaQbYs=FsVx!i?EBhWngDBpNe~A6O-+m@bwVi#F$KUGRM{j%UFWDOy-GHk#
z0DX$^MB2-}0sdhj!bI<r*ZVODp`{=oBm76_?O`$JG1rE%8C#dp7S(5k05;LHB9sNy
zpMLdEyfE<+X2JbH^CY%iL7c(vMpsn$K=6j;2j!|J6B3cDQJ>Qp;#Bq09#L1Yj;X9T
z@m2=p#nO@|IFbK^;{6l)Y5jMvZG@)z{|)Ke+eJapZl{3Wq^R!$+UWvFPM5&Qch&jE
zUS|`<Tf!guE_AS;3GG;(`luTV*xUU!jCL=?)}V=1_*iShN?w>%+CoF6t!0h8-2B|l
zAuVpe_T3AOeR8Zym0Wm#?CfDY{<8Nn6Nqsg!dfRSV2KmXE!VzUiG)iqd0kN{rJuOa
zwVW6*e7>zE?@h164{!pMIph-2^CU~48&%g3S*8|(DlGqz>foz1-r6XWt|KsSxRMu-
zn>Z|0F)t_F32ytMy2y%<jO`xO$=3JS9VoL7wa;&2dL1-9^1{TkdgU0{HE3;oQb=@K
zo{g#vrm`H7Kap9ueD=ywRqNpS)!EQS-_AMjVXh%~(4oZ`i4SoO{|22y|MeHof()!_
zvhwAnF2^{>+maN?oDOf5`@SFa2F;UI3bf=DegKSy(5kVZZSTDeHr5$^yia&n{{+Wp
zA88VM8IbnnH9HLghx1Z<gmkY28ZIfzdsS*L!Ae4gE2MHBH9xiOZ)0EosoRrwwnJ$)
zTo!&zYAk7EcWwajN{5KIHsoNRTf4A3SNDiRYmR`TpL#0hcIP7A%=Ej;(51maXu2Mt
z5W;Ib<RZe%2&A9dJ@w=pKBzlTXd;7w>IQu}#)TI!WcG{#-ghP8t@)2G<A^}AFMxD5
zj@M9}up1x<^d>z#$$tLT>i$^^R!7^n;04}JNMw*Nrwxz}pNsYH<A?TsB1sX!IX6;!
z_FVGyCw<iY48&?D+$Pc!(yxCuxSKrTCc>PY7}OqloYAv=2ez5>OvN~AU%;)pJS{3N
zKw%CY=5hI5a+)c%8|pn!J9UUINQZ|H1NuNlr^=B_q=@BN>42?lTkl?#s2bPD-(5I0
zm1z`Lbp;aj$j<#&IL{#rcRr2)!snpv3Vn;-wZCV%*{#s}^>D+fF~EMVmqL(>ZdK{?
zCe+YGrw37uV_M4S@rZ#|XGFJhc)we>@=x-CR(+T$Kis?*nTra&K|zr%+e97K0+AJ*
z9#>(RDh$q>v4L@Wxg6I%7n?_ED!vgkW&8QQjdqnzNohUyqYf2Wt)Bl`n4isn)C}$`
zEG{?&*zzpj8Tq}}^OxVyt07Z4H4SlXQm)6W?Mo~-`VFS@t3F~$MGE_o+(uVd>Fo=|
z*m2eN`AKa7l?7`>Rl*K94)pcR=&kiKgfjN4A6C^_OmBS5{Bl>8!`+*&**PiJV4qu=
z-jOCq6j(?!0_^k_A=OkfDnIKc$5)}iv@55Nj<Gj)d&|-zA^~8Yp4d0>0?8ws@%L~Z
zJy6aNPQ-O=WKrZp958af<z*d{y44PRYp}QHyt27^tZlM?8-ob8X;@(vad}ch!v@|p
zs)o#iQ}%ZY2o5Vo@9eC!uySspJ|G&U<A2n{W}Neu5W8gcc4~r#k~HuE%X*O<J<SXG
z#d=rYT=ok)?*kX74vS$MxvI=(;}~%parY!+*B2a~!$bn%uLf!IJSA!uxth+o-r<cO
z>8c4B6ME(?*4k2?XrMMdx^LpgEjUaVTTXl;x$40}R>^(>1V7M6_Q+YJiI*{zRPfY0
z!FEiLGj@h9GO4~Y*CQ_Qem=t_?Dg{uEkIfNK%LXZDJ*2Tol?zsNyq`SI-KY$q-x_K
z0M&+%uYVbpbZ>b0My=+{rIrs8Uk%KdJ#W#{g(85s*(KJLY<g^3QY6A_DeIz;PtZ2Z
zB=gJO*nHNLUfH|%(M+;)P2zrnjK@!6Wh8~gaHUHg&q5Rsir7ljK{-U5Y+VdA`o>9{
z8poh|!O8lLs3UJGCpw5CU}>526hN^7spT%g1Pg<Cok?^f`%HIE0jLZsx0PRtJg0z*
z3@c}Eiwacr2MnnMTGWl|bR0E>@FMQw&n>reZ$O#K)f6!v4ERzLH>+jMsS3xMtXr+V
zC!5`(;bkcrmL+`DOU&leoyczsrvS}yVo)hjccPrAvkV3aHP59}q)ajS>gsRbpC6iF
zJq29m6DKhK3cSz=0wb>Ay$RP4V%Wik?gaK<oekMMRf}$g@aCf;*np%>gfPLlMeMCO
z&0_7+!%#sK-jHx%e!cyJMkzL^Z`$vJaq(yPU_cRvXpfzVxLjxl;>(xvh9ch2m4$0X
zp4C5xpDv#is(}mE+qkobJDM?8)OE%@^H7^ym=7a$3hL61OKb@m>S8bFl?v5*w;k6S
zg+P4G2V}1>zXtY<js6BI-1$R@u=p`K+<8Ba)TsEg<o!QlW$R%hWQ#wPXilE!CdrW1
zINX}lGXOo2Y0FE(OGAlo6eHWWZq|`K?#+!0R0pPqx5(_Ddw4}MBj{1WDWE`h`u%j;
zANfE3Q=z@wic`QR;ayTw<`DVDNtVbt=&F^<E-`H96tMZ~{gjn{$`I|(6@h%yu%sKo
z)N-*kC-IOU80c8i7CCqZVJE#{F4wt7>I416f!Fm8WlsS)F~}Q$Ki}sFU+lHx#>4Uk
zO7S(lS2YC3#R$_@g>#<ggDLFgjPEO}mwpd6Br#H<lh~k!53m^>Bt`s3Zvi*O%v;%0
znXWm96^~3BvR7q;_+6k6Ko+CWvkg7NsD|Dz*0-?WFa#BYvPs1z`{EqMj`Za7oF3EM
zu~UF+OD~Hz?n+0!^vM8})!+gsPG8bYD}T*cPNO-m&EOR96nXge6p+`11`^L@k^6w`
zQPGp*PnxFy9q%p>-jLk?!bJyxZz8fKrY0;tL$FDt3DCZU+FA0cHAvhB1PwYVhOAY<
zwNC*Ik4^#06Tx;x_kYA3)N$<@6Ui`49o-4nFg+42wMY&C<Mqf*0v=0dpE(6|-D}4m
zfGkR8lrh0>I*8OixpDq)F%DD&DfY)x2sYh=q^>u);l~Qku{ME<lP>@v+(wStV53Rw
zgMSSv|3R4l>l>b2Ym7=^Ylg%wFu!SU0Y?dG60g_(ZG=8@?J<W)oRkw4oA2Gn!<O|k
zNS6K5Q#n^o<O=mLlcJ>Ag>zV1vh_&>;))`rIbwV(fvr}P;mF!;?yEBDGlaW0UMx3*
z<eaAOTI3+a@CVrL<+_rLjs>;q>4zrGhMXB+TA%q0%zom&&bM=y$}a71%2RXU&lTiP
z6v<2hLViYeD}R<7|4rl43@yg5bBDFA3&`Yu?=Pnm&nr6xH~^3E8T<JJUgU63dm)vi
zi8-=5nn*M}Hu>8vL;nrCEaFHLP`-)~S~yfQqWjPg0~Ustnnzs6?j(52#$MbJf6#1T
z`J`I6wI(o)I(~&xJtX$Xevup@u1o9DB8Ngiz{lU+t(cI&K{<&kKVK>JPh4Q4D5MU0
z!Udu@1)#5x10h1i-;=z-Zmd5h?ph4AW1o>Gk^6TMbYxxo+D`OK!e2+awjW>8pdvr~
zkDw$~J^7&M`e^TsOOmULf2jMzwevI8yJxkV4!F4X$RNV+ZcJ7+5UP?Cn-qEqz`n{j
z1tg9R3Zu7k=Z<4zkNp2wN!He+Z42@&q1S=L-y9(8pY!)5&%=?OU>zL<nP%{ZTCxm!
zho$7HCVvS+JWob4mTSHB8iSnuHS+S>$k@prYLQ)H1T!*}ZGr`nIONFj9F%hYoO-Ql
zj)@6WhTH+auZ5u5PTrpa){5~!@>KK+iQg6+-4R9!4Hp+zW+4Ai3Hp8SCOZED{#Vcc
zHZFhZdF1Gw-*^12&wsJ3E3?~Zi8i?T{$9zXNC>x-PotUxce+BB5Ul%ya`i;H|B*YT
z%1+|-5PCJXaCPI+#hGqW$hnhC`;TJYRm4u1<^0FuiTtbHzRj!Sa{8O@g8q8!1<L>0
z_5Vpv{1d1O{hQrlcD9Zn@hsPN`}}=}^y3jRKXP#S3eNfuq$S87h)r*Slg7ViU)!1e
zk;vx!2OII9=vx1+I?Aa3AT7BDLr9v3iwB>P&x`*Kz?!a~0``FQWS)~DF_O$>vivLD
z+PD~(!-J5<Ds}`W#StA!`U}^l3<nhDjLs6OufMU=6{FB=T4MkR@jO%0d<QqxX`MVM
z*P6XCwzVubXJ>b#T8Evr>}x;3qJKgBq>OYnWF+(HF=*=`U#|%6e)X~~@g`25IsG*|
zaK`P^yz>4d-eEH;7MtFjP~t;^BWx;L0pBS!6o$p{>GVgFcuSwxTv(Oy71balhl{SM
zc*=p+VFChfGGCZfRpI5*BASA?wZhmHgt+c{(p6O)TRMRQAIX)tv7tjaPZAO5SZhGf
zNFvY;2sh%vuPk%i-FMETjpEvFIWh{L2#pX^yIq(YOG26X)JVUZ+nHIxGjFeJuDw^H
zTEAmr7w^DZs>55qfY#wOd-l&L^nYvnqFHSH-87B76f&v=jmi1Y-i?g4Ff3kNV_Vk{
zNof6~u+5N=K$P?Il7f>>v5%Q6Qmx*&%|p}s$I&Pab)szaD{&o5qQSfCS5m_wn{Pun
zl9q%*V)Cj=bh#9DO5{C=LipvS$|O53<1=sx>u8G)L!U8#g2N9EFTclBjaKc8ttZ>A
zB8+j?hfmrXa1ID#yn1o}QckSjvQMP0{D~Lc`Gvi+o%8VwbYg9uj0DiuFu~MiQCRvw
zn#p3^5sU2kNXNFB&D9F|7}j{QMQ&nQ<Oli(8&@VcR__*G@-WSlyQ4l8d!dYxad5hQ
zxfi+l9;q@`HXlS%vmiAW{e0KJAzz#qtNov_yWcRnBeBCTWXB}xBptHl(n|JL9zd<9
zKZEKmh_4m5s>!B^(GOB8qp`gNwq`BxLSB=x;DyR7CxsfKuI&tMH2C&j6mNr@^^N*O
zn3q3aifEtn$j)s|G3KDy?`pvWozF|q2w~RA{F)>id?aS@>en?AsE%4;b!9}X8YO4W
zC1v0+!tA7QLwf5@oKw>%R0BVbDe3_}wd(I|B${1K3Vw%=!mbO8HGQb^;r{iZ*qQm}
z!?O<aO&a$NE&)}mRKX|<Ox=&poCwtLv;9(drNTD$n~QnsS>Nm(7fRh0O`0U}ZMrwL
zPd-wmW#{TVbg{6o@{fF6fzb1HVcF+4`CMWP-f^2Ui|2do4iTkg4p3u#*V>N;ZcCzv
zWBGr@)izAlO7fAB*U$MbUgyi12Hsz<zP_pE^1A+e@{7lFtV66%C~KIm%&e73oAzh(
zjXpkLD%b9@GD@q@xe&g0Z`WR@{&}dDLHZGIIK64a&8fDtaO>_&>9*=pc&fp;e9N=z
zSlb1=eok*)3r|NWU1Ft}yZ5!Sq43z@5jE{Xe5GOZol##&Bj~M{$wHIy1M;^tJ1Jfy
zKZlpSvBvmE)JmThvM|9tihlUcNV1b_1r;yJnIgvJV?H1CP$Y`#x#rLXs&{#%)*F8Q
znSs#R`Pt$7xT67I!S|!`@FDEdvb*w-mnA=4FPIiyX_;X#im=YzKotZv+`H$>W+*%R
zE3w!9mg;gLz*oq*GD$-O-#5HWx9y@B?u0M*MFZ)a0~x=Rq8HEfKbl6Re&rbezYd6N
zO?X|k*U>RxF%fbWmk}Kzw4R=XF=n$|8<9QJ>s4k~Agy|VZ}ro{{E)i$-DMc(GE9hu
zOACC)#h)K^IAQhXMW*8OX$Z=|GivrTAU$=E&b@=;Tumqi*(SzY3iG%*+!)K~2{hdZ
zEIW_s8U;JJ$mI8J<BB12$)Ci+Df&EJ4HVZZ<xg@+@>U22ymdML&0yqE5$y5ajEjFp
zsr&Spj9kp?8n#Bq3lhneNM4PLGaF>0QcjZZA{{>OOYp639T(Dg?#4x_^gAC`L!4(`
zxC`)gE*+b$I+#2@43zv#4l&tc4~n|lqt4*-`&1rloSP&n7Y>bX1coz}mN?cw%eX(-
zSkvMpQ6f<u=vPv<ek|zOR)~ZZiH}RS8U?6yiKo@nD@f_uETQ!zI5Vp<VOpzP33>2=
zYq+aLCF>~Nx(Dx}>xtENv-jo-K3HEK;_6DR#Ujt>-F8iXg@bf9Pd@``jT7(XF2eNh
zS>LLkc#wi9J{Q&TP5k}}>r3Tz-Q1FKlY6LN&-QbeT(*(ruJtPFvCmnia-TfE&alKX
zbZuDd=X(+tE<0>FL6M-{-8>o9<j~^MIKIRFkY(*gYQ$E+Tb4}8Y@A*Xlo2irb4KW3
z+(4R{KVCXaROU6f(0V-=iqeG--`E3xtLzJGk8Qhv@UL&5s;F~QG{T9?t|t$Aj`ey_
zU9*0yEPhECIM7+-CZp=&y-1#WQjD{GSD<(}wCfP12%_Bt)GI5c3nH%^O;1hknSUfh
zQuKJT-E8kio^|pcRF=cusS1~a!pM;5-N(O=tNzdY=1Ns0#*-J=gPp~py3+F=M||XP
z#O6FXDyu+l<^P%lcjF>5u%j_cY69<Z(f*13P&8s|NbZxfuQVia=E+M)7S|Aw=>vM7
zavr0GpwgCss4i{(_!>}m)ylr$boU}~KiKrJ6Cdq_%juu;w#G}|OiKzSiuts+=y&!X
zCUgn&TXb9k0ssLW<+~`@oz%m(_@584T4OH2P_;`UQ>`B2Qd;w0^@D8Hs*V|WS!Z@=
z)pzJc>%uRH#GDNmw_J`@U@U%n|58{7IrUUHZ80i;aQ9PRL3tiXhs)YDsCa+2zdGSj
zo2AWc-wI6t<(tryYn_i-^($|8q#HiVPf=R9YassV`XfL+?JYV4V@ezE$!iT^1PDKm
z&U&dK!l!_Ye=>K=H*Ko&e8ipl7dJYuU$T5)rnLm{dz~mt6KvsIyaugV>_XPURO)lR
z^PI1ujpl!5Wi`=mi8_@woio!5p>g8#mfYuCFtOt)3K=^tuPDiFk0bGJhA?2Kn*7qO
zLK=dDs2AH1&!#ZcM@!6}6p!PRH*Jjjz;Pi8WMa#<!BRQc5c{a9*joD`;{N!yuXA3s
z70sDfe(9(@sZ+p!=Ia%pmEdFrx5oPla8EmTEtn2Z=w9HM?43_r_$VO2bR;7B#UQMs
z;b$O&K6T`Ne{n@pcXQ-Ud9J0MtBUnn!_S@%Mnwj4U#GT^q0PiHy*f=yCEePB(wJ^m
z1(|tauI0SLtsFg<ew8_~;7J+%C$Y?iAZIcA*t@nnrP)PXKXu~>PPXvjj%s*L7le_h
zje%LRtxVE<?D6V%sLr}1Sn()04C)Hj?SmJr<Trv0O`vKvAu|jWa1972oCm%53;}!F
z?e6Z&w6>tCztvT-E+){x=cpO~p4q!wkbi#w8qfcz9GgDge=L*r9(L@znrS;bv+may
z)THPswm1Ezv(0pUxbU3#A(PNN7rGu&n_m46-*9b==z$%u#L8q;&Xp%-mDxc%pCovm
z0yZ$O#P}j#6W?A|OWsWWwurEa(8mwHLugFYHa%Xns7n87IC-;)%reC~HZaxNUu(%J
zK^xx;z2!)U)bWI}kfdgbx29Wd-QKx;IG>Crkh3X!CsFquJ~o=Xy6#HH_%zfq+u%@X
z*EO}RgVW+GIq7Y;$Cv{T?jrFvjO@QLajWp%<Tzhl1TEvczZ3GgR`!V_)yY@ZAd4a!
zgbR|9cx964f}`qI454ZmU%BLSw3FAGY4g08<I7S_3k}oew&~Z3k*z?p2b&95M_R9x
zIyc*ul;}AX!%XYybErx=yhKavOGB@n)l^$9gl{41%t)~)zuaYLBuNx&@wD!uL)0|1
zIa?r7H@UtD^}slD-hgyoqF9~P6^ObVfp~n9^GTVmhet__^wBOgrEbLXm^=38CNge>
zymE2xs4?ZaTyY(KHyd+MZuW?6cD=0QB}Uow`;G25?{(?KV}G=o60W&k#@F1#?uMyR
zyhmJbA7wmWqF%Vg?N>i0JHK-D!3mKYa3NnU%<SvaATF+zsrK4O(Ip6O9I!`KUF9I-
z8i!8tT7!tTznd=`RBry~b6yYAY)xwZWra!<X|KO)9kYD}4UO?dQg!ni24N_q{BU`%
zJ>X(qZSct$=oR{ck{^O&Kb~10%f<eLK3f!$PoltUck@ryMNU6+Z#D6P!E)FAq{6Mg
zN!<QKm0GPk#)|Pup$kMtkR;$m=1CSwJ?%^f!%R?1l)xuU;@+%D)KAhY9eJaOi^@mz
zf=~26aUP{@(dCcYk+E5pvt$5#^b9_kjLEN&0TVye)m(`_Fks<bVIDrKW6?Fa9JR1c
z>Ma&6F=rR+>vwh!jmYLT`I_C~&()1E#eXayo^2SdI2cgZtR2)kkSd&TbNiZpc&_oD
zi~vNG{n~Ijd~qupZ4IF&Q<S#zEDe3t%X6MdFB?=Ax%@zMHrA#ne%($9P@5jJs!x4b
z>Ea*gUg*L?%JJvW_JVna<)`u9FQr1IxSKTdxme1DvudvJTMc1%!XBb+sf<Q_W>7*9
z<A%hkg{yu~s`DaE8dGiogkZYGns(<1T8J}~2-|=Hq9#7bUpN8>>jr%stE1ZM59T>n
zRuSkP(<$P#ldevouw8ri%cUD%UB1mT5adEeP)^%I*j<7<ld?SYjLS63yG;~JKZi@h
zC{Fw)P-*y**Qa09K%2@4lFWh;;UdvxjHI%DrV1NUEv}-NRCfPn(z%yX{Gle3%l4eF
zpU6Vu6n>c%&hbZ<DVo>ldArF;B^xo(8k~ubx}zi#PA}Se<GIi3m!aaehfiGp)DJ0}
zT7H!FXY@^|c*r;Cp6B9+{*wg7oa394n}L6HO$@fKlemL{I-C|4m$Uf@<f-Fx5s?hR
zSMs*+zvDm3=sE@^h0t3O<xX<dY(7_73ed~D85%S^y&-SQx4mo#d-gJQH7CO-o*WUd
zZOB&-&d1H7<8%1PRGF+!6Ia6mgR*PS<9>vG6`j4L;pc9<=naDVmJ2DOw^N9hs<Btg
zYs|)%N9{-5W0G9wbgSrXw#AoZsh1`AcCGr%%@yLy1q6AvA1>A4R1f`eY-4IIZ*=?S
z6$a0vb-m=NZze{Iy)QYANc6UI!6fnCBaJR7oxItP(_MVv?XKosVqHopW+`UE`Q+lK
zQ-ED>0BCOz`$Q5&q9RI7kl2U{5uIcoCn;|GabdcuWLBKFVBDkri_`@z9+K&{b1W^o
zv~uvAZw-fL&~IesrQO+y?>O;nG3#Or<1vTy*)J+?q1i5^;g%qeqwf;_1B>xz0GS5i
zE@Jb$wFFzMFPJaKD1lkYmngpN3Ndy)smHRPzGyNRu+yuZfvxP$45N_|;!|ysu3$6^
zMIDPk|Mc_q47b9LM^y<T4UYPtNA_k?J=7wU212mSSZoqAf(GZ`ofNwuI)xBgW)dLB
z_*3sDSxm~iX&b>_^~_M3FlXPV7X--NO%9u;d<WKzi5txy2;m@#VTXIcsLJKl;W+sj
zgyw_1jYAJufh(KTl&;I$<eS4Bloem9w{1JZJ|<on4CLvPY>yhonHr5D(Uy8RL(Fxn
z`?BjWYB}u(j-X3>0#SP+M{Vn~t;<wZ>iylIh&x18{2QEJZ)IuWF?{TBY+Gt-rtdNF
zMY3PU0^;k@t05lV-OSZi7*?!rlxRQECW27K4dz6-AC%HRR<M?~HYE7Shz@5+IRrik
zb$`0FTzS6om0}a(^T#2&!^;gV*xi2Pgau&_Rx7+**F~R?nrg}qJkK@Mq&vP=)_Qwk
zu3FbKpNg034S}1;g|$dh<H97sddZY<oY68o)oeW1){SQFs$bXY%YF}m)d$Rbs@2zc
zYf>{0A>l}X2Z@8Ii))DU1@qd%>wos1nUUHJa!uAxv5wIe%$==k9j((!$PJKwJ=MSU
z3+o3BTVR{0z31yVmHE@Eb@J)@<NNdysqiEhzoCV!n&G#(13>`{<N{I)8ET`MY&%Qp
zhj5Z9p{SPiaVva;`ABU`$;-4vk&%Jb*TECs;;}QQfb4kBKr*uf$pIJZu#?3?BHBcV
z;#GK$<pHgF720Z=_&eWRIXL%X!gmztf_9ZHe)zmnvcIwxLUr=Sw>^51839AT5$iW~
zX|Df>k$e*}YgJus7IX!ia$QI0wbIEH!>3R7vX_d87d)N`E8+Z~DnMd1)W+&QUg>Hs
z3+yFRJaSK-4SVxySbzC$Awr9qwMe8FKU7X_D^!;7eu^FLTBMl^S+u)ao6*HjonD2v
zn>LN*8C$w2G;7QTi@fuaJ*DRr-B)t7UlJ{S=O>1)e=#mwVv@vWhq&83c2I;piGMP0
z$>eqkbq1glwV_G)e7Wg0*dbzo@fd1=SC+{jT!%+sye7x=eT`U!cwLZ>&gNZiR($n+
z_lN4`2ItMGCL|Yv0;iFH6S#}3>>B2^#%mF*&(@Vom{cid25;q)>?rB(qzZ3c$>e<A
zl)SpYi*<@YaA2(uGoyYBf>UGta`P8A_*tX<gWNo&4c9-!rNv*{DvmOF-U_%HHrBE@
z^8*cxl1D~pP~seqb<&HLnG%#GG(<i&+^e)V*XHl#{K}-q-}|hqgo<xD)%FKb63GLP
zSULs7BP7UtJ>FSZ=JYdp!T`;rbZuH~_^4BZv&CadElU}H!8QYl<ZKntQxY9A3du~o
zf;IqKm;?*=sb2A`4$!jA-+eqc>%E?8FrcaI44)=)3UoJ{mk_}4c9W0eQk4oqax(@F
z0bcGYM$Utxw9c>N`}6_q1&)mExCrzqAd<*LkRWQ2!y&M4D@b(6EPCu0^xIW7tD~)I
z;yYt$U5w&Yh9O0e)jqkYIn0mS<Peu}zz79~g{fKddRc*IeOBi`U0QN_eTG$an#zTo
zo5$Ub_3yQ##T6y`e6%JQ8rsTq?G5NT8&xry6=uk#N-7TYSqKFF>@wCVmdKmfV;Uw?
z$bJxIZyxs(#ki#_KQHOGUPH}|#(*Oy{Fpo=ojl&$=Au7yCRfA8z9eLXCu2eKBtC@E
z$QP+*@vFHlT3}Ik@uT_W8(E#CL*?O0h5gz^hKH$#-X4*QsQ2eJ*vXPP5G=NSxbId6
zXASQ)_zVx$_}A|oe1f+eIyK_-I>FHk6i~K)H*loKK4EHtCHY|!LDuuG-glO)2H!ca
z#Dg1Fn{%$@vfA!4%(RZIsoa{zh+)UC*I}nh0uQZjq7u^{Coc@8+^ptFdF<}r-9EIX
zFLNuPQVCbo_2!00r1>!@4D#s2qAUC2*jGg>NQ}tDepmp@#N)E_DM;0?&tD_HNvNxM
zK_3urU|ExJ)}1OpB7N7GQRe0DQ$H^)Klzb=aqEkx=3(W<Q4Kbt(`eat?PXlbR50b-
z`*ZsXLVA<&BG@PgiF6xgplqM_$-;iVG^N}~DXz#I%hmUBDG0<v)PTl-INTrek<&Yw
zJE{Gd&O9_#{q&(dS@1y^rNNbOCH2iA5@kOU?J?`$6R3}U*xvn7wMUr4NO!sW(86Sp
z$&l+yfj_k_r6ju0wlq|+de^Q2XT5}sts9O-8TWVq@XgcR8e;Y1j-xyFlTs|(<-Mi)
zAI@+Drn!A868V`l1fe7i2ku6PsDgX8gZl5lvrIEI-9#}9M(WZd`j6P%qaEy(LT^($
zUj8iJG{|F4J(+a%B!x(qhj?6r8}6xmrlOmJ*=2XlJekUjDYw$|$qX*wz1xte%=xVU
z>l46=+kx`rNjy>U9bOd!-loW1o5~xwF-|n{@PIw87!r-Y=2t9b(6J^5=M|!Pj3N!%
zUBUZfcG(sX()bv<$W~>HVPAXusIT-{k8~6opJHvk(855W%PV;QiB_1cM~87}28jB=
zLug?R|8BSU2|oXJ^zEQ>EgHi}-#FMSK6-Ph-j{B78D(X?GQ}se->#G$s0Wgf-gx8e
zy4@pvy{=sXM_tm10820odJeX@_dF?b@lXN=I=;$(Jk{(_y)lYEW1_>Q+xfd}nZNMH
z{?&c=t{^`Rc#sX1ZQ`cY2Qp70Ien4zarS7O%8DFrp~E!%;gtY?d~Lur(EFU<r--+f
z{s~X!&na*JolU1<`P&--CXSKBi0$^xlvBXj+ka0cFB&Jadq89a$R^{6k^H8BYC9zE
zCuTmWqcWmR3O9+rCpKE<n0>)!$>{3UUJ#MtOQI?}Y(#PeJe#>p;@A}?a^bJHx7JPj
z_D%abM={o=g~wb~B5mq_f-IeXZE?kbYNVVXh5%2H6E)67-^NKQ^e497^OAVhs^cq5
zY0=WCdZ_rW_pJV!K#0mTnS~XEi;Xo9beo^MlwC(oHbLsOO@a&d9+$lhidkW_e(YTM
za)~*cV#(|aJu|(S2v+AdNP~<lKq%Xtp$tW_9(8U)_wMtUz@xg3q`~F76wKCVBbdYL
zI)B7lL~b5O$F}+sfX>bJMa8Zbe$B=)`HlPy>2@2oG6K=po02WPYq=25HR*xl8wu8j
z_ITq87!)<#4&1-!5qFe`88tcQ)>$Je!>#>X1FNvM>F_vRYtRkD9S*r*{Z!cUri1&!
z%YNRn0D(Dw4d-bR>l{KKo6zhEtyL$LqUUeuJ4X9(8wz!Mm_MVGHJGni6<i6kLUPGG
zAhUWN8%MCue@({@6l_dY4_u)!fARiIXbwh1ka1rKboiK@z<UxX?PQGANOCLgzL3qw
z&0KZ|AiMT7O-q*bE^EMsK}BkR4l_|22e}pA-)klGuEEXYLN-^EcC!TUQF0jbM_JDR
zLBUvM+wdQH%%gMHKXi*4aXhIMpf%kW&AHbmi2eQwEyOykJ6e`EUfUwSx4nIzSASZ3
zDMUP{ueQ_#7;)l*+y(xCDv&xM;H}Dhhot?u?*7V`uyv6e&-JLy6(fnDT)c!<oX&`R
zsS(KFS+$|6(o?Szhjb;O&+_H(@52?h>|>D*wlw+zR$V{ErqAD;7Q(h*CU-lBXDqr@
zRwHW_6^E6_r3+fct!Mc>9Ah442yST((Y@t5u!u&0^5A;iRcrh*X|Tezs(d}6gfcCq
zG;7sRlg-qlNmY+15?3wOgD7eON8S?VLw!C;ncA3|y%#4tqpNIhJTdX4Hm!yA$0umr
z`E+?86Z`@U$)f<|#pnI(vGT#KM+L#9=GDdq)vi{{+-3P3J~TrYF&LP!?d2fRb7I&H
zVNnAMg0NMOatL;_<^jT2-p*dvd*%I+R`Yq}GjMQ!e3)HEUdRyH`BqbXN6ulKflKJc
zP7lQ{m8E0MT||}R(-GAu-D?tnm0t>zGk8VqafD$pJZjdkF4|^7OuyoQCO%+XG<fpS
zX4tyvr-PqN+13Oxe8kN*yfJH-M^S%1EQG%?_2yLVNN=`eoxPQ=6orO|oV&WeFuW30
z0qU`1%+hI`Y}Hw^<<V@wWxsPUY#G!MmTykbAAUKS&>Om@4=MMCI^(xb0U99`r+|*Q
zVbb~Xp!Z){wGTIw-AG*cTNv|i)WTQBR?~F})o*`(_!<NZj63}9Tp7aU%s+8UfrUd0
zXB1UfDqSiSYDFQMuCy<w8<;4dk+wBuC)A2d3R@l15Ov9vDLl92*jinun>je{ciUbw
z1x_~;miVF#h_^t40eHV<a(<P75k#-ie*9agqk+Ngg0?+>M}YEv0cM`qvFCxB?qA@=
z(QVCRRE;2J)ss62L)Y{`_40zxQW@U*>#I+%c~8XS_I~wxB5S9jF)aq|P9Fs{KQ`3k
z0&VMFDE_*3<YjayophPcVK3n;t@U%2U%%R!NPig^F3j1AwQR;QpObs{GF4wLeSZ4d
zv4=raBd3GTbOTl~NtDQk9jB)iyIMUgKR8UiV*J2MO*0#%RcLq?;*%zR$Lq<Q3|wMs
z-MyWvO$?re7OsBs%fjL((qrv-dn_{N-aQUQich>Q4vzK#5-xO8t+PwWr@rKnjiH7(
z-W?iel@sbqfU?;_jq`)EXO6I6XY-oOUp$Nu5mFM+(~KoB-NQ!3VT3I7CJv<KP`_@E
zuDa|KN>b&lS^#{{52!;a4!9zhhr>za)IDFC;aCK?Xj9QsB7<uJhjFyoL5bfHaM!!J
zoVww>60AZq%!n)M7BG%(uhHux%p!RPmJQ2;hFaU&lSOT8axGVQlUC=GhY`~cmTe3O
z%!}><?U?ZRpTmaddIR4ik=af3>uoHJ*f&Md^0QDDn<$QT&)N5yG8iVF+Zy)zd=jY(
zwME#9Bww^2b_M0u+RS7X99H`)p4X^tFxzgNs0#1ZxZQ+qws_hKT&^U$;^^~nn0Y&C
z4inNqP0Of<SevzBql`l&@D)|+uP8lxRz2Fc_@HH=Fx~@mScy%&gLBJBc<Br-by*M(
z@+Fu>97VEAPMFpmbq-Un1fqje?eJzxyPWt(1Kp_hD4#(+azHY1st@zcO4au^<um+|
zEI`mcYY7x7d<G9?`D({CA}#;m$!rmv^Od-g$xl`G{zyTmdgoRDYXQHAqNjjbo)aL6
zk<5{&ZYSCX@<brfqS)QYZ>_<V&cB*dhI$t7X&<US;Oo#iJG<+o_TzQz#->1T8_NhW
zAg?VMBP}y~Z(L|fx3<LyR|CGJzZ+pT;Pw;_+S<ejU4kSmf`kqXISnh>W>4N^t^%@7
z1c9kJSvfb3&0D7pPclhdYt0FDCrP35Ox6P~+|G{@KU0a7e0tD-`&#9Vyo<f}5+XTG
z%tgrD3Mz@-Y@Tt12Rhig8`b+^e%-3DEZzS$L@!~Y6zDKaR}GlvA^`9MCP^I{qIeUG
zKzb8SqTA^DTDepaSUrOB&@`C&e&tDx!n*IzgKZpMnGl4hv(sK#%i)j@N@8(UMr+Kw
z#grF7wq<8|PXPg^09L8HU}5*Z+J$<Jxqn2_%P;){)Y?n`zBTsyT)5v@2$Cztv+o_T
z?L8jf#`{p|Zp16O_>S(94Q7bHvy3QPzj(r^spo3ajpLn~R&DgWWN)+dLQ?s9eREqH
zJW4W7V9nh5kttCso?dM8T4r<gm)VUe9?yaiC~BA~z99KleA)lQ-gkgCnXT&vL690n
zdW#AIQbj2uC5S^4ktV$aMMRoNj}VB0^b(4IfPm6_Kza=w1r_PNgdn{nAP^zM`_JBI
z_ROA{d(S@Y-gD2hpU0RG!e3U_`qo$8_x+S^&`{?b0s2OyjvpW-JISd>0y>`tsZ1Q+
zzC?t7Ns`~$+}pgvF*czup<1vHqC=5+vybGFUxGY<cucSqNSYZXSMFXTmoE81r%I8_
zOOPWX2B-1sOKT}$00xd<V^(g|1HiNd0OGt@1jSB5E0d2;$w7$*j&r0$jmlkc@zad{
z{dL&)@1bO}Byr%^mt#@Ljw+<R<l*VZNPNx+=DP@r=#AR=`U9lO*iU9jt^EPIsJ}t#
zSev!4JZ=~x2i*Gs0xB@l{=dGK!9nnwkcmF>3+_zPB^g)pt?WPC$6Y_@<-^*PjUS-L
z0Ny$C6>=Mhb)zA_{Q=%XQqb6U{{ebSy$mqfbu4~>;tPI&LLZSo!@ixH1{SDRF>=R<
zum7X+KgZ+0c07jTba-2BS;B=aACI?HWTbS#^Z)qW6r+ggM<<AW*qA^?*CXhpBOfy{
z0XsOhZ;Uf)Y^og@xx4)?RU!P9OPu?gKzYSQgZ=x!4EVOkOCj_uTi_6EK2HX~d_&6D
z5myIDRyjGb;`5IlJ1<$io19Va5)2+I5SfqhjpBP@@*)~c=I+5URpe<GIL0Ap@OPKG
z0in!#7h8gI!Ko|i4^-)IFoAkwwVVU5Fseq+#J~;RlP~STeqAR2))iy!2fdpAUozwQ
z9oZssQ@2j)UC~NZft14g|2VFNe_6lb^TZR&ko_x;m|qs~D=zO`Cme<-EEoFq%QDKa
z8lM?dHe}RLop>>V_Hmv*gIMph{F$pR-}vVsFq2G@3{i3(`TWqTEPNiuFiv#66R#d`
zRCROd4)NO42luUwIr%**uHkZhPd-Xaj49AaQhck%Vkq{rUHs2_i2kNz?__*?Zs*Dr
zz+7$w6w90d&9DDtLdLwOD=$36l_)G*i)A0#%Ir7;2Or&#&H~ub7O3hep;VfKnnrNv
z5Fp$o+Ji7TvP6jCU1n=-3*ZsPBhr4@JWIf^dwo36aW{l(;xs?R+C(*crB*jJ(Z>7W
zLW9T(fL~~a56BrC-LGYI@S?&uv9k@L@5T#JygcCK87Sc0WW@$tJ+5wHhO>8ZWrl9=
ztxq*;+)C&h#=QIH*En8+;$-e)x&16^heE)IbrwE_vkwG9O=u0FHcW;nrg+t}yt#T_
z<K^_zHyoXAS;^iu?yPrT+s6o3Qr2ZPp|HqJef;*qayX=;K{ofP{Q_&)v_xXb>d?5Q
z_GwN@uHFa%%9cURHI-5bU%()QsfB|WaimI+3mdvUvhj^rEm!4amg5s*#&?ZW8ip8q
z7~L8z2klF*TZ+7_BoRRnXW#E5>XZ^;W@9Fu?q(f1mg2cH(VM}YChMa9o!SN&&cTgT
zhOZ$vt>oIaC&6tHUfU6k7uE8QC8d*1x4#`tX}LTujeI<AeTmhSdUE_)4|SvzU(_ij
zCI_buGR+zwc_ep_U}%D0j;Ky||J>KfzPtXZC@q{i(~IMZE{n(ttr`uHl5$Pw;rd)H
zsx6)7_B#yUg{I-n$1VoMmbC1d&*JmW3tI0{XFMUI*cKaaT&h^>Azj2$=7LFZHlBk}
zl6w3ifLS){;OM?q;M8}42l5`RJjWhA=7ja_sj>N`1o0xr>oxWHr7JVG0%VTinpyP0
z+pc;Ess1}J^dDYbaTF;_F0@#`ZkPA=(Z~H-y&S^TY<?CyhMZ#1jW(Xsr?#zcE`X;f
zv96)mx|y=+HFodA+|MY9%zTGhId&~KSD|FLS4m3k(()fe7q2O3^PG6LA0hftSvX+O
znu4fNWWIWRzgV$h^J9RgxpK0d%-(Wx|2CJfe#+UPU0I4Y$_&vgq7%WQ9mC#q^%#Ry
zvQ{$GG2^1OuvHUaM$g8WPSZqps~bI2jIi;lG70_x>TpJ!1M->iI8DBbKo!E>d1;hg
zr~di_56Zxh-M*b??~kTx_GePpEcT;`h)il6v1oEJs5SnnpBJ>;a_e|;{c&df4r$iT
z4pt9HeEf9|d_^D$H#=bTBpN>)lzELYpXmPiVuoK=ZPNXM&XdZPj-Q|28*rFa1Zc3R
z5M{>9E^DEYg+WhYBh%^LlCD}?$1Guq4M3nx%>NR_u)ehg9<!f_m+UGH@TbveYI#SX
z(&NT#q&wZoXkXH#p!^hVr?z``$ud!hJK6iHS&QOvsWLA@nP9XXD(UK_q-`|RUX#ri
znKL~nn89I(I0)y|%iLNlfB&8`)!WAJBJ0lbrv4G^(Hx^PAJIY|U(;ZrKen|Yv9Wbw
zNT@!<MZQeW^0FJJ%B=Oq`SbnwLYYk{V~Z^O=0Z;>x*Ef@CI`+leSg2cZrJlET2a02
z=Gn+FoPf>oywtW@#)3kTXf)t}eVPX!@-`{`00n76iGWw-0jD8{x`L7OgJ7~q?C0JH
z9P;nqLlG@!fw=YURL2}qI_L)|;tkV3^dS@gruwUZgK{PylX&EMxQKaga;B8@0DpE>
z?pBKo{)jmg<Omc|2}bD);P4Da0?l(TUcrt9>*zOY><yktUmTMf5&<rmMp)6uXEpTa
z&xiO$_*3}?BaA$N=#Cbs1=#kmUt#E&hf$BOp79kMbKOsJEsVMC&ti<(C_b{Xguw@w
z=QF#koV4qR33*Wtz;asIw-&5!yCi)(-?6_6F@V2AOL;zU7gk9My$kTJ99_(BHW(B5
zGHp2aO~tK1q>AsxL@(d46q3*YfN%1%7yg^9qglWOf^jsK8&b2LTAC5(@cQZ4P>H5)
zo@qo-dw?u}(m_cN6~H|#%Ez2<I?HkSx8HtV&U7VbHxP`|WNZ*k9cj5p)V?t!0Vu(B
zHW)<*H*GtKf9hI%6n^D|^%X7bGaCMaQY)HHF8c`SIxlgEUd@Q~_jk4pRf%@d_$N<V
z=Xce<aIu(hmyzz`{bNd@A-MqXiYDx9biB9}I@==28}^cF=Jo3+pQSq2@&3*RbR@<e
zLg&~p`_`?<(U3dt;>gf;9Kr1zrxG3n36`+)quBH`7K&7K!){Zo_8}K^Ty?*cq|}HV
z2KTB)%|5H(EVhOa1QQlMT~|0}H>esZ$vH}9GZZU8vI@T9k~X;n?n0fMhQES)OR_y0
zbec+mRvI5(RsJe)dE)irX?3OIfL*ilA+)%-1lnb@%vG_!vP1g0xvRKO%NJb*lZ2R{
zREcNxY97|S#1t%q-HF}TIDElH$Ygl)>hh$SwT4&)uFOb_pOe+2%Ub%2A~@e4(|P?<
zOt{@^|7^WFO~dC2eP`50=bZ;&&~ehugicuXQYfIb^~_P3tpRV<wbo(pE>IJ%@4;M>
zd8y4MC+cYI9kH(~6&a`~=U?LJleBNVPpB|yh`n%i^P-{B2YwEo)LUB5Yt^2HwnCyO
zeY#Pg74u?i2RC@g&U>p_2R1p1+2Lg>R)KO*to&XX%9>a`pGm*2zjD|U>1lP<C@!!x
zaps#_uU0qppht$rvc{F_)^*_=V}`dcLkzB*TG1#qs2-DE|8!?pC-89AMfIborFqV?
zzEh9yzFR)W!((51SyJB!T2r;M*XMG<V%>AN>~tS*+AS(H$F+y_uVa;*dZBi(@YDl9
z#6Zq>E^Ijjr-w3YdS?>I8jdKKP-QIceyTXw(EMy)M6?U)<QkE2o_G`AuQS>t<HlG`
zI{ca&)Pxu7y&tPpcdxF$S04R1MfLXDb?S2go-hkgujo=5W|F>(OL8;XV_tNr#`VMp
z-s+Ou_l)AX>q}o$EUCzy{#XW5bf@-++Cx_7cQe{!X*#ZLDN1~ijPT;l6#P_n=7H(`
z2-%D!Z`D(Fn)jU#UHcs~ydEAg^;8d7_Ou`S?smZ#o5V}j7|_T9hol1&hA<XRKADH-
zm_Q#CO>|9KvH+O)tG{s35nqqIZVD#g(H+(@HRmM}><gcSEF-MfPPPVz4jr@t=`@X8
ze{O63Bj?ri!hl{RbGGYu`&^mUeW`cv!An{13eeP{yaW5<&FxkeUD7%&msqaX7Vw`N
z;r#`eUeX`pqvMaCYflz)A99e5Kf87Brn&C{364JY(p|P@o?ibAUc|U%xKw()X{xWl
zg`Z5?p@O8y!~>#9W_Q5hp3ji$WJ$8fGr5&AB$<sp7_(wG08qb_f%i~EQ)@QBf+hn{
zkyavxSOi)T{^y7OM-Ks!eSZ`W2KL*Bz`P5PAeu@0x(2ogRhj#Q<Dyxi6|SZwT2GxR
zCHU;(^~NDcns^h4Cx9@8y+Lpgo!-FZI@kjHH3m16gvW?Ew!4aDevK6mZKo^HX+J>o
zIeb!IWOOGo9*OOJlHGD8YD$Of%r{0)o!Gx}hhpWVzgadsL31Q4T+GWtbZk=D!tX`w
zw|&9qX5u=I!dcIOoYr5uIk0R)aXWp-OJpPW!dYa`*W{y5q@u6?ZZT|Ncl^`8{Vzj1
ze~^ZJ*Xj?0ZU95FxtZ%bw~-F0)m7qM$(^6L0kyh&r53P2U&hQH?*52ErSF7`qmjpf
z_^QHveb=QPiTN}+*BR#BMh>BfM_}-A;<rdnsH5a0fDXO}(T_B%tc}^sroJ1U9hXAs
zMRzMu13~wJXn35PMe;l@KidJcno+B4s0e>bO(+rYa<<ud;QUR=moM>~$+G~)6(keF
z9)~*XkT3@n8rru+XKqO37)SHT*y*39ql=RItZhJ)n_mhA#1t~*2pO45-%@KM7i*IB
zViwdM>9s+UG7N%Gs?XMo_vb2v?Ldh_!7agZ8_8IC&wjxrCv;Uyp%9}$t@=~B@A-{3
zXc%^COuht)@?qONat5l<>J1g>;L$n#jCM{HQ*M3Jh@M!u7^Z=a=6t%pQ_&cJZ->y#
zIhlXj4n-K^bVK*APgSL6ps-4hCdQ{5{g<U&Y;qenTif_41sh@90Uybfgo6$%_O|Ou
z>DL_^xVoj&`ka!rXOzBx-rZEzrf%i@d}E7Ip?DfBc3fLl3NTd=^eyG%836V6#ZoV?
zvL<0!rqS+k@^H`BTK+um1pN<hS0~XVmMn|3DQ>^UNN{M)kh!+>y;NGp&E90yfYnLO
z_Zv>lM290mqAT_w`W(SKdqJ0#2tdk=W2>8k52C6K3uFSN`kr#?mzTE+WxTc(Y&0c<
zSK>(anF~)c4#>>3Rgb<2hJJ)q`lnxfdh_H|W5BZFR<~4_Xc%7UY<t7qIlKW-YQHjW
z@#!h#`P=#kvkL|T+RHh1UvTFm5cTUZBFi0nr;%-HEtkY>E_}k$)7sH@?=v!oDCaPV
zPHifKA0;6<(Jg{G2l<`ssj4gQR0ky|UeKS5J>BTz+8g6z%UhUg(iTmxLHtxox*h1Z
zM4(3Jb>&X0?wqeVqkgy0XsS;3?t>)$$;hbh`TIUVHqhb-$Zw8MEJz*;vQ7l_1V9EC
zj)Of<8Q!VZtY&1}(XKMROCR#^l~e-iRV-+^9BqV00trVfhU$?rbWr*z{g$>X*BX0j
zQqI^}ent1`ybn}Aotu2aL`mJ_?gYd3xBGKrQSCDdt=Dgl#jn)JzBB6*pN<%M>E0RN
zrqcfz89JC|gRsqA<?n>xwVF9Ej_?mnj`mJ&@juZ`b`!WB*yP;<N?2+raUu_)2b;A!
z$M~)vXL;!cC+U&oMoF|l!D6Z2sdVkItbS3wrd@dI@|4R%2l}LL6+P;!pH)Kl6Nt#S
zB%0&M`$S~kq?MyJi*~Wn6`a9vkfn06^VkLTmV^&<TH(*^&VeBJ<Uv78$TJu5a~+sa
zTLYrYkpaA~GecCAi|BVFvXgCcxVDyIJci$?JjvjOVUvrcO^noSjTK%*_7VLfF8`KB
z!m?@1pj43+LT%J0X`RzMTEeo({#{__ypKFOw~1S}(gVxL<a*}D$=Wc1isySAF5yxd
z?zV;^Z@1naP;6Qrw9f?{3bjLtx1bgF&1qz2@yF}djYrNAM~@|V8jD<v+E2h~gr<iX
zzTd=3&#_2i9CT3!0gR~DoprbMIP+<_h_6lTP7mY2l_jF5&fV<eIk4@&Vcg_}uUs%`
z-wuXV^~6-KlH{2R7XVEryLb4U1cA$nQqqPWj)tkOX0r(iRI=@(-vo#Rd~D$YR3iIy
z*?NlB$xMXc>e>l*YkbQa=dA~!@@M+m%vkt3J#k7>UjR81g?xkzel#~b-J{c5oa<$&
z_*Wnsp@FGObEle0YA<?0@dThQ$`{p{E-KTnDDwgJ4wC4-eDjHPI@ePj{GFgjIaka>
z6VsHmyzftJ-EsOVZ<I-<rF<~stzpZbsb=NKc0b_RS)xf3ETEly@{6a{Lk=MJ;?%;j
znr)xQ+pZ6mp;$sa=6`x4rGgB874+HXVKKQ|nRb9INN#I6i^qhysF9<KSCiRlpTysI
z;E+QVC~r&?+Yzh6AE*0KHDES1JqKEji4gF66(><Or8)obiQ?$Ps3LSB|7-BAWvF9;
zqLx3Qa{o!yz}o?PEw1mIwgjo<BbmLEwb5^xPqkfar@iHt-s-VFA-6#}ucO^=GRrqc
z(!s%4CO$nHHFhaohdGjaqvk0eMTJ~@B|ORPaE^_V6MYX5eS)=Tax^iwuoKjRtXByy
z{hnsW_PDu-FbGs=Hf5vq+g5JSu$0B}$rm6l5x?Tl5p#?r-G#yyh^aYI*C2j%@rw|~
zqW)eIKT8GW1s9QCk1E_Xo`}*?(iLLzks@)Z$6B8FsKF$fB)K4(H(HIRb-F6kyl|(m
z`cwM*lL2;=7gPe~K_02!prOc0tIX#JBfACqZ3uj~+o!=-GwCuh+1fAC&wountxqMA
z|K<Iwg4=qOAPwk(?l@3->&`9J$|d4@GFS(_KIGO7e^cVN_N=InxBOoEPR%`d$H(Oy
zpHvi`#MO^TM{op}Mpyp?6x$zNbuF)IV4Gx=rMvhs&3wvkUPGM{6*rO`b8VreeN@^B
z3<z>H7#G`9XXnADnyW{0hwp3VhYuHL7tR+_1%maQ0txmT>+>)U1UoF)$5(fP#LHUc
zJ-S~r$m-|D$M87D-KP-rwK8E$Ugj-60c%0JG_bWHKtM$x9y}H>QyyAKzquLT6->t_
zJ9)|yq`534R483V383XeM2Yl2KtF}H%@Kt|e}KZX$sFi;$zx^+4RSsULKbY!i-^k*
z2hd$6cy@9)I0%E+2Sf?ZYLKL0t*}D@WhS74_!E?*<EO(yu`q}4LQ)`CVu=cXn&oLA
zy&#S3U?rUdq!Mn<Z<APV0-MQL)qo1$*^+>+AD|QG2e+)0&%l+e^kO?SLje>&ZwZDa
zpf{5Okg-$J?ld4V<G0YqY5C3TW)2MuhE|r&Q5BkBB+Ncvrg3+g-TOtk@^{2(WufDE
z<o$G6XxJ5mE1~E4HzbWyW$Y|B{!X%E?9|F-aJ^WY%C5wFCEe){P`?wh0sX?BfHsT8
ze(n$~+^~yHE=~8qzQ)_|JKa%C%ca_CWdJiC7=gG`@p2=wJLBw-%s9<MKihqEh^2ZD
zkbbt}@NxV(OVK=_#=6M^<h=hPPD~Sta3;_Mij$n)IQY!c!orl5{mcZ1JYSV&Cz-nJ
z_}u;^$Sfch?8E1N>O|A-{IG9K&=HK7e<z_sw&hgqaFe7<fo|67L8RN+_d@Vd@A0C@
z^Ym3ki-<Rii@2c{K@#ie-dtFNE2`<t0=!9lp+Jl&!E?-8b!~F%OZ}6Q6sjwgQdC~^
zd?)2vVHB=_P^{x}&tYxkHbSQu=D>fa@#ujT7d7)XpX#|2!XcWpyFBsQ%sTgnmLNBK
zk#=(-02(0_MV4`#+=_N7nKdH<%2ZREy$@qJKA17RpAsoCT`cFk26|1QN8%opXyGDB
z9A{zihp*1zmoNqCBBQ(o;+TPf3J8ZD_fogi>%-Io$si=n6jnVS*Oy%w{S!RXI@z4=
z!o;Gho?_*P&r+jriGFz1skUtbAFt|~xUDWBrc^fjVq+^PS|D0K%f_8Mcs=I$l5nHs
zE6e!+5Q!GAOS&2m<b!-`y6zj(oGn<2EQGZRWIW9eyJg|W*m_BXougIHcX1T80Zb~v
z+WGC$G4jg~Mx;wevc$1AU&|Kokg_Q327WX0RB8MtcBAUxaKTd&9${ltpegt=U9;!_
z`6Njka+1uk8UVHcyxeQw4qoMc{uG?})t-|t+Z#&fc4u8haSC_P0XzGN-~@mNm_1R9
zrlcD~RXVa3oK2EXF9;NT{u<rYvM2JOjglszmKV%37o{u?ckJ7@CFYhPWCozv(_@==
z%PR6@o>euztB)0XsFiK$QO<Mi{dxGRbsKJ*f_R`$xa+$eq<ju}udv4AX_bya+v5$M
zkRKq^H>{qBv+osSul7(Xv?41<V#2eDtBOX25H`ma`GkN*cN;ds^<X>A1r;qLmCZmc
zdHID<b{`u>1p_>#6O3Z6twFN2)c8{FP-2PKcC~3_koRe69~a?-r((p1{PCQjzC%Rp
zn^|~AC*<8|9PW{wzxs|}q`%`?2`-0#zQ;nwc0EitsVM_FVnUkfJ1|r&f<z(h8hm#U
z!T1{K{G(!5fElc+Q#d*G;5qYci+kfQs2&<d7q?Qn06W;}fYW_AG&kuAf!>j92$$V)
zs%O)@;nX4E_ICAI{-q3R`BlRYc{8CNy~;9hp^hIQurjblYsp*#&eHOWV>yGc4YqZ<
z=Bl_d{lMq1ZE#9*a-|Z<*mX$cTums-u*+C#?a(K%s*&4Yu>Cum=Dm3Mpvf!2%ck4T
zQXkr^TPfCIEXrr)I$hZi3Zu#T7H6<Zwnul{ZD;nM*K$+z_wAMFhQlcL(p?u$)}j>Y
z7cj(|2$OxOyg99lS<_RnDVpgl&3+TZfrfcu+@lURpQ(ZvN)7y-p!v+HngMBE!7<H<
zqqbxpxib^p{2DK3ROvT)PPly*ogzy<GWQ7s+>4m8yA`#=;z`{O#@w$uBY17^mj+Gq
zh1BT+P}YW?&=x5R0Ps%}U8ZPQT7!~J_Ix)faE5WGd!;<<N!W7^o3Cl!TA%!w3iH}Z
zR+GKiMa(DH*aKDasftg4p*Kxfv%i4TkoM_vxAD3g&vW2ow+5US2-3=|Nw`h?lw>A<
ziF0cuMG~ix5Ok&(SqW}~o`fgwd*Sz%n%|DYll2HP*|4>SkhhNoG8VL6RbKK)`c6Ge
zb>@I#K6Sn2B!Y`z(zdBA$!HBP>*(Y=RT=R}tZG%dHJL7^?WLCB#$`^sHlC2>sdNBZ
zX%C1BF%2TY2qj{%TnLF(`kfYI^im4z%xL=3HG7MvX*Jq$uWjo2UstQ@>VT5>1xF>(
z%FJu#Ix~CS#jXJC??|HpQx~f_WbxCyMt}J?TwIKZWLPf-*sUDrK=2PV33>3|QC&Jm
zRGrN<NTA9#?wOWeegBL)u58<sXo&AoBPz@*U2RXLdR)VH%R?;xiVi;@gNtnH+2P<V
zO72F5K=CzAG;VLZ3R<EIa`3a`j&5bkRrTQ=J?A(%7teeXEKc0X9icJo51`E?n!&2<
zJGaAbBW4sd;vv*4{X20rJldWeyisMR9xtc@9Z|;R$eR8?K*v$3DDsQMbqKXGj~YG{
zyA+Kr*kU_$QW>0{6}u0u`n>fZNtH(DV-LOy-X92k*dKvp@N8`6Z_i}gl%C;pnm`-R
zTAkobu~wx=d`cMiDXcX11_2vTAOPNA9zbq90*3Zw^|#F)5<;6yAbSBo|FbP1V)nvv
zTN;EGj_NMABnFNdp$y1Vs5_<uoqqd5FLNlK9s3t`C*rqpE-e)f9t8+kIp&?#$d<T9
zs153~O6$AFl%jEQaf8K1f}3@oicSj}L3<D`pmWk5b;OPYCU>Plxi;rjvd8he3I3$Z
z*=0)|!rkCcM0R3h_i6HgGB;s4ru4S!8z69OUbASwvn^!MT<-Il0n*HKvrj0qNn>mU
z%}5eLf=N#Zp`8>3B_9^(=`En;-%T&SGE(21SmbR<U!x|1vB&&a-DuJoY>o@xSSNms
z&w22w&eu;qXZfK*7ftx-GiDs+`LgQ^@xV}Ho<lci?E%lixY4&A4+)uSYoYz(N%PuD
zHrbXjCB@`XLZ3=w#niRZ$q8po00gb_bDD&8A+siT2hdr0hwl7fIj@Wo;9pxjcWi1>
zJ@y${8{9F6U>Rx>I<ehM2o3j4r#sbSEbd+@3!T4wI-ojT<B{yOEtCq;if~|!XYU3x
zkS-Bs)9mu$e%AQS;GQ6vJ{zokl7g2-q-coi%l>-p0?T`vJc}lS4S&mN!j7~8`n@qc
zEoT65jhvm=F=~PF;sOT0RL&XXFouu1DJ=R~Dp3h;+vPr2$-YV9L&(OC%*HLGAvZF^
zGtM;WaCaVB<{Il(b+d54yb+~#sqa$v)z~W_p*D+j`vFgVOQ9ZcR4TYtH^jF;8llGT
z9|?p>-UFF{>O#+%8ys6(ofO23XM-GbQFA;^oOt?n$puL+r!%Wbc1?|Nl{q&biE9xY
zJs-FHX>iYB5^E5^g?ogYeDFO3B1Hlbk&=gv#p_;;1l|pMOwygcC+Z4YG{m%A4LlM{
z-ALf?8X!3k`?(zXt~KdQ3Wd&z+hZe*zg`FjQ@dX_>p!=Q;a`dP=5Q~uKJT7L`kCEQ
z^o9_AThG=52Y8DtB+cRJt)zo4uFcNeDa;n`!EL|O0r{oQ=#86raTZ}fY%o?yKl}&i
zOY!uhvXMLVZ<DWX!I2~>NNfM0V1wlGkt+dkT~YJ)$7ikB$;Y+Cu{wR-iH>p)<z>LF
z?KneX0OpDlP`#h4`2pgBV#CPP$N`xF<n|OY+Ao;J@0vH0{WMTJBHo8S&-|w22|L=K
z!vS9IZ;p?Cl7+HoYWsMU0RjAB4g6Jdcv&RS9dnBG2=Jr%iJX8RJ*Ec0nOw*+TP*q;
z*m3zx4`BcLyK_PCok&XPO6m&cU=!2b+#aFyT7l+YzZH$jP@5(He!%tq`x5~U_U~@(
zfKGV-$p%k<Jr>A;|Mpy{zq>VStQy~E-k_8NMd;rR8<08wyIXUb7S8^kKa4Wz{zo(j
z);d9L2%R?e2tD60AS0S4fBG;w9;7?Wbx8d$UlU^472kvFX-#3<#lpHSHF;%q1kN%M
zBO?RXpds8NlkeSLQ&#)M&xlTtPXMimc!|wrALi#yJ0v5)%0lLO>zTCc4`>BAzHa8U
zn&IYm>aKBic~&oF^0b^-f$&tFvT3>EpnPFyZj(zcTr5C)=E^((RgdDWrxbn7<NHiQ
z{I;}RiJWw_daZI^R*U0t0`gI`PxtkVdOUi;+06`LjL*81@u+)Trtxa0&lhlj-1Ao3
zsYF<=Q=dcrLr<Wpqe<tJN%(F@-bISk-~BZU{qOF~_Xka(U$nnJq><GvkcS7Z%jP4$
z5#4yjHT-|2vmk$6ZNwVw`BdptPL}MR3n9h^mv7J%Tqs9C@wj&DvrX~qH}Nfz*;ZP8
z{rRi-*3Ir&GnTVGkXtJSZ%#xA|Mii=AT`xlp~lt4g)fKzK~ye%`$N~6UwAbCR-y^)
zT)?F993D-UMlcYIvimA)Mg;L?r6Q>BJ}r6|?3EJjbV*@IFnYZY<A4crLm-Ik3)@su
zrdxaTva(m@_>3kIcxAtPBkumR`JC=mZR>B1BLG<z*uqFpH#o@G_-!%21sttyKhSi;
zUSu<#QFZ#e`<8WAIfS>?w%T{oj}RZP!*dVN(fZRqIe)9Ot38GFprQa}MEV7MO~M-o
zJ)m(S!8oVkVWQL1^D*OceJy}WB<oGdI~xj=T8*ORN(K?TR_qHjQ5&QiSST2t)^>aB
zu(W9xw$8<qr*ow(wq{rikOUT>T1Pc^yG6k0NT&4!njT3WIJM4o4woh+dwU<&HBIeL
z%NHt)4O`nDpUVCC>|3hnrxqsQlf?8w_|D+6Zb%RL41fXQlM7Lu^%c^v@rB&$^3{>v
zDtiXN%xwOw7{a<c7g5p=Bpjx)vCtqKpLgQCC_S2U?TzQznsMi00192E-@3)w%%n7D
zJ}TCv0MIcTqC34){iB#Ir>Ac{spyU{dzGm|A+t1B&K~NjsFejD!x>jP;~kM_s-aaA
z8kfIg2j9%_X4%%LA6#3nGhq`!lz-iF(sA-w=OS+Jx8qy#@sd5}KAZgW>-3Sh{2sn^
zoIl$~Y_-+Za{}XSd=D6_mQgPmUW@yOTk9VEgm4;ErX||zw2W4PPdg>CA4EC{?X$fc
zYd_g0wky2ys=rY28RJu-8zZ^|k2Y@%(|y9XS;3+Ec*}AP_lCt}x|$ZtsG=xKJjFw!
z<+s;lcI-O=lcpOm{$<7WTqDr~pTyxs`__1dkzD^Ir(O5q>7Il+8l@a`EJ=e9q&g~b
z|2U<E59Y|#-vJOw8?_n-vGoyf2~P#tI_8=24?Kioo7-$gHc!~SO~=Nd<&$LqZqSTS
zS;%gEmB5!<rfHE-kuTNx=I|6wMKm9LQhxnQcq9KxMpH%=fa1{j6~&<j@9)Zto)y9)
zyCfOmXWOlrstD?(r9i8RXHK8)nu&;MEO82JsOXv}4SU)6$*^J@BsXaRNur)v2tKnD
z8G02ii#)BlfB3M_uBfYEQuj(AqDJ^lw<gbnc>j}rNlc?kFCS#I`hH3E1L<pANOpM7
zu^6bb*JJ*MIl{=G2lU{`K!QWmOMUV3ibgev#pFDGk+@@vMz*`S<lX>$YEJ6-22+<$
zb|gj#vliQaUifmU*F3WdrC>ZLE7|GsRMpnE9_bOAUQSIrB@UjT+b6RU?^b{qGH+-G
z7*Nb-sO$k&#47(Q$Jff*IbJuFlQEzVZ>WIJ)I7Ob73=o-)SB=lAiDH?c;OjygkDbJ
zIc6Heh@0(6iXuB&E5e|x#FvF_R1ja;*iPF-P~2+9OVCRbZhyk{!9E;yA|5|J{h@G;
z>!;LqD~Y*F7qmF(;?k$Wt*`X}aamtJb>bFanGAoCBcxn*om|#z!K-0H$k&LD)UIbq
zO7#|LFT=2C&q5CIkw)L>5>tDHJGg0}WXVZj4UJNi%XOO6X`fm3&CM|xO{O2zv{PB>
zSStbJtPRUnj5CD4?1{7M>B@E()Fky{rAFmXXM;aPKY`+Ppl7VtGT-H)-mV#NdVy)z
zOlEmf#&?gxr)&;It_$z$TU(3B#wUAZr49A~s_|-1xL*Fc3;lmt3;i!_ZTycaKV^bR
z&K0l;=>99zg7AqkEUGi*emQ~V*?r!tCi-EIrzq*-3q+QWre0;j-umt^X%v)=Etqtq
z(=04M+r!WhGrGJOX-sRKoC;{$+S(ZMiDssvqD(i1`EwCIc-wBS<<?AyWUFp??Y7c4
zVcj4_Ew6T`B>Hw$i~LD`&?W168}FGFY|+$oT(st%2B4^~=Bxh8zWCc`nv8%ivloV&
zF5>074?m-PE`}7^=4r`P51Y<#N8JnY8;k0gca5?ZC*I5_-O!CZjLr{V8sr}p`{r`J
zS7u07@x<NdL)<y{Wdk=G87x3Sse7==e6mglP`i8nm3+S`tpmc=<l}~ibXcDqAhWDj
zuGd;~zpJPip2>C*?%pS8@OFG;GV;Fctt!WGW+=5C#+I#0x^<k@q~U)g3O~Cb7y8gb
z-1Y9T=eG=w&}5dV&&HZ}eC~0ETu+GE8zokn5}+IF__40c(?BKS2#9gw&EEv6r7+Lk
zsi}|GO}wY)<9xEB??KT%jtAAX{p_$m6l#}wf(Y)Yk>ffhM;AG~v!wrtDmI=i^|sqi
zpl*xkWPlPvG8xC-cI?;geULdZHWn*tJ{_;36&IAZ!SFmTP=jB7Q!N+mN3k0k21qVu
zoWN@n*CZ#obqR7&C8~4QaD7)7*sTWDqui>Hi+zQlL^Fz+9XvH0>fpi$NEpKaGK+os
z<DRaDycDMe2hxl(z~$jG<S2!Gev#OB^?QWr<(~J~IAvy5`K}G$tEd^QcAe<m(Pb+x
zZTe)NKM8c?Qg}@F&?4*9ncYSz=9A6{f4poZ8gc(PbB~c-t*&lRn%8<kdsQ6AXB8!2
z{z8I=?sC+%c^4T*EB_8bf?Rl$HjtQ@q@N#qK@+yM$Ect;xOb+o>V4$?_w&)Rx7MgB
zA0+JHN)3rlK#RIuLISz<(1aXuYn+%kf9z^_oKHRj4;;E(S2y6<r(_^HwK66(RTcJ}
z@_-wq`?g4#bwCEgNFHyIA1?D`yro0mnJN=oHF_`9t4NDZq{;mnRFpR#w0G>J<GOoH
zWqh0fkXY=9bAX~{*H?dfYnr2Lk5ZRkv|K>g4tb^1=nXAk2s{m4G1&sJmD(rTub;Xz
z(K9fh^)pNTzTQ<n4fsjMfq!&#(fr46S`?7&c|)F`xK)Nw;Q{=_cvuD8r=$FJb7v1q
z_D+DQq5MMurt2}9DOze)fx`>XM;e&pw*eO)AuD@A$}R?u9`%hNTneW$iG;zppi^?*
z@!lYc)j;(@r!8@Ajd%Dhlx?m<&A`O!=;e#m)lD~PSv243)0Bh2?M+Vm`p2*PTcOmm
z{bi1nozT-HjVE4`%<CVoJbrAEDNf6y#-VW6ZF=6Ga@wh*SbsS2b6roOsi=b0{Tjy`
z=e`#Xt((iFp^-fiKmp?}*(-JWAu?Q1G-t+q#*yaMJO4Wz7cW~?H*nh8a#|MS&f#K2
z7g6J~@a5wKWxnsoozI_eZY%934AARG)|#W56C8G?HYi7;$3C<_aEX|WFIzl?R^(hL
zMqdD?;=KkBrN(2X2M|2ndVAxYZZBEoh^z?FG5PjiJna@UBDTy)nfJK7h0A(H8IsU>
z7(w*VHyX;S)8u>^&smG1q|Ed^Ol?Z*%%tBdwb)MrL{!c>-FW-xP-Zi@w<$+=sV_av
z<ngKHgt%`dOKIked}`QZi*9JOT(^?Em%l`RYbvxu$5Em2lb|-b%F%Gfd4^3>ub7i#
z^>x(l`)>A5a2U|e07bxc0h$A5idueHDD5ZSq_j6H#>u)2Lsd8O>gxP#I5J;}6gu1R
zow|m029!ROeCKd;ApsYON*l)NwbxC;HY9SS{NMEp<b<_Z9K<DLXnX{5o+uz@V=*xS
z5R$=2lN)|gT++OTz}$0Ok3GKc;nMJE_<}A&?~9-+kWrnIlN`H?U(~K^&>SlSJDW^+
zy;3uKu8D}mHlC_RfhC$9+|IW5bElLh=5eRW#F>CHK(-{KW_<qsgMk))gwjBh1J1>x
zGqoprZ~dB=;K;)0!{^_YVxLCxzI#ho)cEXskUP`m%QWRg<ZXfmio}j~z?}XTeN|mY
zU!W$YN2}2<BxVLT>GR&`1|mK{|M^hr4w4$sc$sU5oiZoV-y9-|E+|3o=sV@Yll5Ct
z1w+RD+hpntOy7K0F$O)$ju3Z(-KB<Y@)N*!GI4UD$`E46ZYSF;eeGb*h>SK2+~pzO
zowG4}S5Nax0O3BRpH#m9#oUG3*6q`e%)y&9`73iwVoh33^Zp(>uhwL*kEE})**`rc
z$rD?({B@UIL__|ZEpIdTf_*!TV+P*e4s;W5;zi~n+ud><HuGQ;jwduXWP~A3*F(5d
z6|YU_)4ciYW(Di;w^(>wQQGH*lLW6Q^A03Yo0e7`tx9aA+N_5>6fQblMy9@_dKk-_
zDVwzdhjf5xW`XvQoP?tr+G*JZS~y0-jVl6M?=-}oSSa4$cz}B4$yZ&9bB#Wjy?z{{
z2z{FUnI^Kyxn`Cl+$izZY@bk^N9$4XPG3J2jqyVB&3zx@y;6jD1!35{&%AQoTBK>T
zG~mOfC+Cu`vC7%0b4%5zykGtF#QMx}nxY1F4y9A5_0(~WQOqlHo7urzRQ_u2-mq{~
zDc?sbRlBL(47%6$g2|%qIC~LK2-Im<Fgwt5Qr^d}bb`p#^OIa^?~2>BrS;l*&FJ&q
zk;|w0cPk0;nz##j#2X8eV7Tl4GyHxs{&P>4aR$TK#ujeW^OHd@0YC-ck5XhmlQJCJ
zUmZ&tKR)T()9o6pEHKjKfXxgGplB?wA%ujoB)Lzlr3IJe+VP+GqEK-0l)OzVpWCVx
z_*peF+_$C%))l}5<jz8?>}p-(dko^YmbAiw4JgU6=o#g7LY-%iWa2LU+ms_Cor{=_
zuLL^N9C(hEcyE)lQRv6!Q>&KHX9(7N_;oClB;oJqH+KfU1Ete0oSw6sz4N8&;q>it
z0q%$kMOo);8td-s3O4KF@&orEVRJN17T9Qk)`pD2`=k3yUwbv`-esmd6JSzewD^Kx
zK;c7m@DCfprR`NB)iIhDA{`s;oT~BHA!$^o_R{X1I!|R0{8m07-o!_d(3v6DOjyb$
zKuv<tx`12`<IUdky2Z|#;?rkNeF^4edg88$z6m6+GdW1AL^-@+JNuYP3pWAMDVjU9
z7p<c1vc-H#Ww$qk(>OsiDkcgpGYJ>%z;G)wti-`3)dm$6aJt~}%9}asTrv3<OFNAh
zD}JiG!TtDBYi~lvHOhPwAkhmBQx<@mwEG6fytTC_P_`dbIM$d@X(qYfVQ2oSmA74%
z1}$LY{a#+Uk%53h&7B}A;dML6C!6ptOSQS&C{_n!AA)kJ;=mf6J(YCthWh*sdA;m-
zsVIt7DDzf6YX3b^;e^hGHo?TA<Z#R2`xg+**SU6KlNx%&%#ugeC(^oop`j#&yq<*_
z0eXLteeDnVuRc?2Upe;^h*_#GeWt>?5ztISN6AQrDW4<;&zn|F9eZ>@9RjT+@20$s
zS)(1gcS40}P)=%f+@F5iZk!=AD%r5($P_^xEEsn_Q3iQM#+{XL0>}Thr}-%|CV(HV
zyCB!o&%K%nWSOyoE%^N42Po|!X>qo}z0I%n9h_VCP6+0<kP+i+A~!bsvwQIQ3&<#C
zS<*FPDA2K!?em!0k$eA}@%JCaY?DqG7#pNyXx#T%b}UevdL8fCq=xV5fibm+!U25<
z#)UzzM&wxm@e(4^G=1|!Dyv9*+hqm{g*1^4PTN2lkOKzujWUENS>|M7?%EBZMO?qW
zVJ_41`Z4LETv+MW{nXJVQ_kD87u)(S+@dhh;QSuxeut*j3WD{@tH$yQbxOhWTQM9i
zb`jdoi`Lj<l?&Y=Oc7g4m$Kt9!N}$+o`lS-Dl=sfCq5>i=J^Ef+L0lULbI>vy`%l~
zWJTMD*Rg5+H+SZNGNJq~lougS%qL7rf*%D)d?86<Bb6kLg7nj@hg`ol)L-C_uj#c_
z56^65!+|`8d1}geOk$Kz5QRFCwI^fZka;tG0MC#m&9o_6M!RleEBa%v<@H;f>~zX%
z=b;meW8+-bX|CC;H8pHu1Cx`T<((xPix-}O#5xmN*R*67pd5hRUYf!shbjvbxpfEk
z*3)z@STP@H3_p%8yYC<n_dx4JT<joEpaR`~GndnP^o3rlJCoc>T3Ge^+S-?WJO+u}
zBbLwWC}WwZ-M$#LnKe`Oz0_B8{F%x9b2v(2l0-S%lt!lQxnN1UPR#cLll&|Zx5n&V
z#01m#1S%DyiG{dzSTKeW0l3-SMxkm$OHC9FSBo;mq4T=cNG|&g3jESGHgh9wUO`ER
z(<}PU#9rejNj`pc(yh+h2E`S(me;glJHc(Z_7}0h)W?Q^vN@_x@*wTIQExs#?}jA7
zw&iw;U;ZnpFP4Av?|;ij94$$~EZWrq@`T=G&(=|ZD^5WK*w-hE0loD+eC=^~66Vm#
zUk&p6Ca`)mYx%zi)1r;gBF71s#nAvp0IZmjja+4rS^M)2*S}`&{*RYWar4jkL3g=+
z=@wN8C~|$LnL>_TB&z~#U&VmKhOjPSAQ+(BMn@h9IGNlLAQ-4QE62{|UHr(~YPFVs
zOP6xbn1;ubKgc>!4TQLe1xNFV5WzTlXb6PwrG@3kS=~_bd6A7FuVILn;N#e7TTYrl
z|0}uVIAf-w-kYKfAvEVu=34>hNY{ZV3r05!cLK=qdCghY4BGbaw3itl!AAEco>O_t
z?64K$p*W$9%&WTi-lt?qgrs2|QNgrss>C6@pUsEr{$1X<5MPF~$!OIi!O7d*cMr2y
z%td?19OMxUkb{TS#PJmhg*54NPC-LRTI}PJSmf4Aj}Yh<8|#Kx?wwQTj>fV%FNx;e
zD+ea7-d~sy{{q|QS?+YDG6$rz_!_ZJQ{koJGET<TNq+t<_eE_Jr%(0sc3I(3q4)5}
z7gb6pCe4Iu2c5;g5%bcxXU>P)8ap#Q2&TeCrH*DLR@VWCz5TaD^Dl1j-#uf=LI@YU
zTw5`90|6CYB7Nsd(vWt;g04zznN{V6=!eoKEdpl0!0R2+HESAZs>aZBmWb;d1t3>f
z>QIWu5g#*?`LpZ!-p1nH(oKhEUS=(NLAjw0^tz!9)#yOd#XL-mvdCKY!xXF^W!+oo
z&g9pg)1A#)_h~7I#1vo?sQtTjMB5S=;wNHgu~1Zt<hs6d(+E&G%U&(d2>9?Ye@nfY
z-iB?TDq^#Rg8Z?C_Bh@1DHi`VXDoGOA7V&gD-_BsSY!5auv5e-ZgS|aT31*bnGY6_
zV#2z90`j!WVxn@^z`|q&x?L2K)^AmMZ4(Kw!?n!MsnfRy(NR#Rbn4xCNH>a!{AE(D
zDuL*}z=B_n*|agZHBkw!oSXNOk{uB$lVu8y_RqX4vkBHEC`bC|IF3G9<Ffk(cJXim
z+P0WZv)bL8KCN@@L)Q%D*v;N=$Es7t6}8Rp>SESDFUX{FF1UQSaJjc!sf~)O0w!r?
zM$Fu&92FR>HuVCtlKYddBoE0HB<45b3m<-{t)on?`S2Z9J9h4v7mGgYz<1@#CJ;Ru
zRF6uh<<2mEeVL{C+VArA=CDcaof1dU>?D0R>6NUA@z{SceZOSm{_L3&V7}uqTEx;^
z2k+}uwWoXX9lZ}r7@e`Y>GY3P+?h<2)Vx{rwLbz=Chxa7mA}3QogGU3zPuo`$yf}>
zb)5xz)w8=mB1lgqx3;}Hzkfr6)!`2Luk6fkd>-$9$~SfUmRn#`NP-`W@?{~C^l}dv
zOrjeEPd3Tt>ax}28Vf$#M$BH}qGyB7C04`@ODGWz+%I%TguOLVKT*(1IS9yfxgBSC
zLMQrLOS4ypw2Pg5U*sKFj%~}=k4SPSC*2%>5L^6Ocuyc<ZExXj!J2kTy1r#m8xTaR
z{r}E;_-~Nw`-MWn=SA=U!V6czlrCGwb$mz~dXpn!&Ge&>9uUyWe<=Y9<rMb@TM)_n
z9E7<J%gmrTUd0<(tE*&m$E-xKOe70rTHS_A<_F00>^AIy{@om0(fv6O9H%(ON%B+*
z+d-Q$1z}MNU-_t^YSKC0ZzILhe&>)54e3IoY8G>sH%5vpV@mB8L^)3%)7n9E{Y%xH
zD_GeLo?buzXgT^I9P*f3Hw>shKg|a$QZE$y1OX_<WjhccWU%xP5Lq7Ppl`I7n%?!{
z?7|$;62c8MyP8C30L<7=P@rFz{+}G<=vIVsrD2`>PsCa>J5Bcw(4lN5fTEb+)dl)I
zy+pRp|D51aA6kHt_#A$KNbW6zS*ViXEuJ`K0K6L9M!=Ax^E;75MF2iS{0hO>Da!$R
z(N#p$4-ic%@&||qT~(01E~5}Cpc{$&!{us_Bz|TTeREL;@PutknLvU~bW=q5@Y5aa
zKkwj|?_fe%lH_IA39VFbfz_s<O@&FM-Xntum|5YpJTNc#nvV?OsaG28+b|47v+xDi
z)9{zN&(~~&uYQ*?gH5>ZKPSfR5O;CGwJ)SUTz)V}=5xsS=>=brM6RF_iI#6<Ux$^s
z)WG6hy}=3`+6K5nC@iGSZ*Qb(j)~W=b7Gc!fgskShj~cGKUFsEZ*r_h`+$T&40PtA
z%z?jbt-o$FRmH#Ae^yI7KZyW6c;A<m-fO8&nq1=85??ZizbO);@pIug@YE(uh}gGJ
z8`Lp-u8{z3#c<A%AfsibA3U-ryCpnDYu0xsFnti%{5-hf%`VJ*Ku5M(1z{$BuxU82
zc6kjzQ~*#RCTW+fqmJtX6+ZeTb?#?Rq1zPSR#izhL|m2YajN1^+$}Rcy1?{OH<|P_
zQSH2IrMVbRaQ)-N6aM3L_Rm#=0&`tLGm{O#75#oC04(y~FEEqIyZXTL>xdWX!u)Y5
zWdCu2CGID(_&>~l@^85$$iIB?f3leW)F5b#I{GmEH@XF&Vf+~!`ZPlA`1OVt>a63~
z_1PjXe*?;=-cvbJJ$@ot3DYM*U>h38L&1MqnqN)(|It>?qrza-8l9M6WuEH?K=Rps
z6R9ysg`Z3-^hy)rdr-zK(=M-AL7DHn@|O7gNE05{6b;!&hN{^^#OK1Av>#VU)|=j}
z;G19_0qn3gW^r8b2dE+x*f<8qi`Oh%i3gF#ZKOi~`}uA=KM^OP6X1n|?ixy*T$jv1
zlboR9HVsKBy!1l4U(T3r3a78#Q<|C~m>x-rsEY*$DYMq%=K`zfq2Nb&&)TZn?9Mh#
zG46x%TOYNQt49Y+Z{4}G`7*HvC<WvIH37Yo3jIXVJ3ulFlW1A=E3D=(4A0+&4gcZS
z4?g_>@rz@Q{c3kh9e)MZ+-7tHOco?!?)O0?uzUVK&;$h({juNh4thd?tYbevcO0H7
zM~>~VF8&W)JIkLpvbzEFW%z4j;!I$tj`goSc84uq`o?b1^=G{R{y}BeKRUqx>%#1B
zI7@S*UuzqlqYJ;Gk^hyP@uz<doSLQ&g4fagE|Ug6Vp@kB!esn)VPeQO8R+7uGjv0t
z1i6YB*;D#cvMmdME=2(0{YeQR#J5Q!ck@DifVA(U3tcd{S;+&M9dr_AK6h@%`7Cn0
zBl`zv)!ArIyq4tGMV=?$r4av@5cB7`@=q`McYJ2OG2|lH2axIg^p94=gMTxe|Ka5a
zC<qZ2Fnc4=bO+<d%|H)nQJ{r{;rAhn@YK6N-=)8ui`9BpuX<?Vij`%D3kRp<)fbQy
zwBjh#0sVpxQYU+u*#)I`7zzqv9mP0tgIhnS@FlA~g^*;HTjvhNtf_!~I|gt~e|CKt
zp91*iNcQJYe0J5(zAeT2OOrSN!fK6R1R$(k^SUJAP3X@)>{uoxa-{-(7Rbsd=#u4C
z`0D>39x)LYAaI<$L82s#-68rf9CP714!xVXz)bKAGfLAnxodoO*=NW_HUVq3vm8Bo
zB0Yb&b(8Z1Tb$&^0HL_G0rjHEgrM7Ma;UQq1X#a|Ss$o^UZx5$+X)!HzxV#|;WNdv
zKm|0bB!D5HZ<hH>)ute>n-5B050`;UyV9-hoUE(UFZi@{djlT9!YQbZi}J;NW3i{?
zr*7dqj&D*p5)b&W?;G8nMksCz;3tUhHfW(8t<S&P9i6VXdk#uC)%3OEAWdMOas@+l
z>VJ;;X4*sxxLv$JG{f@-YzfF`k-2u~U60u$Pav@xK)lfdcx53?L?Dh)i)_s#>Q5pG
zy`a5b(f2@V8?8(Ygc|qpLSz>2Op-h5zuOUe#eaaDpn-A*VoyjgyjZ+cdloU~W5PE+
zJD)>vR27)jBDXuTbak=`eDI*6A;u1>n)8L`69yo2f0w1jY}^|F0;3v{!=x#)(!vkW
z6Qsy@Co&z7VH9kWJC3>zB`0;k45pAhO|UKSR~6;~wZG*iDeml&p<<X>*RLib>C2=5
zGi1|TZ`uDAW%<9e+W+eN|8o_~)mRKT{=ua0Ka|h@t$`a&{MGWw)TQM9^;`%yg5Q1`
z!Kd4m-7%8^YX5z6K~{3#^$bZZ1<|J@klD<Lb|{m|P**(;uq){LFhOrSu6#XUbPgOq
z(a)4p0<(0o7Hgo~2w?qk@5nmWz30uDOYM|n9*KN=SUJ8+$N4SLtG6CkVJ42+4Uxq)
zAyS3wgKx>ZJQ`<|r_4OwJ8rZ9y2b%#i2SaKhn82@$QPTHALcIj-y;tc%)JlTRm_j2
zUCDmF<*r*`lB-LXR4J8sr>FmPKu61okvt1Dk7Ix!F}dLJ>=>W$=)-h5pUzvxmz8S9
zTyw~Mm{Xnt^tSk^4n45HvK%yx()z8MSJ9K0_kAg^g-F7%5X`r8Z~(1xQV$8RTHAwv
zMy{yRgkbPSGyq!AYKGOaxZLU=`Xq~cS)2EhO#IsbE`ZMP9E&Eg0)WF#U>VV|W6{T_
z_<*<D?m|d9KA!r$jVCAC|8O>q-(8Q)%z%c-2D?3AqkeY?$nVcG=A*{*&kOyBM>NAa
z;4_Ba>r|8v1gt1?f$tQ#?*;N=8NcaqNDHGQ`-yE`d_L*IVIXQ|7Vf>{M5}ig#EtqI
zaZq&Wn%Y+22Jr|7*+?dM7~**V{xPbXew!5zx15@l3{~_BA8%qVDd2B5n#Ej+p?l_6
z?U&r)I@PF&GOxmic0(CvNj!w87i!@Bl0YlL?+p)Uzad`79awO@r+77%KPnW}1b&0~
z34i=KbiswM-*WIVyK_=)1?}mhsBiq045yz`*V_k2weT{ss@E!gz7!9SH%)&K-D>A0
z&j3CbG;O#(Un+s?`2i{~9hzA|7b4V&4BY;+;cn+Mjq#IbceR|CQ{GtTg3mEcSNfOE
zL-`&-&m#Eq;F%p<uerKI3snVcnrfYk(W**e=K{gNzY9$t+V9TfJWln^4u}vg_}MD`
z)4czFXZpAD1pcp@ch;|vYMSm9sQp~v;dUgVx?#k~xKFcU+Xs1Rf=b1wkLR_gk?mw`
zKg)3;S;A?K6(A*dD3h)XFAkx<KGVhNGEcgDTF_9Q=J&2k)EyQD8=`IyU4PaSp;hF}
zBbu~Ndip+$wn?)Me`;@gkQ3L?Sg}~K=3g!vahyVC`h<_f?#fmwI4a8#7&c4b$wn!W
z+Ft(W#ab)KEYn7SJiM)St@S55>(6q2uPc+U0zr5~81Ub}o4Io4<&{JmPF<=qu}(`0
z-_e+Rx?!aNNJs-{uT9yc<(u8bMP>7Dh5gFuY`q%&16v`AFMhrfbNP-j^#DyaO${?B
zV_~{o)wjYoxiqPtm?U@744?`1=E@kz+NoQX?DIka)xZ*QbV)u?YFhfa)MWE>sp(zZ
z@aY@Htc0-*!-ZpAtVxWb{*gQ}V@n%N)X3sO2cfsP#h+%qA3yUT!<^>3^7CiyZ~Dip
zu^+BAUM+r-6(OSj&uaGHyvJW0ntx9<`{!u=KkLEzYp%g7_tzYR&SOD`|Ha;$$3y+@
z{o^B*Eu@HSQ&B|7z8eyfkjhRfStn#43?o9;389FwW+(f;WKGs=Bl`?xXNHVnmcH+E
z?)%*5oX_WT?%(;H`}=tOzTf+I|5129xaN9q*Y$e6p3m3wY1$3p-bO9uZ?$1;P9%)M
z^e{oGD!PS0i{Q2I6Rv8OHy@9@F`Q>D32B|MXJilpc|b8c-+1ey_T6G2Yz}wP{8#Zw
zSk3Dz-g#Lh6%o}JXHY~Aw+N){%{z1PKULX{9)A1s|9lqab^KSp0zS1~4f}y=%=g^(
zO{A=~tNMC^%=%_wKiPvvv$LBULk;Wvwm-y8()T<xpS=X2cetx5+(~)4l6tDMr?o^y
zdIb}Tijzs1XZ(a}l}1iP{Y5134-###QV^P1qEl}e1b6+Y!x?m9Vu?Hc<(bhn(`%O+
zZqQlTnK@+f?;kLi4wxq6tu!w6A1@G$yQydbs57d5;@CUrMjh|Pi((Mba8VJ4lNEWh
z=Gk@RpN^knsI1jg4xvrDrL|UppwAFyS<%8%DYcDmga@QJOYh>EdNJx(vm_*G;`@^v
zr>nk7S6i+Ho&XFZ84CX=&-hJU?f>A7NW4ZqK4+nS?CBZ#<y<t-wDnQ-T`lEJGpi_X
zzaL-sax7&f!H*A#)ep=n?Os`}!|yM+U7|WemsqE+>}}apB45}FoU!PX&5fp@<NP=x
z7PWjHtI0Xsw_W8H&Tg@N|BtlD4;`?VGwPOxXRffGD34Xq!wk|=*^I}TTO$f;K03Xu
z6yH1*r!C$4*urEBiwXv$YTyi*I(%8_5||OzK4(EeKD5hN|KgbKWODNj^X5zOQTp~F
z4@s-xE<+aRLt#o&>E=)H*;$yuPnaa{f~BR7GRIlHH`C-tM<mm7A}hhK8aEt!5s!KW
z*DnE3{MYRSoDGyEMA`UxR`c8V8y{}OWa#$&Xc%>A@)b_(UZ`Gqp~C51ZZ;1nM_be)
zeQKqge9oOQ^mLbe;V{D($3;IS`NUY~hDMJU*$%(g!B0CX!!vg;V9&i=U0h~PWT?Y3
zdgJAEN!N|wq-n0FwV<^G=s@f7)mkx<2|RF;ds1#3+cdV7(<s7}>*By8Rk+DOs{pw4
zn3;C=a-wQ`V{LI1!VBAC9ZZwNCINq<WQ?wI$Zl6o={0;(xgx&b>(ij;Z8RI8*+x9X
zKzLwoUED(`%7=)~748p7p80*}vYdUkX-bC&^gvTR8Y1l;%<v@=-$oEUF>DFK3H2We
z^9;n(8lm@@W&)k}3|>bE*fCagr5tyO($aZc{WzF*#Izn@zgjMlz_E$2U!d;P`B26B
zucq1wXw8}oHeubEil)PuZ+6!@Ro!2Vm`ajZC4usd<<iAOBvD`1jgqHldfEN`d;>TX
zO9^fEYCYLD;Vr=mNJg&nh|3n9-@cQEe9Kjr<L|Ds#Re2IP`Ev68yNy1*iXs;^lvxz
z7YL~IpleHkBNk2IX#uJ}1hYax5CI@ryZipwGTnsxFHo^fX8wKKUm%mV+umLY|7f9B
zlAF1N(o6-^2Cp;==@WI9wz%9`;_<YWAHxuh(U(p`<YYJEV@HMe70Ll{Fyz^gw?r8D
z^uFB>K2t1zLjV)PkNZ|TBsFyppC;fOXjtl)9ks{&<*#_ef9E#ee>MOV{$tO#&_ANV
z-Cq^Y1Gcx<9e=@$|95=-PjKgRcfUM{3AwHXS0!Bt$2BjU3*sL#7FW5tq`V)+LFKVb
zCfyYKsk}@}8MARW?FOF{vs`r8n#h91M<ilwqPfk!x-PHAuM?VQk~ighX6OnXREyy%
z4bGSB%aCxO&0<FJ#y>a#7L8VpRWZDg+Qma(+OFsiK+BKYS+MY@hO})&z%{?JR)Dnl
zh3J6Ehz4Y#VZ~jI{<F-6T~(TDnNQzTo{N*@erAK=d0+ylqJ*fOBUW5anyff6SXT8&
z(zevem)RxIS^B+T@r)q3gmG`QY5J<UMf>vw<{;KJ@F`a}w`(<Go@V229JU!GtMTtm
z@kzQD)om++p7egdYp^tQ{lsuM0Gi7{!j#0b@>WHkC#Ouu9OZUk?+f^ixy)<y(thXS
zD=y8B7NkS1LTFV)F~ed0DV_ah;x?U1Ay(Zo(u1w~RVxEDcobIQXUUcxAaFBTg9}1q
zdU6n@Hmj3m2^xW_tMXrEJpAI5-Ux#)wu2?q>15YsjJ^aH=B{nzI{Elql|hR_8E;&v
zcNtYMruF1KRneo{Yf<;_N+N&sI(tISHal>3qQa@d_&Y;vUV&2BeLXMn_zEqur6bvy
zl_)g);tDa4#7~y1Y|>Auo1`;+UpH^-Enp?S^E}}KS@bjFnH&m`9eP1!TC2Q1u6AKV
z((<971M|6ewAauc0grFI$XGU=A;{fzsx=`bR|uY)#FrRpadl)1cNFjigBW)!?lr#&
z;A5Xm5dv=g%kjVAyC(fL2KqA-_m603|MIcFVY_;l{!O)Y?L{e&!qVa`sZ6FopO-$y
zjdWHF@w4$L3bI&(*!7wR=n555XaFcHFUVw_F#IhZ`Ougogy<_F1B-2iv^D_pazQ5?
z|K(X`M+qMbZvyo21KN&$I0r(g8ia-YKE&&SKL#$<Le|ax$mHoC1rYzt@&A<c2&NRb
z0=2|WbvVbQiY!#P8h@hcN`97&12eSXhMwjH)k=?-zd+v8Eg?~wYIiAVLDH<THq0*j
zOj8CZY`T=mC|M)#i(J10V0?C|a-XK6B`}RogLVRFLjVAZaQ}sNL?kOo9j-iPKYmS|
z?J@o!yd*^o;oRnP1SGu~Ur%jQaVyBTC}SKX_9rBgpJS|97t^B6XR0+WaAsys-GhFc
z9%unRVL-3KO(>Wk#G^fvjlSD|XE(AI{4arQCP06X>RJ89aapFfn<ZkGoVLZUDBQ|1
zN`4XuIzEwl7~1X29kEq`mu%&P-T2|ydUn-U1n!Q7nuw%q=L_i7x*t@k?{RtYcd{nk
zj(YF<CB`p3u9F~%oBIXoDUSqVuk3x2xmY#jhTKT=S-a=gN7mdM_Qc)eP7NopX1W%c
zw0<e_S3}xiC|COR#$0SG$4WBwpioCsXDx1FV=dslqWM{zg!y7%u(>`3+~oNGX<7bF
z_WXY~K>G@4(FgoeHOgbuu26@lQj~O+$AxfBBIl#jwy^#4%)_s!+^$mzZ6*z4Bpq?-
zw@3WJUbI_`A~`~wURTN+_UJyfAv0KsD|y52m?na?_Q0EQ8NKCQw@pDh=(Q2lyXUhG
z9|F+XrV#uLK^mviQ|bZcu<Q$1M7;8;mz$Ts&7$m^kDsCuS9s_CB7BARJZ;o2Qx7FC
z==eqbfR0)SuZkioxtn~w-=X*=-*UBEvNj<aVz9q<zgsAG;IjTtRch4Xh6bg%_4I->
zogSX0w15mj-#=z8+CBYrQmQBLeAF9oG~>P)c*(SLm`z)7v^fA1tIMkCR5GLY3zXSV
z)~(QQ?_eJ{6b1Cmz1-^r|CDk96TzInKpZ+`p$wo)D=YEVzl-A+XYo{<aQ^lv1fzoM
zrJPFDq0#~AShtba$t{jBzyxF^9J2a!6!Pyn`-Un&h^=`g{uk&49)W*gP1B_H7}4Q}
zK^~o#L6TXc0r!#|>e=B_h>^Zr#3I~=Y|N?d`qj(^mjSb-hU){dEiHf^VhesdNv)Cp
znUm{RtBIAaos(-7=f%m^nbSRr5^DL!XoxB0@_58nDGu`K*1|I)GG}2x!7}ii_P8}N
zbC)(aDEFR6)Lh&EVR{fR6j_Ztx$5gb)KWOTdZeGdlR+RDu^S<MREHmrXPkiN<24DB
z_)=8J6}&`<8Y^^io_BAt8remaez`^{*}EkWnG>q^G_#83!4jHdlb@9mX4x}6fs6`L
zC04pcDdjaWl){xNC}|p1uFaOTq){b%gZgFe?-^Ivh~8rN-8_e}lg$Xy^I5%YvI=zP
z7@XT-*V~47_U7Z3Ndc;Y%uiQi);X#7-U1aSBuxym6U+?f!>OfUBp96Bow)ha!70qm
znhA)=xyGuBzH5GhXk93jIkrS@a-`SIjq#eh=9#dYcdv7(FM>{jIK=!C3q|5~mChLb
zX}KPF{6Ac+-|@QsFX3vPghV#;Wx__g%!v`2W8NE*31a)ox3&dV%D*f|?dn4bJVZJy
z!!&dujGsqJvhTWuAMI{TLAhskuo}P@itwinp?=;5l-$<csERsdWi-JHAkf^;1~lOA
z1N@AmOHrW11uEYLNeS`wDxd^c*!Cw@&5q5tF>0g%uwVu1|B2_l3Yd_tAc(%@n`?l+
zTRu>{Z3S{AfO6j1f^aZAx~wEY*_ZzlfjnC{lQkXSd(5|0<x|e2b5f7e{*BPXvQJ5e
ztiAyh-^LdJV*0E9+Vl*6KfNW^Z@)l<N0fIEM~U!B>h&?TQ%*<L)paR82>Yf4?dKx~
zxWuHy<9zSP(67Inpa~uM?0u@feJ8cEv6q@MXmfvr4zLdX?R5YME`NVPZEhN=gO!1J
z_3tkOVAc6|7c`I!Aja(guA+Z;5s<&To=HCe#Bi_ecXtW$o9k)%72xUpGxM{C^bAk;
zV#?cf1uF4Z8VJbK29U4K%DG*x`#d4q>SkvJ9VR|1ZalmkNYbQ^g3q-rqu`QXgSZ_N
z0taoe%rkmE>wYY?uMx!xn=`#FR<>8+!uoj)YqhA4tebnb7ZN1GxEUUQ-aDg9w>+ff
z?-m-w2v;NeUDCxMkHZie=_k(xLhgj0N9@#C_u^%D)5zSEOt@iWvp(sLvrpWd3tSRI
z3Xm~;d}R|=c-IMnc^7oimy$=lvNHYSB1IHY{9sc@#bDtd5@~E`iJb&7p#PYUeT`p`
zk{NVy!{0nraWHTBW8e7%IhNRv7UN)nS(6pAv{0ACks>0pjmQCP`e$_}ihNj|&Teyy
zVs?o3Hm_`xC?C|WoZM=}UV>b^pF`9mX%X+7hf9)l4U5=Dhk?fK5Da)~fa@cfQzp<N
z|Itb$fx>}LM{a504={2Nj3ZgF0{<Zvd<Nh{Fqc{AvZ<2>?Ih>FZEH$=+*I9JEZ>sE
ztq_8%lgK<+VF`&X;MLjCZ8$l9G!;jmK-U?=FC7zrR>=`2->M#)+6QX1xpx0Uto^$M
z`~Q-~`nO>B-~RkB5Nj_gEaOC%z>(nU{8)se+u8M?V?<g%w+}G?9!;lmxc)np&yQV^
zC*LdS34?dXlK5VIsnI1F!f&DhVIBQ60s_ZhD4vu((_Td+b@}KVBu{RuSu0&xy_^(u
zkX_3XWLN$GXBgrIi2(9kxICssiBD8(410QJx<i4sL8w?#@iI~{LEq<Y)|TqpGdo3(
z)i|NWh0U48@{qgzYKacwg;?cmpn<V#x|5QBujTdbz;4}zvt-d1`zq+d`=>q&(pzhy
zFvBuO`}d4Ybks0^*4n#kZDVP6sb#RsWVK7!$x2udN2j`m=DcgsVV~-Vd3oxc;G?3G
z((h5upyVj$Tb063AARxXP-I^R0qv`jLH$%VVx8Z~Cbk9~B5w0lq9HyaVqx=uFr?&a
zCopJ3u&jM<D!a-4;ETxZ0SOl*FHr;Nw<*S;V!X=3!Qj#98RJ&Bs#SiK8bb*~G`#|Y
z@Y;z_$L(OpGeQL*Hm1WiXQ^C7-zB;bZ!jY<`;*(xw25Ue*i*2F*-TMmh9>LstukqX
z9<dh-p@U%$7U{Z3-z1v#^Jdc8n26zsL!=Y*wN$|H>ErSq>?n-jq3kM2)ui{1-C)+f
z-G5N~jQ{4%F4I&83NrtdV$SdW_Wws^sA6ZOqz_12*)7VroXPXDHCJ&#ww~zq0Rzy{
z$ca?Zl{TKB1;jBGR@gIp{C;vxq3hJOW(d!^RK-vJTaOU5RxeR?%c)~ZFrCGSu5z|t
zpkc)yV`n-z4+e&Kz&LJavCWr(4qEV#wwktH%ay^D%{~=({s<#{STifJ;6+<H5EwCQ
zkZu69)_S0}M6}ifIQ|p)m8jhcPw^EryD30>#f^ijB`7@MKtvQa4k5?tkY(w%m;ihP
zlaIi716YfI(PlomL@~v4BT94`L3C!F=A_1U8URG-bud*shsqsF0J6-S=VNWS#)w~_
z_yWRxstIp@!+tl%F68%v{mqOP{%u0z0at&4L|N=+kFZgP;NOo26vZ+rS%}qHnH{ot
z7N8O3tor5`C@Df}JJ4MVj{JiG5uJB~M%sS5a{E#_fkw++S^h7GFP-mv)#|fbb$&k_
z1XBQUv^kPz2BIXU0>mLV(hl<d@jw&G%E$)DcYiw=7&i{7zO;+5C!OSvUMleX1<L)q
zQ(%P5FfFOZ06r0frxP#D57Tbp#cD9LQ(Q}f))7?3i<r`jv#@>)(B*hShwE_Lt@>pB
z3&i4c>6i!gx~>skfseId=uXrO&thG-LmZfetUa>Q1b;HLzHBj^NWOS^T9<IX{AmGK
z^>3;>{|UbVAbExVQ6c=d$KVyeK*z^{#t8F|0G_)I{FJw6I=;W#`@6{VKQ;|O2lF1>
z*(MlFOV-5_vq>h!r*5EcS5KSG+T4cMpB5%9C?C^#{-Nbbt0JYSETU_8x@obuQ-Cxl
zeKUNim%HSeRO$Cb<_yGEW(t1Bm}^h>qP(4_4q&~YDDkO0j6g^&*(xA*nkUsD&p3P3
z9*Wm6wE8i)Vd)Lp172j5Kqm{bTZr9S=DYzY8nQFKHf0e^6t+(*+c*N0Ca$1t-BfnJ
z^g6$A^q24v%A^%cJ%R#qXIpl-rKRu&wOCAxiQUAK=7q{}v1x|O8VB#BJ8kRf1V&q3
zh<Ukd(+gzw{LbXmylPG#7v?lB*9TJ4OixTN;DR106r=b{O0GfNcEV1>oTo+gA8FA#
z>p76x-c;wWFnw(<{r36$?d2t|ANSl{4wD1#=a2z4&j!4wDG}=A_ndldw3IsG?K(MB
zQ4;XvStR|mfe<n{0ChWVVF?|&-75N!tQbYWG#fd%qVInqm>+lI&h&tH$YszkI+Vi4
zaNC$`h{b%H)0f_FCXjS?x6sWoVkI`TR9f~3>M>_l(E7R5d@_4g*Y32HrZ4@NP)r|F
zgmnj3r?}2}x=_<C@X}*5yx}Vpdn-RYh*DU-4fiW?PJ}DOY9+sOalNeVbnMlL$cZo8
z@#&EY3Mv^BzJ%Ar!bJ%_>PQABuC=t#nT;%fdk^{MJNddK=h?_%0qFsJz~glfH~}Qc
zukXLm(Q>K7kSJW&CF)R78{oKg<!M;fy(pg>9-Le!i{aX%n`u-=+eN6YgR4v4SZ5zY
zq>nNWv_nt)x`Dc)T2gF!i@DB-Cm`=+O_G9xak(DOV)?6Q%(aiSlDVNX1+#P~?(axa
z7W-6AEyi2^0-Xe;&ylonAIr~Vp`z-8NlkgBz*(^q1A4FJ*&lydeo_DQ>5?M0^$u}D
zJ_<}AjBa_QNw8WupLR0(5axy*mL{npyt*nyq@Eb?3J*X0HiI-GTH+wsSk#uD9Uk2M
zk*K3%@-Z2Gw*uF%667y)r=m&w`&-m?o#bI0>+Np_rj-_FZ5t*LE{0*;%n`4)%W7uA
z09z-0r5~&nw=O66NP#K}X<_{L6Xs5yG!#&4!hp|Oz{WZnbzWUN^Yr6{*rk>_PKHWD
z5v@~UDHS3by)AHM5+{)pF!0{Gi1##}sO}Hc-qUM(Yo7Lo1*j_VDxXkspgn&edyv=C
z>{E-^I0a>w6uudIlH<WC5Z$8_){3`pZ{7hqCkRzU&jMmvdu@g!uHZCT9VgaHAVhc-
z?^Pgwq|PoSBwTuJ+Q9U1={5_!YEX?wRD%1X^z=dGOE8B_TkHZeoMS1S7u#!D&3Q@6
z<8k5%yGZ0gg8FLUI6BJ|C@y~70G9tcA|MF3`1MIG_qoQ_3tuk@T~g**C3ga>Xn@>7
z-IEQ>dU!lVv`T3NQ1uu1XCdH!>i7QIT=)NpYV|)gM}YJAufy3QqG<)sw!YWbCtV1+
zEFoBV-B^oOFu@-re9U66Ed~%m=Is0h`bN^=u8Sk<Ib~~IfH>6HFL--(5+8?oY<s?F
zZ~xrKv4@TW8b*fC!5#6J0Y(DQ#Hq()Nea-p?u<I)eN!y=8-Gk~Frs%Lr{cU#QF@0V
zu!~pLg~QKcIq`@{@iIVn@#gy-RJWSoCBLLfg9|a=9~)&Hi+Ry5`_<rO5p*O{|7Ni-
z=@#6ZxS9(W$H~@px}6&A3KGUopKFe1)d}x5<a;H>m+L<%LhbgqiIb(Taf=C(!$Z2a
zj4cPw9hL4)uYIi8_-@hhKzR6(y6R+W+>oD2s_1AqKiiroo!k%<=syP*n(faPFq=+p
zoruzkQ2sPKUY?ezf)mJwzOv*f(HxgQaZCTr2KNsbZhuC*PSKuXi(hrj4pt#yIX4e*
zXl%8siuqzJk9krL^LTQHay)etzZH)-$??O&ce8WJFw|v*3`Fo{#3z$#rXAEPZd)!F
z8-~VB53}jGzK|ETOuvqs+{<U!j=Bf6?Uvb@3Ft=Ae+VC50s@6Tvf?9iTc9#N)-kj5
z)5GB8+5`JrG>Vu?*plqNO(@*uPfYNw4=tEj`~HVt%g|o()9?)$!*!|r5arO%D%^X0
zi+~*Zg$%(|2KL<eTUqhrF&qFQ!WRqQFAwVXU10SlwQ-S8;2DU3RH)%Q&+Ns%Eazv6
zGpcnL8P-}(nhG7OcYbU!d9tE#mJ}Cth+2efCDFw+n*!|yE7+iL0M-||sirTteP?R@
z3*>uV%raKpf3r_5?BmqYlX`Bb_A?+LxbWwvdl22W)Q#9ELMV-v%>Bg(_(R=UnS$&n
z&41?kS2F`1IcESAo<dCd0^g4w7O(r0*C>$?J3ZDYn`1e6j@~ZASFPZLC5wr>Q?P)p
zy>I5dmAl4Yx*|aDpJps4KHLYW1p>K1>}o><WCX%$(8LkCu{Wjz2waDfAq7yRi2)NW
ztr4-{-~0>;R5Yhvq5^TuOT>L@7wP(BYrn))&@|Vll}$C9cXjD$74JP`03(s#{GI7I
z<Ov_G?%1|ld5|>Rs=BHs@<RsJjn`U4hdadlZDKI!c;c2iA9%B$WF+66u4#r7iv3xQ
zbL(McCO5W4SijO;`rJKND0J|;_QCSmI**m`d+@XlWwI8~j6zE!72Sq^%5<FOk91v_
zjdRrMrvZ=@lX5A_`!+jbt1NZY!n7xrrz=C&Eh6abYt5{Uhe8uSn#SRpBGRso1Mhq7
zlI?)2V8(O`K8Q3Rnsp2lW+{c&-ct$YD)Jt!8w3E6#&&x>2H=}lsE&aBKa);jzFosp
zq;?di+C>|{BI=%V!n(X1)Um>H_L@<<@iD>Tu%Z1PZ3F3PWC)^?H6)0Rn7Y^H51)>!
zD;wZe*r{<c(SE^mp8go}?SbvMSFhp$w}_%Zz?84U3%Ca*2F`&W>#iWC-EAJPcCr0z
z>Gxsm-LawK^t_}OhrtS;H$jT+r4r$7mGZ&VGvph!c;E1DO5R*+I^}(BxrmGbW0;f8
z_t{Z7DFgMsRXUC>FQY!5`A<H6z7Yrq6e|q6WKxG$o8P;eXfu%m`w*gISXJ;SbL(zX
z)k|>qoyj%_2n)20&kGSXHt&2D%O_5ATyE`B;JY_EB6caRn^=3Qc&2HQkzo@3Zh0K4
z$|e-z$YTj6-db#^9p8l2On4n0_hrjclzbmj0vOIqv0jLI|CK02y1r_2yk?`tW2}q+
z5_&%%&E0X&8T~znjrMh`S-ZkkeXTOcuiQ2}<(gZ7Gp9-uMq(3>SqN<v!JEF4i5|SM
zPU`YFtb_Wj+G@6-mKihS%=U%y{A1aT1(RPL14M!7!eD_m8o-+o>sZ;ntBGZ$F+A0d
z;F+;@J%0557Tu!g79(-sP8|eycR1qQTK_ar-Z<6J&hLpa<pKIz64sO|pp=CRV>hz;
z4*d6b#iu{7b^q$I|JI%B9HtK@uKiS%+A$gze^U9eI&!7HA-zL~-I+<cLIng81j;yh
z*d0#Cd0O0L4i<cR-Q-9+I{yZm)3na8R^hcbK)Nu<eSHG1T;x12MQN!KaEI4X87LXH
z@aeS%M_DjOFk<qKj-(~;^PFV<{3H$ELh1s)0<z10SsDN$KweCh)91p3h3Bs{#KaT&
zlK=__qsCN3Iku-YJx(TECQim|NsF-{hen+X#M05U`uMPA?d?z?TRH}H%1MoB<k?Ss
zCjHt`nP{2at<B9Pwq9-$4Vg5V&TjhQ5&OL@wKs3t|Bcb|@mkOYnC(5hK03TqMdQOP
zF?{^2S_;RgY!|G@BX}hy=Exyc-R<jx8W&Ifk!L|OMb0MvcFEsus^@0~C0(B!i``W3
z6Qmhd<jT2=mKnSPRdA~jRle%>bl1hUq5jr1_Kw5T$FJt6j$=GvnM^?Pd7V)tIi<|0
z5o>T{pW{$?xp~k{+I&{9<ErV7cDL?yr+|px`Qqw-VD|6;;Falr&8^bcG^!e#-?gi)
zu`GW1YW8+76EE-~%Rh}00<Y5e{OOHh#rrO{y`(C@97vqq7S9{w<o3iQF1m@Ye|b3C
zDg2#_7pxhl!LhQyh;TGKM;*+#Hb&(pyN|P!VglB*>cXgkpXG8F=4#qh&saSdRN-G~
z2^f4#T=>Kd?m{xI1CCf%(pXBf*V(wr#2n6_Qa<fpObawFUpVb)weMIIJy74*bzt{s
z9WW0NxDXef+<KKri?^``j2A9BwH*%<TtB;>ac2B&@1ooJhjMxaYx&mlP7fXl@aNEu
zfuW|c!u3tkdCKd+u6nEEKcN<i{8KbW(G_qEq`K}E5XZH}OgqCi)T<0b8mBuQoUj0G
zHgB{XdBW|J@=b%FjRLN1%l9DL-w7fF*;4y{fojfGWRU^e*^c3t+{O_xtyk9@CyV_q
zJ=}4T*Qro?nL9hz{FIjiB2xTxo31gOB7eSp_~B0waKk|yeBI5a$-AFabY;#I>I%eb
ziWYKR2pwme&dm-MT{{XaOSE;fvvuJ}Pj7~@%i%8ET2jBx@!&a#FANS{vXsW{P#Ld@
zr^8zySfP*(!PxeIjjKU+{JEOn0ie&DkOVIWHp=6;Qc7KC8#DRA$d<*1H4$3eTly9Y
zZcgoP^YuGTkbJ#B7%+MBy-)Vi)3OqO7mhRxifxU09k-oQwkChoFK^xC>)iCLwnwG*
z`gy};Ehp1v4`6A$bF6viA+d=Pf$#5VV?k7^07ZZ~xfLVo7|CEk;0x=npC!xY=0?4(
zDzpD(cqqvb)@FsX*|Ns<>~$(>Op>)cvmTl`5#Dw9jmf+!`vuy(;#*glI^^-)lv}l_
z8^RNg!rR^?24Ds`sb^w676Wrk9Jk4@ssjuUH$<O3vH}7b_}O(SX(&dgdk-&QgP)A>
z%@vO<h8dc;$T^<d**XUMd__6zip)i;WA33Z;{3|5QjoTZ?b8QcU>ahL7#eHEokB<_
z<l1<p4px8RSDZjzJDqHu6E>@yTxz*GgBjk+d+ZI~t{#4+#A7*7^nk9liYMvSy6#Jw
z3<U$q@j_KD{@7Buxe}tMoE5HIF{+*AS>QCbUWe}4=(l9@y7*voYTj1)HR#JC?=m_Q
zJ={hkhQP=X=kT6b!J^AyZs?AqX9Lo&T(qw+A{y<5>Tsb6k~SK~-E{>aQNqPQf%iLA
z7=Cjs2ym>x(?##w8`<W3>+|~D%Wq4wtF8L&V_WQ%tzz1NpyL29C6MF~zl-OVc=vdb
za<a1ne<%Mt>1JM;9oQ)O{&o!^7T!l<Ad60@@siobWD#d_Bx0J~3T9#D7JHm5RaWG_
z^TQWP0;KYDILkk5w7+l2{Wg`91kB>kt&D%^HTo}q{ckQ3iU)6TVq-N4rNlP*PhggL
zT_@Z|ryB=4ZXarYVZJZ@>zez={hl>d$<Q%oz)8JUzPGpy=%rxbcVqoT$yypdJ^LX7
zldh<4pL*n$WWzT>BOY*uo!52H15&l~sUPtIsL4L_qJ8eoXYT;ra(Li#?Et--K6Yoo
zPtlb|HMp!qRl-TMCI0>QFF=NRXLR$YwH(@TYzx?(Ya$<qOrMyJ*Un)qQU;wc&yFG(
zJjw=;{pBs5c2RsNA%i%W4-LAEH_i)>nRcqI9hL3d5<m{r9rjEpd=g;=`IbdsIyc%F
zAx;9gK^4OiSXD>LJlN$SO}SNI(S||%Wwx!oIIf;{9P14QK4|QS8utSE2BoA`J}{zZ
z{sPW-YC%te5+<lecs;oksnX?|2J2D7mj@ATaq=3($A!*HU7SkcO;}Z}Gome?=Jk8q
zd0HS5*LKDnK6b{HC^zHToYX`@Ij&}I;(K^lHY}x^doVfAJ>P20LM@FDxww0IH#bAQ
z<i<@ixydI?@xgR1<*#h2!4YIyY=MgC$+S{fRd<3GA^fX~vK^stF#>d$u*>lvv{x`g
z;nT@_@YJwjZJkYYoqJcYr{~h(1HCJXkE%Wzq0|}3gYFAVKNonM!PT6-Ow4pW{x%V^
zoh7~Q*OWel625Jh5ZGvYdc3-z;XLXJvkZ{y?1y=FKLoy65?+pwHE?NNYS(c4aZLqQ
zPC+htlSsFG$?;9skoZbVcu120*X!pXAr)R2coSe-&}*SL$4YfdU_9n^Nk3I@Tl;oZ
zI$|%6Z>4`Qx2<gvl8WaBnO-<)W0XoTq$l&BgTS+JbD-X+a^DHdr`LUs(N&;gwDl9a
z;fdl*msaR<(uIuo34XxzkF|a02dec`Ri%j4nMR8SQG%^xt1>Gyaw?(|fqSLt>xK#S
z&8<c89iH@lHJ`<UGaO6^K--Z3kmK_{j)~d>WPGDeq60w}5MNZ98K?vR8pw-$h$88G
zF0}=ltx!e&{0mpTNWn*!nxR&dA{EIG#H#xv)#<F={1PeJiTx8hD$2R4gRL3u<bHu=
zyWqZy8msOJ^!%OAFGKxqo-pxvsXct4X&Vw)6?M2mU}~*)`?ft{R$?(&mgP&S`%x=o
zSuT^}wD<X$t#XT~lf2h!Ksu}9N|kLP5sW=o>f(X?FucU9y(Z4fh3*$U>w2+WwAboI
z)Kn9tviaii6m;!0U9_wr{nWsS6|Bi{l-otp+VYf%y2oebkG%f8F^7=O1~*wZK!PtU
zMI*@-KybWkYZK0WhUdbIk156Mr5?0}D)qixq`W0d_zyd)fLS91Ec0HMCtoy^RB@G#
zU4Tnvco_JVWX~mN%9CC2DIL_~M0*v4pZDd1@bhBd^-5)rf4KF-?t%C5807o2L+%;U
z-Op2OYK2~Gd5ft~BD(YeJ>Bk)(LMTkD*lph697X~sfOdOw7in%LFl1G&4O+KKIjlK
z^5fO@<e852wR54)zo1jCe<ImK-yL{@WZMx<x=hTF)iT4@j14&E(D+!*ylLkHiCkAL
z+|w;CLo@N`;m=}yBdDB2A5{md`n1ixf#k{8X6<Ns&P2#;6xYa^?Wv`@!eb3c8tO=@
zP^^kM@I1B<qRWZ$UH8;oOTBi^no;6+F43JDA@{ml@y2C0LznuYP+;K*Y6`Wkz|G~D
z&1pf4oqcX>Pm3Oeh-EE_hXx4Kuq^5fX+MX4O<U~kVs-$+lV&X}@>Qxyb}LU~>bj^h
zm?ce7$o!&XIuj+Y)#^ILYxjhjlYA!fd6^#8w~NAx0f)_4qf@_^PQ$hy)mFY7boW$=
zU|=_jGe@fU9}6(maVavNx3-$Gocp9YSjeOz`T0Cn@aaj=@>75GS$r`jE)j3rDaWmr
zRn(VlGqcXN;KUu)RFFJRkPVEBnpI>RPEyr?SbUwIrjNiZ^t^5ME7Q-OBSN|}&P>>b
zZi(`@<f+{4<5IY6Z{7DA*2y+=en|9eH_#>?Ap>D|Ze&T!B&PM$HA1+nU=DZWji#;(
zT@^TIRsZQw+uv3ZTLEXnD`QzhgT$OrOhFeBc2N20(wKurNTk<<BOJqN<W6&3G@!5-
zt>WjliO3@BOayT~{FDhh&>q<-tqJ&4yjC~QOR`(`wZCEhBXNt-?Bh(x1Fv3%LA!yU
zw|vy1$JJC|YW)}{@}2M_YrPKWncEIkF*0wR&Z)3JsTUcLV&M9;(F2RMCG_JSA_MJr
za$rSnr{=80N`l0qbr72=@voS^4;k*Tr+11#UM|Y|9dz%!tV03y4qHg%v@xNMr9~~P
zK(1)daw_^(e12<K`WdmEU>;RUMYj*4=EQpluKVGoFk3T#u1NT98{=XW7W3l8%yh)n
zAeyWV#2K}#umFrsn2ino#8|JR^^*`LuRU?`JlmI1T)RvSn-CS#fi`Zcqy=0XAMr`~
ze6uXpjeh+#Z0B};+;CT;<uuRRRJ#*P&tChyKY2eyo<%eOo7*o0ya?YiR?Hj?kZIvg
z;s&Q5@;tbHc8YiVlPW%Hj#4I`ZO93yt8t2V<>{UF3OUp1XC!rOY9mfZ?Pu=H9mgZ_
zDB-e@8$dr0i|7((hxOSdhfa!NN9sC9$eQLSUoOg4ddD6wK4YkKm`_{hhI7g$pf78U
z=Oiq27)G)b;&nb<e3H8Kx^W^L6i!vypJj~rDabEn9LU%Trxd}ZOJD~2bW0H-r30-}
zI0oriOYLq%b#;QLw=;45ShD-4PqSL?*sGqs?_6Uq0*}6%k=0IT8`!apLxm{|*3?;d
zWAEmqi$HTtTCk78(B%0;a|7d)f=3$5rdkg*M*m-+kWXQC-P01mWcdPEXqQELZEW@$
z{(M<^xgg-5qnv=app>A_vB!?WJ&<^zW=jDqug_(<GB9{UF{BSs8kFysX#{WuZEjE)
zSs#o1w(}`9ijvdme-O9TlmDc5x#?Dpz1ghqo%cy9XXrA^bGO>7LliC?qZhav-+g-e
zRpuiQ9f)Si6cyPWXq^Y0olb{1!WRGwa}6SZ_^PVh2a1PV!+)Y-kyYsyk>sCvL+aT&
z2w6Yy>I#rUsq#>dWc+p!!$7l3YpnY(kQ3xP8L`L{XftqFPUR29=}`S)`9Q!lgQ8wR
z68%{}fe+WPD6)_%f|z=Ae>joqFHiaDxVH>c17ZOJBR8ZPwM&KIXxFI@g9zMDa0p~?
ztQ$f!m<42c0qfY2AbJ4%H)ID$fyw~h+=U*^fTQ-lw(aETpoGXj-AJonDL|z(1^){a
z3^v_U12|q0nPfI#^b7!7@9<gq1&Yl1?Yvsw{Ppuy4k0T!hy`urmJqoONd_IYz1y?-
z&8a_yHBlL_l2l+oPLvYStYL>0v(y^Tga$N;qL{6!J$gEiv(@eMhK4AX7m@!mo|@s|
z>cYX-9Fp!YkOWX82dS0+fP+bj)G?b%#AY34h9cD7h=_I>%2LeOyW2A2YTs19ys4e|
z=<xIz4t3De48YM5$e(_9HsyK$G37t}2=u>rFTvA>hX^o{W*;H_qwP004uV;Q97n9=
zM<H<O4M4dSx@P#7&;H9DrH&%{i2zHVwfGPv3m^g`t@JE+T7`{fQ7p5Ih^3ls6chD2
zT#b_3%<Nf6)HJWc8-zaRAKK*$vykc!4~JQuyzziCNqb@WDKA>%FHf56?n3C#G{a+n
z+|cd)XjR5}Rk`7A2<wxH8YtBJ8{gov?`~IB60h4-{czxbe?{4QHR&0{_?{lmywByH
zi9|hS^d<`Lzqe@mN?g#5&N*SQ5DuAeHN#~2NQl25fb9u9x%~?S<NXCXO}pmbMA8fz
zrt(p8TKR`Dv9Li06%Vobcg5e*>nvQRJ$do+Pgsp+p9UaPgSg4AcyOm7S2r$YDWNx)
z*a(?hoxj>MsND5Bj^pbvgPLRmi&P*Hq90LG+9wd=zK+ax%#Qkb>PB-!x6P^j7E3|~
zP9n}_bas_nE!NN0Yf$|1nJ<FhY9v1&h8sZ91%=LwFm%tt%jurFC7Y9EO`L^Ac}%nF
z^V;3KuJFsPW3%zsPLUTu+x@*J0s^uQ1SHW7%L<0s+-SqSs6qBT<eSKx@ak!~aMN7Z
zoGtQx@RtC8(=sEyB<K%16r6_WCe(p=sm6nw_#Crtf;;cF9DQ8ZTY&S;G7ak8M;0aF
za*gb$+>#QYfEEAtQk5iQk(Akc5Zyhe;F>*yeAk6q)2$OOi>mSm0Jx$Cp4TaW6&u~V
zqx2y+;=0qS&Z&Y<Pr?!kGn*kpL}AOPExYRwjMS5}FDt{EQ5^7_HmSq6UQt%X-nxtR
zpF+ODZ-qZS+FL{e6@F$i1JO4RCf`wuup8aHpW<p)|7LgZ_CnXDpCE}Pm~qdYBdOt%
z8~wh+eXlI>5@U7a6T!5Z*s_Vg;cS&F{E1Ny`rr9a^orZzd6!?Hjk&gT=-+0?kDvg%
zB$#Xh6tyRQf%?W^mej6(8;W>qHj*f2Lr#|uYumh214QN<>%z4A_X!4nqL&cme}SIZ
z11n$db#xIr>Gzta|E0NS0Tr6xNtuc;`%={p5c>n;M+kE0R@+9xFVIE9%*JHl?B+TG
zirCn8td)j{?kV~rAv33@`Lr~&agBOQo7E=$P5dhJMU<l1IzYJ%`u6Y&-pH+*sDp8C
zi|}&A_ZNMJjM?+&(QQ`^m%H2CFioU>0iKA3MWBt3k?6lEIhnCQ(lklFq1P9nxOe4R
zK+?6dsvV5`Q3t}E2@JdmzK;RL{pCMZ^LZyjaVzRLbc~7)HdbjcbCRqHY!6OW=a=2(
z-!$2FoD+FVM^qp7UvtcU&ASj1#J37zS(;cHc{V-h?Q$c~WK(8W?}ydEOs;!N+=wnI
z2jp|?BUJ#-fOG5Czdd#zEwh$lR{g5;WMTeXzk^qNVtU5YJGo_um&+coa`o(_q!K#t
zBI+fT8z^I?QRuX*>*G4eP1ZcceP#A<8=qayi~ly<-@X#8(5;IB(6|h4TwABnqL|#L
zMO@|d@Y8F9U2*ewx(A|vNOLZKCU1Wp=4vmijJg(#k#z$aRVG{6g*u~4C1*N}KJhjC
zpd1t7&T)M7uWKn^+E06Ch)A4+pTgzGHVXh_v#gKITbZv)b*iDB*CmJzNZrgcZXWSH
zTm9M-t%A-p0wmOqiEQ0fRVWw4(`^>xX1X%2Hlw#?pwR&{Yj@E2sdf2tlLtcvx~amm
z0ttSKV0n0)C|dZ#XuyjB7#$`*yp?n9fS|@Tx-|N%yqy0C>i8}u2wGIkaqpH^NBNg}
zT0YU?9U!^{&e9^e2%SLm(Qv!`sWI(#92Mhrm?xy1F$GEX6N;@@{e*4FhywKFc*|pe
zl*OTU{se*!%KfAdQ)_mAQ*Kwav8e0I*P^#yl0Zc`)@Q~j9vGb_F>;YK%(Ov>^~<)-
z){$)-z@K|yXU8)V&Lz^q>b&Zh|CCO8i8VaqJpSNiZ+NptZfB9>m#9KFIZytH>5@2>
z?~jzO0~Rdbn`uZ2L{Us@8%teYXOY$V#~YKl+4a<#!G)yrJ;vvBM3lUH0yoB1^S%Jy
zlvoI3E2r)UGK?6DKPGmbteU^U#Gmm||Dx2wAk)VL+=u6{W>0LZi@j(<X%ORa`5nJN
z=c?bz?69xd8imZ;sLgAo-m<StB;oscM00Fo3h9o=g*Q$M8|j*{DiZ^6te;S)Y9hU3
z`Fqw~>~C4ju3M!4M5i3Pq_I2k-Cm@b<)re_`!RgoCVEkVAC|CU(>7LqZ9h7**w;6A
z-SnJ<SLP!Ic*<R}d*Tq?$(9>YipHgn6w($#)!4?>xcYF^Ksj9%(M#9d99@6RFWwUD
zOF_KVea@g<88C^8R^wlND2G!;zm$!ZYa9_TO&+o|PQBHCGZWM_5y}$!+S8hkeKUBO
zY>O-J00Jy$n5E5V;@z(TW|ImJbxe`=a#FW)l|J^)10KFh)i&RBFlCYMd2?m4o&kkU
zb%dj&+B`(Ttd&*nh}>!$Y{;M+vOB!Zdn)u{la}~{19hOX|8Lea0wSMF%8FJyPn7`b
zI2wH2V(c`Sg^HvV@>QT{f4cS9&<|B+Gh%Uv@G|C^1Ki5;Lx9%RMTe|mPaAvFj<%Za
z@dZZj@^BPyi4l4DIMjV+ood#XsM~yrDLC-CxjS_%bh`CNU47L0ydoyvK3t0>;Ub_>
z^;%<_zMQ36@ygc;PA}cRN*evGtg?61><6dc^8esTq-Th}r<4@PW`GUl40QGAu#+(L
z3zX1D5=TrC+KvR#U-uX7A-sVsvuXpvMHb6QmWYBr9;sW&qm`BKPv1C0uT}%x&oLaT
z7tEtZPqghxP3d)eWx_fA{+>)N0lnChXZLhTK$YDT#prD0kyxp{n`qEyMY6j3Y|%cU
znYK5B33m`C7R2T_FG+lXEiSz=(?y@^{WCE+(YR63o#X3WkNSs7G_@@}y*4}m32eb6
z(E!g<NMNcNo4LPBP7cIH9#<D1@l{q9dC8mLq~vB`?W*Rw;;H4Qda$LjC?@~O-zKuG
ztVUR@1{&$&HK(#xC6^T-=)fXS3u@?LSY}b&mU3*X-g!aRSX^Ku#trG4X<%03R1JCp
zuV9t?O+4gWgUhh=ic1QhRMjN?L(L4;VYBR32TCHuc>@xi!87Vr8bsLL=`r;+J$&c&
z?qZE&!Vfn=29D#8UYSUu6_4|KTBEk?@bz#eqH+oJTvfkIxs^I@O<F9gK34qt^{0zP
zaj=C?3LFM6Ck&rXX0IS6BsZ%~lgPBhwH)$OTx?10?))#%Gjk_Gf^(b8*mAXzbQe?7
ze3rSdC}o~*jALjvXa+^^;*G4-#R3t2Rd??<meypc)}$ZbgTc<AtmA+ybsVb#9n;i!
zQ&e059AC2?5ZgU|&iydutd1N1jh&>9n2b=NhC=9%;Ti0rF79N9+0$7h)|xMA1oy?&
z^f>s2=r{6Xyk_S-?L@PD?mF6_vHVq8Yxf6p%busnexN>O9WB~~-e9Fv)wJ*Ttlfxo
za;4HG+08&$3_J;RaB!MI4gO@7=z3%PrQutE7{vC`BB-lUgG=0W`Ee81=iX8aq9;DF
z0|<ts1Il0lcm1|D`LM0E^<i0!Rq-^}5^uCi$(?+zqgtf;{?M;9^Dt@4vo^a~+KN&W
z#O~<N9A4&~Hm5zmJwAWv@0xyX*Rq{2XHN(ew5BIY(;TxT5C@%X=2>Fq!P15O1bhnp
zsj7xr<4^WV%d(G)mfcc0O@ASTUL}M+U&J${IqAi1iJOC|heeP>-JW;9Ku@cFfjm6X
zqcwXE5W9)7($qc&>hen204qKQN~S}5_mEA95P(M-JI_`Rr~!E?yZGKpLy14q?3%(X
zxrgiUy}Kp*&B*4q-%95JioR7fsK1f}XFN=n0Ebr~ibnPWUYu;p@9phpC(bCn7`Z<D
zsCcpzFl9v(9S{Y~LS1$K5uN3yS-`sZu*9j``3jh~?2os(qDUqmo6Q#unPGc|P}Z}s
zfZKSE=)etYlP}6|krURp$_kzcG0RIozq32Y+MUW7nQboP;-){5^Z50oeEbkAWw)oc
zwI4zTEPM}T+o20M1Ms)8F8O;_eYpAQ4&UW;p}Wl$8b9OCCqeMjK=~rVhB`dgtXM*`
zs-gv`<@7=kWe6{%!w>)AZw^3ODTP?Lf~*7*WR<5t2CNS^Cp`-SHVyh#Ci1D2t_=23
zSv@{0D7VYoRDF-#$KMCN;y=qZzw{{4cJB^ohT!gizfmySYJ8JY55R|9<t>nnQV1}<
zD(c+^$}Wux15#{w>Izw73CT<};7qZ^vn1tehhB3le-iQ2o}&}dSq`9}3s~Iz$4*iy
zR+J1T$6B+7w=Jfls_MeZ&#W(ClETRAp6O4vSBy0G^&+hiw{5Jbs{s({ftt1%+jc2u
z)*RwFW*cn&t4a>91$wiVk-kN@w=cL)?F&**#wrs6h=U|UgG9LHcyGsW#4}$+2P-D6
zP1>mSVV0W&sH1Szm`*@|KFFEG*%@?pgv|G$H-D8|4wY@DeLcC+yU4ODXG8>a0&1#)
zKbw)pt<Au@6RgG$gJK-Qo{{A-V&}#*zFtE=lk;)kGFtC#lHBe}2uNKm$Zm+fI*;i6
zo<bTV7k&R*4Z;6XJp4y_z<=L(UUb=Qrge#W7R*6DO|)Wggll3S7cKP06n~q#@NS|d
zx>^%oUIxgJam^t*fQ04*H!PI124@&N{!Ha%V^hOr<aiel`nBpi-Bua9b<anq5umT`
z@V@wqGTYrjKQ|ZNN_`b~K}{*?dKx;b1pfyUyZXqEIZkZdRv6yCv+HYZK8B>HFZDzJ
zoc5*Zb=zk}Na5qIQ}Ql!G4BZ+XMAdck2<$r2flMvAyS9hrTUv)F6`2)O6o!VPV6@h
zyU6&St`il!bKY<yiipPk$jY>a`&AI#ONo|U{FjM=7(T{H>{v$ze}#0n;NWg<3RCmB
z*wc?a+!Itc0v1buW^fS4%K8@<Xka-VD3&s^s@U9QV)0tf<h1H-&TITM4Z-4X?@3AN
zsw_SU{7}n>EOM0<)zi(13MhD-tobrd_gEEJona-b5WXz3tb}1W<vGUZX>%oL5O6XT
z!uPl9U?aLzW#Q)7a&~L$&kn8euRew=zq=I21$w4a-DAcU!emQy&TofG=2Hj3{H3SH
ztm$P9*>2c3zV5sB;<4TArl0)<4?_#~`Bc|)YHj9hIeWBM?>IbHvE8_FtMY=^qjhb~
ztDj|o<(2+{#HMjp*$5|<O&8V+UY~FFIp{&pGgCraIn35d3oO%NPDeM#oS4$8LvGy3
z=GWOd|DcHUF$bl0cmi2-alBOA^>PSLfnA0`q<}8X)1A+BQ4+FD)=Chm?H@b<C9y^3
z1)Y_*+Olb9t3(ZMI~tnpyue6hCT?`9$&Gr`i(eYI)NyP@pCcRcb-v>nV98Bv_FlQv
z!nA}AP99!LBhK_tk2ew%ao@e(RE8+aa!O<maJ{=s5??guvb`py&g+mT?mGoY8{)ye
zr{OBNlTFyji=1EAeC}CCd%wAu_9U@djzt=&X+q}2IyOJXxCof4$II@&m)Ad5kC5;`
zIfHtJZFM5%VNhYM;*=;AJ;dUSa-d#ks$J}ltlab5CUWwc&y|gbmW6d`44L9i_z!Qc
z;PO9FX>cr&MKI6KS|zodVkZ+M!qYkIA*T7`w;;deMB>;zSB;TLVjtdr2E&XD&rcn2
zJ_EJb`VjN+aNa!T&Mm1Gx{J>hmJ~Ov$(}k=$#Giop0QsvICjxneT!fgXfY1`U|c2u
z+Hzy5+dn8#tgA>*H8-y3mLbHuX<~R<RClcPG+Y&j=-dHWb}yzYcT=x~>NTY4nP`Wc
zm$G*p3Nn3i+Cd<8YM)M8f0i|OYhP5aIPyaH`uTk1iKRI8frho$tQ=2YXC^$SvpAF|
zgb+_)SYOyMy21l+8+Z8G@rExaTr*p%l0vOBpYw~pUl}y%6H+yFw=r5h__{I8oV3*4
z$TRaaDf^vLapq-n!-`KQ3w7rQw5FDIU#mPTi^#e9xm*G8gBN}Xzf9CDhgIo@Pg4&J
zt`~Ja*NsVecRGC9v)u#Q2l(b@FD$Y~1YIXe+_Sfops_YAUR&+D7^vjsl*RO(g-gmI
z>yf`#8Myl)eLXIMeD=VmWjsN$jF<p}YZU5oeg9G46xI`XOFz0t$*aKAXT(QsY|PbS
zPrthUIjuh9E%jA~yNFW1PWXdyQ^IS*2$WqNwcm{e2>1evf9y36EBDW!dE5v6&xHai
zNMiETmTW+P`~*xBsIMI80F@I-pqx@=Ec6i>)LIX<<asxK>}$^u&Dv{*a#l(r!m)T5
zNd==(W@C$EvMpDBT*q0`v9&Z8Se!q(?RQmJwe_xsVmWjYvdk6;m~{vOL+iseNXoOl
zxP@CpMLf5YK-E|8Xy^h1v`YDUoGj;=eyFs^b4~3p29hzSVn*MjCTz+C_;%S$-`)VW
z{A%A&HEt&pAK%)eNaKm^%TEMroH*v$)mg6@GqJ_d+@EhDv~rF;X`Ue4;i7vj-s|({
zES2-mZHOeY&Rc%FTs`uwgQt-`*nR85L3yKasO!N=AgTHfBU;)-V%?%1A_~kzHpPgR
zW{Wx8$rw;Rb?Ex}24unp%IeJlC9vTsy<Iln4BO@F272$0ZMIeH821iTSyTjv?!2@j
zp|u|BkxmQ(>j||1Kl6$C3Zeq;qPooHbIVOY_YJGdyq(`v{c5g@{!g`<zjB>RMgB^G
z-b?;5F#2bSsq)LjcO*0NCE|@?&M0_5uK7a=@T0mnbJmIPPhWT?^Af3Evz!91Y3nwO
zD*g^XN8~J-o4LN!^-wiQCYs~Rck9QqPm_Y4s&G7!ZeItf)s(9dQ6vqx98m(g0xT!f
z@)41Sl49FsZ)R^d2uOA`#2?aa7p^@#&$?NM&&Ot$v~j8_+nAewh6G~TB78axYJ1P$
z_gWhCR#;iSC-vOl_AB)~wFg3{c6N?ui|3PxyhW8le2ulOH)Td{uBo{%`m~>_<YchQ
zH-rn46K?W$2F^+p#kz6MREi3{BHzl<8hzg1<e^gUq(1xpl;rW(k|ED}ah5>$Pf1oY
z0`K38p{7U&ku-_E4v{bYhQ0<~-{Yu1FHGm2Ai6#+OpA4!)@jGX0i~18c+6*6Yg5a(
z&K?Y;TDG{hvFZKWZ_&D<9$g904W<7q*UZ9#4>|V@kA9~70;$X5<u?%|#W~mt!5!yQ
z!`Thyhxv(TrEvQoi@98~yN22>*~Tmh&8CtP*5@R|?CajxUp2~k&LWljh#KBt^K#*o
z3^LS)^=CQVs>JgngVeH*JPovc;U2x$K*QId;pwIZX;E`EM`>SFM)flUNT%lL^464e
zms7iQJP0214p(s=uY|3-*KW3RaZ-kb1woe`xr5q(hyg@2k!3R5?oJ5<G`-GgyiOEE
z3A|XB=MC6<xPU*4(T>K;kvdZRtE_UV+!OcBDxiLHtcu2O=(v<qPQmiVTiI6|^Sjf*
zSiwP0N=u&8*qYCJ<rDA~li8UR-dY_x*<9#l_U2Sl-*bO*9Av5Y(+(SK<smL5M&)d|
z#cGfou6T%GGky16sd!G3oGyDN{0o<RgNL@iG595duf~QS`cMid9|IQyo_G|pS=Ekj
zaP!<=bhKW*oe6`C6wUkR3a-yPN5VGLgy3gJsaKp$xT~NRKv(4c3|A^sprOY2!)dM9
zSJz_R_W+x4b^iG!_BIA8y(5ajwWj69DCq&(d&~7^;=TVx-g`zh`L64}L69O%M0yDd
zf`WkbPHZ$05S3m6A~n)mfItM4UPJ{H6huHkN<cb<UKNoJ(n&&*o=^iMalg*9=3Miv
zIsaqrz0a3(_W2OQcm_kl`|>`|U9RhQk5@NlSJ_x^^lFKL#0wps-=kAd#~%cwmAWsX
z7B$WK{el47;DIA@(zBztj3;;0-_f6LD|?eKGV+(#4U(28WeVt_L_H*?CLhzHC`n0W
z2nhM@9^^UYTDCcUB?h5m|H&Xwfq<Wyr~VY1&z*w)(payeEcrCpk7L#LD!%M3E}6+K
z3p`%>eEC(;bW`c*>A3C0DEi(-u=jB6!{pk7V%R+4T>RpaW+-v9(9IG_V&9^KUU~14
z99c!|(>^i3Noaj_HWcyc)w)eqGEjG<6We&@`6(X8fe#_xRZl555-GOwaC)RXTldoF
zuxZ^HBi&le`YhPsAjKUkeHI?AoyAcdX)u90{hc#ljhp@vpEeiFeMRIcExhbB(!PaS
zhuZZ@DX~<Yb2yeYvKw$zPQIBbahOb;vk=!b__;W?J5nooJY)T3ZuTnT2qypodXCYK
zB=yn#3V(lJmg0imX8XVlEBN+&Z#PBHB~3kG(ILSsA8}?(UCZIbz{U;A?I%BiaKvs3
zc&JN?SAJ#5cFWIjmoXeHk}b60-_a9!jBSo4CgPzTNOpBb2MqH~o+nlM5}8(+aQQ>F
zAg9^L+qsd?`L6fm>l4&4g3$y^0h+Z>*zuE_qaVloX0^M4I#w0oT|{Z*PB!-bH7_}{
z{g^vak{<Kl4k&M$G7ZTk2+T*Y;B44rHn#}=JB9C+Wo`M$hj4_hn5<U8m$a_tA2E+X
zUz22o2i*3+r-0Fl^HG$G2wlQJycAl6KkUNbxp6BntIk!okyh~IVNbA}dk`&dKzY@s
zpbI*=(lO6bohWUaabk+!j+g(5j`w79;X*p1**Iq$93sk|?N+}|f)~%TXfj!l)JOK~
z_&RMb5ESOhPt8HsCzi3M_cB5EKo6;}dVN-LcpyIHJ@UmWV3#+txd7N&+B6Vq`O{(F
z!gw+yH775PZ7DD)(C&mQJ&`WEAn1Hp!7FY@2scKysVure9?vswIl0$z*$4731ogL)
z23Bmx@adSe6ufh*s@XGn+B$a$1J?}pK(V)C+s;9mlHu7I+C4SmEp7A98pPUIL6dNu
zj()!>lgzIoPCG`jcN;xap0F%`%r}9!E^oDfcj?=D(!UCnBF4a^JltKtgawFN>Q&ZP
zE&oeFV!RHFJJZq{8VInmhqK^7QSXn;k!lA;I^Re!i10k&_Ct~-<C^npfPu~MmKpH3
zX5g`oNJoAxHvr1nEXjZzs={ovW}dLkbrN}WkLHi27dUMH_WI{!^Pg~(um2w_DgWbR
z{^S4o=ibr(S>xBuI>B^{yoZ}PC->If2U3DQx*CIT2&Aw;whP-xu+3v4>;Tq>K#~jJ
zHUooiMUAQV&ellevXXA!rm+9|3_!1Q-~2sOm-wzOfG;q{&-WOCiOV6=dVaq_Y?hNZ
zJl%Pas=`e6=FJbK82Ifn_KvS8c&P_s3xLKU+Z^%40C?QD%`1jTuC8uV{<F>HFZJw;
zCSP)EUA|R+r<{gk&AYc7mPho35k%KQPO@VHHTgJ)Z(pbx+M~bpnE(Bb{?)uR5k$^I
z_qLUc?Lwe+;HBx0&u(i0WNftBIY7{1x&3SzZ!k7fYYM@qSXnD9r!Uv6Xzj#UTo*~5
z9o=v3RcWaJwz1qXO>UC$VA-~m^u;=YZH0rvVqCm@PEz-ar;1T3B2Q&rtwIgWWELP%
z$KVdDP+MkN26xPu0#CL6>$C@A+e=UCt#_z28Xx*t6tk>6BdRSZbu&v~&`>5%v@?bb
zX-f+jw&*H$IT(>bc~i~STy6e^rKw=TSUeSC+ShmpsZ?E?uY^X2@vH}bo?dIvXlk@t
z=QS6xnr)xX&A8U4N7cxGPHwpAM;szX3O*3!H=kUwX6cyLSO(``ReaSs5&ZuB`#zib
zc7D$rJQtPUz7i_urlUDCLU76?BpVU@0loUOeW0*Cb$TMbQg33@Il}5<F7r|!gIWjw
zEA-=Y*OXQERj=$E#o)K;DE(htk+f+4g-uVh8uTdI(K{%laAwj9ZCc*1D()U7I~{RZ
zYt1J}W8HT_I+`L1*SS$MsO~EkwN*7M{*6z9uV0|x#Gvi^XF)qaKq0O&ot*ih4L9bJ
zqb2!tXM0KvbLkMM)=^Jd{5l+5ouVXvwt}fXnXp1fWpIj!WQN)0vG*>X)mWG4hA<$E
z35}SCQ_{)O@`-d^XXiz8^4)BCENuLau#kpVhXq%EN|jK7CElG4671LFFACpHYQ#HZ
z=erqg)H<N&0LUmD!R0P;-|ZSPY;Iu|W@~looau{jUYUD_Gydvqt;e#LU(&x+9u;vp
z^*QM&OT>FEh4WAGtjW+y+aiKP4`=YFSH)AVs!lo1w5eG0PeHdQOuc7&l+SM7S8rvT
zSXT2|bg0{cxEMuo&2u%gu4kPc+Pb<o-H4%&(Erd*-{~v7uyOx-yPaLIS?<uECO((C
zW{&|m_sNHvTFMad-r&t*Fe%tAGwvxJX|LN$92^5>2ou&Pe4EFLh5JJo+Qff3=)9cn
zOqo@NG|6%rvp^=httR3Q-7YVgmTv{ML_7(TH1I9#!1c9I_kHSfAxs0p8HLjuz6`M-
zF1ZSu%s@XtRrdm)ywTz|UulDCPv)MBFC1G8@|1U;`ex8^_mg6pT!bubz%g#TX)aF1
z%vewlHxMBQo4ZIf^8IM!T;Qt7(7Zb}?tYR*<zat2*)1g{Wi&k8wpQ!>u<oh2%b)+j
zNkQ?FGi~Id(;>L6*SX*+OU>t>SwH9X_WTBc!7osbC@P{)?dAyrVPSA(TtkZ@U_pNW
z#_{%?hp<Q}ESZBOcPAtZsAeW-^mHaqsr6{G4_&R=Ck8IW%U6%>Pj<ZhG?vZ0cyl^}
zChpptlS$Hs4;9Y<vE}3miH{g4?5bodTRrZ-|4v&T%3AEk<t`+A`J9kfhX#WHnK{g3
z6{u$mlTPE4P)eH|_(QBx42fDaU$MZ~6@z5E+3@()U9KNl`;tHFPwM72a^Ia&BirL~
z2T?>Dd|-PG7-%t70+g%hv<Dyc$vLovn)L4pPp9+he;g@)?K|g1SB?@|)sUv}LU8F_
z50*$$M6hDG-CCDFpk7jf+9wDUa91YFmW$*py~f|5ekqRZ;DH1EhN^ZP&bdeWEbJgL
zO_LWNES31FNa%`=W8e3>0h`iY!621dKo50VjhsEM6>mn6D!^n$+{p}exQULPpox4@
zQR^MIRRx}YwLlk>u9(fGE^+q9{2uwnRWfp=X27t;$oM+nJQLC}`xy`-au_{BGvuz;
z-r9zKrJItPS%|O<(%0>IHP)DIxqoLFEk}@R7nG+VhI9psqN!<qye;hT-er5CsjZI7
zSp5eQnR*d9{qgZ|*hNae>MX9b=9M#!=unS-fmOEDeItsSR<lCohl<{K5ZP9!d4(i_
z@99oHwH9`k5cXEEG{=pe)0{ii(axtVLB;Go6_W~$*^hEcYv9xqJy<?vaGqP9tjP9d
zZV(>(<`y^R9zP3c!F$l`eO!CPru>GdVL@ec^zeDJA5eTd?>3n~xto=AnU8cC2*uA%
zAb%}}T|;87&tt410KLw^vqY+AsOrD_UY$R<zBiiXjbQB|s0y-!9@etw1L20?KX~GE
zDcrYg06I*{-z>b;$mc-Ufns;uz7fi&g#h+ZrVD^pW2lz|Y;Nu0E$zufdqAd<q=TcQ
zoM)z-2e^Glk?V{Vd+r>&>7;9apwK$P1fVE#2B1Eifnc5~%pa?O0UPi=C89CF?ptR4
zf6WI*^ykfJl_1*;A9LQ995{e<T;AL!Rc2S-Iezx?A|&^G3ZH;i>sSlXj_ij=x`Gpw
z4+zw(tF|q8#se7Mb7LiM%?s`-jjFbJ<n3Dwo9Z6|u**E!UL$vcalZD~K^_hX6=KEA
zk*PepHO}`@s@2IR07-mICE`CI@~o-IznWrevNQ3Q0prt4AODl}bm~86J^nKg>I1M>
z0WXP{#J2=k+dM5vNwf*)37D{WhhHxkg<g9gj0{jX8CE(GXE^x4QWHm{7)_7Z5M#JW
z{6r-zl-FE_Jl<3)>vHUut2~n|{e7MCYoOXR6RWf7TcYb&JxJk#2XHRgHPoTe{s)%=
zn8*_5BkyW%?}~)J(n0{eCx*aH;4c6IWkBN3{00H!#-V?+HWqT`ePo8lK;>l=fJ?wg
zk;{=Q>lkSY6%-IuPC@?bg2r>hwtm5t$=7DxNPK$OPsr{*9!L{FK%0rU3I3NA!S({X
z<z(NDZxT2NQNQI0_~@9Z4@fV~fVeIo`CnJ;AC&v*)E_u<5VA>^=4)6<WW*&vBq;G4
zL{ZcE*Ol_3pDMXCC$e$G0m`}UX*x<+^SR%k1&%+ztw8R-S^D{KgMVl%^zHxr^%>d_
z4`6XQ)cpRjSq_M18V`T~G8^@*g?Rc;v(M$<EFEyD(aDEoI~hZHfM?a_1bAWxCfcE-
zQ$R%oSB3nSmH(gK0GDP}GOFaK+B8XQbkzSj=kZK7k<E!>;);A?6t4+-1jOmhr`8^<
zdZ#SA&C#_O7@Htt6KJm12?RUL<qv1f(g9SfNUb2VHrA<VO!g+5^VKYRf<{el=&9uu
zz3gjHD~HJC>qIXqoY!gJ!Hj*&-VcAC*#BdRYA67hr9+NAu?rh*ka(x&mZ6vzzb0Dg
z>;R;=rJ-$5&UMq=HYDD7`)s$*1p+Eu67_nI*&F8d$bRd#O96#O0-;uY`!0oj_7~tJ
z`*H7|b~jMIasAVM<-(P>k{@2WvIr;YR~%4_HoFl+u-oxK-Ai7{RsmlSSCxGVerwZ5
z>*H#AhU)y+LJ{}!uj<cVcbBv^FYB%(QOPd3gZ5h@=eZ;`Wn48cv{a6K8gUrf?|aXd
z=1Tq6k|Qx85@ZLy=KVuvym9JC?cfgKB}t9QOmOa;6WMvQswp{Ewr$#@CX!-TbLryz
z!1sCat$m(iBt8w{Qb!C0cP<fHqA=+j$f-U&+3#0^f!jmf3$RZw^j4lMzVn)4z0+@`
z<^A(GBMvVe3$0Qt_Qq^SD}q_}h-K<F1O}Q0pS3%86*;kzyGEjP-yM2!IUR!6+tw)p
za0hI9l=|(CE{ggv!WpwFX)~oYlx|pGTQ}2qGv;TpPHoWFMF(yHx1+L+XTkm)yE>5w
zdxG;peC-$jPZ{7?Z9awfQ}UT|shO};wH+u5@~r;!uAV%Vb)6yW1`d#xv_h*ADiL5p
zZH&4@fn;(Ff<dw?sov8K>8sEwk?P_Tta}EhF`Xo%XTZ`^K`gK}VkA17jz4~0#%@%d
zwwhx)WqmPCob?Iz3m~l90>=VsZgmK@FGQ(4(!4T>Ao<v9leez-ek)UUdvkCqMU9AB
zST4oAx9XhZudEvNi!E!xFuDu-Nw^HjfM%n2Y4Qz&*+#NDZnb7}fHABaR(PDrm#X-C
z<0$!cX1P06^^eN?0%aJuQ^L8g8|w8gI0&mSW<!N4_BJ_oeuLskfrMKN?n$q3V0Od}
zJRtlaX-+-koZ3os$k%%EL~_Zi*}p38WbvmJSefrjy@2j<nAal<I_-N%8Pj}kMy((S
zP`~H1B$nd){aeoVpFdnNX|9@O(m!|PZLRKd4ejif6f$vo!E=hQt9HH;oUAD{Yd*c6
zL3v&{e**?Pe_GfD3m?<}eD=<V0H-#Z&sjQ_)@u_2$@#FPp|H#5>)Gt;Hx}MhHCY0R
zA+74DE!cXzP1>ug(MxP!v)_=f|KQ`9w)Mq3L1P7hDDPQd7Z_iSR>MJ9-Po=k*Sw`o
z;eRxM(2B6qXWzPts)mFlzm|Jp)Mu4NMNBOt#vjBIQx~QuaiNUYt45Y9Vjr3KyN)Hi
zweJfZO4!Tq520H3GwW+%1*TVJ;`&?Su|w*{IYc?5A^(%vEB@LeT1%tbg5$d5vL?b>
zI@hh!9+{_%aJ1l|2NC1`tl`Zn1aP8#)Cm+X*_{I`1m-#wYM~Vit$dzd%a3vV#g(w&
z1N!vF{Fo)~h`lpRkvN}^poVZ6CHuM19N^c)Ix1YeHJYkR%0AG%O*%|5Y3uM`bXG<c
zsM9ulChFV`b+$<E<b5_6n`xZX_T<^;-yqqrMYU%abm^Fao!&WyMJNHALizs<W<$uH
zoZ}dWc%YJkGtQdb#Z#gSzz#yGy?~zBe`9PBT8@^$vc>?w#6*yLLwW+_%SVTy+{$nV
z+;m7UgxcD?U`qsM<h$-<l44<TOM;0*SSb0<z=WO^M-RBkFx&E<SURXH9v;>Wm9@>u
zef1G)F}v?yez4EB=3S^pO-^9UVnto6jiR#HM#Y{`D|AOc+41r{jB`dc1q+;yLTr{I
z+tdO5i+iwkNb>Urb*b;>)tUXCUscq#7aXSf14<Z?L{Z?3E<Fz>YVS+)5^ryrLN_4n
zv)2pFxLaqg|B_e(47E!YC+x&XJ=g3GV6#)|l9VrRzqDK=a4cPLyeP6KZ~}gyLz0`{
zWX&@WUIkZGrKfBjU7SE%fkD*wT_!bjlsBRG0j{)!-UrH03OGx<iU4UxaZZ;YFOObD
z$sY;SYY4N*{q&(4P%dQcGMd`%IK1r!rbdIYnf^2@9h58j_b0nSCwc<1z9~PH50Bdc
z#J3!Uj}KcD)vIoFauvQa7LXP&D7Cr&NG2&lqh47<?p;DKrtgnl10R;*0A(jB08qN&
z`Oxa*Gs;@6OO<@}5*+~p-{7bcmk+FOs|JWy#5T4IpI86|w^WI9>P-)Ho2$F!NJjLw
zv&r5w*d|6D!1a4(efiP|<tkipS&!Z%*1ymGg2)1M{tkoY*e}3H=?6J0HTytoWdG7i
ze4qIe|Ncqqei>IM^viZTE2pBrXH*zRlO`&!UCrT3vL$G%3J}fy3a)yjFX?QAPIyNf
zvC8kPyb)S#Z1b4Mlu6@8KVqdAxN{Kaev`xGXhl`utr@U>UDi-I4(*`<d9ZF_2EiKw
z#WSxhN;?742|n|(EqoYl;lGoTG+Y@u=ui7pGxuY&Ni#W)<ov0<4`VcCh4py((?n~m
zOlUOso@!#gu-H?ON^g(K=UoH?xrhW|O8gi$tZG{Kt);Pc0>MXfr$>w`Pbc7)?+exN
z?#tBc;u4E3Y6lbesdAWS<eXHTeEXX@%6O1&@7EurG<e<BUs^`sp5!Uy4V)!8V~=dB
z0G2-nCv0{jXA^OfGfd}H*@vHMmVRtDt07mbTYb`w?gSEi=Pu^U8#9}qEUnhq3ht8H
zZ;SYEX3_BzE&+Aw2N*O0mc>s1q}6X^%xuis1CbY~)!(G{pl${S`(6j%H-@UZWCQe@
z$#v@lS<>B+s|;7wo9?K%gOgp)f(uH6St3vLNumUGw_|8$HlwWsKXXPEwMMAb&q2C;
zV~YmoSieM9y^(4w2|No!b)BBd-jQCZ_`elT9hv<#vs)6`rr)yo6L^R!qJe(<@Xz8k
zD^KEKJh_cjc*^phrrG|rS_`dzP)#I5TA@)e&1feDB>(!vl{`31uiK!1;~lN=NsR1<
z5kE+`ALNUyD%iC0$DeeXBALT@R~c}S_>8I$Y`J8-fhapr^D`^c#IocB*J7!I2Evh!
z5<F<#EG1&)=aw#Lj6u{%KuKoFRaLOm@F*$fvkpx{Rvm2vWO8NUE|iu82@3oSg~T92
zQGEwM;7N>I+)CS7s&~8}#CAd11Q-79@j`Ha@SfM&nVp&C*V`XiT3JBc)Puo*;_0W;
z-#Y{UJMTf@uI<U_wroyR&V(foS_T)Qy?YgHg*iNt%B6W~C*$Aqc6HY|Nme|&9X#dG
z4P`(C<s+nsM+zmwZV8ekCBm6;i1>hMSOjD0Dsvl&TJUYJ(?Q%)m>5C^zugU{S<^gE
zFo>96<@#Eko|mG0@}+yeeJzN2eY4AFEQdF4-3KSe{>xHg5pa`FU<YFMY(lJwYy`1;
zs_6-6$+PuC`$bmE<9j{<T`~irx<sAzX^|J^WP4jRL#gj;Xa0gN_*9|rW8C#H31pia
zz@S}41O3%Ujwnhx)L4haiJ};0w$C4k0u7f{|NZk+$3_1-5exRM<?;5BLUttbK-6Xa
z_!e_;=|9>p|2r4?e{Gx(#}0V2f(j4q+I|W>r+t5Fz$UXJa7X+%sE?1>p5yRKBr$kz
z2Dbi<fSd+nuKot`wm1R*5Ix#>s<SIHH|6?%5CW;rb#j3_EfkXvi@J8I+SYRd;&xjm
zSFo=kykTdEI^8l~j*b)KLcb{d2Bih+y;->%&>8ph01$TEMtv|``-KC}+JC_MII1N$
zM)#(RmAd<4vP5TOuYJ?mN4*A)HGp6?5;ibL7Y`t@5f`xDWxtf}Ppxj|yPli9_4($H
zf%7T7xgNU8D$3j<xEGq$A9K35ezCr2rbkGPH(#LiLwFFEKD*Wm@{Lu`pLcWTX#P+N
zPFs?+VNgDL73|=%$j|TeMV*za{k>SbYCtRw9KrkAR->XuU#&@#mATLNU~<TW*F4{N
z>5IxypGSKHV2-K2Tyzk^vhux!Tb-{64(S5ZQN)N!?GFa>h^A9kHn%!z&Wg`k{hZNC
z&^deZJworsjd|`Z*f5Ee(5OwUFValO2k3U#Ze0<pCY^i3d0%QHBh;IA>?*Ug&-3#;
zbFA(?SgIV<`39y76EQluM$}%xpb`)=SpA|s*mPU28?R={r5V-gs+Z`B*D{ecH-F4a
z_{d(N2oSljo-t6GFd5BiJA?)n5)=4kO)<l9gn2wJT=Uz@O=TuT)r_`xmhW(L$;Wgt
zo$@UN%8{^xvnERTE~Mj);#re(hBEc)My&KwFIsSNCVX>NNzmD0ZwwT9Dr@j~ZJy4*
zMIfv*n|Y&T&mja0X0=*9{^(*x2tq!QaR_aUYP0#K3rP<?K8{n7Ze+|BA^LbeZNA)8
zF`P@3F6t_?zLTQ=$p~X%QX2b&Gqv|;VIKI+JMXSgk5=99{4e>Ugi-RZO|;cNYU%<=
zCE1k)k5?#!>KFoyg#oTxi}l(stK~meebl*z45Z)k9sLG*n@K6j*<BVIM{RTjm6D{%
zStMisLZXq$sK12gd8qu9LY~+rE1&v}Nsfc<dyO%zM^AC3p;}sZo;M_|bh9glW1&fN
z)Z?1Wa9F1;-^8@38Nt|HJXEo*MD=^R?t1y1hu1*No$tsCdBLdmCeh=GgF0egs6yg*
zMI}zvn{ECgS1az^)V!&}{YY(rV7sq%hI3R3PPYNepIZ$RAik7-vUN3qqBdF(V|{z*
zDCJFQ`Fb5wQCkNK3$t%_rAPYbUxLiUZOQS79euG_g`mw6fAfU1bBSdp`nso%JX<9X
zy+S3{#Fv`I;oOM3L~p>bqp*?TGPW@{v1xqDQvI-1SU7<#>xOU>e~o^x6d<v~0p}lv
z=!-FcbT^9;UmVk1<oimJ@plkA5PJuPn>fZ)PSdHF?9waKi8k-!Yh(SAx#7Vb7K5Jl
zqw?{+X;!d!WZmjCb?qm$M`)x$C6VBa2|=<*y<brCthADR9o!k1Zt!O4CN1Zj`I$RO
zSx$6K!Qggqlp=&KOmNlq{K%G5Q`C1gjnM_V3=4t0lwJYZjUIlok>$e?OKLTMXjOCe
zpE|!(k+^LxOE_I?^F@Nf(X?ak>LJ&zZ-nXP$zH(~Zox%UXA=8%+e8){`87#E^Aq2R
z71RQK1ODThrG7oH9g@RQVG&uAke;P9sLq4;)Je@9*A#)(96#v?OR{7$GHfmNeEfVW
z^twINV^!6rEINO3wdPuBb>h1#o-|hu9TLxr7aoA9Y)!?@UM4$y%otv3_)l@yKgHx_
zm+Eu>=Kz<mi@kM@Od9n4s&Sx>F?B{vlMh(J7*9z@f`uLX-}$iIpo?gJM)6-cp4f=T
z>vx{c0E&>1$byv2pcgJRzZQ24Egk|Wvfw3Pl!{_LK+TvTXDNWXr9n067u23*I8}tY
zC+TSEwIA$ld4KJ9$R>&@J*%ooLCX*z*xJZ9HA^<aT^?*+w6r4HB9E`EXTSD^nh#wE
zi{3gBBH80-hHIby2Dzbl+BnVnk~=LOYs%XmB0f~*Jzgp5(P+z$#w2edG4+>V(usm+
z2*IZ>aCHjuaOUk8B8MUt_5`!PlO2IsFo0*1L1{Dk;|yd9&4aJ(#1}5CW~44f%B&P*
zK&MeT`<QwB;Cz+nE3eq%uzoq4u3BlSXl$cZ68x8lIXN_h_c#d%>TOy)m_2nf=+QS$
zG!P~uJfm`7o%1A*rG*6T12lZQ<-&*4jEGmb+6_dp-0J!;EYN7yyI_aJ2;s-ZzI>WT
z>rKJwB3GdgfZtBj1&P1MI=Rwmz&k}U8N>xDGs1_h>~QKwF?Gj#s9S>o2^#WzPPBD6
zOqgh7{K)&uv}H}efV!P#2>4qy;ub!$l_K>U^kO?w9a4(?c}qVctbSnr7I~R8P#RRz
zc)~;i5zcl^v_lxRb~%58Zj@UY8j-!Hdus214o*S}J%x8Amy_VTi9_Y~*@Z>0B?%N!
z6x@2+S2PjRZ`Q%Zm-_5ju`!4*f?oJ^p2xGWPf_G>WW^=|#YhSywqSNs0kLGe;UOF7
z8f5HC_#jo*&z?IV<X@KJTa<o-Jdoi?4#2^A4mde*KPqsN=V8H*5+%%sHo0UKI$?Zo
zQ*M!xtIJtfSb`^7knMaY*1rPmT<6&|`H8RC$_6$KXt85SRBMl3glEb1_;!IpjkFMT
zbJFCpDh`M%;M<$;T#!OyEl}20a{O7tUE!mF8$ruCo>Nx1?P!eI1TblK)6(yuq(}y9
z>=TaxZm}o!wd@0ic}a5D!4HR^KzQ8IiXF;{A?7X^b%R-Tugl8d8~e}r$i-5FT4{NN
z0=wnzyF%~}wkMA_KPEpUh!$bu->O=zt1>S>q4i?8!dFQ5>K!if-95UoQL|_w@?e2@
z8v9VYl30(?4SWe2RXAU%A=jH}4^9j=il$tKD|Nk51;_hIK4z~%I<MUuc=b|dM`k7R
zh!->s_>PkMqz^NXOups#it7bASC2J2{qmhc?*j70Oe|2KWAae9!5Gm4_W&lE>3hB*
z?aeEB-*~2MP4nvyns0Nfg(?YpM^LSq;OF9lfjK7A`rTQwb%My}np`>fnurA1#3E(p
zP~|lxX|_dwK?_u|pJ%+SAb`*@`;*M<UsgywW1whrb8cC<gGT#NiE8nwJPyEL1X%Ys
zF6A~wZ;W9!?_-cy2dr9rQ}E8MaXl%q08>qhc>=N(R?IBux<2?L_lxFJpoug2k0uV#
zWa-irB_Rl`&zi+8iR|qZP-b^V$<)0uqgO?3&R-xhFMfkch4{~^tz@N2=GO^MXu;>h
zfu|sLA0qIRDAqm4Fct=eXB)@qRsGs_id!2d3e9}wC!EV~U))^T%DU02`Sqro6(G&w
z0Gy0>>aNOq)h`a7GPaKHV0P#Z9*+)EN>Yb7VrC6I^ONSLy0<sQ4>|;`6K3BmMQMQ4
zojE3kXMkqVr+*9s_`65qZ|^=Zg2Y|VxM4W;*LuoN10IDIHMb){{k!<*KYSZ~*b^MM
zmm9tTKNkQ={zpco=}WGVL^Xpgfa%)F(HK{S?dagwflA(;UK7C&zh7*8dg5*uTCY*O
z`JUAh7!?&?ur}ql?q4uJ-Q(}B>i1IYUT??G2v(e}G%lHOiim3q29jy%*9uvvf*JF>
z^aEe7K6yqlFmL&jWOG`>D-UkqshXTZHf85*L_=3Z6ws)%;E2!3?dXz+?w6!iw3PCb
z^ZwE%gH`zdb~*rD_Y2v~B&89qgCKGdA{1ZLGl`bNb|blo7Dn_5D_k{Sf4Y0uUH(<K
zr?6+>jjW|>OZ3Ewbu<Bzema}%b0oSza(#|$nZb{tlarf}SjPmmT4D@i>mwJU62VHB
zIEkOXO^Aw{cK4^hrSD-S4ZDWbg_85^bQVJ<ik}zt3z`kC6!EFO&j-fRcHhu~uf(*t
z_VXDLl8Q0etVHL|W|cAd0LIU(tQJ>OCi$IVjSW}qXCJ=Ww1GaQ$PfdukZ@1+2f2D`
zQ|7U0JHzmVK*cYejd@YGA^xpBg1GU5w;G@=alVkgkO8@kLpGzO3Z0#IisJ$gaXAMl
z0b<C>gJ00|s48xR1ZJLX?cg{jd6V~<qKl;8xTEA3>`fw5uE&x=rzaGle*&2&X-4Z4
z$cK?tZ_ygA?pVgo@%*M2SpB+|i8~Ad?fx3k55o2IG@c%)MywrA^?W-q=rpA~iMULf
zU~C%D;#o8LfI8Je{^C~*VbRCFL7&D+44jTtuR~}tzlJ(zuZRnn<g=Vj=}4tso!jmL
zMtBQ%WpNTcKNE57s^7?so#XgbP9^=t8B3^pW?$O%FI1IokN}!%3&SfmbPlt>PzuEK
zlk0E?j9FY&3+p-zpf?w}O7<rN1S~t%`#C}{aTwx+Ik~yJsYDV?Sfim;PN}#7!CAQM
zVMoJZBuS6sLrYXoCnuqE!t|&O{!J}1At5qyz>wKYw&_`Lzibe$I>O?Ue;oJg#@>}P
zpK0~P%0XtS#1Y&v+_|$MI(jdg8s6A-TpA}~*r1&mW9HS{Is8S~d*GW9)Uq}oac?M0
z8o^HBijg<cR7O^AmzG`K7T(#Clsd;b&s6lPdO|oMi(XhLP9?21@K~z4fEce|J6csd
z9jaqgcYko!vR5SOIs;8H3#(9ad)Y)Br3;wm0&8IfI26O4<SQAq6j92cARo~fTPl2g
z+-}sVc}((BL$wv3gUTz1(`@2JGe5px2T`wiQOUn<8vQAlt_9c3Jh3C%`}g0vMF2;m
z^%uwi-1nCU^v9kkS+99Fau`+S-Cg>bjZxM>Hh2$`WXL%HR5MF<6fGQr+4OCj6RWJ{
ziqBuii}+Yur3vZt=-BD-?@nlVxh%)8acbDc?X>VCf(1DywyWHQTjwMRi3i!`a{^pO
zf_E>ZLdj1`;#OeP&m;%vI5eLFcMV|2(`_|hbWzMmtN3J@TXh_Y7MB)&Y<KAv-?gI`
zjxhs5{LXQRURHyT=y_A(2_^(gLp19gclR%uCVq60+zT<#ymrXtdAXO0fqROYQzS9R
z5H{2!bnaT{m069amf}f`pA}<s)_CJ4XVxTLJukIZR$kv7sL&7cE)wE;Vs}bM#H=4$
zDZP0bMeMUCC`46a1~_d?Ct|nkYnwC9hqz1e3z=VXKT5Wz6Q`o~*MDgs1e74r@m<+f
zS;;xXiF^38Xv_c`9A`UnswUqPGkq!bNhDLaNYby&itN|t=C9!5Rw*1L>%p*VE=rbB
zaUOOV-vX)t_kYf{Sqnqe9&y3w$Tb#a`J-0>)cHVPB~~8A>1ym2ER_o@b{&2x+)$b7
z7+db;_00>W-E2mFNfC0HXGA^<W$ipBx77@9a60+K*Uu(&ieChBMj0WH#I(8=;guHN
zxg%4Wo0tUjLvB|$ve8!^EzHA*`zU2>=N&6AP=1Lu2lE=gblYIriQ6%v3meKn#vpEe
za%q@<j?i0x&6=}+LbmI<^2^M<Xp9riOQOcTFOBdX&IB+np6wJ)!Z>Q7!=c2j`AS6+
z&!ge0X(Lrw+r_l+*9BwfzV}x#B?aZa_4KM(uQJO-2opDRN!FtZWN`txhxqLn`73M+
z%v@H_FV;U(R<k<uJ>QU3=$*rinWWgJ%_kd${qg4uNFI1@M~#`Rr2&>1Y^djs=yWeT
z%R`eywN<E75Ui4hahNImbW14X{0oW8j?C5F^u51y9E^%V`Jf6AQ^p<qz*HqxvoePW
zy^Cj!u+{_2XX)UdQ|fL+n4f)oe2s9bIZkA2Ls#STrAy}*yHjVQDBn$s3Alp~go!Rd
zIIM?ndFJ3rM|66!tl~tYu-_iMR309_WYiRzV=dhZ^p8}fTRoox*c{QP^&@*QlgI(b
zGl#O-85Zx5K{J(E9RYM8Gk;sAyv3Qf;{ccSuc*h8Z5a=N6X*~31i<tDZ@<7m{pv4E
zwZ%&);*~&BCe8f8wO_wM$5tGn`tH9$f&^q8Bo(CpCrB~bmOrt%!_GWgrDS|fs7Ky@
zG~${zYqj~ur^?K*gHR$AD=Jw;QpK0<F23J&8Q+*$QQsKwJ(|dZfr06g`(@`Qz-Ov1
z&%Ey{HD1urIphvD8+>?=?wdZ4(X9}5a^`PS1T(wlIMbXRTC@lnOU?5iSXX<7(JQ!u
zHnP%z1s(=lIe12FbPo!YwOY00=Y4JYemvhxriS}snymT$*Ip3+J3Q;mVK(8&k+O>v
zFD@b)X^mHxikXed7pu)a(34fn4pJB2nR<I(;?2*5B0G@sdK`!ta&VhSM*uGXoZ?zS
z;5|ZFQ4k&JQh?c1e)4k^8)EZXl^oYg(|e$_dk_5FJ{MPh(xV=1c^vDId0<bp96I|D
z9`Mx=g1&HXzEELbyE?z8%;oUdZ{}8z=cPshNh;sy7Yv*I)9C(7)^HStvHxo^QOt5w
zT`7c}yy#Nkl$>lNbsKD=F#9a*MB7ABGv;r)r1t;NC7o)1KzxgBvI68LHJ>s=bBPb`
z5uUf<?GsDnCmoVZ9{9~eeq0xFpj(qz`?kZE9L>9tU2T9$(n2HgNjjK-;VGm_Ao;@&
zsK@uM?vV-ZlEuvGa6zjDjgU@owP!aVYvl^&*JQ;HbcAYeagLOgbvC>{jug6C7JfaE
z%WY9`{i&4*N)8K+4`U;y5`4RBkZkI%rEp}2tzZkKe^}mzVfzAm!{uEmr?g4EM|z+O
z>UP<84l-MwHvmF<h)Xe&AFkN0_MWbDFy4sMA~P@gam@870tba)BEn&)%RCd;Q;(NU
z4r2lU$7V4IcFZ)G7oapgV7_^H8`fFXSkZju^G^4VGY!gMKi}4qMxP^#{mWQ&>XTb~
z3oSs(&Qmb5A2Pt@s>yuKniyg>o@OuEX~y=nyw`>!`|E4ASACCn!!y=3`6)lxiMeeV
z;5dXaE}I*YxUXzKy3$-R?i3;6^rEEscEE4YT01D<tXWo5l-dFQM>U}m1C2#%;I>(A
zEd2D(hq;tql^)-LW7<<+YSW+9Bs9A8c2g}r#%pqnX+nss?E!*k;#)1bVmzzr&UFuT
z`K&)uQ?W}B7pc5arE7kf=iWCf8hmi0d;e$Zw?s0*N`Ev=y$gTR6?S!$bYY=X*dYFX
zgX)DktEM~4M)mCa^inCGU-o#?B(V2#<eaFIZVqo!)NJ5#ZDB%V=m!gYvtlPVr8oWS
zokygG4N7_SRk!^7Ri+0Wy$4q+iMc?*Ul2^Ja3<(rDDNGxj7NalaO1saNBYFg8QbgJ
z@;+KJ_c?D1^Vp9(Wzj9`!(_*zL$J&~K_vYJm^AG#7~K}#LB{Od*K7VK-{LsV7J0}T
z-V;phs9YYkVSN+E(}CTN`J-9y;*kdcXt0;?Yy*PU{@i<i<_(mO=70b?g5%LiT<Y8@
z>IkcxU!l_2fo`h-e#&=E&SpG%xDBYg8aU^flf#qW((LkEZ+P`pb)R!&PKltx)f%0M
zkfaD)?b1|mAb`A7Wtu*bC4wGZFn87qkI==?cNPip=X#Gms@n#nrIod@>SBbKiepbU
zxiBlwxBU6G86R!vOTUNz@_O*OL~mJ$Nwr)>?A4V!eczp=y9Z)<ZKDYx9W9KA8#oQC
zjyZ;5w0bSDePm709-6#KO}d<Ao|<=~ih=Qz-0rEMQ=f=@1Zu4jAUS08WQ4PKPEEuj
zn2Uey-aTk&Yk0j{`=VweFZ~RkLk>tv_q0(BFbOXn%7n0TAi!=D3-CZ#YdY?}0oEPc
z6S#@nSS_65WNH)<dggD(sxAPiXa#FA1Ai{N!6Rn^_~})Wz{hwKm-qWFTV->hJw6+W
z*VwriUxe(wJ5#yL60LIL<Tfz{-sr()Zpt347&b>q+~OE}*Uwb3d@v(WmdEx&cVSP&
z^#1At#6@8IFv(%OI#j<iO9nW6JgoW(MVnIm$Q!Y;U8h@SVsD9mu&eLZTZ5I53((Xh
zq<chG>;NM?jDE}iy|$WvZJdSarAO)adMBNPSX22>`F;&O;dX?;_QqtoHOxy!GT({R
zdY^}xpP-eyX6`$%+o#M|k2ED~3CWTEtkIRk<om=H>^QMPqY|2TP+EBDwUSk!<QJ<L
zbhg9=GaCMdbMXG)g2ZM~crF&eN-@Co4M(LAk|6}zfz6Emebm=LPnvtVKPt{@YCAol
zZSB@EOl@4qL?uF*)TNyl){~<foD)|@UTp+uibR`7{g}gE!ir^x-W<Z{D1TKJtS=o%
z2JV3aR=?#Gkj?hl0|mL-XHG#;>E1G&wJG}C77ORj*8Ng^NoV*JTdDWcIfG<LbepF^
z*yEEaY{^{k3i(Y~kLnYyV}DAu!sr9ak&nKLbOd*l&9s$z#C1S;+ZjBb3*Ky7AYV~n
ztQuazJ9h4vmuv|Io$hQ^NmkPbfo8Pf9LrE%k{*G*%T|tf!5=$M_Y*FBH@+wM(JiNj
z52I3Lj&h$?uErGeUZw&0Sb3?lNQ^Wq!TGu~oV$%qlOEzVlUZZ8<ck>-g&z)yG*|_R
z*7$rc<trEVa`t$KZ@TeBU~};lvCm@Eo?J=cT}^T0@?NbK<R8t?=o0g$DXX9A=T<XM
z>P(0uZ2DqXBAdk!P(pu8CzKav2|%-z3iWbbRRy14S~Q;F{#jDXuF<gc)gNO8VtLyQ
z8v!^Y>sZfNh|4xT;x54^EjxcT?bu-dl}k;azRYk(nXqYkq+0LyuDC*zR{qgLO$Net
zn`bOVg1{Q7uHuesIYa!ia-#hkr2q0jO}~-xM<7}&{Yeat7`#%1yVTeeY|UbZfp$ut
zheu+}y2z}p$K5&D%u*_AxS8pweTs*Zp3qzhyZF}2+jif0qQ3_$CgrqLHl=r<&biHj
z)9(@v<2S=D{Cu8nI=J8Ogz@mmdq`f>gUMXTWzsuM2eM=9MY)god&V|IcaUw5&~g|u
zUDuP7y?$OSdCPvM&APO6==0MJC7kD(J1aS;s|2M;L@1%JTadnPJWOzsr0L1JMp#R4
zAdFpTs7p1bKT#L7jIukam)=RtU1NMgcOeLGqyr3@%g0Iz?phLV2W(?ps==xg(E2F<
zlr8J?791V)(Z<|IH$-p>2O?+V;SRC-vfVjUKISiIRm6%=r-+FdX@0ocK}0;(`58jC
zS+1&iP&+>8lO!p?=lZIQ-wU}SraZ-KDyIz%=~2NIFM~=-7aN{hS)<*CTg&*h7f*fT
z4&C5SIP9|K$Yd7<f&}Bg2ZHqTeiMNBwoShG)qzJ{8f5<*G1~$js>oN{Es;m8#}xwJ
zB1wX0ycCbA2A_T%`ra(B)~%(2v23^G01n>+Ry;tmWC18X*-=xDGfrxgdtIX*RO`){
zgqahp*1#}&#$l50!CJ`DUrfER4XQ8II)9&<U`1gB1ifMg7|#=SN#tI`aU5i!8oA>)
z2NZ`ZWP$3aKC18o>zL7`8pXhQVV)eDp+tGnVOwl#@~^$$e@OrTVe<HADdK<g^{GF?
zccGA$<7Kx{2<67X{Qg(ui$aC@BkI}Tpb5DLt3XoB3dHVZ`lRQY5o_jutm+aNqgYGp
zmI`Q35a|K`la|w(7v>Ac)XM|lNOR;7T)0XO$+j_noo}n5gU}P<2aTbyjiF_*YV_3x
z>?XA^t450`lcDHeH`-`?=xKb!F=7n97Zm&*28PCOn`xVAQ>DI+px2-mE-a8zV%qCR
z^a0r$bOQOfC5978+blAt7lP5ac{A%YA-|10LpT`OvikV_EtTGy%+G{SIony)xIB*}
zJuxna<?WyD<2`M*ibE)TmiP$CtPlw+d5%~P@G{lzQ8VO@fNR_k8kcShDy{b94jT#X
zj=8AGS^wU?I%5LtoUyEC(LDT1hvq%uags6$N_>8zMA9VYlmZ}y^#)-9o7}TsLmmAk
zt17jrs6;NeUTYIhdB^X}ZwCE9<m#B?z-JJhwT31AN>AIvPbYqSVo*kp^f5FvfA`YI
z(DiZ?mtFl5vVu9WIr~J42(r5QBDRz_NRi1iCEw?JsGdSQWq?yPpa_7UlG7^GELXWS
zFA->zYT1gM;%E~}5<rVD@{<Kq_tGN~;5-Bb|KmMo^VD~9S6)PzNp)R7X#7j>G?6Ue
z<ccFWhZ}XYJlBLkqj~w=QETymh54CaW$U?bngP?ob}J|ES!9yDjncjbQDMSZ2E(o(
z%*k&Na`<CO>boOV8V?NP7}Osq5MNO)wYA-^S>R;<0+M+rrN=}M3dpVw%NBf<Phu;7
z{vK0J=*|R?&V3yEu;(;cE2q$&?Nh8982beYeaB{k>3ghSt?pJ8Ot`SEPy=uDUaLV$
ziHqhj;)6p<+%ez6e21hyAn!R%V(=uG{=nTMup|bOi=L$KwoFOpnZB?(`d*rr^5}(x
z$AHR!!n@EV`wa(#_PzO<^!9`V+>Nmq5-ty5CuC2nEiA?;qNlV=8EZe&PgWc~g-!0W
zQvCwZu2m?GS@T$bvJ^vUsT8T3R(Yv=e~M+~eQx2WwJfT^>TDjDW^weFUH@4@75j>Y
zUG5-#@l@|Be*VZAQ#6XeF(5iZ3c_V4AkU&<IYbBR5fUqA;0!cTb^X3#EMU6pU!uDj
zdtoOx!Q<pd=}X9@9b)>x{4LO!V<zVcQ6_=#B7%EA<j_X{(0TLjcp0yC&kkw&AqgF9
z)&Q?Pa3%WUk~s~q8dle3-D<_pZRv;9f3WJJ2HmZwW05&J?Dv~vCqglv)SntMI1_0z
zIcsIID3|qVQsbUre4q2d+i|5vS5X06fO79Z8epK>5CNsV?JMN!-LXi55cLU+B+&aq
z-sT>lEm4seZ{{IM;#b-QXhXT$$6uPe`r2q<YJxKEeNn#GrGM`F1+#>MLj~Xac-rU1
z@ChA)`&fD=YW1wXr=$P!wVm7-XK^mdDonv~WoF-A2yzhEZiUT5>30C7|M|Jx-=L3M
zfEF@7*M!{AELe9>0DY;jSW!ht^E%{i_V<YjH!uqV@)>@jN5~(G&cqssPwDHc4ah{N
zvPZKt-k9<c@zCAAZ$qTJ*-RME#xZQv5~Gc(UUpAyf45)ixX@QNv#>H6XN@8-X5Gjo
zuJj=J)M2<LNJsO9N&sDy&9Elxdd?P*dGv&a`m2KH^kQ|BF1J32FsYnK8ObIm`NJf+
z+Q1xT`vOkP`g#=v+kE+{Vv(CFAE|7XFJ5D3?i0x6Cv*jS#<n<tqnfV~5-6f$>dKXF
zX1DpOld!rU(84(xB6eQ{X?;{$f57e-J+-AqGQu=@lW*u7P@6Y$)lT{6p_t@!PbIkD
zS1jmlxrl*x%XCRu*e)P4@wDU8r%5h&&)Dj9wLS=l_|~5u&A>d`Hlm+}#pL~5yFTTZ
zAXl`xOf~9#_}&MoLpxxy1>v@=EbErmfxH^%7k#JEcL$Fo*=S`Go^eQoCxG3S{FB)b
zHdSLNyJ0jNrm<g2WmZy3%70Zj&HUEcr1i!3C1x}U@7PC2;uZ&$Gz`M<s4NIjyxeHv
z5E45qC$8Mvbr3#%gDoXxOJa}&#<IpvOzoY8r?>O6q~?C9%?z|JY3eL&)3erobnk{T
zU6x$N<N3;!a3^9S5G)R_%32Xq${f%c*tsopT-M{0WD)zCdb%u~`>tzKF02aVgvF$8
z&v;di<7Wi_+61YUQB#9vzo9xK@w?8Klo8;@ufp5B$qNEwHPt7E2;Z?i?lF=AFf&9i
z@C;?m{igE4594UM7rsdalGzze5?G~;^#yQT#xP9vAg;&4935F`KBEUyk8M4($-{@I
zrQKNY9tIL=TB6tu05Uu6&e#!b(wvxQwbEBzbD!48^pW1xo`lMChII0;ZR5+q@1dvn
zvU#jXY<TI;Cev@s4=~$LQBz9@<r_}*F_oboN#V&E=e~Vo(Y`1)0nwt2r|+)wrnk<s
zxvDNTr(+H$o@GxfDXx0Edx{K3y35COGTG^>2%m0<j60?V$`3&AV*x_jfzjZ)6F1#4
zB;0-Vu~X`q`b&|XA}@S`#|+dktdV;y(Zg_FNfrim(e;Z0QkX3BgT{1m#&g%146kX=
z$t0XfH7NWUsJ?4X1mjvbVS-$Z1f9-!4Z}XIvC@5WyV}9=$vpUvfM|<g*)0K9+Hy^P
zK;s;RC6;*i4*Jc2e>85NrhEq;H)^behVPvrQPsqFH4D8Uq(=K>8jVb>P4E3%Z133n
z19&|<T3A;F701vvF#9E9v`K^ntZ_gl`7J-vJ7C47r@Y@xVkm|Hrkx3b)Yf*^O{Us;
zGMIlAvXe=@wvgSHxm(ht-$uDYeoK-oh4)~RWBumOAyiE>%^L9IW#5dc6;}nvx2^?e
zg?~}=2||64r0@}aCVGOi6LLq=Dd!~beDynN{1jk+m9f_CyqJfk$);_n|A_$UBDy(f
zzWq+xj{eHm0<9obi{7JN(X%IBEV>}7uQ>kuOQEM$ir^1A&k`%!j}_}Z`MNxdCU1rC
z`#AWN#FSC>c=1-dR@{D}ozfl851588x873)X^1x$eYW@&>^`1bID5;%9>#9q?eB#V
zj#J=qastNnty{)?BWx#C0peShF|;w3uexGruZ|asbrMz<;Lr(hcurj}WJYzuPm-EI
zyJJg@fcjdr^CiH5aLe-m%H-m^<7WU?OVQoD+7jAGnyuyFH|xlzTZ7c<bn1h7)&oY&
zNi_J``OKVEv&+V4BY5<wEfdJjR<znfGVV9%6spnW=4BJAKf{(!q2FnF1ojCe#b{4e
zxO|MmDV+JYs*?Yesi}YOg#NpCO%tF7^;a+lvpy*P{`Qd~L(7K!&~MOQjreieG(`iD
zQ+Of8L1}-lz5(xN0{>o3tbe!-!R_>smnkMbPtOpeyBl1|nMilNYNhXjG{lbc-c|2n
zO8tMGf68jVqOG@*=H$5*cM^=A2CgVyZrI>&5Tieo&TZ%G;|VIKiqQ|?=JJMH4OeiQ
zpXYEwDJdCsdhK^~4Q$N&KccySQtDW#H7p~!-80TuCl6tPpW1j0P80AL^%mH#Sh!{<
z+v=wgE#ACuyYcu_S(0|i22e08{$~ZzKll;<q$>KeM%9V9Lo_ZR8IGz81cnBAyHI$>
zKa6jVgYxvR-sLggC5iFlGB$sGOnrL!W1QW;$}DYN3DdYwsd%+^f=$1>R5e!Pj<ItN
zG2lCA!{}D#v&8d{H^1^_^0WGgTN#_#*RO!9s!Os3HHC>>2Vo-uS@Y%%$<~C&DT3}*
z@qx@0F~1yNoCeGdS#MDdb`fgk4pH3WmmTiK^1E@ETyF&d`1Yg2gWKe+{p9DtwzW72
z*t)BTWJ*XV4l13weM`rg8{AM4laQFgqP&px7R0`%08e!b(X(EUTQdDCS?ZU|<3PMm
zMFfK0)5Sns@d;l`j!kPhwh7e&<1Jf`R6QAyKi>kPy8MaJdvt3B8}EOEL@V*0l+J#>
z-ymIklnF|l{HkSfJd7Do%f8J;t}-dCRsGfF{-0T=`s4V0p)pVPRnW`^5|<Ot<HeTV
zhrHXQ@GYQm_qJ&7H|U6N8aZ)^ax?pe|5e~FM*X=l-t(LkHX;-k{d6Bp5AQHAFw$<e
zHQV~i6uH&MzOQ$FhuiP?oCK~(q9;s-`~soxx*(lk_5hZ3s_umqfd03-GF^pFYe+s6
zp$6%~HS<E}n_yyHVeCU_f6VkIU8kyJS!FKtwtamCI!5n&VaQozZ>j5Os*<06Tw1~|
z+$m|9dpYWM2G4m_Jw95~2ZIsH!gJYBH5l|1MJ%l6(03kjC2AdO)_$|w0>K#Q-S}W=
zbf>%9>1!$-^h&*kiU^>O1CEs`>NY6<)lZ(xZn-D-M^l@Iq+hocS|}?!rqLcn(F+R;
z&o#OE=RO!+er;`}4P?C^Qs!X$bCcb3O-)y-{#P0yAiY)|1#HXt7B^3T;?>CrRQnmt
zXAwHYWKC06MtI1%Z;3rh=a@?98F(an_ljP>Hd$m)s#vAS6KbC0w>tuxg$Pnb?zt`p
zF1E1Zf~nk}fleB)A7{0rx!Ncog4yPD$&C`?m+q5xFzCs$MTz{CO6J43w}z&$rz%O)
z$`@2?*VI^}AVA?Xruh>5b03y#lPwg6T1!K%xKL;+M1CF1tnqXsheZ#cgi|K^YS$OF
z5;@pzeoi=T{FE*~$iIRmY$HHbn9=Rqu~Vk&$JOz1PR=0<^QrFgHqTQuNg~G2O^jPX
zOm-%pFP91b&A0dO*=#UxxjS)}^RE3jDfEArc2h4w|GHQ7M+$dZ3V62cGAZ2xU?A~-
zger1x!Ock@((Qa;0QUZWI1K&?os(*~OMzT%ne$vSyZq<JQ}+PH*W}4$Kn7N2`0DXg
zLai!mTveK!OV*X)>AvQX{IxT+gbT{EJPW&nJDQ%*B<NX)({$G$#D$qwKk))zUnMaW
zGwqydSZ^5Tfi-TJGkTEVT%+l5_*}~$c6uM3c6hDFn<f(0B_3re_)L)gZ-3nIm)QJz
zOcpa?K3emFYs#7?_k=t-`0IpMrcs?ml*U08gm<<~xTghggp3rFA9Wn_+ho~o4c<sZ
zuQf9@hbr34ygP!uiT(QH?&Apor44BUtG)v`8sZAH(JZE<83e06S@PD@n7=<bI<u7|
z8Jg#~aUwXR1ur1U1a@a1tCj*e4nwi$AD8ogSuokJ3*f4Bt{$?z0#E>K2nCd+VkjWa
za-06Y9Nqu21bjcM(*W!v`T>Qz5obgeFl_-cwz1m3S)%`CW!-B-us;jXL;d%kL}wiH
zQzKKOERJAAe6*bi+4HvqeA(=#^M9<||8=3XqdY`0#D)W#jsG9)y?0nsYq|#-1jGVF
z1f&xc6cG^V(h?gWBA|lOi3);r0jY+BBE3XVKv4+2NQp?Vp%($^AVnZSr9*-c0wnRS
zJv004;@)#-&N<ILXXc(icoz9mR>{iyz2E!(O6xQ~YJL&8HK->GaX+!de?MRrdWY;t
zu9j$p;UnvT=0g845QQnaW+>9QCM^mo*Ov8)oIe_GX#EVIe+Tp(ge&!P777|GnK*^T
zsEO=KBlJjHp@j--?8wpa^0EwzwRPzNs=9;dO(##y@g~cM?;Cc5%>Jp}{&*nJ3iih3
zv6V+g5vXP!1n{+Hya5VZ#3Z1a1dzIa9Qx}Kj?ZJcfJF8T1?VSe-2zDRcm=1w(#C%r
z+nH2l`Cs)tzgZkFO9;BRRh|5$KG~MH%iQ8>v8D`t@Q_6NC|QJ3>^-3%J^re7zzkDc
zjf`c^#02HPM&7Ez1K%Bhcx?K($uxv;$faNjMX8zRp0$|{CCcm-qUMrCz_x7KgxP-Z
zSV<5Gyz*d5emI+mZNRddBW{!Ipk`AOjKYyQ&!}EJS_fR8dZx`~&v-)%ch!-dZr<RQ
zXc7;>vRVyAvRRZ0HSi{nE!>FhK5_$6nIX#4V6$N!Yq<$5p6BZi4jdb(g=HGgCs~U)
zdiwfRUpNdt{|3aJZ+=VSYWCOIh%f-n7jV}^4%N}+0u&fls7t2rY`%kH6Q=2P6WfD~
zra-i0AyoW8S^1IHavt&q^UjN!)Z2*L<&<}!9Sf<uU7P<01}=Sn3;+cB>%Rd6{_$hx
zKk{X{?toio0g9eQHaWm8_89`)UFM0~2mF|aih<HB@6S1)Q82=j7F{ol@Ep1~x>{IN
zD>eJd*$KdTRz;^|a54l-F17gko)Z(?Q-$?N_lE+WK`=X*|FMv?OcSaPF;iW7!>?TB
zvun&6!}iH@4cpLr_G<wsCvY%Se(04}?dYpU<ca9os#0m5rt$kj`n2Y+`2D?i#-Q<J
zCs!OtJsU-UKu(dQ+TqnjTHc<1ekpfc6wRydsVwk}zDL9366d-3O}R&d%I9oi2%-sM
zCRPvmEcTsl*f<)Kj<GcI0x$xfiNcW*UBs<cq&($LkJw8x3cPevz#J&*@_FS^9iaMS
zq2yrV^b3SKiQa4$h03oWX@yu>p~+d{?eqW(*VgtoazKc=D$!zq-&zUIiG!6w582PF
zs9|#_N>B~A53|ha13}W33<!>;8Z~|cSQKmkHX-^J&HhE*YxG}&DF<eJmmUCP0e<y&
z^H8KWFgza(i<{>p0-<{^e<HvE%2<+b9_#pPAp7At`tRQlExD%tZ}!nVH0(q`HbPjR
z9Fvv@p=D`}*oBtF0zoqj4K(e`$T=s+sJcYV#V`!+zRP3p20zhgmL;Qmw5Lc}?*vU@
zDO&pGDJg4+IyP8d;_)8!Co$G8?R%cJUP^te=$lZ6T|n{;W(_GsU+vX?8)l<Zv->1a
zRJdUEJR}^uZy4u<-D{uJk^-UFWjUaDsMl)!&`qBDQnh|1CpsCpQR94Y-wiMrXqPA>
z!4vj}7LgJaBXrTl{upUBc)srA557+(d5CQi^Eg&-ol8W$XGh}kEBwtD2SvUB>RQ$n
z;eJg=85MdZ%}@uV3@r4YIwAK^k+f7`ItmUYIG}>24!xqegsO?0tsJ(!G<>pdH*XRm
zeRQtQ4+1E|`Vm={xAD`<s67<V39X3pY&}z~iAIjF7`s<z=9JZ5v%UtM!)l~geo@w)
zx;89Y4lHX^@BSPU{@AbN(FnmNp6kYy|9IKYFaIaEEbnoDz98Oq_*3rZ`NREx)m8k<
z;_{ai?({EWIR8*sV=yekZZiTI&1fxVXMcW~4v<tlRrn5~J?;d4i%FpE9QyufU?rwX
z9HkTBJMPuk+@&wQl#@!-7i-P%-Eh>xhsFAgziI9Kq_^{T{(cqTN6LQH)ym#m23$#S
ztIH2YTtFpZ__t<V{;*rH-?I~ajDB1!{{18Qdq??G?copCfT^RX&tM=Nn)m4E07QG}
z1Rdy7a_t<<yZ7Ia*#9Q+|5sl^{PkpNGYCIf(+ADaS`wztP(st}l!}wz8Pjrg-9r1*
z(C%1*#Csn4fxATK&0eyKcSVbgCq|q2sk^_TQdQ$Dv?#sa5CF!b5sKeIX=Vg5`e84?
zVY0nm0M%DW;so+_)g;MXhD{6Z^&G0&V|)eez%&Pp_^b->k5ng4(1TsVFcj^{e%p`W
zH}lwd7=Ct|4L5(A5)Mo>$vqGvX)R=WLjrO6C$h`eNN9tfkp?0Ukna$t0F(H|iWYsU
z=<0*t@85*XxL7UmMW5om)$FpvU4I;f7jxJ8&F<5C-$HKn!vWE^-yG`i_J73Bf2XMa
z4yASXvNv{yr8Pev73S*~OOYd-OPJ6i-Fl)2(W;eUJ_}#h>=cwS0fDU9TclvW*{w1C
zEUkvl4k*ht^J7908Z-LBi#sQ?7|lM7S$JRkk`d4A22NYu_ZhH|iTdd(u+t)dg&l^=
z64?vi2dLi*4Hhb31oADWD3P~@ZoJYb(gKE7&E&X69u0_1o_7ynsTA25sAr{scXg4y
z(<NR(;1I;Y8E{yVQ&9IPXODESvvp-<EP5*LqU=ZB8!7$_ma)ITd?m{)!;g&KV}5(F
z(Z4wj4a^;A{=f8BeF}(_%;J<A4&UH*!YYHu))eM~i*B$~ol_X693(FTB09nZRVOim
zWhC1yQL0RvEpm0CVyp22v>{2dVJk`eX?G%rXt7EsU;*(+nH79AXwqQeFZHW`UkTcu
zOKCsWIe$_5{b!#0C)|AhnUDSXwFsbK`KN^yYyxg3%fDi|`4polB~lXv-l0X5BUE><
z+oQN~4<z7w5Yhv{>X^mb%Zqx_Qo3@|sqo9k%buQ5nfp&pN4Gx(sXctaxWNAxZc(Pq
zzN{C)!kGmBBN34+K##Q-2tjLU*R>0bFMbiNtB7vDWoHPPJI_2vMVg>$r{5RdmRhaJ
zE*#Df((F5(Ao_9$B(n)jPE8hRs6GchK)VfS<h1!K%oli&5<|xcs;)(L**^RBAF4sS
zAAShp>-<_PLVicfQh<k)0DEGm3ywDzAuLH(B(le~zg@~U+jzW`Qe&m}5Il91=ozbf
z^3zkG7+b2fav$&Ff7otEY|KT?VwPrXrA<Ue)Zbs}|IFgJ=Q#Q(b!Gk`JG<J+E&OO3
zZd^vUy%jAm34jY)8+xfn=`5A&nWs=+l85KJuDCYddT(@>2qAk9LfPB>3Ksc;+ar-j
zm$Z6Ds3v*TEh9I<`v%JE1m339<-P6^o7Ec&cjHDLzh2M#2q+$f!b(`0Fi%K8$!gzd
zfV_iFCZO;b+*(;xBR2XW(LHKs4@;SEFqe`2RTYt==@F-YDz5)W4eTHNUm}Jy*NA1H
zUa#=3HJhRFv<QT#O)QD}B+|xQ>-4MN7PUp_id90C#_8Rb5){#1KO&fLG7Eq1`AZlc
zOk%?3&btfOTvV23*MoYTsO!DDF&zXef8skDwL%x9ni9WJWht?zS~HboSq42~U%Ql4
z&bMG@#FS@6KZ7q7DG}P6AO{il#Mg9Tl4VjN+38!eEVX1@XWZC&F-Bc$Hr71;Y=1&=
zf0AZ~83qux6h`*q1k?<OE`$?_jj+h^4ErU-KB8@O(cbBm2cvY0GiGJ)qd@x8o1#&)
zP)oyf!t#eFtK83&s8K*IWpj1<Ls|Z>EL4vr4c*W<3ggE8fGEoN0d)kT8U9e9FZ%-j
z{nFh3kD5mR=-8=0UT_EfQ6=AC{!CWHsr08!|1a6z-_2Qn|K2|%3KX{Voc?dv7dQ?z
zlRLuphTf@?9QR?Q(7n*^diaoao9U`dC2Lgd>rEkvFW9j;ve6)tld4@!ksv9xV-Bea
z720#w%>{2tvpJ-9X^y6~cZ=wsdR_%$>*&^=VZGzL)BgK~#I>FbyPY?7>r)>mEOj1q
zDb(~HsJV2Mm;DJkzOG%K;oAwo6rsIY>1nAP)tFKxuN&PlB{4HINB4Pbh#{_GLZ>&*
zQLh4lruu7rE>N<#6-wV<LwswltuR$K+oLgghnC;!WjXo9ME^pJuC5rB6E(BZ|2&oo
zKQ8=c>U8~4y6}BJZGgPp1EF$0o4KkPF(o|m`84Rv)ahZ_sX3mbw)5f`;tFnzR;0F|
zCX(ALRxl+xG_7X!deR-P2k1FTTLPl@DBHDCGff1COw7f`p(U@+95I-sMH-IE`HJjO
z5qu9GQhtUfL)*)38ChU0N>|uW_>ZW0ouw1T3A>LQ!;&3|dU+)1H+<Vm7V7c&>L;xC
z<Hw4)L!U<Qbx00yT?e~K93-1kZ1X=F+fQ?;0R}lF)%I)EAsM1UurYFr&B0T+>abUC
zEJ)=8r(MQ(w(Dh({nPvD2abJ{mpDXMB1t-QcsosWm4z2mrNosDX!E^yEx%aC9KlT3
zD4gg6LeNIkPxl`?4*}Pj4|(#|+s153ARk|}bbDO#;>X<elzZnrC$-+oOe8H*FBNFv
zKY?!!pN!6mXq(arjZt&H6JZ>XLNyAck4>+GDL@g$>MKwRNmF<%-LkeVh6HFQZ|{%N
zBkUt<ienLHNSoMJSZFm%q+_AYyRs3f!>g)~1GBF*vqzW4VzNBcV#?O{VhX)00I4ma
zK5p-n_c?pU>$-{^7W$ca4jqzP=u+){f@{WGhqOen_8Z`_5ABUfSzF|GfBG59t2V{`
zPVc?}v=4Z>liPU{Q)Le8@OS&(`wJ|L7?LkoP0wS)9O>u0<p|^|f`v{yoL>RWXgao7
zGB)qa6I#adZv9T!6CPPLS5Ap4=0w+H7uqByQP$Jpqp-tN#)85R<p{lT_A=K@ts>cA
z2g<FRSF-Maw#%=kzIHQA+0eAtYqXm1mjYT%-+J5E#syk12cXIbaiV2n&VmuYZKU*_
z^7^D5K2z`b)$}JXB0Ao%kBU0qo)fCXs?dtnd<Ym`gd-8s2s_}AXer3w)b5oFAA3)#
zk&u6JsWkA`Z63d>#uI)Lopf=8KqW;4&~{=1`pS68p<09No}#fy(=)lLopu1ZYu}~x
za=lyOEP||`XOqHv7M~8lu8?2b3i|fFH{xUR`dZIJ%Pw0J22V&gA%%%rVRQY`WlmC(
z88$EJy*x?pGTvLUFvn_phziPzFH2mSz4f7LmkZfjvY^5f37C^nu&us>Tq9Tv=ftx;
z+fe~8t7v`A9^F-90k>9^2Jag0-p{ztcl!JmSAdSS{}t}-C6dF;92#{W_|COj0P#EB
zkxd~F+lt8B71dA6N`o8Eee6cGFfli?&c+4}T9L;oWd)NI+Zj}pnnLc>1I|Sv;Oop(
zUU~mx_tPu66cJ?GJ>{91odI%!-n^vWm$JY)yY?b+iEm5f2qHKJW-Wj@NYxpvKj{F#
z-(DH+9?2KZk#nlF61c*3%x145$A`1iUgBt%`bQQw)m^lThwcj8s|i#d^4)vA0a@mn
zC})>?=H?qY>tIpv1#Nf|@;FJ1DhebUVlndAXviU8>J^A|AnUVr-usYnn-ng(o_SQ5
z$0echIa}+C1QvNcL^)FtGU4Vs*_rsG!Y{c=-$5(tBnLAJ<F2~5#Kk^9(LR64^qbTR
z65<`;ex{OZcjm^ZGED966;Xd}rdT*ssa1$)1`|;$j-@J5dWyV92n6yz1)7*)h)2w#
zq<>H7w!VP>2NP{3;j+VT(w9{smXujC0A>k6Iq2`Do+*1fklCk@e^;S(sQ;>SfBChK
znunPfWw3_o`uGjmJS%@$zkC8Z622ZM4HV-xXJ8w*hTVYEGisYhc<f%bhwdRM0T)LI
z0VU2%s3qO(80FbgHGF!N6=-4YU5mds$CPAFt-w7j-r5y62t4b@b~;1p)6MA%@Po)s
zm_P=gk>DcieY(AMbV{AL_hlxa_qJ#n@;QMyL`F2$pBmgOn=%%~4dQ$a52|M*p;sDV
zVN`blCX62!D&J+c_RenHGG^s-LxTi0zm}ntd%L(Qyi@I7{<!6cX+%4KIpnE7JwMG1
zG?WQFerE)9$`Vdx9?2Ef9XNZdDNe4FT4DhS-ON?fPF<+3OMPl;f29A@rLSj3J}&ZF
zd!ADu&aL2y$gL(g141m%oHA%WdZPat|78-q`$OV~uO@6QFI{R61Y5FmF~s>`eQB{s
zR!ndTGui-Hs%IFEj<%@y1viup3CMh0iJowT%gk(7szV8VfWx)1RNA8DIZ9|ILfwLE
zu~+sx$YJsN!tHe4$ge(nT!N2;zgF9C8<2;I(&V}DdTuy`gA%FFgFNwWfZmm}D17bE
z{p9+=!z-%JdzbgiKICHc-9YgwSUAjt`XeM}PzSs*W<ykiSvF7glkJuyg&AB`sR<uq
z?4WFPBUf|=i$M4W*LIRK-fBKV;tM*0evqWUJc@TG2{dCk5Vr{W%VYRw<^#3*e&ian
zd0`E03j@Z^`;+@7`dDolF}ad1$A=1>u*Z}`2q%){UQ+>2gS0m{>!U7j+|V%o31cq&
zm^3@{A)YjW?*W=vWjqc#wULE#22WxYG2XA6RX94vKXwQi#<saB;jAFh;}xyAkVI4c
z$e{})p2jVwG)jzci1sYxM8nMe``6Bj1bW@8zPi13a|qmknuu(mpN!ms3Q)2hOz{j4
zJq)_}gyr4ynANfM&=OeLLqgt^+EQAfvV+yi&Y!4^)t03GyWlx~UgiB)xBugd(CKwY
zPSnR{06DwhcxzLtg8I0XK0`NTkopyKmFpKQRo*Wat{{XC5#tq!a|Dlzw3g($TY2PA
z?g99$D#1f8@mj-4a+{`GJAk+q*=QOlyoIjpx&Ivmq}&JjcRhV)m&p`ts>DQUp{`&F
zB{1StQQ%J*4fr%I8_Kx|;iYr;W<IEJ9rF%pTW2%7n&h&HvVW5NhAOzHVD{WOQ9;3b
zwM(t_XDJILSmPG*fc_H_#mHz_s=gCNkf7P@e?;MEG!cy!ne4vuVVF6rqoZA6W0;#=
z$*WS>)gzOVh=(;zv(Ha4^c7HVKqJ}eUKv|XMJgpt$&XJN#J<&TXroGtr3`;f;U?;b
z*Lv(;e1PHr`}5$VQ<Cl6+23p_9jg0UCLi=b(^!Bt?7WMto*5&b5Hv15#w_5QbO=>^
z7Cp9RNm7hYr>eliko$%<#T9fF9dvddxJX)>^WA-}8`xDT!zvfxulpF&T(jb0KpgV2
zJVvY)J~WgW+hzCAqTRjtLQ{fwTTcta)2vY3-NF8g=raw7oLy4{8|iBqNwpEhUC%#p
zDUX6~CLOdZ-Oe(!6EWd?noZzBPhaD8W?_{XsFdbF$WXQc=~`G58sd#@OmdFSc~IT@
z8k;i)zw~v?G^<;-Q?llB*I}-wkyRlB2ZH>O3+Y*dy66|%?gXPo6dxssJV+EnOUlDb
zz4fmJiq;H4NB3-B-Y=_G?s{yut6$%A73QGtFH7&k90XV+%V$Qq%m#}z({i+rjiXOz
z<RmOJoGuVtS2>hZ_Sk}Jn>%Th2YQ%f6a=%A9+zm|k#0uwQY@RBOoa-HJM$=EJzOCu
z$wP%@Csfh!_`Um&#_g3*r5}*Ocft2lP%~cxvX}#Vb+sLx5X(>V_!}b37{&beAGQ7l
zM;XN;5VRyU_1#NQ|8a8On}v2wAJL#)8G^+Z@*L_VP<(j!sNKNiJ*IjkUuY9>OnXUe
z2k?$7+5~mRrYR_KBp$#Q9eA*n{dM<7NwxO9+NXPJ`jtwGPg@^23!3ucX!XltC99Zo
zn9#CMCF~nf&=$SkcphX+Y=y^m$H@s7xLcacx{O8QQ7tZ<mfoQ)DeQxrPBuhMvD(*)
zTOT9&ZCT!>qniq~5^v>mEbPx4>u5Z@%HnzMz&mkKx(5l}&Zas$+ENHMB_eS|hvt+M
zyN2)Dv%CX8tRI)W(fM)y!(zvKSa@LI`3~jAEi9Pc3ENjIQM9o0;n|FzBH#%*r}N6;
zMUZpEW#!v~n@mPUgRXA3?-rA$qhMD09QM=idIPBthc_3bqp1>d_D8x0I2_kiOVm`u
z<{3c+=lO1=usKkbhU(=PcgW!x-ib4|Pd4&T-M^6UV8rIkk;W7vbDC%Jvm1!Xnl+w9
zVbh7LE~&21d>c%Tp_q{lPmoiHVy)79DGv0*uhbQ}0tqAH{T62}1dhG*78h@1nu&SH
zEY_y)4^<`VuP}I!tnd|pss?{^Uen!SHCP_+Q1P)#D%-(^LB(cASDCDs_;<(=>PcP)
zQv`_8bk1HLZEV-<T8Jr&|3>61mlD%U_xfq&=f_MKgFihvjS35jhlsO?07)@bo|X!;
zrkS<yLju(#vZxn`<#`WJmRH3sp5j+(8@6RTF8lh<A-C7o{JZH9>M7&Fy|1C_2ysdP
zX^CtSsxd6Nu_PTPE^qoI3Bkj-ye8^A``$#EA<Csq@6$&ne>Wx6{kq2l>nVVH^#s*U
zX96}ir#0S|l@-_TUcE9Jl@_~o$~}rXDsZo|v#(Lr4gEy3$ur&+E7~)V!<4RODDSY^
ziMztZg%m4`@{x>z%$Ij}@3rZDW^VyKbnl!Kt`)?FTe1N+0yeZ-5ep8@B(g+P6^gX-
z?T(lhel{rhDk-SinjG*!OC&nzXb2XW{wlf!(KtJgl%Nk-LQ08>ImA`{Q2rCcEz;N}
zp7E@*k1}mt$3mvgb=pk6uSA{)MeWcOY#oH6GmrG|IUb@A&hUBVI*7)sqj5lS2nf6R
ze`&p7lTj{aN6=)yS1skAvor8Q4!WZ&qL#TN^GwGEa*JCB>cGzFS_AUXdhc=$NvWr8
zdf7(?vv%K;URkgx(HNteu~a8=6Ky=;hy~yz1|EC#FstuEfb`_qFK7*EyyhC^Rz3Ul
z77#BAwroc`u8m|)uge0ssL>jf3Q@25-wDC-Gm_)Kvc2tG2jub+z|3i|JrZxTG)*Gl
zHiu!!3Y9OjevKCSPwwOI8teh7bYO~>tJrc5BH^8+l5Ww$ad9E9)36DV@hhtgty(z<
zZd(2dwF<n#k!GIVz3(d#A%Mi40WY10nD)b^P)(UMfQvI<>kV{@_X9tg+2?;5!u@q*
zAAppdea^5c#l2pKX@-oCk7(LbdS@bAMHp;}B_MUWk>D_vVgz6i&@^szD1cDX0aDub
zT*ie(iLa0!pLkP<3j(-t2hE~aNC)v*lU7GX$GJNU=8+~0G@w@A)k~qi`wsHK#rsSL
z0UBrrDb0kf2s$5kT|f0co>)?j$!q>1Fmd}}WWIZxXrv^A$Wz~l^_pZBa^>!=^0gz>
z(^8cyZQ*_k(_5SCm=!;~e$38jD>j&XgA&xpAKKR8f}pY0a_3l>+21Ztg(&LeS;uFX
zU3eCI{Ha^RL-RvmPl{Ge6RNCzg&QyaQN&`}j()NXy-S+BP_7BlywbR34<t)#<fnt1
zE`c}*^Z?@Sh;_w-GKXU``n;b^#C2RH8+4dvgfAm~tGXWTi+mjmzbMQu@v&JaDY-^)
z_(+-tmyyYf8yY@hEwCtl8m1)@9y+zR&v6sSS)NSG_krelJFQ3-M#>&LaY<EG1@oMJ
z?|!ub=ZOG7;ywa#n$kFn;YVnaz`>h8O@%whDLy0WnhzAj?|V)QQ!<0EU9eQ#l-$?I
zWPRHyryUNVUQ)>}B#t(Wdsg7pp*~WC`r$G;C&gS++&jOQ+`LYc?0g+;0ZXxJ^OBF(
z^K3wSwC9;mAYylKBFu-7M*+lf$}uASX<pirqT8i6ogSVdm4jm^jJxl0aftXtENW^h
zep>)|RKyxnQho9Lhlu<59;IfwX`7TLCifNY+##<c@6SxSE{z0>ehRh98LHLDdN4Zf
z5l_!>u=UBia_?P5(siucET8evdW*HKkeoIFe0TxMIY^%_wD(NXP1r=Q@RUeu*TrzV
zwdbibBff)nZ!9JxiM64}wr=1mb#w%D;Wtx!KA$t5H9lrzwG290j6Ccxf+`2Ms$v8`
zP3BW18offMFB6xX_d2BeM=k}caRz#_e{7^59qh$QUoYMmNhIisnb!_UZqTldkk|A*
zV%GPaU?(p(#2-HOMH>{GQkk=ua&%xr+40_xhkzNMOOBbtmDQ7m4&nNj&+D0>Q?bl3
z-3WPFS;~W1<44#q--Aa^y%xIIS442_dqEZ??zd5Uo%E{ppi&ELLZB(>y5u9=p8549
zgCh1p5BF-`s#(ZUt4T!Z_6x{`eiO^n!_uYtY{1g~WoB2=;*pUxNq%eD0{urb)|v&=
zr$kB5TD{?6^-zQk#r7n+!227X{4B>?>v~<q+XO#pV~1CT>e)`Y-}0hPPR6r12>Ws-
z<(v>|@#6%%AKLU|cu)tax-%=Bdc+~RSUpKH@sOF>cz1DC+_|mXE<~FCi5Hmo3u>cE
zsR&QfNHi(3dFmh~bf!o3QaS0eyJD<g=AzW2!&myMLLA@V%cBn0-VYUZ*Lf*+M}58O
z1zqHUJ3{kSLzn9@5|@haBHHDPP~a!b9VePxr4{!aoCB&t2?U2v6N~DFD3)L@O3Fcv
zx*=!mYEM~AvH(Zv)y0C=FlEplA3;IEhj2EJy2s|UvNCNoUHm5TYXvyaDugg4Q0&fB
zSz}{z?;*pfThH(MAB(TcQ(m%Gg%|=_fvvq^WWW`Ue~Es!z9MdQLfc=5>>a~zWNX~-
zK5=Go|MGr?E(*ByJBYhX*ImbjgQ9eyocJ||?}eqEX|t|E#EY)fcoPtUSQV9G)@PId
z(y~canw@&Nk9v#H%Smz9tMVyxVJqFAcey{_IO_chFRQm>*hJ2?+vRmwJ33FlOPQbP
zsw$go+sTx#i;l|xPe{_0CNYP8qC&>tQR9r{mm>-enZ<*W)&pnGIE#MxIBV=2u7<k0
zDi4lS$%zCS@ipx#6;CqW_E`<ONSF2~(sjStA8~D1EmYxnVq$3J0u%3Cx|#JzZlG40
z;3wUdB0RYH>BzeJC2d@ou9Lg4XGdmiiqGP{##M=%wFeDpX;O3uy$i`li3S999UiNO
z70gCnD!$95*x#|cr#%?N48YpiCvU;|=zWmGNI|<ofTP2SkgFJ^GIPa7VkHc}m6@G;
z*|h!cVqOU)<d$*#TFgWDB0!+q_jElb_rjY6@@V<H{Z7vJOwymC4!>oBbc}{BVju{s
zJgO-P(>%@MePr5rs1{1j+10;3gzT{{?s5`rd4EwZr2CcO>DRMj+<Mykwjv}t>RA!?
z!&Ey$;p0ZAEOF5r-Zn1jYTo1Gq+D5ERq-r1NvvU3?T`=6A%+N9IWg-JIcge=4Wxpt
zBpy_^j3bM^-q^FT)w*VOzd3enf85Q;+9*)XbK4|ypoA%;8+SQGF;3G^gZbr)8IVex
zLz~(KQcf%MC?%lKTMM5UQlNIh9lxGeusGiH&2-GHyV<QFn0suRAz=2vg<d`PRRB5n
ztZjN9!cPbj{OlTOG_<?gGe6T}+A(6{kw(jcY@o@t%POR2v6~q$+1u7aQ1<$Jyx%$)
zdEQ(M$J0)ZJ5*7%T(f(GUyxc1uUNe<5)5oUpsY3{$iD~(goQSx@DXj|2{nZ}J$(w=
zPWY1AvZ(j(N|%oum-!f{yaI>Ods0q~%ELZxd|H~~7;r|2WqZ3hygK0!D>H1!mvlMh
z@T=>BcRrW98zB3pj#iM2mdl9WjLQb^^sivcoc0o=pdT5yt4m3-{Qe*i`(Dt3$+vp5
z;4)|!NvXvN+{C{Xvf-phxPql#dN@+;E2{A%?ljNQ*e&FBeT6}zURbFXHDP*em)^Vu
z+qts2q3|Gi0$@UM-{JmNv9gk@M3pO>3r%G%19RK&I0xAAZ^mgm-xxc;_JR;sAC65a
zTT!o9K~CVnh!ZSpfKK#>Pqh{W)$2ItXP=A1QeF+wiQw~_!FFzg_cbgqcJtMazGu)@
zIW6wonH&>oWQ_x`_}j1Onr5h-Etfgd;a~MHYcCc2UuEjIFp7V%%2X?bWyUB7Aa_o*
z(xv-1w!z_uTOQ<OeZ$)1oa7y+aK29Q!!!3}8X7iuSEPTT2A<Q8f$!VE#^Cl-wP&2H
zzLX9ZUelIECr#bCqybchOXE(z{SNZ1cc3L8Pur&)>)(Wj)+<4*D9JM>nu!HwSI#TB
z=C&vme+T(0^RS5E)cM@Weu&{+BjgAE-e?7NF+yUq_|ZYA6valiobJ-aUSjvUF1r0%
z(1#J&qm&F~f&RSZFUTiX)jC#lu@V%XpGr*vjxJJ1#!>8)E^Vk@1>WL{W9{Hq!Lz5&
zPraO<xf3WC)ask{_(oh@wX)}dZTRf%I>-~8y@l{M9TGH}xNB5NtcgrEXmMRF9)7mM
z8#Bo^&sywcD`}=`u@}s{lU^&<0wAswx&TdK{-T;9@6*%mbdKtQyCs8O=HqJvlZS2f
zPkNn4v@t~Hi7{w7?oVmHjAKMdWOUONM+Vz{;nB4g1fx&6i~JI8#b)W*9qHlRAK#>&
z>pHCc6D-7|pDKl4+k<KnpOp>=;2+=!s*-#{;*MJN`p1R<?Yoi(rhSjsh6q0NHNsNl
zW;`ZgrBaE}j<q~i#JT2i&1CiTMh7)E_x!Fidlzo*wKnc!^*3`prY{&l#vSrzX@*Kv
zhs>E0$%$_!n=ZsSbY1!4(XRh8#Cs$6h9fwMu7FVQdxZVy(rSuo9$QtrkYS;@^V;Yl
zTs?dGT%0xo$aRRQ3h~_$1J<W4`fP}c{e?vc)gt4?1+sURM`@+D@QJ(YYX|fO8+p|m
zH;SH`Hb`l_4U}=@>^ih8pg1ewxzVP28y<?_N%huRfqnFq3W+5rPQyMH#!1W6nLXeu
z<Z@hP`G6}I1Gv>;5N;^Q`?2YhNjsyR;SH}tAroJZ#(avH>sx6gD=4+~oCJ^I2%5p9
zXBP&$eVICig9u#*UE)rZ96K`gne{FrTZ3s+8Gc)AL83>v`Fd?hP4&Bqs_J((kL|L<
zO=fI(?>#|3f6AhN)iW#GWJyuZ#Qw<BX_IcnP8V0+I3?j={k>VuUVw(b){2j_6O|c{
z4jaE~l0~#Bu`3L>`e51rLhCG(dq|8P27i<OUL07SJ+f4zU=&B6wefGXbyCg^uHamq
z8H8jHefAPjUz+&bUvP3jFY&~4aQ2HQC;Kz@N=vdbO$_o1oz{ivmkl_kmAoqt`7ppe
z#$CIQgTrr63@E2-&7FBeZHxSrL@y@dW^mzZx~VO3iwWZ31na1dVu@Zs7ja8G$JqAl
zm$``TOJu;w@(Bl$XJL({<Clzbg-u&9k@v3mETvjHO7Y5)2$^CJh0*)<se#sA8R
zi1jIBv2d}#TjHWOPu=$rVLIe}G-fzBhpI{}kF|BRv%=S%92mzp_pmD7NbhJ2+S9=P
z@VtmRo47^?ry6|L6+-NvE@`Ks>|r~XP8iTK_W{k4r?lh%0KQx|O^>{~b9_l-a1C3q
z<}B$nS6d?KR3sWE*JOM*;;?L$pzC<TH_f+0(voXu3Y=D*m&cOEebR4xYxADi&v#QR
zOG&6<JEw_$asYXNJ_6??kdilxLLFu$9s`P8hC?UT(=qQ4OFk`#+7rl%c@zsYj73=d
z5e^yN-^hfKO{qQG3hFWlKB0H;3(uIA6SaK8cdn$y)g+3zz3t`Ra0x?*r&Fp&RQ+7E
z-wn@1M~ZrlES<WjxnCflO)E>s7QZ5)TN&|KSMkHhgFCyDE=&i4kA14?GH2%zJ11z9
zD6!4l4UXiWej*QJSag}Kx{Gcaz2JytZn`qRbZnyZ3A6v^-L3PYPZ>nMequ2$rl^vF
z@FM}lUV`M560toD%N9KPWNygd2ESjfrE}Wz!=y}aF}v+67x(UUSXxMnA@Chbl272{
z%5Og&a@%0&koWPowJ_pV6P>FzVNHX$nwVvw)7)!oE2yU_6Ww6T_2tN?2FkjApc24y
z17M~<g(MO02n(L(KAoIiFDI9+BCOg=WE(taH}~V!W!2;$mfTGjF!g3NC1Cd~RePZJ
zl7KN~tr;pbR62dgZaqya=C0k@m^-QkxoM4p)=$rYFsYIo7BIc~y4tehit?Je)1yWl
zm$~O8lCwUtmdm=Pyi^eIcF}%Zc=1bVmuBVAy>;pPXN#6zkXOczwRbB<-60dQE>1A_
zMj|Dy<N=hHPeg-f>jl1d#pvTn8*Ah4vs(cO3nHw=S1d&BQjRyqlB7{JP+E7q>8{k!
zyAPgw?m=W7nTjR%D$eGCHsNJjO@3wsSi>0N#LSA~SkcJ(z!k}nD=X<&G7xW>!)^qO
z*Ikunq}cbi$Srv$?R~j7UnD`Pf2^%UFwVV_b?#ZOrJY;NM2D9IaFbJ~IEg%=kIX_o
zx#lbo&8}$ocE`R+aqT@`??Y4qVw*RF+(h~0LJ)H9z3g{>LEL_8eV<i)LX%*sXVYrq
zhsn=TGY=gWeM#Vh2;S;ffFpH}OUurQm^8f0VWD#$rAFgb-rp^a5#Jy6IgPbI_I$@e
ztNF8mCS^>}P8O~e@M~;?9a4j~Z8e#ChJr)WhSU8<wkq}-#&=(SxR?f-lvSSjW`w$T
zC<8woId<_grQJv>U}&2%M`T>bA_q3|rNH`u&AO$5P4Af}xa&&qo$GsiikB<N2Jo+F
zATE$^lh4*hOK7<qZYx-J>pm|S=W{YxKsC5G3L%ECgEZm<pr^dO+J%)#ECFZAblsd>
zw3FD3O~&jM2Nf@|z$C88rX}XKDYyD=1yaRl{G=OToMpW%-svb_-8_??BUx|sa}PER
zv^u`euVp>V6_Da<rFQ4BbQo1E-MgvBN#WD+cH?q`_=ufP_isTn9-gC~ZcPczUf9tm
zTNGg&9I}^XXfgY49Us@+H@;U{$j|PS@fpM*@iUs@rBT%jA%+6NyyE#UDaFSp=*>X4
z%lWgaQ(+E~I;ay>@Zbbm(vR8Ci#;IQY#ukpVMi`rurvu>mN*=SD0yAfZN(yn_H}9x
z*?3jQI$yGo@XT(n4EUoZ_t|A39vhi}1+9|_>jzU9b~WwGuTxEE7zOb&c(k6%a`%~+
z4Uu5`M)p;VjEHa*_eQYQG_VpKme&E&0<Cw|1BI9GO?`O4oe_D5L2(b%B9+xD%kkC;
zn~#AJ63;2gB%Nj$3*wj;0e-|j)-K7Sa;8G|8Z9GQr6%S5=~v|itkW@Y-cnQmjBQi@
zh=V}(Iw<6Q+nM`y)yU%kEH_PxpMBi(_%;Ucois9`+$91u_!gwR#q?n{Cr^*wLo(*P
zLZPdLn%Ye*FU&np1p8DpK!WN|(Nduz#s~*WNk&)tgX+7}Gib&sa8kE$Qt&tcVD;o3
z&sp7E&ZH$<s{2m$k2%J?(28gN75k&m{+AZBd0WW;Q9=H#s344us0Wxap5|#mtRq!F
z+ujJtM2XMBQgsmUbZX+kn9J_D4Jp!O<E?W?4}-%#QLQM&yXuS%q>_eF{?$+x2r-3n
za&mI~-5!*WyCc+_T_z4>o&s8cKe}PsRFFrNOyrj8eF-Sd_CeU_QkMiMI&Ia^VtN02
z1FBg%v75mpF}kYdla8+8D?1TAl;qtbR1Qj`hr(PqUGPB#e^g6pBH2|yHPMQyFM5Kn
zceiEq^#x#Z7^S#pC0C0@Aw|92!1#)R+FSBBuy2|5zZJDcu13{iUM_AJC8C;Z5i)?2
zsTpC5egs%wkhp#r;f9-A1dY|t<@Q?w-Ewvkv7N^45;;vV22%Ex8$XV0wgL?ARo`gt
zFVn;|2rH`pHU(%I1V`bhfJVme2mTd4TGoXkh6{GW$C*GS5thcYH@l~aFWx@{b<z(&
zD{ELv{CW-3YZ>v<fcenqtV{DJk+ynHN;4^9>-_MMOHT4r5-B`Tb1Y|Ilz*VGk}##C
zqp_%lE;Nfr@0_0I!4ZoS0a=HPrO3CBl_od&CAx6<DJU<X8&*hq41{Yw6eR-o!^q$E
zdn3K}$4~nA`skyX8%(20v-;ssPKt|-bFC4<Nv!#8Kqbn~QtRMZz%<8r)oM94X%=8z
z2f9*DZhP88IFP3ouWyAcwGO;HyCLHAD1{k#&X)=zaueVei0kd9Lx!tL$fI$(8ZUO@
zU%r%ED}UDJd10B_D*`D)=8*1v)s>!63Q*7w8#U|JYT=1h4+(W_m`O}-kdCP_wB9vB
z5mDqbmB8}g!yrMcZTgUO3mP&(-SK-G+10-RbvrV`9@UJVI@inBhOeY)+1k3~j5j+t
zFnNyj<zB%D?>T;IhMCcyB{eTUCd+PUazy3^fEzD9^{3`1cN~CTnm_gSk2FH`4^%_O
zO@lw6jE}}6JEzwjV0!^A=WG4AjS&7#(|?*k@N1$$S`k1U6kelOC1C)x?o+=Cnj8G{
z=|AF@>!(&H0Pno?k3!@Bq1XK@D?j~(A(Q_1=>u7Xs1ANW7NhR-cTk_M#T(O6Y5*1x
z^O@48G4-+Zz=+pewq>S6a9fAJg9ej_{j}g`P>nLU&+SOyU6VTBL3&=dHiiBnyURc2
zOq~7Kcij1}`W^f%k^7hJFP(0?U$?*q&BFgzcq{yku+6OcA6;ehH@|3){ki1>cNapB
zl8o;Ss%IXqg%f%-<m(Pzi|k3k2i)^%O3(fBG8}a3^E0@@1pO0KY4oTLjgfqx5}P|;
z!Ps3|Q+=h-@<Hz>cfJpOS|V}#)Hn^qHDR1@m#5lCON}~!cIh|@pt#PDK#!tkE<^BI
zR7DI>>)TO4he0SDEEL1X_NeaA_q!1D8*mYT0BT1Cf=Pxk1t@w196|*{e*gCJ9BPL$
z+-fOeTa_F_sUk@LGNI9rXW5W8B(DYw8{U;Ocgu>SUDPC$&!>2PzCL(C<aM#z)dY|g
zNbUEZb<0D4e3u|xm2I&wj$Td%w9%ofCjqPwLB~fVNs&obHa0Nx@IG~St-tNF1$74&
z4p_aNK(JEmh=X_s2-^ZuFzM@DD_yScop#*Wtu4-z7zaWBg1AnBG~4!$BAF2E-~r03
z23RS8sXaom9BfYEq@k&HsSiABq7-l6JRg=7F6nC>r!AV|kOmZc1}T{DAkAVDBcOYC
z6li&?m%+YX_)lCLiO(pC6D%B1rE`EBf)S5{i5YYOeFBQc-`)bI?hB%xA`g8By=0MC
z_H)%hL<7={`6LA&moZxcdw=JIyrk>0OeYz8pN7FbSUW)PPk{_cKjPhFlQtZ1hPLO=
ztMn9C;0A5mJazbb%>~bj3UV|u9f+Eh4xYU2;64mQ`r_aGYJSFS{>e(|8k;>pH-bb=
z|6V%{NGpG@uLeTC`EeFo$NpXq`p<9mpO5A5Y`Oke=J~=OO?ToOJU3ha8?+OCBir)t
zC)2?49zwi+%dCfUW-9sIH%Dot$b<gL37oj>{{7?4l^ydA^96IyI<d*$K`y<NEecR}
zyDz2%Onc+?p;#6f3c>3x8BnJ%dg{T<^+xHeaM?GP!0#X@1`=m8(IJJOqBGlLdnNg$
zRc(1$#i>?3Ue<0E{sH+f@{_k)s!px?iD>vZ=aXDo85@y;qw+2xDl40-Id4uxPocde
z6-QMbiN1As3AJI?>~Dazs2Js?>~B|)jf|Pql{uCEoUMgnB=g=4^J#zN@dqeQc@v=2
zH(B;6N-6OOn~0{tR$qtJQ_$nRH|3_e2Mp|5$0Gdz*l^}qY%-m1ycwXcHeN!Wpd~2;
ztxT~DtQH}P(JD9GMej}w2c9nS17Xu8=6V4P(fZRh;qRa^*dS`-o&5%Y2bx91_<jd5
zZ0d5|fNT2T|44|#<o_3PL;pzt*>5xxe?3$Ge+^bM1@iSj6A-=5R0>Ei{tVSOsQtxH
z^H-8~&wq0&zq%X@>xnA>j2*ad7;(#)^T3QMVAQwyt^E%MXKY}CddtvmATY1%rS*RY
z6~J?`L4cRKbd;qO5wkptUy;+MXb=E@ol8=U7B`y~M^0Ivym;k?NXTrqE8JW2;iCSv
z(Lj1Losq=StUuP?JOv>^6BW?~6Fm<H#wM+N_dbmmV~Wx6FnDft6+pZrvt@OA%z8IY
z(T4!GlMWttkb1O!V6Ki>_qAkJSGM<hsR>u-XC0S^17;nX7X;$nW0nl-p_EtcEaW(f
z!wlwtcWmiFFvo&Q+#<LFl@R-)H~Dai+LxP++A~KwKS+3yONl3Ovpq2U6W_gwin%-R
z7)b@yMs}FH<I~Cf_ikFCYIIvN3DQtaF5ERih>>)Gl6rgSMxETPin{u1T!Gq5NYe`C
z?YFLN=L+pAvv{%xzJvV5EmwR6A?>+T1ENtA^k6><4<$XZ++REF3~#kmnINxRZ95rO
zdX?{b=?%WOvO!;19?XJvc$(pi8gCG=Q34Uy)_cgi=F+g0#^9#s&{15?ppct+R)<A*
z`xj>G-Ac7Rx*l*Ys>dvZamsk|0S-Jz>J7CHAH8mIn#6NG_?_Z%;5Ery&1L_w59eBa
zV9RsFS(9?gr&*Nnr5Q7H@D%$<Y1d<8s`?|6XoZsf_TvnuYqzBnzkIe%P!AQ8i<hW_
zu6V2fC6)`OgT?QW4Jh_aeac0HR<>E}A#JB4Rc~4qgGAaKodwOWgxt2M36k0V0G|3<
z-;Pa0+3-?riI%%JDYk{ciHHrpg4frUlJ4W4kNcD*X~vp#I5k!XI(`23Eb3l;dO@-7
z`}{<$;z#uYz#MO$od$;@VBbN{L%p}?o%C0L=IJw|K96s6K#_U-iWeHSj9I|#m?-$3
z1OGavUHQM5#`R}E?0;zde{oE^?TIBwi8aro-`FSk+ABAeeQCPPyTjIEn?}u#7>3zF
zo>YrQ3#XATHJ0DZm9)aZOSKwAA<sKgQqfKR>iiKzDKqjGEoI~KCwCIOd0Ck-Zd9pe
zRchZ}=Qau_8t9L$I0K9~O52=yrHI6l#7v9s58h1yUjns7!G$0{T@6GDgvuq+IG!o1
z7(m=2goX>bU0j#~SKn#yP!<V@V}X_yBga$3hF-$n;cTTNyj*dow@eX=2&>oO)=Yxr
zb3GtGIiK$!NCJFKX+1xFJ$`hBkF<7U=(q)pY%}E4RBq)2_(5I|5%6KLq4Jj~wP45F
zjYEzM(%C5@m+l#%$T<;~Gcz&XF(=w&RZ~Lzp_%FR=26Ho{tT5;9LvV3vGc4>n*xc~
z=0rJ8F~z7v@n_@qqa2(}1>p!E!1j2zdY;Z1f!sg$vhg!RyU>Tn{Ggrsh;69Ln9giM
zmD{@|T$ydSsp8Fd8W|MdUj2@95DCqP>?{^lOmj<In73?G#e#VR@VMA4G&gSU(L>7f
z3mmf$_pUqVI<|l_b1>csjVfh3@zSNNN3kQ1A-HwYbtc9C^Jq6v$7T9J(en8s<bIQc
z3;(2Sr{KalS>+zcpnAZKVV1YBM~S|_aaY;ST`i9|axnXD5H0Nu+yeVWa+zi36yuPG
z%eWlgGL*m5j|6FYD<QbYBhsCkQu#@Xyg*^dDvgA-qS=m|*EYs7g$8Ouulo>6H!vS3
zW^RY`pTIUj_6pTpt!s;9THPloG$0{Jhi*;)jn{otHaz+E0F@6<Q%-aqbjZHIA%I*}
z%w3om@K;lur60Jl3(KKC_HLUqJSWEQjP`l>a3A0HacCLa8kI}6;VzC~Ff$!SKOU84
z{Y;M--DlJ=&l;dgT8M>TYt1x1FC!FRYejqU9Cfw)4c8r3usvw|KH#11U>Jg029He;
zK4iH<G^z(u%c4lMSvlcM6xe;?y7Y=JY5L}U`g>ZuLbm3e)Ws?BvUh4xaazwQK%oYj
zEU~zFiV^UwtT88Pg+n3qUSD(+RgNd9R!6BNT6!f(QY8?cRb}$o@c1JJGdE@=OBc?r
zlMkD}G>4w}Mpvh^pvJU;DSAT@o3_-@jXan9(GLJnbAL|0pmBy>E=K$(VcRboxNsz7
z1bWN?xxcK|mLweQ#P;b+{^bVIo580Vt0A@GnNv;GT0pk&E8)go4EQ_f%dsi-@1S?6
zY=Bs24LFM-wuwNA>88-^^sd;B3q3*CGKP^dEs}z3@G}4s?g3%%U!@%X5t;r+)qJd%
zrlk6)XNt>93Kn**c5bn44qU2PsEq2{TgKgiYMLR6(b!WUK<1qZq27WJE7kUY2VKem
zvTULq3xJ3{FXZ=c`7(!j=-EERO>2I|iX8hksJZbcEg5*xfa`gIpEJy#x7Ytj{Ed5W
z&5wphIeiv7z`Op-o}s^~2K-YsGy{_ng~@JiKkbH8mK!EK$TNMVo1Z)i?MAvr<t#Ka
zHWl74{=}<qW@D-xLLT$B=}5J4QN11TkmV-Q`J+6;yu49}1EfN_L5NMq91y6fedTrE
z$z=pzxhNw%n}Xv13asg8GNwy5>+#^|A_Xx}yoKszTopLr_1-H7c0JmB3I5O*>Gii^
z#2e_)(Q8W?6jg#&nCEL-U1BChgi+KdSjC#%XO5I>MxH0J=e?z%$c~hnT#?vt>C&)5
z9sxJKHHG$U<vCcuUxK(ieGoWW#lmB{Jmecgf%q$lw@H4<IX6*Ha#HDAO!No1!!SoD
z8?Ifk#_u4IH=sETCbx%8%{gD`JkCfJQEr5erGg9QvCpl5gw;vPuEN{_#zIvVAe9dY
zU0fIx{T6zDKV#~rps7ss3~uE{VScm$`3aF8#;eqoK~YpLjyucXW4O_)j>ejTcHTs6
zThreIB=M6U3U{wl%9{5GJ*qoC(EX^PeM%&zc5^P`F|rBkCsLkI8YN6+ResJR?Kv+C
z0?`*wECAG%#UrsIyWRPl=iYcFeSMX4@qW#al+!(4=C3brXzH<u?u_<W(f5xMn@C2x
zw#KC1FpGy?i<Xjgh^Lhw7|{mX!(9-H7|-%ITcuah%@cl_dQgYmuJB$y`bP4%XXk|r
z^N|33Q=V%&kE%mNzQ#<|j27M#I!_m8af7K4JkS&QWMto`fmy<6$bD;8{f}7Y5!X@S
z>y8_<LXT~V>>$PPhGekIM>-e9;z0b-&O32}*lP(NP=%hp`VG^eDhDO(>KTZwk>X4W
z^=Rklr*7;QZG+AW^W~!&^>Mc9O5KMjQ%eT14DUbAd#%0pWW9U9ecAPv8U29Hj6|zJ
zDusm*8gb<OHJ40^o{<sA`l&^tzc+MqQxlW%(N5xBTibY#&$biZxLr&vHUBxo`XhJ+
zII_$q3#~zWiIg(u5@9hM(0skGT}<}1t$K+4Em{WdL7hGC=K<LL{0ZxORZd2&;|S3`
zvU&rzYGOWxEw%cY00J{*5H!vq`QF9105QfdReU0JSWR1!(y81V7n<4tvy{`-g^}Bb
z;t`+B@QNW`ceWL`Mnjo4x69pFoVQmZc2Xih+jjLF^r7+6h*6ACtwN@V^R#e0{)F30
zSkpj<4`a60s62Z24Ao)exOf452zru_RVH)ZQ}M+I)i<`On;k1~13<1O={u-Gdby;~
zTVO_F$rRHDSaqLy;h^HkX8UF_X!%K}TbStjb!OI=lQ|_I-_hDjKw2?Du|QQjIard`
z!qxf;jBb;3TY`?>fh0O#5p*{|{i&I7F5_1rAm@0tt7w-Gp_)B#{u~u3S~avL7$-C2
z)2OfaqFrIq4pt0nu;31;plINEc5$T5EDhJeoSlYg8gg^x(RCJ%Gz2@v74JQwE`SHx
z&!u6*LBYlSPN@t5qJpQ{-L$C=@5;+U-ZO_x@y-MzmtAvj3C{VSHvNlw{GMO*M{oVA
zqW-6HoZ#oUAFEd>uph@5f2nB2c|iE6nj~Dy1Z=|n0??l4%UR@6Oc5S3(kpYaeIK*a
zpN*y+zi6~?sg&&n%=?>V4@q@)HEl+ka*?WFg%2vCUMF1^7Znz$sM;go^d_62ZQIVF
z>E%zEbFs%mPdFq9gK0JO!uGzOCVeemdEtdL-wSkQyyJOsEyloSMyPiEG1@er#MzPr
zZ^lg=1e|;}Yd$_}03foZ<5ES{1Dv)lZr`*deI6bNI3Bg<iUCe}mV3c1Z~ZaOw%1}q
z*?HE_HyIm<dQj*L4j;E|#)j8};Vj-5;}D(blZ$QJ&KtX5G$uPBAzHj5Om3PiH&P55
ztFux<cON0_0fJGbqoWi;BD?2rZZlUR<z!|+d)*C@a+DHnO6RUNm6M1IQaGj1bE(<1
zwEo!0nbNT%4>ooUZwXj?h>Aar0-?s1DvdCR%M_VBgfFcWDoMF1&Do2u+k;}aTgjJt
zA?!E4E#q@z&xIzRS0;5AXX^})2`JJg+TnGa;;LIE(3g}8r1$y}NTE^ceMK~oaIX1F
zC20<nIbGC!H*Izq{OS>qiglf9yxpdEMR>U3(yl(@{BW(9Qj59>{&FIO?fC7g!{&>^
zw~O6va`t{zbqe;Jl`bv6n%Aby*W5?dz#?pD@kmjGbWNbDbOe%bU{(I$r!x}=$<Geo
z+kelT;fr2g@s*~}DgF@z<YCI=MoTe@T$Yf|EMOI;rb^a@s;l1aTP=%pfWN&K8}_9_
zMtk4=>lZ-6EpEO8hN)GrES!cH)>n<>lgrfC2)LCC*#P$d=%Anv*_qyNkC2D+!Q1Ve
zM@~=qXdkl^c`(~%5`Mi#Pi2$vwh$1*(HHb~CwM)P2ZzB}QFxAiH4{}6yXq*F0xvAv
z$iNaSLB_v@)wRv-!cjq27@wM02IZhOEgk{G?^a2ds>50p9~xU+#V0+`C<x|#3Q}eg
z1{KwiJ4JTAW^SYAQE%yGGQluUMF4hI7~yPMvtqSy`c!)DZ0a8rJa^dZ5kEPziL;Z|
zXwPT>>{90>R*Qk^659)(f#v~@9SdN!UpJnA{{MfnS$^^1{+~){wx~cP7cCowm!cGt
z46X*XtE^^W7y137%Zpe?s_(~_H8%^MZPM#QUTiIX7CZQw23!!2`uKI1<%8crqU@+$
zzOr80IbBOnm7CELtrG_65qhS{fO7-70lOng2l{4>=5U~8_S?snJsJQxuoUx6t|s17
z&6dvFK4U~Xi~{o9nIdVR)AQTMSet?#gI3d%p%35Mn|{FL=yiUTH1cw7Y1I~a$a<me
z0v}oa@l~`<bAmcr5@<omP`JN?E@PTd6p2XcsoxJs3_Y4Y^1yU>bgXp6MR(|oXTrLZ
z^K3j(CHYOuY?4bb!Ya*sR@YCfxw<Z)A}4d*Okwy*m*hRebB`u-_*k}srZDPy%eTY7
zLWpg!J-GR2Kk0liyZ+SX`|ab$3S4!WK!*zb51n8-+79;h?)w^HIl$P$iYP%3hdlxE
zeIp(U&_ep{fKkZ}E#S(X{bGUU`{Cq(zN9@m0Lo5#Mdh?3MdEE@S^R5F@D^8F(d46N
z%@!30Cfixdw>TKfhQi(B<kn}?6v)7Q3!DPbgMX$)Lemfnm>t*h!dyd48M=1s^8dr$
zdq*|drt70YL<9t+cY=bV(nNX<7Mh5NNRbv5kR~9~YXAk577+!d1u243A~iG<IwA_v
zA=D6%4hba$65@HkZ?BngX1>{b_V1j1_HXvuf3V0}#w70(-sidR>%Pi$saK0(cO864
z-5Mbg4OhxbGJ^5qs_&PRfObU87()95-0I(T?UBWrO}TNuA3DD==KfE@NB?wB{L{Zb
z2-bXh=W>1zo1N!cs2dg9T{nEHspGm8+;lH))~^?<frG@{PO|_cC2L9}B>~nuB_0E+
z%#K~EV)&^DE#5eSu?=bS00B@cUC5QVF6Ej5l1++tk3wgvkobf*wvGisKD)8HTq&(=
zuUtDc@6hHisD!UMZ>TEIcH|8G@&~DTE7(k~{js6bn^khL2<}FGIet5p7C!b7%|{n|
z=(HfEYA_EIzU8ZkF-Rq}UE^I?PFRCUQrjVej?iI#{*5FJpf)ecMcY8CEZ68i+*yKE
zo;ql}uB{J5lsT|5Niw`LI?vw`2-qW+!`x-uAlq_J!jF9Ck7*tXHIX3C(Is=+ztMuV
z{0`4pZvqnSFf;KrFq5K$uh-sPnEzt%F_C4a<skmW^C8t}Zrad(o?1P50cXrF|5?!T
z$qVC85nsPf_-WU5s^+l22V9vY4pPx-?$?l|;X|V0J(s!rL$w_4U*MyENuNOKgxI^C
zK8V_3y?M!R8h~bGb2G!&$J^cwsdiC5<U33FB#rKa!tUM{M8t2)aMup{eNu1Ak)C#I
zNYcvjlk^ro2L6J+2UvtUQ6<n(s@30YI<?Q}Vvi0{O&;PR#GYUB;Foa33vahS0LB0W
zW7t@yzLPK6i*@et;cD9ZkVkA<=0BvSlM_U@>6X(`YvP2wOYEedpl9}6sIysL{4yT3
zid>}I%65++2CRJ}uo$CU-jPdj!>KA3XBnDb6MKc|*6fbNQ~0Sz3AE{o&=UloSBoRT
znko`LnLo@OGI!Vd>uunVrw$Pg5*&;O`mu`N%Xq@O^2ggFY$RBVR8PN#pU9vw8xvT3
zO4H)Ys$(ukDSkS+KC|5uZnb|5{`Hmza04ABn9mlwOdTP=LcSFXO4a75AKwjdmd>kz
zsPFqdSi@63>S3-NDllhh;ujr^39U!epfMlhjfF@7<l?Z?`qWU@Vbc4akjAOU1Di!7
zqzv-L`*R`+B-^2v<-i`m1?9H3j(njd?vvoG=?yjO(*d7DwM}F3R?iP&XF8)4kB;8G
zUOF37clO0Ylb6Mr#~klCqpCPBQrz9U;nh@O!WZR@ori8*U-HC-Q=KH(g6Vp-r>%fj
zIc-b`AfZMe_!SZ$NQiBvuTz8_)m0U~1G>MEUH4fmTeVOhh@^KLL}mGqN{Hqi_y*c3
zL7p<i;|NB5Px;Kq!5I>=#3hIQrNcp_^?|rL;3cq;kp4C9B49qjbW&G9!#ne+k)OoD
zM(?M1*zaj5hNrib+@}QtpAugUX_^OL?0_v~yq(M3*>_0aPX}szbT`m<Q8dR^C2`-X
zaX0f}F3`zIUm)1y1xhpz6TIS!7@eN+d^Vo}c{0V3h!-nK%x~EMI-nmngGb8ybc8*V
z3C>OGha%sRrCLd1$JoWMMF(e+pw>$=z9Pz<hsLj0qv#{&b&3;q7SHYc(<t`8Yw7S`
zJ0&~V9HVk1*CvKbERMh47D$fh+8_D811EORpd*PC9J*j;nj?s4ZO4-;)Sy46mA@FE
ztLQQIst){g!=A~5AVn(dyJN%xPMQ)Ll3wdfW1T|cB1RsG%?Mmo7APOw(K3`w-;xh>
z9dT`2Dlolx8)4W7RfV-=ACy2AE6I|;*9fo@XMsC2T8m-@>vJZU&^W*S1htPK4iGk<
zr^lOi<@f^GTf-scNxf}Ywk#x8k9O~J(YVBN=`ub2<C{}~Di;Sefs^g%+Hj(SPfpM!
z@M@PwUn#Y`FSJ5FavOQ{?_L@xTA4#g9|8Jk?+k)^3Yf~of7nk9Lz3gib#~bqsoI3j
zR*FhB@CRG<?a*Y8wU%<980!x9i2EflI}q4D)KSih@DMHa=4@8lY|C{OxAm|Hb&QN`
z-bNw-t4NdZVes~CDo~s}se}=I|L#(k!t@%3szkAj-#Hf5AE*yvTdZtO7b{A6$t<_!
zIwu;QOCWdMFl^GU{<Z>TXr!ApIS%u5L$iGM6LCgtQ%cp(o>p5)l}h<;ndpiDECbz)
z-8wWTA^5Zi<my~ccRZAJFS&+^@mLuUlpS`l7(yt-xU=s|(}41)<u-;~?@n*`;3w!f
z>HBI3=`HS)l!XmP$UCeZ3sY*yMa6S1Q@V$x-sykQk#GzisP<qq9x;A9_&^5vEq}$w
z)AOX|rjyw#zt6bXduspWwO&H`0+-siZP^3<+fM@d+oi%(t^~1<H|=YqgUNv_k^n|M
z%Qnn9Pi8c>Z0_mzoEHOMj$$>3Fv8rA9EP<l5v-jPyK(mnb_LwFkz<`C_xKrCB5z$u
zVA576__(I&1zY#*Q-k7GQ;*_nRPz!8lO%*0L?=NY|GN(>^USjKl+L}^143lZ<bLvb
z`A2oZe`o#delWam*@gKiM}4ja5K`}tK(=v8VUB-X9pAr8<yrRqPKM_d<;Lv|G)m#8
z;o7VOSQ5ljj&2F9TK;iRQI34M%+aM<Kv}oZM7jUEzS=JGWAR*XJdJ^{+h#J}+GMT&
ztjb+0z{u-}-Bk~{Tu!~)_g@#^+2t9E@lOe;N$4o+9^Bc822$JE$#QKTC1m#2eV(LB
zVHp-X*AxK>tBHiiTbYM$g`2li`N$}ou^j0R(WDK`9LUDbvWPuD<Tc$&@+smnd{D9x
zd-Tx8SmfsOM(@bDoejFsQv~PMBlX&xgpN13fgscxqX~*5sKE#2ua1}8LcY)jZs0E}
zv@<{!B&2P6|Fm_#a#4oICkM>ewDoE)g$bG<t<M_SuJ%Cbn?h5XKZG^C!ziPd$BnPN
zv*yZ^25BHvV&zog?W+Rk=_JMO8aoEEli=jXM79w_qG$@?{wBW3y826}>-%?go^q^p
zDR&x_SLrr;0z5veT4*Nb1I{Y+)vOA72d}9@LB(AR=wvTktz*Lryy(rA6Wyt#4j-s`
zAa;F(<V?6q)ghlE$_|(lQfymnnk+}2!Q3BJd_FQR-thGq&GWc3L0B;1gxb|}>Qmhz
zuvXe3PgBDhUxc8xOsGL0%pDF`!$VrvE?WQ&Rf7^A2WE*P)m7G5emv{L%Od6c&!lRg
zuXn8X3tQ4t3rhmMOwhx=7KHU;&5P^4h4o2uShUG!C-{8$1X6gU7c5XM<;6zBg6Vhc
zZaRJ(@bt}sniw_2jPYRwc!pR+=}uf%n5KWvvt12Dj=go~`Ib+#v$?9GugE4M92_Ie
zv0YMZ^Mcn7VqvR&*3B50be#BT`E8nDDz~Qy18DanJ#T)33WePE_0XOGgX0;jE05es
zL;C`~+PK9aEFOI83XM0+44~|N<^nnxB#>OO&0+cBugiL?%0|?nOdF}P(}#sK*T7c&
z1=C^2;N$v(g8N%vPXCAXzF(7#K?BHf?|wn~UC}I0t2GwR7d8nGe`B0|<AE2EyLI~d
zfNs)u;K3_4fihWq0{!A6_tYVXi%E>2xCfdXi;KVCGof9|cvInE<LYItBXNY%%cMmz
z-Md5gfr>pwELw_n^GlGxVqKZ17x~O?6T@aZDxu6zW{&P4D|gy<GnoAaS1%b3K*jgt
z_B>6<X96Pbzi!V5AaK3LQHJnloYZsv7Y`ZVR2B&=90Hai20!BGek-kYx)R?0?MgXT
zd%WPH%bQ8~rv->eP$xeXz>I;a|HwfR5U*$rle>KI%FpE7l~bpO8*jTi-dX@(2)d?D
znqcBNhTwucuyYfDRnm@;xmq7(zV4}<HS-58(jDhWnN|~~|NfnExXQ9Cx0N(Dtcgl9
zBg^NU*n1<BJ6Pz~)X^_s(7e(Li#Bp6KuM)T?=mQ6q-O+LB1f}se<L4jrzfJ(S1kN=
zgm!5z+6+1-hv&`znCMI@B>EOGuhTyFtvv`}I}qqG(K7557B+tmxvMnjI9(j-OaAmB
zmCg>@Cf`=tB@w>PAjTQyv!Mev={k3*F5s4EiT5o}!0^W^?^92qvu9Dkn}RHEAVncF
z`_)z$VB=kkvrWwvaWEyKp4L&Yh<Le-gav*>Zihn87Nix><pX=5bAv{HZ5nP3U7(&2
zl>TbIz0cEUE9iyrzR!3ZXvf%@6hXd=8}c6tKS?$-zU5Kkxl`;?lj+Rplk`w3aTwSM
zBh-a6WW!|xZ(e?mSqzS|P0|Z)uXUIvtXL>wMjK`<;@V4S+=150Vo~1;#WOkCEf~LB
zx)z3VkOT71$)rVUD!Cm${!Zrf#Bp|e{~L_YreV$3AkOX<j<KC8>Q6rCoOQ}^yWl7A
z*{gC55?O)*Zi|=XL+h&GO9XJ+2KTc#z*NgI&_Zv%PWba8Ti{Z5F1oP%`}Ug5v&w-r
z5B8GdA42>^cY-6djp3vp#OviJ>=;ST<hsCoc3EQ+LnDTJhMiwm_0FrQsrC+>^`cx0
z@2lSyTY^-S?!0|4Yi!d#d?Rwp;gpqBFhOlyYDN7F{k%Alh14`G0iYFRV*vNkdG`Fd
z5|qPGBhOdhe3TY>Z9*{JLz2`rq&ei_gA<KTdF5^;xBhtzp2f9-@XQ<;+`hrs?Tlkz
zGb}C&0nQVlej)$RPjbyGVie5O3ol#J8kr%UWJms1q`(R#WfL!7u~ME?&@HbK2W@LG
zt;Fpo0mb%R*xf9>tuM6UgPd?#=$@0wx~7iD1x8a!fqmmnJ|Gzaw@U3!m#)yXiQ*qy
zfntVSwycAfc&;NZ_TrJ$G7wSvgB8HI3n*$<DsLoPYyFr|ycee3_--nQjCF_aG7`(W
z4G*4$A6k5?i)!&=Ii6EQ(aTODTo0Pg9H{|{b!|974lpl_6trxeW?qVjl6TG>*KHO{
z-ppCnp%#Cv7+CWi4b3oe@wmG(-^#DM$ary;ao*w|1Y7-A2Lil;%@q8xv8(y!2$j2m
zW*p^3;rE3d$i$%wfCvY8AZ7ymCF0btTjhTelRxxp=)rq;=;jd&uw4g1wz(|bQmX+F
z#vLHe?WO><DIh4ogv!Oua1a|V7;zpl>jWGC8XxwP(6+!|x&y2$iK_GyRGUW2TrXZ$
z|EF{_|GMSXY1q-l0By+{q6Ahq0WDUIayL-YM_u;emNs-5zUIknDCwd)H(bqaR~lix
z0JN5|;k^B!XM=F@a%`^t*}{#tUo05L^e9%paXNfeR80BfLxTrNKHq#5%^Gazj7<6z
z4<z*;8&;H~s{3c2UgUWxf377tG9nVxWb2*us?+5ss1iW2P1wmVAje&^4*?MkrTSl>
zHm=X}<AOa8%dsY@uz7d5qjS?=zFOVoN$vqqncisH=_UE4Z0YZcK<PiKe<occOmEWZ
z)^lIK&wJNZKd-tQwN8ojQ%~h$UG`6Bx#znoyUve~j<$SkWuD6_iUXOxxDMK|=DzvE
zzj~WRY67U1FEj5o(3npFd2w(&cz<sIvW@qE8jSowk${C09hRRWT6!!=>j$ONIA|<Y
ziGXNTL18TuLT@{<*sfR%UziY)G!D&2apa!8@a2-hhxbwEug{B_BOb%5(^^fQz?i98
zIQCcsoA%kjSp5O<O$&D@v)$+Z&tJbuAjKcvo|Em&JY>KRqT7H7V4M+b;eBA%?JPN~
zkVFN`fnlj{LT!RRK>_vV*rKzFa)XbKSibGNM<)obTu&oHT4}6x{n}@j)h!x|T}M1T
z97>noChtfonQty+UpSv4{QOIufWQXSK1l!NcsO?I)bOdzZP-}v?5pwfr%yM0jQS>&
zdF;(8p^NyUC2X!27K%Th!75=+Ylf;0S!V6LsrV=m<pXtxTD<%L*px$hpFyi@N}9JZ
zkHe4ji90@!EuD?6b9!*cx%<jSN`kL=^6(o8NKH%wH!-YuW+Ns>+c2+Y(DRhALu10H
z>p}L@h#_EX*JVryA=M8n3t_N~t?G0W4Qcv>ms}C@J;#=PdcxG%vd*%v#uF>eYm;{{
z*w%KRPH`#jeqSj&%>+MC-g6CKdbCgaJ~j?SrrbD)kf`sQAg7R`$ls8!VL|g?&XOA_
zE#}vm+nsvU2D^UPOgiGIEbs8j?FM4nGuzHIJIl7+D~7e}YVs9e(+>6hE$GdaKh8!4
z?O}UqOuZEKu?3n;^`|Kx1Zp{81Pb49_6*w}ffj~$LJn;g<r%@A*dG(MI$>|jIi<f)
zcKc=d%EU;ysR5gjK3R9hi(LbqHZ#d+;>>@)X5elDqKvnmC5-QSHR$49_oKpaSsvT)
zOJ#lkuOtos=LC%ZOg{R*Jr@ilb^ODbZt*<wABTQ}jD8`y|EEPvzbZOz4rAt6VL%q|
zCT#I^*^rDoO=kp1D%g;uMY+)bK#>0?^AVW_nRz|E>40H{tpj)b@B9A0FV6dufbTDQ
z|GzTsdM!BkUz#-}_=kNm^fqMDs~|LkatH{Tq({&?JbVBZAeZHv*D!|QEJbbc#pN@(
zXx|OZZc-fiTOpN|Y>5nv@h~!~8g?kJ4HNA(%hZ!A>p!`~EIVWSg7Hh^mSs2_)s2Aa
zkYgLSUCb8m^UJy^r7AhNrdZsTIo5S@!Tm$|*)2x;fL)V<N06EtgliB*hH8-dW0+|k
z)mJ7FW$0kxg78#1AwI8-PyeBP+77^Ist;m!K83XMDr4=D0H~FJc=Gn8k6o$HxEn4+
z7}AX+OZ_@Bov%w3`%@Ea(^J+Q2FoBW@EG23SpaXchk?oda>w^tCYCme+D>t%L<$H{
z0Qg{sgj6~}Q!jo9^dbeNQ}l=Ad^392615nH5swH))dkIVix-<k^n!|MTnE``e!K9m
z%Gx3+mw<V!&>Dm6yTNzSp<@C#PNlvKhR2Q@6YL_jvuHQsM*I$x8`gYY6I|@xZ|LpE
zbC7umm$Z<|@0qPAIT)>jri8h>DdN_|&uqT{AWVBbpO<ykVQ>u99M8(w{LKyfque2{
zV5ff!n=$?*w)YLZThLS7N3_Z3Rw$6H&;pF6wyOq1Es;w!(TZehe#VVjtH+pKOUWPF
zEoeJ3WEM`gnXbNt1qAkCf)s<nxqeI)O>{{;MJ|~q^8W32w|c?{$1gwEGxu4Gv#xnN
z>%|?8#T+j?N%m%MyRCVuaP;<)asal9_r_A633Io^a^+dJO@;Fl>TGO*ZDDpkk5Cm2
z;{++JtIrDb2zjLybG+)xu%<%k=nB?G{-R%DPo;jX-_2VD{f9w3U)^C<`oITfT~fBr
zE+o>oruy?WW!OAZI#bg_&wWbU>-CXiZ0dLREz_yT2;jsjXi~Nc@qXcq1|*gyG_u`b
zo?=`*e4_ET+-v}=s#S!!jL<p6%dnn3rFL2il!<zxEWGE@vN}Ar>cHs6;D*vsPfxGX
z;ea!zoQ|q2xJ8aVR7e>dtiOd1Qpv3uUbfWCdZAt0zhk83Wdg|1%5doN+KUn@%Kc2@
z$tEXW#U2KmTDULBjUI7@;4&2{l6U%C>ucUS9XTS;X@8a5K@-FVS_L8V+_UqwGAA?y
zb+>fSBQr2FxzJ4)nmkY)W+?!7jq=cT;vY=*KIBoY_TQS|oQKrG)JUM%*xv<c0~dLM
zixy#<hX;O>=Ryt+n=b<${;wkD|M6L0{{p4G5bat34tdaq*<SF2nvQfM{!T#s&mFP<
zO)5rKSpD?XJrxqePtXfS$Uy>jftGn1S+Y-rG6$-(Psts``I&H$#oG|asCV&R47MwL
z!I;=a!<j4vivF(Ug*VnLuUqMb@+u#hXhjN-98u2vaI=bM^4nsUo~85XpygK>VJcYv
zfCH8YBkKYb3{gNZ+G_wz5afuz?a}~W7(awNut05QWe<z==4krOOlr(ZM(y;nEVo>K
z=gw<pWxFR+O|3g#?4gy!8nn=YiP9D$z@z0trSM>3c5tBi_I&@EZ$_ygi(d>6!=p`E
zYYVApjoQ77E4F-8mF#4ojfdi)G!En6+_V8OX{#Ty7*7S-vzD_M@{V=h(}bW~zNdOJ
z0&1tp4ls^Z+*l}+D-qFV0b|aW0aL~M#%vDCWir&NZTeN7?sX{Rg*`fa7Qq0S;r=Aw
ztV*+%1}-%taEh-_aY_1H@kK@~^Qk(9TQN=(HVs@a3?(H-=z;BRAs^PqM+QoTCuT6>
zG6dJS4c|Wr<go)|)i^JjmbL0^#`SUc@A3WBWx=QpuP73RH=nRx==Gr&hvX*vQ|*`3
z=W_)lPz-6kPv6b63l}x;b1jD4nWf%3K-?#$B~)z)ll9vzMM1>1Oo&sOWkqGutkjAR
zd}2c9CrCQNVP0bXQ7|!y{qNR(5Y`&B1m4!A0Zk~5dzu>jPO)6RI<MSz5Mef^EkKrQ
z&*q7}k<vb;T~BOZ8#9sf&+u|!_)sMF6#o>E7oU1lhmVXaAPCJ&Kn~Fa$H|viEsXE0
zmdVO$k4YTj3>}e6e4FvkdnA1mPWI9gr#;1VV5ld7xe*)~p8n0Mr0E|Yux*ISQX1Sr
z03r4`vdD}DVNkFny+!$L0F3lT?09Cc!c|L``;><A%&nxMMZYcZfBXes`?xbVk?H4j
z?qz?4^T4aE04lc;%^2JP+1YvrTk61m_}c}j#l(rK6JyuP3r0ib39&60!9b%{%l+um
z&Zmv0_^>le$=29r^=mg7q(jGU=b1)4Quy3Fp_mu*;e&4N&S`V8*?#9q!N1<a^H3-w
zo4~j5CyAG2!$3{%rIz4-#XI_6!wbynU0}AjU2|?fb)2R(3q+W}&Vf>Y!C098VmEN>
zFY!G8TrK^-h8MrcLH`xW`u{q;BG0!_12P;p{RGW2q<MSx(^W0WO9S_|JY^lbI#ro-
z9(^?g(RH0hhn64IRm->yYpjU_!R9jh*Is*}8fm>XPp@TVO7-v`QWF5p41O_%x2-B>
zjP3)Y$q<^5Q5$006TuB5@b&^HH89kR=_g7a2o<)2sSE(FH5UhWH`z;@(!TAl{WceE
z|M&k6!BsUH0xl3sME##2I$)H&ck<u<i*Uti*k$N>YA_jr?;RWHv_#8}oJs*WYloGd
zT447)N-s7Ls%TahA)ggwDzmLJUrL8K{Kn|}oA*b;D0kh13liI3ZLE$FFRb^>D#y78
zm6T*j=<z!_Da=H%9aRjmhJ8g7=F=R00KT=eyKsQR%0h+`;&EIshnl!u6|_7tYZCe9
zNCwNY3TLM)^T)QzZUM7Jo=!-J*V0a?2VsHq_%RL~GXQ{32d~g40|WGKJ7(S<9UaT{
z@|XS!>sM3L+6RGV=<EPHkhEuz#Yt^orDkLRdVz%Ln(I%{1xjnG9lXc|nTGj;<sf^=
zal-oQLxIDo$p%6*$ioiMoOWZS8^AaEpmMNQ*T587;=nANc4!O`*8_%QSbJbhkmpOz
z19ZMMFlw@Xbs@ETgYDnG13RyAvo@E443aN7pZi#*_3f0su-s2(Gx}?+I1?<_MKqc~
zY%CvC?1opkrJpYK^w`*5`UzUd>;u;as)~!vY&C!Q-TT5=scs2q`>mpW*$@AWKUV-G
zkmS^9+Nbfo;p$&X$ahEn*Cp^b&f}kdN+v^SnwGTs!mZMCf3bLS_g1+2o3FyJ&-_1l
zOg|6($M91qFc0{L(dao$F2Ce$7Y6-BAbNd^29d^0dCgn?<s$y(qtVZ^lh2%!`4LO-
zG0wVz4cl$=5R@u*J>u|2v&Fx5QBVIe^vd>(FY=GX0)Mu@FdeC$gtY9-(2yS{^&Q?>
zHSMaXTv&Kunj`(ydlclpPz3WhIOnc_lxsD4$`#1;>^!+<TJM5&qYl33?DG{c;|qm>
z`L6b$=LPEN?LuF!Y5+VX^Quo*r>zHMlvq?QVC`B`T2`Z3#LqZsSRGOokD@D9dIOkl
z+@SmOPmtv^^m#Wy_BEjyOe|m%TCs#(hX=hk4xVpZ%;u#t*U@+wGI#%z#RpUG>iuVm
z8+U$p!Duf8@W9L3r{Z0P>B45hWY-rZ54ooPFla>;A2zq$XWl>g_w?^FDh5y?G6VbH
z?-m&UD;L-Or-^od7HR$}-2A^-F8)i+^Z$bd#vEEERO|69rd5uMto~!Z1UJ0BSCaTe
z7SG`)rmZnEgGWm($g$AAiz*+y_Vo~NfeD84c?10~4Z%6ujGt!v3CbVs;>3w7v>jPQ
zn|AfUMn&A0pwG+#E2f+FtFjp%U#}zH>|jk$ue>nc;3@Q&XF(&tp=Q-NVWgL)4!Bi8
zW+ht>pv?|5r$0H(irnuv);2_ztx5EkevPmC;xuI>cFbT;;gvA+eBOqBs-f!ir}Qnw
z_j&ab*n>it3qWxy(H3>J7bV(f;L7pt&48TMxGTxv!S)XP>kHrM%~Ypz$FnbVuh{$K
zhuZhk1VU}eK2A83#9b(Nl^mB_>y@GHEazLO#@eXcX~%6_x&&W|-dACZjgYT($R~EL
z_Rd0%Q6<T$q4ErfI89T+_!rKlGzl4WDZPHh@~N#0ba%6&>w(A`27f3!73wo9$G^nu
zKnfqylsS58y}I<O)X}_(WF4tWhg6@2x2B^UHu_^7qU*&rYx^tZ89yG4T+UA4ZO2pr
zaI%n#x>v%E7af_gSw*U#TQXMUBxXo1{d~XYtD&KLI>8KG6B?UJ+Na5=HcSO<^*6_o
z?pXi}cqS_T;y`*^O{vy_lQC=P#Ivs<*D|gf-V2h<5w>7nL~9;ohO3h|+SFAZ%QeDg
z<b+Ll^Q42NpT|#LAX>WrP?4%vo*YHgvorb!%RQqp5uMjA5+rR0{Mgxz2o!rGGP3HZ
zA5(7jjqj^3Bs1=wdxGAPN(Ir4q}y*OdU+aagi;Um`*zjXpF&w^QdAMLLl9Bu8o3oo
zhPPuG@|Nq%UX>ip@7uVmE9cGq@Gz_3hmCUqTmZM6NvsrsWyC|`l)TuepkbPb+w`bH
z;DBxQQ~iK9Vp5t$PuF=K+L>lNKiXwoN@VE-3!*Pj<p;w>tf;34HKk_l?hRE<+Sm?y
z0Zi$NoamS_EU%^Wb9G*Gp^J3zhYFQMhnet72M>zMVz|J-!Z0wAWgw@Rq}-@Cel~A&
z>A2=9lQ1V!Q`f>LTaQ4Bjr~T=@aER&8~ffg2gc4}ZS8|hniy4qNKZ8xY4moQg&o(;
zaksXwI*~M%?lR|ns>-zE@QKixmiebMjtL{iM4z^71#$-EvK19fKs;M-<jWmfd~8Ex
zt9+O55xlthGH#03Puq0w5j0yu*!yt<#XOL`O;jDi4Ed;7=LcCu2#vV~Oqre8lzG7t
z9dv#60-c|*<5qti@a<0V5fLrZ*qd_k_Pm5|(}{{A4tmC1cKyC%9A0-_CaTOjB%Ca4
zJs5nFtFm`EUc5-isw<&*4Rn8hmtMc<P0D5%3KykKK#y&&UG7jvC6t0&gY#3(+@n+)
zs;@=#DSqbrYCDrqB=x!LTUb0)#$_Mc26+aqc8)}!Sk9(zAQV2?J^-K|lLV;km0g$K
zC_!Jn6Dr_(I!Vxa=hgkxk8K*8(;?v{^<iO`{{F~9=B)g^m(%obDkdcw{c<`mczM3n
z@_$!l2wX%R#XD3D@XvB>6e7WnlLy-UKT9<BYk~{3-zueN$JieK5JahyaCAHtx6hQs
zoel(S(hsbS;N+euZAgna*b+S}eTt^Co$7oWQx&Br4a{$V7S%e<Y-{7DT7-w|3RHa>
zcriG9I4=Ms8WI7~%TYjgfUknJc2H#iz%K>QPUBbu06+J|gS7+vUf(B7BTRh@QdK}L
zG#MK4SzspzPDf)FUxeNTa6l~#c@0b;Vpw2(5@di&^CSdD#O^~vV1YDWFRu-74}9$`
z>`a(lfc8Uxl8{KYp*@rA0FXg|NuLCu7VJU<)B&eS0)(s&ke5V|dAq1x_<|mZMx-v%
z`qMUJ|Fz$6#EgI<t&+WoDxDVgY}wsy=p#hg9ClI|;P(+TkWNl2X5swuZ81M8_*gp0
z<G0lw0We?FM0Kp!82r|-J-2#U^as~~eWxFg5Ld`}&Ltws{FKN`e34RtW^<35tVYT?
zUduWX1CQm^H|xF)YWW)ZFSz}+^NUWGT2?&zb-?^T#y{_ZVP5=3eIccj=Q{Mb>%$^f
zB`Jf!%tHf3v?{vrxCNy439{w&#O#%}dfke%SZkZbmq4JSUkS?wmJ1h?+8>X38qQa9
zKCz$4FObir<54*5DrlQo>T}@%63u8`dplf~d`?+hOu4~hfJQ0vr;zp!va5cWr_Eww
z=VUU&vhTn4UN*fBDFuZb@NAh+Q@I030aYFt!pGqSX>K7IQQDlmc8n@NxD_MfuD(lS
zzD>9={Y^O?`P9!`_p>|Xm>a)C!kQ4yKD-tX{_H66Y(52;Oz^-%OUb?)^weuZZ(i0>
zz|p5PEEBlu(jI4z(aeP*gd5;B1JiBz*cSf&+u9TaSx?*?%f@v0`XqTt2YlNUDuG-p
zC$itrS9e}T5ta2^HEUzua$DT3qnByW&|~xYyt{D~Qw{B;bmjAcE9`5CVqm3(_y=pR
zQx*jkva_C1&m00S<R37-Fp3yYxp+(!U!3-l3<z&1eGS3>@d2{XU_Y7Yl(iMq?I#c;
zFZ_Vt^C5Spu(KpTGRJ{9y%rxPEagtX;A&o0+=;=AEWEx_+cUhmpZ!=W8Q})n?}*#G
zu6IzaBo!p6s?Fjm+sO>~dEanazL(>?+QL?W*HPLSU(XK{QL4-U_Zemx-;BCl(MQ(_
z^5Y{^9qr;=v^9#XPcer|8-)>9TJ~W=StX%@70*W(Rj$^FQp0}3cYF`t%5KU*T*vwx
zB`~9!C)}BY=MKwAIiPHb>7f18xIJ%(lU%%t;mKui)lX2O#57P5yj`lXn30u-*FFs1
zJbV8jPHCkZuDk+_<&p9%Rfh*lbLlgI@i4vCBK?v(VCD({M?gfVjWpl;L+Bf@QK{=T
zpET`59mv(8bSF^c3dafmojp(|&Jq9_qWr1KgF!=$x<q<w*m;9*VYB0%6FN=3-<CQ7
zdnBFviNW>oHgk<N*tLOU_qJ|URb6m)MyS7Abx`eED59?Xcnsi4-^nZ&x=6k3Zs{3a
zyZPwh^&j`-6CAq+l3o=6*vw)4L1syhek-fGy*y<lK86Y#_RC?1ej-m2R*Yl6H4t~B
z*Y5R9Zn-OeuG{VkukLjNSEn`0oG~OJ<EIz^#gU>zn?(X1vigi^W65^?4dp1UcVErS
zgOJx+cf(S_=FN4cUSdl6qT<Gw!c5I8U=}saOg)KF-k5SB81_<`j6puT)2M(|N@3SY
zBcgk&><v$C-RVqq=ctINuxOt&4B;kntfKk3AyIv-J4%0SyJ4w5cr^k~MBb?+GZDJ4
zPm@eywYjRd<S|{}*UbDC5@xl75k2)nki+Bvljnx+IZLVibXA(l-fuhKT>`9@0dZ%u
zDa1MTxvs-(sgXri=7-BpIa8IjpchSwmlN2L)ts>AtKt0JLWYHb&a=GLqS}Vino=6l
zi3dY6s7zvy)birpq$#B&2`jvPTm%Th90m$401hNjP2(1%Qu{C<hWD2>Nt+o9bjYNS
zQSDLhmt3pHOW}?K^W^)`xJr$(O*zH6YC7-b1!Q?MvbZRR{S{CNy7eaP=K|ya49SXU
zC3pJsH&UIG$F#tmIS>AgfvJ2z6-1<5vu(^aS?%Mg@N5Y~c323AK3R3>>_@-LjnF=O
zP+Se;UYTb9je5KxxX3%jPt?S4`eJHaEdrv}&lqIu8dpT99MMS}NE}^*T3PmjQ47=%
zD$oN}qyRwLq6hTyNaSm(x5h^bfXhgAsYk`x3Oftm1P}wB9snUbPSyGlxL`r>E6Yy_
zOuJeWw@zvjkBK=eUmnj<wIKdhGTl|>W0~)*Ukzsd&W{LsapVt-ixy`fiJNLGRK_7i
zQ{Au=3(>!lfBy9N7nJSSV2=N4yV-v*s70)!`q9P*k+8W)02p>q2I}2HrsA((AyJ>M
zr-M1R{en!|*|9dw(=4Hv`C>iO%tsoHBu>sP(!c|A=-RlG4<(+9wcR02=!IAoTXtou
zqKF%-3ltr4?`(L+%s3oOmTI>kJ-%Ugs@u)_ZqzCGTs#@CVRSqXBCp1e<V+A(Q<6(F
z%JT7)x;#naqIwbF9r0S1ypBPQrT2Z~d3K`LN>Wi4NlmYApG>|itD}m~6x@*d8qP%x
zB!>`vTJ}-#no>oV75!baEYwbHdalJ4pTGTb>$@_Wf$9TMHcg{upuNo?c3({Ctaol-
zW5l&5l9qhIOvcX6zDVX5;vrp*Z8=|#%ogc=Zv}CfuiEC*z5rw(Ty(EvJ;DwEZEa~I
zf;9yc)j+QHbhczCA3R3DO@2|Cmy3_WZ8_zmD_-~6;rJOV(8Uw6TJ8<x5W)stF1FFE
zz)f3{T<JXfNme(7VyW+Oh{dHZ4k2yb#cv5xR!dR4@rdciyIZzqkNl?zg6ScmfO5Xm
zeHQTyRu!A1>FzND6V@l-o3;4{f#h#I2GuYA>C>Q>xXu80I`lnfnrTy4r+2cQ$|{Zs
zZO1MoHF^^|XT_@PiC#~fB1bGMo?aGx?|gnz_=kLpGP9!`So{3ZlNoO%GV1Fm{rIZ%
z8%&`>p-e={P9)Z0<`ews8tvrL6k{r1ZIni_ohVCPwb~1d6Ynf_o}Wy7U(FNGAaO7&
zRYHKQ*}b=+b?i>_gqjsz4ww}Y`Zln>CPSB`kug}y`&qT;`=+{QhfbF`J?PfQ&G-t}
zhEbj?J6c<_k_lWm0N<uwoOPA4-cigD@G`DYy)4@<Qzg$R&}n-MO;5Z^#y8PKiGnuO
z>2CXBpZ3QqH{_WdEU`B9s*?%$>d*orOEX3U32~n04n(#=xTWoo+iGQAn&PsNpej@}
zUiw~=Z(GBlJYCWwoykngJ`4yp7agh&EM9=gPf)+h()G-{EQJ9N6EX|jMdts2KZDG7
z|FJ<nlSWO4B*0b{t^wBdiPgH`+0G#zNHpTsUz2F^08z=W`kLaE-ZQ`I*!~OWv1e0^
zMH-EzE2K@&bZa~ZiRP!bHj*Q8Vd{5zrk^Rs-!_=wbYDy^Ux}Jjg}efJ_1XJ{QS_;&
z26zOwX5mq5^mAtsZM@GKM7-aqm1tciwUGthNhhkb_Z~?W0PU#{a%P7elJB)2q+3<A
z?A?!Dqfj)L60S`1Z1bK_q2cB-l3=xk+3J4O6$G$NM7;_BmI~bIX<o{(h3t$#@13uT
zbUO?@n@?Z_Gk9cLavRqF?JTQZfvUmw!J%b~2Zs##uj@1~nOz2O0A7M<PKAg<6QF_i
zyMdf9zD?G{Jx1}HZhT$=0?}^)f#@o0W6~)GHPdwA@+O=PyKtP^y&nBMj*Tm@8vP_M
zNe=7quRlSY6KfO3ZT=tG(50u@7j(ck`=GK$l<T3M^>Hc#7^j~gj%0Bv#$#lg{IGFT
zZ}Z|($GOwwJ<8QdX{0MfX}l!|$4lwp{n8ov;*$6=zYXl~(tT*ktjZd+NIGU<1~7Zy
z{wmAk6>2U~7@$dgo{*I3z4KwE6e!LT`Jt>-$$=x2UkB``dQ&?1SWqH+KYo7!Yq-fl
zg`d#F7b}pXifQ-faRb#%w~vI}zsIJJ3QnM^9h4#S{WI{45!>xPC?|M~{f+ntm9xp{
zU9-o@2v5L;AYE$S35d76F==4jxu)dqm=l`Y+bkI3sZANAy|TkNPABp8G6Mah+Xh#a
z_v2R;gq^H93HXMT&u5W~8w0?%km&6G!MaOFK-<lCf28|Sc<Yq%po93$gF>3r)pg38
zyU4`{%?qkG5CKW5s-!jZYB*VPCx5sMTiA(6On<jo?vc_?^^klXaAiyw(eoM%z2#%8
z>-n}I`Q*)qEFbB$Ge^E42y2L#G&MnZT=+@1W$oGNz%qZPmBiYWPUuw|F!|K(>W*8v
zOlS0t-4(HBipf)C$4`(?h10@PoKD!;0K6C563w-Kq^kQaM_abvoO>lo%~MKn1#j8f
zeO9vpE*f~BNj|G$g?HI|C29xn4<i_3##-9$Y71Ue>BSsKnPU)=NodEl<j(6^fj>~I
zu)FMIyC%*qXil<I_c5^cz&CKjBX9BzMxzjl8C7Ip(gHsVi$4DGGl1q)N$x56jKEMT
zO+vzZ7~W8Y=i>%O-xnYR;o*a{)jWdPbrzD>6Z<1iK0KK~i&RbMi<qRPeD^Dh>YRbM
zmdbTez!8lKu5<YAO~nx3v82~6-a4WtMX=^8FlP^qZ_~ufh8f)*;XM^ynfP~H+0_?;
z0b)Lc4l_U9&bPZVS+cD$7DeGK7Z%(X?QKavoy8G1qW`hxY0d0&Z0@oB8Ma3DFJ(Y<
z8hElMIy^a22doH;TNyaXHi6>m_ut?XZPinMf-VtWZQa#75|rTG>sA7p$w_BuL5_xV
zhguGbWVlJ&b0@3#jL3e0C_v35T*1t7nDqp+BKrBNgF3klP9Fz~cBp;5Jp5%%yq92V
zCuqI)v%tn<L&-HiqPnZR14S-y|CVX0dqc624?>3}ljgT0!%esOg$B?(87AUVVHMxj
zt>%QL1+1G6;WZJbJ_dX54+YUK(1!WCGr(gjt@Yu1+Gb&RRj17n)xD_PluqwG|KE$l
z6h~s=T1IF^1~eANvRxv};Z+@y@wkEU$HBO){&iLD?F@*=q~Q&L@S{um_YJM8_0&{n
z<`Kzjox|u#pfJ3KBCxNeg)r*}5sh#)ufJZ{99=ZK{w0j@HAs9S(ilDiUAqgztYNA;
zlJ7{Qyeuc^=0|dgQ48WSlL94C_U<dzXXxkV4}j%!5qdvme3|rd$ni6sMQ00dYEv+A
z;rsyugvO7V<ryRjO=Dwc9h%<k$w)*^Ch6}+Ws;WJ!{7W{g*#3<nzX5^S3umrYf;Wl
z%#c&L^&ULZ;oYMrQeELd8?r>u#TqEFKLv9u=|7*Jt0y|vsdmsYe^Bi#2(twI(%Si`
zT!@)2XZLO|Za2pWzO`_v!RyVv#+7V!8|m5XfHrHD!bVT#DH}Y-&^WZ=Oh3&ozc7tF
zjZOs?0S46GcOYXM#+y`3wDqx$3kE9ZtJ)SIAtAy$8#c<oi3e+nzqSWu(mYwG2BhCy
z)8C6D1)rhsc>t8t;OOAc#8aqN1WU`Ra;n{Fwk%k~s#I_<&MRu05H;SS&Vz2pqU|%7
zXM!JA51Q%-qynkxB}0b=brVjo-O_&$U$vmdFYIH@(f0g3M{Z8f!#wW&KMxoG+h&+@
zf9Fs7A6b?Hxkw!0XQ&1P7!NzPfop`r`<edX;6aD-0Y3XAp24>YdaQUy54h*|L+53-
zMBPWj&xZF^P*vc~^%C008;R^~2=4J`GMeHJ0it)j`CUVoKDTtl2Jc1MXfV->s6wAk
zi48QKrG$iPG!A>+?O#P|o+!HA>=z^!C;9qo;zy0+Dh@Rva8q+_iNHK#vNy#wkiAn|
z3>_uMK#px0SgcW>=$9KUEwjFo#owLyLE=i2$LNWz=XFX;k2*rLH^wk_0lzz(Wj~%V
z!SY!lW_e;GftNZ7WGzb$<)(&--_K2tPR%XfI;pRIZdUg&XTGm^woaKsvwCz{3dKLv
z<rBItuBT_Z1?yO8dmYEAc<X8Ey+9E2H}drCnh-#qT02G1Xvt<Bmfo2(Y2E*d78}YR
zD7^imV_4Ayz4EHtcRX{!MRh`|7cD^*8rIgNjRBh|*khKLun~*YR4c4ON^r!a<F3f>
z$-fuFg-#|+et9dh{Pwy!YJEz5&+XaEtPvXppbvWjpq*>9vw_ZSqH^kHE}{(OsiuW1
z^@$z5YeIorKkSZvVV9MD9A%Y~6xFyv6Kv>6o8qV?N9I$724!jd<1M=|EK0ELhMO6-
z%*a;%-i;SIXHOVKH9hV)T9U{JC9-$G#>{5<TBRC}ZX@Eth5Mlw2e<PNoxz&Oo!1Nr
z98)}DLND-0K1vEW`@C=_BB308r|J)KsYW<*X;smxpB5X%Vqi@#yRYM9DsPfiz_~-`
zRh*;h)ZYJQab^S2Lfg@1Gv{-!?Rv#ws@Y-z>k1%fT_@e?2L?#SBuTkA4YX?ZNj$Im
zu)<j^U^74d(II6dt{HVusA-c!y+SC9$77-c?Ag9+n)#4erL1TpaYS1G#k0bMTz0vK
z;3<iag(N;aU$mmvp(StyDwe8Iq+I;g6@9J}RgQMGxU6{+YX`3bS$tks1q{xBbM5&?
z6NUm_g_K@%9oThn2PD|ht7b2&1F)U_Kr0)xpkw(54eP(4)9whRbQ_rhy93=5^8u_0
zzLmk2THN-#Xe(h-bJL@IGy~zEAd-SRV*g=!<JVTQ)Zo%vB3R!tnp{bW11z9_f&|<J
zVex1_@{5($DC2$0r<$kSu$dQH_*836t6EQA#1~$3JT$caGAH@_Y~#_Z^5L6x;}TO}
z_w;8*B9t)C5W?CLp~h%J;w}L7vjnE-*fEbA54}AOwO2c5y>4}j8q2KH)in{EWRf1L
zK<IrO&P4Xm<RChbV2vZ8x&>E<Q?<qVLeHd^SkH;wKYRECWP~H)#-{4z<LBDsf@AuZ
z<CgZjhm_uBg#^6`kyH2qjh|*#l5xjDTHzd2b;7l9ks+$kht^pmFNQ8*i+=SN#_cs6
zancDTQ+rw&aVw!Ll<SZ-Fz;eLFM@#SOb<l?yzFY(ol=R>=KN?qvCr!d=jx-G&l^ZY
z7>wJG8P#rob6&fZ=PE_oOY3s>?S0WGRJshV;7bgJ<+^J#5tZv^dO6&E0%HwELX`@u
z_pjW)vgNU$tCslqgjjY%(vc(j<`ktsl{T~(xf%Zxlt4Sv=cx&dP?#=_FYW8a7In@~
zjgRT@T%M4lRyuDE)i{f1665xzIn{-4n!O4se24TmsoeU$H$b8%OqJqH#t`}gJD+5C
z^)WUpoysWE#^JZu3K`u&58b;P>4B(T2nXaYLV&D7QO92+igaL#+|>QhZ@F|FUufVo
z#TPnwjWYs7&+n~4J^M0u3U{9QTy=HB!XAbwIlduI$HQFMc>qsFt`f3r?+j2vcXGMB
z0HVNm1zl0)m3Ia7y*0p@A_=DEhM{*X`w;XG))hCR&~(-HQAX}XiY!M9VjRy9Yf(Dp
zFE@{h9~N9!5mn1C%F;cv*`d6A>)t>^z(1|by!D$?gBn$OKgGB{rnA)_rE;Ou%ZoMF
zP@@0iE#0F`_e?-^J;wJ!r2)Guc%nAj<92)YK^u$jvZvFJda*V>FL`%XQ2CsM`Kf@j
zt#pr}03W5<2-rwBF2iw%$XUYNR;bhn{xQY*My!RglHb|g<BCh3uX$?XUG(oi@M4e9
z>p+?k;BbgrsY<Kh{Z~1mAcAB}RjruS1DqGP-Y}`}%$|;w&+}W}RlA3Gkx0$F)nKW)
z0wVibd67UuI2T!~;Dm;$Cvs!XyL0LbC%bm*(oFz4qG5N_)Y9z83Wb-rBHcv`lz;R-
zBp&nL81>cxTbV-7pjshV(xySGNdoQ20KUiG$b{t$|1n>>9(L{7La(u1(#yfSCN<#$
z!=xl=X!0moaN>?5q0Ez!e+SyjO}C@Pvsq00f)U)f@@^M!MNr<I35(UKLfw#Gs!IxM
z=5d3J;yWy53vV72XrHlVusu^<TZPg{x?r+yBc6l+5N7eRKcxa@m}vM?9uO%7v`c8K
zA;?`-OQXyw>H~6hQSQ5IkH`rpD}%F=&r%iQDt}`LK{UDi?lO|n5@{04xjOKZlY!_B
z_awG{o`_D7y29`JgT_A05BqZw{ybc6sfN2qnUNJhXqXt<CrRak&3tP8yz7rXjSe~@
zP}XP?zQErB8Q#AI3>$yzXJ!RV=g2VDKGO)43W}5cbC-d0gBygiQ9LhJg?6s*K@=o)
z+GX$roVEAMqsc(c5KFE1;r(DH-}`rqLT?b9p50(!K^(W%j|k>^qcOthM4=1swKxQk
z!|6rLD?C;-Iqj3(@Ol9mJXwB_NMm#@FYyQLmm1H-6+Z0A5b`0%fsFM@U&-tR<y@h6
z2sT*&tG;iiA*e%*h8GQ78rdp)v_)g)0W_rN(#Y@g4Qppu0vBG>PKZl{D=pSwDlsh$
zT2m&iUQDOgF+_<PEyxTH%7IYX9Bkj}jAK1AaBmeVel1cNnbSfzL*L{DnJI);Mwj8h
za9KGK=v^BR`kP*DT%O$Uw;Q*C)!5pILK!Kz=C#*<=@{o84fLFUk{G#JHZQ!XK8r4_
z{O<cr%iP0v#mLEG;_&%%ps+v@w*ydU8UR*{5K7y-Vp?%39jv#WOm+0O92*bnJW;?n
zP@@K<#fG?z#4rrN?r$XWowmPh8+bRzxc0L~H%D(B&rSi*)x?L7Lb*3Uz|^Q3-__W0
zNDehQZtW}y)?QuR7}<Vrvc^ZzTR#3}KVTIZ#1B4q(9jY5z}cT}-sr42U?lM#D9Fvh
zd>FTtF7As2`quQ|=x(76{!3$;ci5#xgLmG`gxXQ=ESt7Ka8RmTzetz#NbuTVB&heW
zzK|dOT`=u;=VF;EXpI(L+mU?vU2!J6%ns96hmqGK0R!;Tb7Xzer$)}8>Eqf0K*1!-
zwYtyqwN7)6+JB6rvouT}XeR1Zny0aGn^)wBl-8Q3Idj%zr3=$`1A>VVXBhV6!rg|0
zLfP5w_2;((Z?+$~5NOx@QGO-+)s7Z$?US9W8ALOS+Yn&-<N6aM%<C^e>-|_!Eu^RZ
zAmI*iyi^`@&*b=&6tJBGNfY<+336jQuWFjKU_(W{o+19o_lr5~sTf?OVtm^(KY*OT
zuVgAnkkxedPMmOxExd{ZQ@NnHE0Hk;AINwc%X|H&^Je4IZZoNMgY+H-?Z@rc06rDh
z*~zB>o#Zl#U28gO-OteWy-Y~)WF0=i(SZFLdy~aC1eGh6<XP_TLgRJ)YISFCB;Y2z
zNwW|f9VV#@3iN}oY7khiyQG7kp!pp5v1q@jFYNqt+5uo6gnc!}fpU(@H*n3^gOJih
z)YN}u2n_ny-ukKa1n_U|kGwb45e`dh?gcG<d|j%^j%%=bgW`@gMCnR`oaxjFkaJ59
zwlhSOUSn^MV^jGZj$M7TKVx41=YHqU%+NoPD*tCPM8V^f`6{y5!V~B-b!DsHZ0dk?
zFg`r0*^UKpesO;vsL{sm<Kgk>rhCYW<e0a!w~%LCU`D0XF`C1tKxxQfh}$F^V5&q@
z541EdM~T|1q+_CEvH7OoyQ-hcEc<*5dflkFQZ?&)vnZ#ex8Mw*c*NbL0j^tCR3@NB
zsLP}A$gKf&o-VuzK{iU?0!$c|_k<{c+2eS443K2}rm)><wwKK{`($-Asx3XSqI+KD
z(UXMtS_D0}_Ra{ZQV>pw&<G4GP80gu#OzKQ8DNJTM7Z_zzB77it+f2RZ-y)ysV<ZA
zVC$?6lmo0B!i7e^Ipd&guHhmzS=6q)0U1tqrI?4dMGQ~7=+&BLye(xyC38o~FDPxw
z`{w!TYuknl!v(@(CYr7hMH_>WkrBc78rlE)dLVy&vCB$RrLi?|hpD>+&&OV~&!|Sc
zi4J^EJ;itj+>)D$y`^>L#8n0EXwcjK_(b5tq2fX>A2gHzE`9ac<(LqAJJ)iJ&!Zhs
zsV&5ArM(%j-KUWNpf<O2#)Ntba2$OP1Il9Q@WTLU#rFG0^Wpb2X?rZYZ^(~QoEDdT
zic!kVere>iQI=<k;OwL=_REJ`nn7T3kiqAjJUha>#MvOlC8aRg0kbgmc`s?~@`)3u
ztnk%mi2BxYo*CF*y*j{a`K$MqD;i#hc!+lPEB<C@_3P*OyJxQMqc4+)+R4aln&?Lu
z;O^AMny(Dpq#s5XTq%x)jNkF;1;DGrNwr=tsZNA*^d0kr8o-MVdHo;@c7GIov};x-
z;5JcYI!O~Y>-xgOy$#hTEu-B0bT&@RAWlMrPjclu+gDjmU*}%&bFXgU9YcULJNvpX
z+`lkB|Md93FMw9*kNO#O#UF|-70%O`chp@%_4@`*bwih?xGkU3Mll@K$QTSA)hr}u
zslIA=U}?luwz%MGxroH;r#6N6nj<Wvb}vkJUaclb&i;5mD;ImVh+NsS-}z+KxAf%H
z4?B0e+BjXbDaVCt1p<hOhG(AFaf8ECU+jCa9AE3|OdLzR_yT(`5!m9en5Wf79^h-Y
zImP{a$IU{rK45OTH{~Bx^^!ERr61#*<2}O#^0<JWAk_V+-V^QKDw*=TXXB#+9QaER
z<GpR_0r4=$ZaXi7h9rv7ppvod5A3Y?s~5`8kHhvSlXP8$HS6#K#iv)Ve2}&1+i^dX
z|F~KBre!uz$WOitRG4LpoXL>UUTF)L9iHeCeRDU$!{g$KvO=FW@2I0)bD1+1UhNRL
zDVe=(Sd<fRV~X*U6Ahf&oJ1ap&hHYwKQ-aVRG4~+Q$kufvCVP2#9Xc6wkK4*e7VXp
zlX{9g;6YT!+w`w*i<q_8pZz|`W%hl?+)Uy`vwZ#4C!HtHGaf4x7TjW$Bn_Gf2HNc)
z8A8t!o#S!UTWmQVSI)P)mOqvH(Lg6YBgK8~xsX=k%{*mnO>-~TlC@Ma*@$>qm;8;e
zp-W7QopbKrF$f*JEss_{T--5=e^gO{@V4`Fl9<JYyv!Q<{@YFSPkF#q{nlAr13cis
z#eoW!$<c&uvQJyO9e|&@DN8X7z3TTgJf6ci0lst|aH-_50@I;cC7WRMs3&Wo`X9)#
z`lyCfV|l-T+Y%qE?`bxeKU7hdRFyVd5C$;>{Ee_2-MAU`3vG~DaDy7T4(QC^yx#_-
zL5$<DkyErwIe!%C!oAP({@(uwqKUscfd3~RzJGOrf2uP6ncpREFmp}-nGoqu^DF%&
zkaH=r6o_7;C)&_hEPsL|jd5NSE*aQ%{eB!|rW&^CJH-p&#}`HbqkAO4<rp&g+xS|0
zbBZLatAr{9JWW9l7_9QU54rsMDd~{AHcQacv?k1^&I0hH48Wvn;K|TVO4z`kFq;7S
zS^!l*Q2CAR|JFW}*gGyLAPaFq#4&Ly!Vexl@-OOYx|Tu)8GBFj%6K+9P#BhL(4sFV
z?YnVD?N!%4%PuF2tI}(Rewcn02KIAj-6<I!k(VHCok%jszwu>R@2J?$VM}(x^o&Vl
zILCyW0-(el-dBL;B>6qdO1X2Y=3My+OUdUs^pQ(;ipyu(W#3PHG4<Hbgh$e(0vZ0F
z)u+bVADQrKR@D!u<A;+Z!!)^cvPI;AqgyQs4y^3azo*WSGBIA`$7$!QHP*mU|Lq@g
z_rCI&z9~y=y20VXV@D>vbML8U;>cfo{rYwJP4S1g*9+F?dOna<xF_9MazaUoZALO*
z5_@7|pV8JkKW;yK-(II&QCp%f_|hP2m(yvPL+27)sux>2H}=h4opEispXagP_unan
zJXKwv{EWXn{iyxs?Qc21o$nUkRkz;!sKzCYKb`_Nc7=Y?ig<Z@_7dHlEC~Xf4uuDm
zjeO-}w*B3;Pwi#%@yiw50Uz0BE|1F;l9o{WcWGN6tGmaoP(_!#J`NV4fH`tozb^0l
zC;y}RQC{Dy@XjwQOuJcj^u1Nw!Om$Uv!u)MM1to*xvld*@;_v6uG4<`ZCahqhrjc-
zi_U((t-94NhqotHg~e$8!Z(^aF8OD!t&RLQi+yvoa_5ia4`um0e}b;pB(D17u;b(O
z4n30<CMzFRD73h8o%rMyne=|zWR_pEp4dN_|4Ico1t+>Qc*QG){@;05-^BCT@%}J>
zsJr)0oXCZ>UnW=FUYF){Ugt@H`EH@J4GfYtVe6Y}*ca4Te)v-*-}dE1$yzl_xzMfc
z*3Oo9j!(Wb=UULJrIPM@_iWplWtKTl{x{=BU=>gkQT%&_>EAeu>%UijyXj|dRB!dm
zYQrDt2esS}-}RKt=X*IJ)2eV@^yK_Jj(8(?edStjp?ax1YhIlVnX_cqbkWlyO_;}T
zhF>y8-W(2Wq5yY?&jqa@;Hu}gTX_Mv#gY>^o^X9n{V$HySKWbg;1l$K+X~}80#__4
ztq;FGF92y{45Kq}5y}MH`+ti~?SP6De&_#svj0Q~)-ra!W2p{MYa6W1LJeAU_WJtc
zT|n1{{3~6Gxc1pMPc4%gR@JP=v!iYOwEeh_;|pQbg)O22(jthB+$-<ZAAdUi_t`(I
zp29D#rA9b_%ow;fpy(&o_ix^Roc|!dw<fUugMIXeeaj!b7mDAv+Pt+UQRYGB_M&3#
z+up*=8&8}Jvyg2oI{x~XmO96km`&l%zDMuME9d>)WdDNoL;t`34EO&td@MoZmwr7f
zdj4ya{i)2i>34ogm%p6|Jgri{rT#lmJ@9Dm#pnX-*JUhzwWWTF?V8{7i7;EFHmv_s
z`_}ozPjc^XbN|b*PY~#g-@t4A^m@_w`*tqsUv=?cz_-osYI?zw84JV@*#ni<1J5JB
zh%Rvbx=A3&)pFNr&rjT5cT@fg(+}VrdHwtU3_o_6qw#+|>#|$z{?qx^`JJE4-rrXK
z*AUO~pW*#qkPpQ}@_>id^~G-y+qy8SChOWB^@Ts!J@>Z<%|E=4zprd^_$;-%LjF2y
z%8p1dg;qUY=8*bZu4~1^m9C#PUa1H9Ms5^!TEURO#~`vLFjftDOCl?8`9Fv6^<U3i
z`Bnd?`R)GKlm0U(24rJDtc~@`M&$btkxBuqTjq$7r{%V~;Mi*>>1AA9Mv;9Ca$8>I
zOkA^ZlT|x#i+17!$2S?wPv-y7n74b`kLllhV*Nd?=$6KsJw763?Od5`be5+f-i<Rn
z$nlBb!ME`tdK(jSOB0v-R_BW?Eamph&-k=rTB^hDn*wt-#GanSF5Omj{kizt{@=iw
z?T7zE-MGsS_RTMSbiQq#;<a2;>-)@((&z0a*G%Wsm~}?aqs?h|toxl4dCl5ip1-;M
z+v-QN`$yONN6&Xp|M>d8*w^YprlXu^=eNH(uXO5aaby3Z`bU;ep3ZErn~2n$BK2M^
zCZy&sN$jDx=i20FKtwdtEB+~a|F-(ua29jBD!p@)P`<XJ{*dMJ`tuw9GwkdCaanNT
zbM^1nKQBJ#|9<^*(70}2=Kara?B6f`XM(Kmg0VALh5MZO_v6oP2Clc1xxD}Rjr#lU
ze=>lBKE;0j8RjScXQ-9`xPX`6(th9mxpGDIwe?oO1**uBi~k9pwf<Mz{`X7$u?7>H
zC;xu^E2%vB_v@e6_DC}QKio~;|NJKZ{UY$d9EJ;@C;xu^^CH-pYk&m?lAJw~E0{+e
zhzQ%!5Tr#KyjHeX$#l}gDgQoJ|Goqqkwau7U_#}%Hy3zm@8Qq(j~#l6%mIzFAejW5
z4@!ZpMviA-GyeUuf9(E8c~<$K-~7K{0H>xHGQgSS1vn>Mt4EfSN6N5_MCN6-c5p84
z2ZvEQhDQ*NaF7~xAlA}wGz96F28HgPn4h!R`ucnG`mF8nTEOiKsD_vctRRppfzjG*
zw7P}1N=DlQ1X@(rVtS;HOnV~We)IP7Yrs8T$nCV1YfGxVC2LM3a{M?}lp$CK9Q!pU
rdkm7o6Gw>YK8*l1j6c=_hhTuiFq4-7cb|7o+y5)J7I>Zx`~RB&kn;59

literal 0
HcmV?d00001

diff --git a/vllm/distributed/kv_transfer/kv_connector/__init__.py b/vllm/distributed/kv_transfer/kv_connector/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py
new file mode 100644
index 0000000000000..6089e3babac3e
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/base.py
@@ -0,0 +1,122 @@
+"""
+KVConnectorBase Class for Distributed KV Cache & Hidden State communication
+
+The class provides two primary abstract methods:
+1. send_kv_caches_and_hidden_states(): Send KV caches and hidden states
+2. recv_kv_caches_and_hidden_states(): Recv KV caches and hidden states
+"""
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, List, Tuple, Union
+
+import torch
+
+from vllm.sequence import IntermediateTensors
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+
+class KVConnectorBase(ABC):
+    """
+    Abstract base class for a KV connector.
+
+    The class provides two primary abstract methods:
+    1. send_kv_caches_and_hidden_states(): Send KV caches and hidden states
+    2. recv_kv_caches_and_hidden_states(): Recv KV caches and hidden states
+    """
+
+    @abstractmethod
+    def __init__(
+        self,
+        rank: int,
+        local_rank: int,
+        config: "VllmConfig",
+    ):
+        raise NotImplementedError
+
+    @abstractmethod
+    def close(self) -> None:
+        """Close the buffer and release resources.
+
+        This method is responsible for cleaning up resources related to the 
+        connector when it is no longer needed.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
+    ) -> None:
+        """
+        Send KV caches and hidden states to the connector.
+
+        This method processes the input tokens, KV caches, and 
+        hidden/intermediate states for a given model and sends the data to the 
+        decode instance.
+
+        Args:
+            model_executable (torch.nn.Module): The model executable containing 
+                start and end layer information.
+            model_input (ModelInputForGPUWithSamplingMetadata): The input
+                metadata from vLLM.
+            kv_caches (List[torch.Tensor]): List of KV caches (keys and values) 
+                for each layer.
+            hidden_or_intermediate_states (Union[torch.Tensor, 
+            IntermediateTensors]): 
+                The hidden or intermediate states associated with the tokens.
+
+        Returns:
+            None
+
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def recv_kv_caches_and_hidden_states(
+        self, model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor]
+    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
+        """
+        Receive KV caches and hidden states from the connector.
+
+        This method attempts to retrieve KV caches and hidden states for input
+        tokens. If all required KV caches and hidden states are received, it
+        will bypass model input, else it will fall back to normal vLLM model 
+        forwarding.
+
+        Args:
+            model_executable (torch.nn.Module): 
+                The model executable from vLLM modelrunner.
+            model_input (ModelInputForGPUWithSamplingMetadata): 
+                The model input from vLLM modelrunner.
+            kv_caches (List[torch.Tensor]): 
+                List of KV caches for each layer.
+
+        Returns:
+            - hidden_or_intermediate_states (torch.Tensor or
+            IntermediateTensors): 
+                Concatenated hidden states if all required data is retrieved, 
+                otherwise `None`.
+            - bypass_model_exec (bool): 
+                Indicates whether the model execution can be skipped (True) or 
+                needs to be redone (False).
+            - model_input (ModelInputForGPUWithSamplingMetadata): 
+                Optionally adjusted input metadata for re-execution when 
+                `bypass_model_exec=False`.
+
+        """
+
+        raise NotImplementedError
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
new file mode 100644
index 0000000000000..015f892cec933
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -0,0 +1,19 @@
+from typing import TYPE_CHECKING
+
+from .base import KVConnectorBase
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+
+class KVConnectorFactory:
+
+    @staticmethod
+    def create_connector(rank: int, local_rank: int,
+                         config: "VllmConfig") -> KVConnectorBase:
+        if config.kv_transfer_config.kv_connector == 'PyNcclConnector':
+            from .simple_connector import SimpleConnector
+            return SimpleConnector(rank, local_rank, config)
+        else:
+            raise ValueError(f"Unsupported connector type: "
+                             f"{config.kv_connector}")
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
new file mode 100644
index 0000000000000..5870070a54c75
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -0,0 +1,261 @@
+"""
+Simple KV Cache Connector for Distributed Machine Learning Inference
+
+The SimpleConnector transfers KV caches between prefill vLLM worker (KV cache 
+producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe.
+
+But the logic can be extended to support other pipe and lookup buffer.
+"""
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
+    SimpleBuffer)
+from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
+from vllm.logger import init_logger
+from vllm.sequence import IntermediateTensors
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+logger = init_logger(__name__)
+
+
+class SimpleConnector(KVConnectorBase):
+
+    def __init__(
+        self,
+        rank: int,
+        local_rank: int,
+        config: VllmConfig,
+    ):
+
+        self.config = config.kv_transfer_config
+
+        logger.info("Initializing PyNcclConfig under kv_transfer_config %s",
+                    self.config)
+
+        self.lookup_buffer_size = self.config.kv_buffer_size
+
+        self.producer_buffer: Optional[SimpleBuffer] = None
+        self.consumer_buffer: Optional[SimpleBuffer] = None
+
+        # 2 pipes for every rank in the world
+        port_offset_base = 2 * rank
+
+        # In disaggregated prefill, the prefill vLLM only uses send pipe
+        # and the decode vLLM only uses recv pipe
+        if self.config.is_kv_producer:
+
+            self.producer_data_pipe = PyNcclPipe(
+                local_rank=local_rank,
+                config=self.config,
+                port_offset=port_offset_base,
+            )
+            self.producer_signal_pipe = PyNcclPipe(
+                local_rank=local_rank,
+                config=self.config,
+                port_offset=port_offset_base + 1,
+                device="cpu",
+            )
+            self.producer_buffer = SimpleBuffer(self.producer_signal_pipe,
+                                                self.producer_data_pipe,
+                                                self.config.kv_buffer_size)
+
+        else:
+
+            # the current vLLM instance is KV consumer, so it needs to connect
+            # its recv pipe to the send pipe of KV producder
+            self.consumer_data_pipe = PyNcclPipe(
+                local_rank=local_rank,
+                config=self.config,
+                port_offset=port_offset_base,
+            )
+            self.consumer_signal_pipe = PyNcclPipe(
+                local_rank=local_rank,
+                config=self.config,
+                port_offset=port_offset_base + 1,
+                device="cpu",
+            )
+            self.consumer_buffer = SimpleBuffer(
+                self.consumer_signal_pipe,
+                self.consumer_data_pipe,
+                self.config.kv_buffer_size,
+            )
+
+    def select(self, input_tokens: Optional[torch.Tensor],
+               roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]:
+
+        assert self.consumer_buffer is not None, "Please initialize the "\
+            "consumer buffer before calling select."
+        return self.consumer_buffer.drop_select(input_tokens, roi)
+
+    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+               key: torch.Tensor, value: torch.Tensor,
+               hidden: torch.Tensor) -> None:
+
+        assert self.producer_buffer is not None, "Please initialize the "\
+            "producer buffer before calling insert."
+
+        self.producer_buffer.insert(input_tokens, roi, key, value, hidden)
+
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
+    ) -> None:
+
+        input_tokens_tensor = model_input.input_tokens
+        seq_lens = model_input.attn_metadata.seq_lens
+        slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
+
+        # query_lens contains new KV caches that are added to vLLM.
+        # so we will send them to decode instance
+        # FIXME(Kuntai): This assume that all requests are prefill.
+        for idx, slen in enumerate(seq_lens):
+            start_pos = sum(seq_lens[:idx])
+            end_pos = start_pos + slen
+            current_tokens = input_tokens_tensor[start_pos:end_pos]
+
+            keys, values = [], []
+
+            for layer_id in range(start_layer, end_layer):
+                kv_cache = kv_caches[layer_id - start_layer]
+
+                _, _, num_heads, head_size = kv_cache[0].shape
+
+                key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
+                value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
+
+                current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
+
+                keys.append(key_cache[current_slot_mapping].unsqueeze(0))
+                values.append(value_cache[current_slot_mapping].unsqueeze(0))
+
+            keys = torch.cat(keys, dim=0)
+            values = torch.cat(values, dim=0)
+
+            self.insert(current_tokens,
+                        torch.ones_like(current_tokens,
+                                        dtype=bool), keys, values,
+                        hidden_or_intermediate_states[start_pos:end_pos])
+
+        logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
+
+    def recv_kv_caches_and_hidden_states(
+        self, model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor]
+    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
+
+        # When bypass_model_exec is set to False, it means that at least for one
+        # request its corresponding KV cache or hidden state is missing.
+        # In this case we need to do prefilling to recompute missing KV cache
+        # and hidden states.
+        bypass_model_exec = True
+
+        input_tokens_tensor = model_input.input_tokens
+        seq_lens = model_input.attn_metadata.seq_lens
+        slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
+
+        hidden_or_intermediate_states_for_one_req = []
+
+        input_tokens_list = []
+        num_computed_tokens_list = []
+        start_pos_list = []
+
+        # enumerate different requests
+        # FIXME(Kuntai): This impl assumes that all requests are prefill.
+        for idx, slen in enumerate(seq_lens):
+
+            start_pos = sum(seq_lens[:idx])
+            end_pos = start_pos + slen
+            current_tokens = input_tokens_tensor[start_pos:end_pos]
+            num_tokens = slen
+
+            # collecting data for rebuilding the input
+            input_tokens_list.append(current_tokens)
+            start_pos_list.append(start_pos)
+
+            ret = self.select(current_tokens,
+                              torch.ones_like(current_tokens, dtype=bool))
+            if ret[0] is None:
+                # didn't find any match.
+                bypass_model_exec = False
+                num_computed_tokens_list.append(0)
+                continue
+
+            roi: torch.Tensor = ret[1]
+            keys: torch.Tensor = ret[2]
+            values: torch.Tensor = ret[3]
+            hidden: torch.Tensor = ret[4]
+
+            num_computed_tokens = roi.shape[0]
+            num_computed_tokens_list.append(num_computed_tokens)
+
+            # check if both KV cache and the hidden states are received
+            # If not, need to redo the forwarding to compute missing states
+            if not all([(num_computed_tokens == num_tokens), hidden is not None
+                        ]):
+                bypass_model_exec = False
+
+            # update the end position based on how many tokens are cached.
+            end_pos = start_pos + num_computed_tokens
+
+            # put received KV caches into paged memory
+            for i in range(model_executable.model.start_layer,
+                           model_executable.model.end_layer):
+
+                kv_cache = kv_caches[i - model_executable.model.start_layer]
+                layer = model_executable.model.layers[i]
+
+                key_cache, value_cache = kv_cache[0], kv_cache[1]
+                ops.reshape_and_cache_flash(
+                    keys[i - model_executable.model.start_layer].to(
+                        key_cache.device),
+                    values[i - model_executable.model.start_layer].to(
+                        value_cache.device),
+                    key_cache,
+                    value_cache,
+                    slot_mapping[start_pos:end_pos],
+                    layer.self_attn.attn.kv_cache_dtype,
+                    layer.self_attn.attn._k_scale,
+                    layer.self_attn.attn._v_scale,
+                )
+
+            hidden_or_intermediate_states_for_one_req.append(hidden)
+
+        if not bypass_model_exec:
+            # Some of the KV cache is not retrieved
+            # Here we will fall back to normal model forwarding
+            # But optionally you can adjust model_input so that you only do
+            # prefilling on those tokens that are missing KV caches.
+            logger.debug(
+                "[rank%d]: Failed to receive all KVs and hidden "
+                "states, redo model forwarding.", torch.distributed.get_rank())
+            hidden_or_intermediate_states = None
+
+        else:
+            logger.debug(
+                "[rank%d]: Successfully received all KVs and hidden "
+                "states, skip model forwarding.", torch.distributed.get_rank())
+            hidden_or_intermediate_states = torch.cat(
+                hidden_or_intermediate_states_for_one_req, dim=0)
+
+        return hidden_or_intermediate_states, bypass_model_exec, model_input
+
+    def close(self):
+        self.producer_data_pipe.close()
+        self.producer_signal_pipe.close()
+        self.consumer_data_pipe.close()
+        self.consumer_signal_pipe.close()
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
new file mode 100644
index 0000000000000..bad119a1aa929
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -0,0 +1,108 @@
+"""
+This file contains a new class `KVLookupBufferBase` that allows developers to 
+think of KV cache operations as inserting new KV cache entries (`insert`) 
+into the lookup buffer and querying existing KV caches (`drop_select`) 
+from the lookup buffer.
+
+All distributed communications are abstracted behind this class.
+"""
+
+from abc import ABC, abstractmethod
+from typing import List, Optional
+
+import torch
+
+
+class KVLookupBufferBase(ABC):
+    """
+    Abstract base class for a lookup buffer.
+
+    This class provides an abstraction for a key-value (KV) cache lookup buffer.
+    
+    The key of the lookup buffer:
+    - input_tokens: token IDs of the request
+    - roi: a binary mask on top of input_tokens.
+      - Purpose of roi: Since KV cache may only be available for a subset of 
+        tokens in the input (for example, when vLLM is connected to an external 
+        KV cache service), roi specifies the subset of tokens that the KV cache 
+        is associated with.
+      - NOTE: roi can be further extended to describe which part of KV the 
+        current process is holding (each process may only hold a part of KV 
+        due to TP and PP). This is not implemented for now.
+        
+    The value of the lookup buffer:
+    - key: the key tensor in the KV cache
+    - value: the value tensor in the KV cache
+    - hidden: the final hidden state generated by model forwarding. This allows 
+      vLLM to bypass further model forwarding by transmitting the hidden state.
+    """
+
+    @abstractmethod
+    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+               key: torch.Tensor, value: torch.Tensor,
+               hidden: torch.Tensor) -> None:
+        """Insert into the lookup buffer.
+        
+        The functionality is similar to the following python statement
+        ```
+        buffer[input_tokens, roi] = [key, value, hidden]
+        ```
+        
+        FIXME: in the future, we should only have two arguments, key and value,
+        where key is a tensor dict and value is a tensor dict.
+        
+        FIXME: we should transmit both sampler outputs and the hidden states.
+
+        Args:
+            input_tokens (torch.Tensor): token IDs.
+            roi (torch.Tensor): A binary mask on top of the input tokens
+            key (torch.Tensor): The key tensor in the KV cache.
+            value (torch.Tensor): The value tensor in the KV cache.
+            hidden (torch.Tensor): The final hidden state tensor generated 
+                                   during model forwarding to bypass model 
+                                   forwarding.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def drop_select(
+            self, input_tokens: Optional[torch.Tensor],
+            roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]:
+        """Select and *drop* KV cache entries from the lookup buffer.
+        
+        The functionality is similar to the following python statements
+        ```
+        ret = buffer.pop(input_tokens, roi)
+        return ret
+        ```
+        
+        If `input_tokens` and `roi` is `None`, it means selecting any of the
+        KV caches in the buffer, return, and remove it from the buffer, useful
+        when offloading KV cache to KV cache storage service.
+
+        Args:
+            input_tokens (torch.Tensor): token IDs.
+            roi (torch.Tensor): A binary mask on top of the input tokens
+
+        Returns:
+            List[Optional[torch.Tensor]]: A list of tensors. Can be None.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def close(self) -> None:
+        """Close the buffer and release resources.
+
+        This method is responsible for cleaning up resources related to the 
+        lookup buffer when it is no longer needed.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
new file mode 100644
index 0000000000000..fe8d8d7375f36
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -0,0 +1,242 @@
+"""
+    Implements a distributed key-value (KV) cache transfer mechanism.
+
+    Key Features:
+    - Distributed KV cache transmission using PyNccl pipes.
+    - Non-blocking `insert`, blocking `drop_select`.
+    - Use CPU signal pipe to avoid racing condition
+    - Handles buffer size constraints and provide backpressure mechanism to 
+      stop the prefill instance when the decode instance is slow.
+"""
+import threading
+import time
+from collections import deque
+from typing import Deque, List, Optional, Union
+
+import torch
+
+from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
+    KVLookupBufferBase)
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class SimpleBuffer(KVLookupBufferBase):
+
+    def __init__(self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase,
+                 buffer_size_thresh: float):
+        """
+        signal_pipe: on CPU 
+        
+        NOTE: on-device recv will block all threads in the process, making the 
+        KV cache producer unable to listen to new request while transmitting 
+        KV cache. Luckily CPU recv only blocks the current thread so we use 
+        CPU recv to listen to new request.
+        
+        data_pipe: on device (e.g. GPU)
+        """
+
+        self.buffer: Deque[List[torch.Tensor]] = deque()
+
+        self.buffer_size = 0
+        self.buffer_size_threshold = buffer_size_thresh
+        self.buffer_lock = threading.Lock()
+        self.signal_pipe = signal_pipe
+        self.data_pipe = data_pipe
+        self.request_handling_thread: Optional[threading.Thread] = None
+
+        self.normal_signal = torch.tensor([0], device="cpu")
+        self.end_signal = None
+
+    def _matches(self, tokens_roi_sender: List[torch.Tensor],
+                 tokens_roi_recver: List[torch.Tensor]):
+
+        # tokens_roi_sender: tokens and roi of the producer (in the buffer)
+        # tokens_roi_recver: tokens and roi of the consumer (query)
+
+        tokens_sender = tokens_roi_sender[0]
+        tokens_recver = tokens_roi_recver[0]
+        roi_sender = tokens_roi_sender[1]
+        roi_recver = tokens_roi_recver[1]
+
+        if tokens_recver is None:
+            # consumer sends an empty request
+            # semantics: DROP SELECT * LIMIT 1
+            # so any of the data in the buffer can be drop-selected
+            return True
+
+        # Assuming that roi is a binary mask on tokens
+        tokens_sender = tokens_sender[roi_sender]
+        tokens_recver = tokens_recver[roi_recver]
+
+        # simple common prefix matching
+        min_length = min(len(tokens_sender), len(tokens_recver))
+        if torch.allclose(tokens_sender[:min_length],
+                          tokens_recver[:min_length]):
+            return min_length
+
+        return 0
+
+    def _send_tensor_and_dec_size(self,
+                                  tensor: Optional[torch.Tensor]) -> None:
+
+        assert tensor is not None, "Use self.data_pipe.send(None) instead"
+        self.buffer_size -= tensor.element_size() * tensor.numel()
+        if tensor.dtype == torch.bool:
+            tensor = tensor.float()
+        self.data_pipe.send_tensor(tensor)
+
+    def _get_element_size(self, data: Optional[Union[List, torch.Tensor]]):
+
+        if isinstance(data, torch.Tensor):
+            return data.element_size() * data.numel()
+        if not data:
+            # cannot perform `not data` on a tensor
+            # so this check needs to go after the check above
+            return 0
+
+        raise AssertionError(f"Unknown data type {type(data)}")
+
+    def _add_to_buffer(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+                       key: torch.Tensor, value: torch.Tensor,
+                       hidden: torch.Tensor):
+
+        if isinstance(input_tokens, torch.Tensor):
+            input_tokens = input_tokens.clone()
+        if isinstance(roi, torch.Tensor):
+            roi = roi.clone()
+        if isinstance(key, torch.Tensor):
+            key = key.clone()
+        if isinstance(value, torch.Tensor):
+            value = value.clone()
+        if isinstance(hidden, torch.Tensor):
+            hidden = hidden.clone()
+
+        buffer_item = [input_tokens, roi, key, value, hidden]
+
+        with self.buffer_lock:
+            for data in buffer_item:
+                self.buffer_size += self._get_element_size(data)
+            self.buffer.append(buffer_item)
+
+    def _is_end_signal(self, signal):
+        return signal is None
+
+    def drop_select_handler(self):
+
+        try:
+
+            while True:
+                signal = self.signal_pipe.recv_tensor()
+                if self._is_end_signal(signal):
+                    logger.info("Received end signal!")
+                    break
+
+                input_tokens = self.data_pipe.recv_tensor()
+
+                roi = self.data_pipe.recv_tensor()
+                assert roi is not None, "Please provide the roi when sending "\
+                    "drop-select request"
+                roi = (roi > 0.5)
+                tokens_roi_recver = [input_tokens, roi]
+
+                matched_length = 0
+
+                # perform input tokens and roi matching
+                # FIXME: this matching is O(n), ideally it should be O(1)
+                # but this buffer size won't (and shouldn't) be too large so
+                # the fix is not urgent.
+                with self.buffer_lock:
+
+                    for _ in range(len(self.buffer)):
+
+                        temp_length = self._matches(self.buffer[0],
+                                                    tokens_roi_recver)
+                        if temp_length > 0:
+                            matched_length = temp_length
+                            break
+                        # rotate the element we just accessed to the end
+                        self.buffer.rotate(-1)
+
+                    if matched_length > 0:
+                        # need to clone the tensor
+                        # in case the tensor is freed before sending finishes
+                        matched_item = self.buffer.popleft()
+                        for tensor in matched_item:
+                            self._send_tensor_and_dec_size(tensor)
+
+                    else:
+                        # no match, just send None
+                        for _ in range(5):
+                            self.data_pipe.send_tensor(None)
+
+        except RuntimeError as e:
+            if 'Connection closed by peer' not in str(e):
+                raise e
+
+        logger.debug("Closing drop_select_handler")
+
+    def drop_select(
+            self, input_tokens: Optional[torch.Tensor],
+            roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]:
+
+        assert self.request_handling_thread is None, \
+            "drop_select should be called by the KV cache consumer "\
+            "(e.g. the decode vLLM instance)"
+
+        if isinstance(input_tokens, torch.Tensor):
+            input_tokens = input_tokens.clone()
+        if isinstance(roi, torch.Tensor):
+            roi = roi.clone().float()
+
+        self.signal_pipe.send_tensor(self.normal_signal)
+        self.data_pipe.send_tensor(input_tokens)
+        self.data_pipe.send_tensor(roi)
+
+        input_tokens = self.data_pipe.recv_tensor()
+        roi = self.data_pipe.recv_tensor()
+        if roi is not None:
+            # convert from float tensor to bool tensor
+            # as PyNccl does not support sending bool tensor
+            roi = (roi > 0.5)
+        key = self.data_pipe.recv_tensor()
+        value = self.data_pipe.recv_tensor()
+        hidden = self.data_pipe.recv_tensor()
+
+        return [input_tokens, roi, key, value, hidden]
+
+    def full_handler(self):
+        time.sleep(0.001)
+
+    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+               key: torch.Tensor, value: torch.Tensor,
+               hidden: torch.Tensor) -> None:
+
+        if self.buffer_size > self.buffer_size_threshold:
+            # log outside the while loop to avoid this message being logged
+            # repeatedly.
+            logger.debug("KV transfer buffer is full. Handling...")
+        while self.buffer_size > self.buffer_size_threshold:
+            self.full_handler()
+
+        self._add_to_buffer(input_tokens, roi, key, value, hidden)
+
+        # when calling the insert, the current process is a sender
+        # need to launch the request handler and start listening to request.
+        if self.request_handling_thread is None:
+            self.request_handling_thread = threading.Thread(
+                target=self.drop_select_handler)
+            self.request_handling_thread.start()
+
+    def close(self):
+
+        if hasattr(self, "request_handling_thread"
+                   ) and self.request_handling_thread is not None:
+            self.request_handling_thread.join()
+
+        else:
+            # TODO: have a explicit close signal and have a explicit way to
+            # check if it's requester
+            self.signal_pipe.send_tensor(self.end_signal)
diff --git a/vllm/distributed/kv_transfer/kv_pipe/__init__.py b/vllm/distributed/kv_transfer/kv_pipe/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
new file mode 100644
index 0000000000000..4b0cb44cc5b81
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
@@ -0,0 +1,65 @@
+"""
+This file defines an interface `KVPipeBase`
+that provides an abstraction for sending and receiving tensors, or None, via
+distributed communications.
+
+All classes instantiated from this interface are assumed to be a FIFO pipe.
+
+If your distributed communication platform already supports key-value lookup,
+you can bypass this interface and directly start from `kv_lookup_buffer`.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import torch
+
+
+class KVPipeBase(ABC):
+    """
+    This class provides an interface for sending and receiving tensors, or
+    None, by distributed communications.
+    """
+
+    @abstractmethod
+    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+        """Send a tensor, or None, via the pipe.
+        
+        Need to support sending None -- important for error handling.
+        
+        TODO: add a `key` argument so that we can use traditional 
+        key-value database as the distributed communication mechanism behind 
+        the pipe.
+
+        Args:
+            tensor (Optional[torch.Tensor]): The tensor to be sent. Can be None.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def recv_tensor(self) -> Optional[torch.Tensor]:
+        """Receive a tensor (can be None) from the pipeline.
+
+        Returns:
+            Optional[torch.Tensor]: The tensor received from the pipeline. Can 
+                                    be None.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def close(self) -> None:
+        """Close the pipeline and release resources.
+
+        This method is responsible for closing the communication pipeline 
+        and releasing any resources associated with it.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
new file mode 100644
index 0000000000000..98222fa67e492
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@@ -0,0 +1,276 @@
+"""
+    This module implements a PyNccl pipe for sending and receiving 
+    Optional[torch.Tensor] between distributed ranks with advanced 
+    communication features.
+
+    Key Features:
+    - Supports sending and receiving tensors with metadata
+    - Handles both CUDA and CPU device communications
+    - Implements a non-blocking tensor transfer mechanism
+    - Manages buffer size and provides backpressure control
+    - Supports distributed process groups with configurable parameters
+"""
+
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import Callable, Dict, Optional, Tuple
+
+import torch
+
+from vllm.config import KVTransferConfig
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class BrokenPipeException(Exception):
+
+    def __init__(self, message):
+        self.message = message
+        super().__init__(self.message)
+
+
+Metadata = Dict[str, Optional[torch.Tensor]]
+
+
+class PyNcclPipe(KVPipeBase):
+
+    METADATA_LENGTH = 16
+    MAX_TENSOR_DIMENSIONS = 14
+    METADATA_DTYPE = torch.int64
+
+    def __init__(self,
+                 local_rank: int,
+                 config: KVTransferConfig,
+                 device: Optional[str] = None,
+                 port_offset: int = 0):
+        self.config = config
+        self.local_rank = local_rank
+        self.kv_rank = self.config.kv_rank
+        self.kv_parallel_size = self.config.kv_parallel_size
+        if device is None:
+            self.device = self._select_device(self.config.kv_buffer_device)
+        else:
+            self.device = self._select_device(device)
+
+        # build distributed connection and send/recv implementation
+        self.group = StatelessProcessGroup.create(
+            host=self.config.kv_ip,
+            port=self.config.kv_port + port_offset,
+            rank=self.kv_rank,
+            world_size=self.kv_parallel_size,
+        )
+        # add a barrier to make sure the connection is initiated properly
+        self.group.barrier()
+        impl = self._get_device_send_recv_impl(self.group)
+        self.device_send_func, self.device_recv_func = impl
+        # set target rank
+        self.target_rank_for_send = (self.kv_rank + 1) % self.kv_parallel_size
+        self.target_rank_for_recv = (self.kv_rank - 1) % self.kv_parallel_size
+
+        # transportation-related variables
+        self.transport_thread: Optional[ThreadPoolExecutor] = None
+        self.buffer_size = 0
+        self.buffer_size_lock = threading.Lock()
+        self.buffer_size_thresh = self.config.kv_buffer_size
+
+    def _get_device_send_recv_impl(
+        self, group: StatelessProcessGroup
+    ) -> Tuple[Callable[[torch.Tensor, int], None], Callable[
+        [torch.Tensor, int], None]]:
+
+        send: Callable[[torch.Tensor, int], None]
+        recv: Callable[[torch.Tensor, int], None]
+        if self.device.type == "cuda":
+            # use PyNCCL for send / recv
+            comm = PyNcclCommunicator(group, device=self.local_rank)
+            comm.disabled = False
+            send, recv = comm.send, comm.recv  # type: ignore
+        else:
+            # This send / recv implementation here is NOT intended to transfer
+            # KV caches (and should NOT be repurposed to transfer KV caches).
+            # Currently it is only used to transmit control-plane messages
+            # for PyNcclBuffer.
+            send = group.send_obj
+
+            def my_recv(x, src):
+                x[...] = group.recv_obj(src)
+
+            recv = my_recv
+
+        return send, recv
+
+    def _select_device(self, device: str):
+        logger.info("Selecting device: %s", device)
+        if device == "cuda":
+            return torch.device(f"cuda:{self.local_rank}")
+        else:
+            return torch.device("cpu")
+
+    def _make_metadata(self, tensor: Optional[torch.Tensor]) -> Metadata:
+        """
+        Create the metadata as a dictionary based on the input tensor.
+
+        Parameters:
+            - tensor: The input tensor or None if no tensor is provided.
+
+        Returns:
+            - metadata: A dictionary with the following keys:
+                - "dtype": The data type of the tensor or None.
+                - "shape": The shape of the tensor or None.
+        """
+        if tensor is None:
+            return {"dtype": None, "shape": None}
+        else:
+            return {"dtype": tensor.dtype, "shape": tensor.shape}
+
+    def _prepare_recv_buffer(self, metadata: Metadata) -> torch.Tensor:
+        """
+        Create a buffer to receive the tensor based on the provided metadata.
+
+        Parameters:
+            - metadata: A dictionary with keys "dtype" and "shape", describing 
+              the tensor's data type and shape.
+
+        Returns:
+            - buffer: A tensor of the specified type and shape, allocated on 
+              self.device.
+        """
+        return torch.empty(metadata["shape"],
+                           dtype=metadata["dtype"],
+                           device=self.device)
+
+    def _send_metadata(self, metadata: Metadata):
+        """
+        Send the metadata dictionary to the target rank.
+
+        Parameters:
+            - metadata: A dictionary with keys "dtype" and "shape".
+        """
+        self.group.send_obj(metadata, self.target_rank_for_send)
+
+    def _recv_metadata(self) -> Metadata:
+        """
+        Receive the metadata dictionary from the target rank.
+
+        Returns:
+            - metadata: A dictionary with keys "dtype" and "shape" describing 
+              the tensor.
+        """
+        return self.group.recv_obj(self.target_rank_for_recv)
+
+    def _send_impl(self, tensor: Optional[torch.Tensor]) -> None:
+        """
+        The actual implementation of sending the tensor and its metadata to the 
+        target rank.
+
+        Parameters:
+            - tensor: The input tensor to be sent, or None if no tensor is 
+              being sent.
+        """
+        metadata = self._make_metadata(tensor)
+        self._send_metadata(metadata)
+        if tensor is not None:
+            self.device_send_func(tensor.to(self.device),
+                                  self.target_rank_for_send)
+
+    def _recv_impl(self) -> Optional[torch.Tensor]:
+        """
+        The actual implementation of receiving a tensor and its metadata from 
+        the target rank.
+
+        Returns:
+            - buffer: The received tensor, or None if no tensor is received.
+        """
+        metadata = self._recv_metadata()
+        if metadata["dtype"] is None:
+            return None
+        buffer = self._prepare_recv_buffer(metadata)
+        self.device_recv_func(buffer, self.target_rank_for_recv)
+
+        return buffer
+
+    def send_tensor_wrapper(self, tensor: Optional[torch.Tensor],
+                            tensor_size: int) -> None:
+        """
+        Wrapper for _send_impl to handle exceptions and update buffer size.
+        """
+        try:
+            self._send_impl(tensor)
+
+            with self.buffer_size_lock:
+                self.buffer_size -= tensor_size
+        except Exception as e:
+            logger.error("[rank%d]: Exception when trying to send %s, msg: %s",
+                         torch.distributed.get_rank(), str(tensor), str(e))
+            import traceback
+            traceback.print_exc()
+
+    def block_if_full(self):
+        """
+        Block the current thread if the buffer size is larger than the 
+        threshold.
+        """
+        while self.buffer_size > self.buffer_size_thresh:
+            logger.debug("KV cache transfer pipe is full. Waiting...")
+            time.sleep(0.05)
+
+    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+        """
+        Sends a tensor and its metadata to the destination rank in a 
+        non-blocking way.
+
+        Parameters:
+            - tensor: The tensor to send, or None if no tensor is being sent.
+        """
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+
+        if tensor is not None:
+            tensor_size = tensor.element_size() * tensor.numel()
+        else:
+            tensor_size = 0
+
+        self.block_if_full()
+
+        with self.buffer_size_lock:
+            self.buffer_size += tensor_size
+
+        self.transport_thread.submit(self.send_tensor_wrapper, tensor,
+                                     tensor_size)
+
+    def recv_tensor(self) -> Optional[torch.Tensor]:
+        """
+        Receives a tensor and its metadata from the source rank. Blocking call.
+
+        Returns:
+            - tensor: The received tensor, or None if no tensor is received.
+        """
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+
+        future = self.transport_thread.submit(self._recv_impl)
+
+        try:
+            tensor = future.result()
+        except Exception as e:
+            logger.error("Encountering exception in KV receiving thread")
+            logger.error("%s", e)
+            logger.error("My device: %s", self.device)
+            import traceback
+            traceback.print_exc()
+            raise e
+
+        return tensor
+
+    def close(self):
+        """
+        Close the pipe and release associated resources.
+        """
+        if hasattr(self,
+                   "transport_thread") and self.transport_thread is not None:
+            self.transport_thread.shutdown()
diff --git a/vllm/distributed/kv_transfer/kv_transfer_agent.py b/vllm/distributed/kv_transfer/kv_transfer_agent.py
new file mode 100644
index 0000000000000..9ce97851dc849
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_transfer_agent.py
@@ -0,0 +1,75 @@
+"""A centralized entrypoint to perform distributed KV cache transfer.
+
+This implementation is a shim wrapper on two APIs exposed by `kv_connector`:
+1. `send_kv_caches_and_hidden_states`
+2. `recv_kv_caches_and_hidden_states
+"""
+from typing import TYPE_CHECKING, List, Tuple, Union
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+    from vllm.config import VllmConfig
+
+import torch
+
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
+from vllm.logger import init_logger
+from vllm.sequence import IntermediateTensors
+
+logger = init_logger(__name__)
+
+
+class KVTransferAgent:
+    """
+    A class designated for distributed KV transfer
+    
+    Target use cases:
+        1. Disaggregated prefill
+        2. Remote KV cache storage
+    """
+
+    def __init__(
+        self,
+        rank: int,
+        local_rank: int,
+        config: "VllmConfig",
+    ):
+
+        self.config = config
+
+        if config.kv_transfer_config is None:
+            raise ValueError("KVTransferConfig is not set in the VllmConfig,"
+                             " cannot initialize KVConnector.")
+
+        assert self.config.kv_transfer_config.is_kv_transfer_instance, "KV"\
+            "TransferAgent should only be used when kv_connector is set."
+
+        self.connector = KVConnectorFactory.create_connector(
+            rank, local_rank, config)
+
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
+    ) -> None:
+
+        self.connector.send_kv_caches_and_hidden_states(
+            model_executable, model_input, kv_caches,
+            hidden_or_intermediate_states)
+
+    def close(self) -> None:
+        self.connector.close()
+
+    def recv_kv_caches_and_hidden_states(
+        self, model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor]
+    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
+
+        return self.connector.recv_kv_caches_and_hidden_states(
+            model_executable, model_input, kv_caches)
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index ccbe00386c5da..34815d7f0aa78 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -27,18 +27,23 @@
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from multiprocessing import shared_memory
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
+                    Union)
 from unittest.mock import patch
 
 import torch
 import torch.distributed
 from torch.distributed import Backend, ProcessGroup
 
+import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op, supports_custom_op
 
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
 
 @dataclass
 class GraphCaptureContext:
@@ -904,6 +909,14 @@ def get_pp_group() -> GroupCoordinator:
 # kept for backward compatibility
 get_pipeline_model_parallel_group = get_pp_group
 
+_KV_TRANSFER: Optional[kv_transfer.KVTransferAgent] = None
+
+
+def get_kv_transfer_group() -> kv_transfer.KVTransferAgent:
+    assert _KV_TRANSFER is not None, (
+        "disaggregated KV cache transfer parallel group is not initialized")
+    return _KV_TRANSFER
+
 
 @contextmanager
 def graph_capture():
@@ -1052,6 +1065,26 @@ def initialize_model_parallel(
                                     group_name="pp")
 
 
+def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
+    """
+    Initialize KV cache transfer parallel group.
+    """
+
+    global _KV_TRANSFER
+
+    if vllm_config.kv_transfer_config is None:
+        return
+
+    if all([
+            vllm_config.kv_transfer_config.need_kv_parallel_group,
+            _KV_TRANSFER is None
+    ]):
+        _KV_TRANSFER = kv_transfer.KVTransferAgent(
+            rank=get_world_group().rank,
+            local_rank=get_world_group().local_rank,
+            config=vllm_config)
+
+
 def ensure_model_parallel_initialized(
     tensor_model_parallel_size: int,
     pipeline_model_parallel_size: int,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f0020562c3c3a..4aa0eebd976c9 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -9,10 +9,10 @@
 
 import vllm.envs as envs
 from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
-                         DecodingConfig, DeviceConfig, HfOverrides, LoadConfig,
-                         LoadFormat, LoRAConfig, ModelConfig,
-                         ObservabilityConfig, ParallelConfig, PoolerConfig,
-                         PromptAdapterConfig, SchedulerConfig,
+                         DecodingConfig, DeviceConfig, HfOverrides,
+                         KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
+                         ModelConfig, ObservabilityConfig, ParallelConfig,
+                         PoolerConfig, PromptAdapterConfig, SchedulerConfig,
                          SpeculativeConfig, TaskOption, TokenizerPoolConfig,
                          VllmConfig)
 from vllm.executor.executor_base import ExecutorBase
@@ -108,6 +108,7 @@ class EngineArgs:
     # notice.
     distributed_executor_backend: Optional[Union[str,
                                                  Type[ExecutorBase]]] = None
+    # number of P/D disaggregation (or other disaggregation) workers
     pipeline_parallel_size: int = 1
     tensor_parallel_size: int = 1
     max_parallel_loading_workers: Optional[int] = None
@@ -194,6 +195,8 @@ class EngineArgs:
     compilation_config: Optional[CompilationConfig] = None
     worker_cls: str = "auto"
 
+    kv_transfer_config: Optional[KVTransferConfig] = None
+
     def __post_init__(self):
         if not self.tokenizer:
             self.tokenizer = self.model
@@ -908,6 +911,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             'compilers, using -O without space is also '
                             'supported. -O3 is equivalent to -O 3.')
 
+        parser.add_argument('--kv-transfer-config',
+                            type=KVTransferConfig.from_cli,
+                            default=None,
+                            help='The configurations for distributed KV cache '
+                            'transfer. Should be a JSON string.')
+
         parser.add_argument(
             '--worker-cls',
             type=str,
@@ -1201,6 +1210,7 @@ def create_engine_config(self,
             observability_config=observability_config,
             prompt_adapter_config=prompt_adapter_config,
             compilation_config=self.compilation_config,
+            kv_transfer_config=self.kv_transfer_config,
         )
 
         if envs.VLLM_USE_V1:
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 1f654a9cce465..c9f06eef3f907 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -21,7 +21,7 @@
 from vllm.compilation.compile_context import set_compile_context
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
-from vllm.distributed import get_pp_group
+from vllm.distributed import get_kv_transfer_group, get_pp_group
 from vllm.distributed.parallel_state import graph_capture
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
@@ -1666,6 +1666,24 @@ def execute_model(
         else:
             model_executable = self.model
 
+        # Receive KV cache in distributed KV cache transfer setting
+        # In disagg prefill setting, it will also recv hidden states and bypass
+        # model forwarding
+        # In KV cache database setting, it will change the model input so that
+        # we can skip prefilling on tokens that successfully received KV caches
+        # NOTE: The receive operation is blocking
+        bypass_model_exec = False
+        if self.need_recv_kv(model_input, kv_caches):
+            hidden_or_intermediate_states, bypass_model_exec, model_input = \
+                get_kv_transfer_group().recv_kv_caches_and_hidden_states(
+                    # model is used to know which layer the current worker
+                    # is working on, so that we can receive KV for only those
+                    # layers.
+                    model_executable,
+                    model_input,
+                    kv_caches=kv_caches
+                )
+
         multi_modal_kwargs = model_input.multi_modal_kwargs or {}
         seqlen_agnostic_kwargs = {
             "finished_requests_ids": model_input.finished_requests_ids,
@@ -1677,21 +1695,36 @@ def execute_model(
             model_forward_end = torch.cuda.Event(enable_timing=True)
             model_forward_start.record()
 
-        with set_forward_context(model_input.attn_metadata, self.vllm_config):
-            hidden_or_intermediate_states = model_executable(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                kv_caches=kv_caches,
-                attn_metadata=model_input.attn_metadata,
-                intermediate_tensors=intermediate_tensors,
-                **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
-                                             device=self.device),
-                **seqlen_agnostic_kwargs)
+        if not bypass_model_exec:
+            with set_forward_context(model_input.attn_metadata,
+                                     self.vllm_config):
+                hidden_or_intermediate_states = model_executable(
+                    input_ids=model_input.input_tokens,
+                    positions=model_input.input_positions,
+                    kv_caches=kv_caches,
+                    attn_metadata=model_input.attn_metadata,
+                    intermediate_tensors=intermediate_tensors,
+                    **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
+                                                 device=self.device),
+                    **seqlen_agnostic_kwargs)
 
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
             model_forward_end.record()
 
+        # Sending KV cache in distributed KV cache transfer setting
+        # NOTE: the send operation is non-blocking
+        if self.need_send_kv(model_input, kv_caches):
+            get_kv_transfer_group().send_kv_caches_and_hidden_states(
+                # model_executable is used to know which layer the current
+                # worker is working on, so that we can send KV for only those
+                # layers.
+                model_executable,
+                model_input,
+                kv_caches,
+                hidden_or_intermediate_states,
+            )
+
         # Compute the logits in the last pipeline stage.
         if not get_pp_group().is_last_rank:
             if (self.is_driver_worker
@@ -1759,6 +1792,56 @@ def execute_model(
 
         return [output]
 
+    def need_recv_kv(self, model_input, kv_caches) -> bool:
+        """Check if we need to receive kv-cache from the other worker.
+        We need to receive KV when
+            1. current vLLM instance is KV cache consumer/decode vLLM instance
+            2. this batch is not a profiling run
+            3. this batch is a prefill run
+            
+        Args:
+            model_input: input to the model executable
+            kv_caches: vLLM's paged memory
+        """
+
+        prefill_meta = model_input.attn_metadata.prefill_metadata
+
+        # check if the current run is profiling
+        is_profile_run = (kv_caches[0].numel() == 0)
+        # check if the current run is prefill
+        is_prefill_run = prefill_meta is not None
+
+        if self.vllm_config.kv_transfer_config is None:
+            return False
+
+        return self.vllm_config.kv_transfer_config.is_kv_consumer and (
+            not is_profile_run) and is_prefill_run
+
+    def need_send_kv(self, model_input, kv_caches) -> bool:
+        """Check if we need to send kv-cache to the other worker.
+        We need to send KV when
+            1. current vLLM instance is KV cache producer/prefill vLLM instance
+            2. this batch is not a profiling run
+            3. this batch is a prefill run
+            
+        Args:
+            model_input: input to the model executable
+            kv_caches: vLLM's paged memory
+        """
+
+        prefill_meta = model_input.attn_metadata.prefill_metadata
+
+        # check if the current run is profiling
+        is_profile_run = (kv_caches[0].numel() == 0)
+        # check if the current run is prefill
+        is_prefill_run = prefill_meta is not None
+
+        if self.vllm_config.kv_transfer_config is None:
+            return False
+
+        return self.vllm_config.kv_transfer_config.is_kv_producer and (
+            not is_profile_run) and is_prefill_run
+
 
 # NOTE: this is nn.Module so the profiler can properly capture/group
 #  kernels calls made within the graph
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index d58cb029618e9..094dd5a5d08b3 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -8,8 +8,9 @@
 import torch.distributed
 
 import vllm.envs as envs
-from vllm.config import ParallelConfig, VllmConfig
-from vllm.distributed import (ensure_model_parallel_initialized,
+from vllm.config import VllmConfig
+from vllm.distributed import (ensure_kv_transfer_initialized,
+                              ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
 from vllm.logger import init_logger
@@ -144,7 +145,7 @@ def init_device(self) -> None:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
         # Initialize the distributed environment.
-        init_worker_distributed_environment(self.parallel_config, self.rank,
+        init_worker_distributed_environment(self.vllm_config, self.rank,
                                             self.distributed_init_method,
                                             self.local_rank)
         # Set random seed.
@@ -457,20 +458,22 @@ def get_cache_block_size_bytes(self) -> int:
 
 
 def init_worker_distributed_environment(
-    parallel_config: ParallelConfig,
+    vllm_config: VllmConfig,
     rank: int,
     distributed_init_method: Optional[str] = None,
     local_rank: int = -1,
 ) -> None:
     """Initialize the distributed environment."""
+    parallel_config = vllm_config.parallel_config
     set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
 
     init_distributed_environment(parallel_config.world_size, rank,
                                  distributed_init_method, local_rank)
-
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
 
+    ensure_kv_transfer_initialized(vllm_config)
+
 
 def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
     # Check if the GPU supports the dtype.
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 7aaa8b453cff1..7c0bc5a678956 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -43,6 +43,7 @@ def __init__(
         self.speculative_config = vllm_config.speculative_config
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
+        self.kv_transfer_config = vllm_config.kv_transfer_config
 
     @abstractmethod
     def init_device(self) -> None:

From 02eb17991ca68ba97cf11cd0417fc5f2b5d0b6f1 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 2 Dec 2024 09:31:09 +0800
Subject: [PATCH 108/293] [Model] Add BNB support to Llava and Pixtral-HF
 (#10795)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/model_executor/models/llava.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 7fd4b32774798..db7fa82ceb9b7 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -287,6 +287,15 @@ def init_vision_tower_for_llava(
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()

From 8d5035d59d07d148f56a5f7df273c7565e20d128 Mon Sep 17 00:00:00 2001
From: cduk <19917266+cduk@users.noreply.github.com>
Date: Mon, 2 Dec 2024 02:49:48 +0100
Subject: [PATCH 109/293] =?UTF-8?q?[core]=20Avoid=20metrics=20log=20noise?=
 =?UTF-8?q?=20when=20idle=20-=20include=20speculative=20decodi=E2=80=A6=20?=
 =?UTF-8?q?(#10809)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/engine/metrics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 5bfd6a9f4b386..4869557ba9b44 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -473,13 +473,13 @@ def log(self, stats: Stats) -> None:
             )
             if (stats.cpu_prefix_cache_hit_rate >= 0
                     or stats.gpu_prefix_cache_hit_rate >= 0):
-                logger.info(
+                log_fn(
                     "Prefix cache hit rate: GPU: %.2f%%, CPU: %.2f%%",
                     stats.gpu_prefix_cache_hit_rate * 100,
                     stats.cpu_prefix_cache_hit_rate * 100,
                 )
             if self.spec_decode_metrics is not None:
-                logger.info(
+                log_fn(
                     self._format_spec_decode_metrics_str(
                         self.spec_decode_metrics))
 

From ab21a28faa40a6f10c31b8ac46caa5601a15548d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 1 Dec 2024 17:55:39 -0800
Subject: [PATCH 110/293] [Kernel] Use `out` arg in flash_attn_varlen_func
 (#10811)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 CMakeLists.txt                           |  2 +-
 tests/kernels/test_flash_attn.py         | 20 +++++++++++++++++---
 vllm/v1/attention/backends/flash_attn.py |  6 +++---
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f43bf8143458b..c78cdc77a7e42 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -522,7 +522,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG fdf6d72b48aea41f4ae6a89139a453dae554abc8
+          GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py
index a20c73345218f..1ae78d7b46c5b 100644
--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@@ -71,6 +71,7 @@ def ref_paged_attn(
     return torch.cat(outputs, dim=0)
 
 
+@pytest.mark.parametrize("use_out", [True, False])
 @pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -81,6 +82,7 @@ def ref_paged_attn(
 @pytest.mark.parametrize("sliding_window", [None, 256])
 @torch.inference_mode()
 def test_flash_attn_with_paged_kv(
+    use_out: bool,
     kv_lens: List[int],
     num_heads: Tuple[int, int],
     head_size: int,
@@ -116,17 +118,22 @@ def test_flash_attn_with_paged_kv(
                                  (num_seqs, max_num_blocks_per_seq),
                                  dtype=torch.int32)
 
+    q = query.unsqueeze(1)
+    out = torch.empty_like(q) if use_out else None
     output = flash_attn_with_kvcache(
-        q=query.unsqueeze(1),
+        q=q,
         k_cache=key_cache,
         v_cache=value_cache,
+        out=out,
         softmax_scale=scale,
         causal=True,
         block_table=block_tables,
         cache_seqlens=kv_lens_tensor,
         softcap=soft_cap if soft_cap is not None else 0,
         window_size=window_size,
-    ).squeeze(1)
+    )
+    output = output if not use_out else out
+    output = output.squeeze(1)
 
     ref_output = ref_paged_attn(query=query,
                                 key_cache=key_cache,
@@ -141,7 +148,10 @@ def test_flash_attn_with_paged_kv(
         f"{torch.max(torch.abs(output - ref_output))}"
 
 
-@pytest.mark.parametrize("seq_lens", [[(1, 1328), (5, 18), (129, 463)]])
+@pytest.mark.parametrize("use_out", [True, False])
+@pytest.mark.parametrize("seq_lens",
+                         [[(1, 1328), (5, 18),
+                           (129, 463)], [(1, 523), (1, 37), (1, 2011)]])
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
@@ -151,6 +161,7 @@ def test_flash_attn_with_paged_kv(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @torch.inference_mode()
 def test_varlen_with_paged_kv(
+    use_out: bool,
     seq_lens: List[Tuple[int, int]],
     num_heads: Tuple[int, int],
     head_size: int,
@@ -197,10 +208,12 @@ def test_varlen_with_paged_kv(
                                  (num_seqs, max_num_blocks_per_seq),
                                  dtype=torch.int32)
 
+    out = torch.empty_like(query) if use_out else None
     output = flash_attn_varlen_func(
         q=query,
         k=key_cache,
         v=value_cache,
+        out=out,
         cu_seqlens_q=cu_query_lens,
         cu_seqlens_k=cu_kv_lens,
         max_seqlen_q=max_query_len,
@@ -211,6 +224,7 @@ def test_varlen_with_paged_kv(
         block_table=block_tables,
         softcap=soft_cap if soft_cap is not None else 0,
     )
+    output = output if not use_out else out
 
     ref_output = ref_paged_attn(
         query=query,
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index e618edf7d35bf..4aa4b296f0efc 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -205,10 +205,12 @@ def unified_v1_flash_attention(
         v_scale,
     )
 
-    attn_output = flash_attn_varlen_func(
+    # Compute attention and update output up to `num_actual_tokens`.
+    flash_attn_varlen_func(
         q=query[:num_actual_tokens],
         k=key_cache,
         v=value_cache,
+        out=output[:num_actual_tokens],
         cu_seqlens_q=attn_metadata.query_start_loc,
         max_seqlen_q=attn_metadata.max_query_len,
         cu_seqlens_k=attn_metadata.seq_start_loc,
@@ -220,8 +222,6 @@ def unified_v1_flash_attention(
         block_table=attn_metadata.block_table,
         softcap=logits_soft_cap,
     )
-    # TODO(woosuk): Remove this unnecessary copy.
-    output[:num_actual_tokens].copy_(attn_output)
 
 
 def unified_v1_flash_attention_fake(

From 6643bf204c73fcd8fb6f4de3085cfcb45d1d905d Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Sun, 1 Dec 2024 23:05:32 -0300
Subject: [PATCH 111/293] Fill TorchSDPAAttentionMetadata seq_lens_field for
 prefill (#10799)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/attention/backends/torch_sdpa.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 16e044b618c40..dafa5bb56acda 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -341,7 +341,11 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             )
         else:
             block_tables = torch.tensor([])
-            seq_lens_tensor = torch.tensor([])
+            seq_lens_tensor = torch.tensor(
+                input_data.seq_lens[:input_data.num_prefills],
+                dtype=torch.int32,
+                device="cpu",
+            )
 
         # For multi-modal models
         placeholder_index_maps = None

From 946493163b661943b0a7e07fdcbc4a18b3a86a7a Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 1 Dec 2024 19:27:13 -0800
Subject: [PATCH 112/293] [misc] remove xverse modeling file (#10814)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/model_executor/models/registry.py |   2 +-
 vllm/model_executor/models/xverse.py   | 423 -------------------------
 2 files changed, 1 insertion(+), 424 deletions(-)
 delete mode 100644 vllm/model_executor/models/xverse.py

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 2b7b69e8c3a95..c66fbce018a62 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -94,7 +94,7 @@
     "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
     "SolarForCausalLM": ("solar", "SolarForCausalLM"),
     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
-    "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
+    "XverseForCausalLM": ("llama", "LlamaForCausalLM"),
     # [Encoder-decoder]
     "BartModel": ("bart", "BartForConditionalGeneration"),
     "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),
diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py
deleted file mode 100644
index 25a0d474e2863..0000000000000
--- a/vllm/model_executor/models/xverse.py
+++ /dev/null
@@ -1,423 +0,0 @@
-# Adapted from
-# https://huggingface.co/xverse/XVERSE-7B/blob/main/modeling_xverse.py
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only Xverse model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
-
-import torch
-from torch import nn
-from transformers import PretrainedConfig
-
-from vllm.attention import Attention, AttentionMetadata
-from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
-
-from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
-
-
-class XverseMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            quant_config=quant_config)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config)
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class XverseAttention(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
-        bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
-        # partition the KV heads across multiple tensor parallel GPUs.
-        assert self.total_num_kv_heads % tp_size == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = hidden_size // self.total_num_heads
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            self.total_num_kv_heads,
-            bias=bias,
-            quant_config=quant_config,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=bias,
-            quant_config=quant_config,
-        )
-
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
-        )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class XverseDecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        self.self_attn = XverseAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_kv_heads=getattr(config, "num_key_value_heads",
-                                 config.num_attention_heads),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
-            quant_config=quant_config,
-            bias=getattr(config, "bias", False),
-            cache_config=cache_config,
-            prefix=f"{prefix}.self_attn",
-        )
-        self.mlp = XverseMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            quant_config=quant_config,
-        )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
-        residual: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
-
-
-@support_torch_compile
-class XverseModel(nn.Module):
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
-        self.embed_tokens = VocabParallelEmbedding(
-            self.vocab_size,
-            config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-        )
-        self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers,
-            lambda prefix: XverseDecoderLayer(
-                config, cache_config, quant_config, prefix=prefix),
-            prefix=f"{prefix}.layers",
-        )
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.embed_tokens(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        if get_pp_group().is_first_rank:
-            if inputs_embeds is not None:
-                hidden_states = inputs_embeds
-            else:
-                hidden_states = self.get_input_embeddings(input_ids)
-            residual = None
-        else:
-            hidden_states = intermediate_tensors["hidden_states"]
-            residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-                residual,
-            )
-        if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-
-class XverseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
-
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-        "embed_tokens",
-        "lm_head",
-    ]
-    embedding_modules = {
-        "embed_tokens": "input_embeddings",
-        "lm_head": "output_embeddings",
-    }
-    embedding_padding_modules = ["lm_head"]
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
-
-        self.config = config
-        self.lora_config = lora_config
-
-        self.quant_config = quant_config
-        self.model = XverseModel(vllm_config=vllm_config,
-                                 prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config)
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
-                                   inputs_embeds)
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
-    def load_weights(self, weights: Iterable[Tuple[str,
-                                                   torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if ("rotary_emb.inv_freq" in name
-                    or "rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params

From 777bb76a8097bb46b87837148595e3b621725c01 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Mon, 2 Dec 2024 12:14:45 +0800
Subject: [PATCH 113/293] [doc]Update config docstring (#10732)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/config.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 5d9e2766c7faa..510bd81d66217 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -91,6 +91,8 @@ class ModelConfig:
             the default version.
         max_model_len: Maximum length of a sequence (including prompt and
             output). If None, will be derived from the model.
+        spec_target_max_model_len: Specify the the maximum length for spec
+            decoding draft models.
         quantization: Quantization method that was used to quantize the model
             weights. If None, we assume the model weights are not quantized.
         quantization_param_path: Path to JSON file containing scaling factors.
@@ -107,6 +109,7 @@ class ModelConfig:
             to eager mode. Additionally for encoder-decoder models, if the
             sequence length of the encoder input is larger than this, we fall
             back to the eager mode.
+        max_logprobs: Maximum number of log probabilities. Defaults to 20.
         disable_sliding_window: Whether to disable sliding window. If True,
             we will disable the sliding window functionality of the model.
             If the model does not support sliding window, this argument is
@@ -119,6 +122,8 @@ class ModelConfig:
             the model name will be the same as `model`.
         limit_mm_per_prompt: Maximum number of data items per modality
             per prompt. Only applicable for multimodal models.
+        use_async_output_proc: Whether to use async output processor.
+            Defaults to True.
         config_format: The config format which shall be loaded.
             Defaults to 'auto' which defaults to 'hf'.
         hf_overrides: If a dictionary, contains arguments to be forwarded to the
@@ -130,7 +135,7 @@ class ModelConfig:
             override default neuron config that are specific to Neuron devices,
             this argument will be used to configure the neuron config that
             can not be gathered from the vllm arguments.
-        override_pooling_config: Initialize non default pooling config or
+        override_pooler_config: Initialize non default pooling config or
             override default pooling config for the embedding model.
     """
 
@@ -734,8 +739,13 @@ class CacheConfig:
             vLLM execution.
         swap_space: Size of the CPU swap space per GPU (in GiB).
         cache_dtype: Data type for kv cache storage.
+        is_attention_free: Whether the model is attention-free.
         num_gpu_blocks_override: Number of GPU blocks to use. This overrides the
             profiled num_gpu_blocks if specified. Does nothing if None.
+        sliding_window: Sliding window size for the KV cache. Can not work with
+            prefix caching enabled.
+        enable_prefix_caching: Whether to enable prefix caching.
+        cpu_offload_gb: Size of the CPU offload buffer in GiB.
     """
 
     def __init__(
@@ -904,6 +914,7 @@ class LoadConfig:
             "tensorizer" will use CoreWeave's tensorizer library for
                 fast weight loading.
             "bitsandbytes" will load nf4 type weights.
+        model_loader_extra_config: The extra config for the model loader.
         ignore_patterns: The list of patterns to ignore when loading the model.
             Default to "original/**/*" to avoid repeated loading of llama's
             checkpoints.

From 221ee7939f5c3d161b0fe8cf4da775e82f083f84 Mon Sep 17 00:00:00 2001
From: zhou fan <1247714429@qq.com>
Date: Mon, 2 Dec 2024 13:36:36 +0800
Subject: [PATCH 114/293] [Model]: add some tests for aria model (#10770)

Signed-off-by: xffxff <1247714429@qq.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/conftest.py                             |  6 +++-
 .../vision_language/test_models.py            | 30 +++++++++++++++++++
 .../vision_language/vlm_utils/core.py         | 11 +++++--
 .../vision_language/vlm_utils/types.py        |  7 +++++
 4 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 36f1d477fab59..d6be8f5b00af8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -656,6 +656,7 @@ def __init__(
         model_name: str,
         task: TaskOption = "auto",
         tokenizer_name: Optional[str] = None,
+        tokenizer_mode: str = "auto",
         # Use smaller max model length, otherwise bigger model cannot run due
         # to kv cache size limit.
         max_model_len: int = 1024,
@@ -672,6 +673,7 @@ def __init__(
             model=model_name,
             task=task,
             tokenizer=tokenizer_name,
+            tokenizer_mode=tokenizer_mode,
             trust_remote_code=True,
             dtype=dtype,
             swap_space=swap_space,
@@ -842,6 +844,7 @@ def generate_greedy_logprobs(
         audios: Optional[PromptAudioInput] = None,
         videos: Optional[PromptVideoInput] = None,
         stop_token_ids: Optional[List[int]] = None,
+        stop: Optional[List[str]] = None,
     ) -> Union[List[TokensTextLogprobs],
                List[TokensTextLogprobsPromptLogprobs]]:
         greedy_logprobs_params = SamplingParams(
@@ -849,7 +852,8 @@ def generate_greedy_logprobs(
             max_tokens=max_tokens,
             logprobs=num_logprobs,
             prompt_logprobs=num_prompt_logprobs,
-            stop_token_ids=stop_token_ids)
+            stop_token_ids=stop_token_ids,
+            stop=stop)
 
         return self.generate_w_logprobs(prompts,
                                         greedy_logprobs_params,
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 3457ec6b8e73b..dbb0b4d350d10 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -8,6 +8,7 @@
 import pytest
 import transformers
 from transformers import AutoModelForVision2Seq
+from transformers.utils import is_flash_attn_2_available
 
 from vllm.platforms import current_platform
 from vllm.utils import cuda_device_count_stateless, identity
@@ -134,6 +135,35 @@
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
     #### Extended model tests
+    "aria": VLMTestInfo(
+        models=["rhymes-ai/Aria"],
+        tokenizer_mode="slow",
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+        ),
+        dtype="bfloat16",
+        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<vlm_image>Please describe the image shortly.",
+            "cherry_blossom": "<vlm_image>Please infer the season with reason.",
+        }),
+        multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
+        postprocess_inputs=model_utils.get_key_type_post_processor("pixel_values"),
+        stop_str=["<|im_end|>"],
+        image_size_factors=[(0.10, 0.15)],
+        max_tokens=64,
+        marks=[
+            pytest.mark.skipif(
+                not is_flash_attn_2_available(),
+                reason="Model needs flash-attn for numeric convergence.",
+            ),
+            large_gpu_mark(min_gb=64),
+        ],
+    ),
     "blip2": VLMTestInfo(
         models=["Salesforce/blip2-opt-2.7b"],
         test_type=VLMTestType.IMAGE,
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py
index 7e8c6dabb15af..88349ef9a3a69 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -29,6 +29,8 @@ def run_test(
     postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
     comparator: Callable[..., None],
     get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]],
+    stop_str: Optional[List[str]],
+    tokenizer_mode: str,
     limit_mm_per_prompt: Dict[str, int],
     model_kwargs: Optional[Dict[str, Any]],
     patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
@@ -50,11 +52,14 @@ def run_test(
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
-    vllm_kwargs = {}
+    vllm_kwargs: Dict[str, Any] = {}
     if get_stop_token_ids is not None:
         vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
+    if stop_str:
+        vllm_kwargs["stop"] = stop_str
 
     with vllm_runner(model,
+                     tokenizer_mode=tokenizer_mode,
                      max_model_len=max_model_len,
                      max_num_seqs=max_num_seqs,
                      dtype=dtype,
@@ -85,6 +90,8 @@ def run_test(
     hf_kwargs = {}
     if use_tokenizer_eos:
         hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
+    if stop_str:
+        hf_kwargs["stop_strings"] = stop_str
 
     with hf_model, torch.no_grad():
         for prompts, media in inputs:
@@ -138,4 +145,4 @@ def process_runner_outputs(
 def process_outputs(output_processor, model, outputs_per_image):
     """Applies a model specific post-processor function to a runner's output"""
     return [[output_processor(res, model) for res in outputs]
-            for outputs in outputs_per_image]
+            for outputs in outputs_per_image]
\ No newline at end of file
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py
index 8459476dc2d07..d410fa8c653ce 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
@@ -97,6 +97,9 @@ class VLMTestInfo(NamedTuple):
 
     # Optional callable which gets a list of token IDs from the model tokenizer
     get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]] = None
+    # Optional list of strings to stop generation, useful when stop tokens are
+    # not special tokens in the tokenizer
+    stop_str: Optional[List[str]] = None
 
     # Exposed options for HF runner
     model_kwargs: Optional[Dict[str, Any]] = None
@@ -148,6 +151,8 @@ class VLMTestInfo(NamedTuple):
 
     marks: Optional[List[MarkDecorator]] = None
 
+    tokenizer_mode: str = "auto"
+
     def get_non_parametrized_runner_kwargs(self):
         """Returns a dictionary of expandable kwargs for items that are used
         in all test types, which are NOT used when creating the parametrized
@@ -166,8 +171,10 @@ def get_non_parametrized_runner_kwargs(self):
             "postprocess_inputs": self.postprocess_inputs,
             "comparator": self.comparator,
             "get_stop_token_ids": self.get_stop_token_ids,
+            "stop_str": self.stop_str,
             "model_kwargs": self.model_kwargs,
             "patch_hf_runner": self.patch_hf_runner,
+            "tokenizer_mode": self.tokenizer_mode
         }
 
 

From 39cd324f92744ce27ffc940eb08f8d7831e08582 Mon Sep 17 00:00:00 2001
From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Date: Mon, 2 Dec 2024 04:38:52 -0500
Subject: [PATCH 115/293] Update vllm/outputs.py

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/outputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/outputs.py b/vllm/outputs.py
index ead37164f1113..08bc5a91174a9 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -141,7 +141,7 @@ def new(
           token_ids: completion token ids
           logprobs: completion sample logprobs
           prompt_logprobs: prompt logprobs
-          finished
+          finished: whether the request is finished
         """
 
         # TODO: Support `n` > 1.

From 5757476ce7496083f5ba5b1d8736acd32803ad97 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 2 Dec 2024 10:12:01 +0000
Subject: [PATCH 116/293] small fixes

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/processor.py        | 26 +++++++++++++++++++++++---
 vllm/v1/worker/gpu_model_runner.py |  3 +--
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 5bcf1b5e7b86e..8fe9d3adb8792 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -39,7 +39,7 @@ def __init__(
         self.input_processor = input_registry.create_input_processor(
             model_config)
 
-    def _assert_valid_logprobs_prompt_logprobs(
+    def _assert_valid_sample_logprobs_prompt_logprobs(
         self,
         params: Union[SamplingParams, PoolingParams],
         max_logprobs: int,
@@ -70,17 +70,37 @@ def process_inputs(
         prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
         arrival_time: float,
-        max_logprobs: int,
+        max_logprobs_permitted_by_engine: int,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
+        """Process the input prompt into an engine request
+        
+        Args:
+          request_id: request ID
+          prompt: input prompt str
+          params: sampling or pooling commands
+          arrival_time: time when inputs arrived; will be computed if `None`
+          is passed in
+          max_logprobs_permitted_by_engine: the max number of sample or prompt
+          logprobs a request may ask for
+          lora_request: LoRA request structure
+          trace_headers: trace info
+          prompt_adapter_request: prompt adapter request structure
+          priority: currently unsupported; must be zero & is by default.
+
+        Returns:
+          Detokenizer request structure
+          Engine request structure
+        """
 
         # TODO(woosuk): Support embedding mode.
         # TODO(woosuk): Support encoder-decoder models.
 
-        self._assert_valid_logprobs_prompt_logprobs(params, max_logprobs)
+        self._assert_valid_sample_logprobs_prompt_logprobs(
+            params, max_logprobs_permitted_by_engine)
 
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8dbfb6ef3aaa4..6004d160c5c09 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -211,8 +211,7 @@ def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
         sampling_metadata: SamplingMetadata,
-    ) -> Tuple[torch.Tensor, FlashAttentionMetadata, torch.Tensor,
-               torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    ) -> Tuple[torch.Tensor, FlashAttentionMetadata]:
 
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0

From 3d1373cdb6ba85e37c9bc7bcbec1122bea77b288 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 2 Dec 2024 10:34:19 +0000
Subject: [PATCH 117/293] moved output processing commands into processor

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/core/scheduler.py   | 250 -----------------------------------
 vllm/v1/engine/core.py      | 255 +++++++++++++++++++++++++++++++++++-
 vllm/v1/engine/processor.py |   2 +-
 3 files changed, 253 insertions(+), 254 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index b515d15172c44..899bdcbb156bb 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -6,11 +6,8 @@
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import Logprob
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_manager import KVCacheManager
-from vllm.v1.engine import EngineCoreOutput
-from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 
 if TYPE_CHECKING:
@@ -390,253 +387,6 @@ def _try_schedule_encoder_inputs(
             encoder_inputs_to_schedule.append(i)
         return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
 
-    def _pythonize_logprobs(
-        self,
-        do_logprobs: bool,
-        do_prompt_logprobs: bool,
-        model_runner_output: "ModelRunnerOutput",
-    ) -> Tuple[List, List, List, List]:
-        """Convert logprobs tensors to Python data structures.
-        
-        Args:
-          do_logprobs: sample logprobs are required
-          do_prompt_logprobs: prompt logprobs are required
-          model_runner_output: model runner output contains CPU logprobs tensors
-
-        Returns:
-          logprob_token_ids_list
-          logprob_values_list
-          prompt_logprob_token_ids_list
-          prompt_logprob_values_list
-        """
-        if do_logprobs:
-            # Pythonize sample logprobs if needed
-            assert model_runner_output.logprob_token_ids_cpu is not None
-            logprob_token_ids_list = (
-                model_runner_output.logprob_token_ids_cpu.tolist())
-            logprob_values_list = (model_runner_output.logprobs_cpu.tolist())
-        else:
-            (
-                logprob_token_ids_list,
-                logprob_values_list,
-            ) = (None, None)
-        if do_prompt_logprobs:
-            # Pythonize prompt logprobs if needed
-            assert model_runner_output.prompt_logprob_token_ids_cpu is not None
-            prompt_logprob_token_ids_list = (
-                model_runner_output.prompt_logprob_token_ids_cpu.tolist())
-            prompt_logprob_values_list = (
-                model_runner_output.prompt_logprobs_cpu.tolist())
-        else:
-            (
-                prompt_logprob_token_ids_list,
-                prompt_logprob_values_list,
-            ) = (None, None)
-
-        return (logprob_token_ids_list, logprob_values_list,
-                prompt_logprob_token_ids_list, prompt_logprob_values_list)
-
-    def update_from_output(
-        self,
-        scheduler_output: "SchedulerOutput",
-        model_runner_output: "ModelRunnerOutput",
-    ) -> List[EngineCoreOutput]:
-        # NOTE(woosuk): This method doesn't consider speculative decoding.
-        sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
-        num_scheduled_tokens = scheduler_output.num_scheduled_tokens
-        do_logprobs = model_runner_output.logprobs_cpu is not None
-        do_prompt_logprobs = (
-            model_runner_output.prompt_logprobs_cpu is not None
-            and len(model_runner_output.prompt_logprobs_cpu) > 0)
-
-        # Get logprobs as Python data structures
-        (
-            logprob_token_ids_list,
-            logprob_values_list,
-            prompt_logprob_token_ids_list,
-            prompt_logprob_values_list,
-        ) = self._pythonize_logprobs(do_logprobs, do_prompt_logprobs,
-                                     model_runner_output)
-
-        if do_prompt_logprobs:
-            # Index into prompt tokens, for building
-            # prompt logprobs output data structure
-            curr_prompt_base_idx = 0
-        new_running: List[Request] = []
-        engine_core_outputs: List[EngineCoreOutput] = []
-        for request in self.running:
-            req_id = request.request_id
-            request.num_computed_tokens += num_scheduled_tokens[req_id]
-            req_index = model_runner_output.req_id_to_index[req_id]
-            num_new_tokens = 1
-            max_logprobs = request.max_logprobs
-            request_do_logprobs = (do_logprobs and max_logprobs is not None
-                                   and max_logprobs > 0)
-
-            if do_prompt_logprobs:
-                max_prompt_logprobs = request.max_prompt_logprobs
-                # Number of new prompt tokens is the number of scheduled
-                # tokens *if* the request is partial (because the sampled
-                # token is discarded and all sequence offsets are prompt
-                # offsets), otherwise it is the number of scheduled
-                # tokens minus one (for the sampled token)
-                num_new_prompt_tokens = (
-                    num_scheduled_tokens[request.request_id] -
-                    int(scheduler_output.partial_req_index != req_index))
-
-                request_do_prompt_logprobs = (max_prompt_logprobs is not None
-                                              and max_prompt_logprobs > 0
-                                              and num_new_prompt_tokens > 0)
-
-                if request_do_prompt_logprobs:
-
-                    # Construct prompt logprobs, under the condition that
-                    # prompt logprobs were requested & a nonzero number of
-                    # prompt tokens were computed in this step for this request.
-                    #
-                    # Note that this scenario returns an EngineCoreOutput which
-                    # is empty except for the prompt logprobs which were
-                    # computed for these prompt tokens.
-
-                    slice_upper_index = (curr_prompt_base_idx +
-                                         num_new_prompt_tokens)
-                    prompt_logprob_token_ids = prompt_logprob_token_ids_list[
-                        curr_prompt_base_idx:slice_upper_index]
-                    prompt_logprob_values = prompt_logprob_values_list[
-                        curr_prompt_base_idx:slice_upper_index]
-                    curr_prompt_base_idx = slice_upper_index
-
-                    logprob_cnt = max_prompt_logprobs
-                    prompt_logprobs = [{
-                        lpt: Logprob(lpv, (idx + 1), None)
-                        for idx, (lpv, lpt) in enumerate(
-                            zip(plp_tok_values[0:logprob_cnt],
-                                plp_tok_token_ids[0:logprob_cnt]))
-                    } for plp_tok_values, plp_tok_token_ids in zip(
-                        prompt_logprob_values, prompt_logprob_token_ids)]
-
-                    if not request.prompt_logprobs:
-                        # Ensure that None is the first prompt logprob
-                        prompt_logprobs = [None] + prompt_logprobs
-
-                    curr_prompt_base_idx = slice_upper_index
-
-                    prompt_slice_range_upper = request.num_computed_tokens
-                    prompt_slice_range_lower = (prompt_slice_range_upper -
-                                                num_new_prompt_tokens)
-                    request.prompt_logprobs.extend(prompt_logprobs)
-                else:
-                    curr_prompt_base_idx += num_new_prompt_tokens
-            else:
-                request_do_prompt_logprobs = False
-
-            # When the request's num_computed_tokens catches up its num_tokens,
-            # the request generates output tokens. Otherwise, we ignore the
-            # sampler output for the request.
-            assert request.num_computed_tokens <= request.num_tokens
-
-            cached_encoder_input_ids = (
-                self.encoder_cache_manager.get_cached_input_ids(request))
-            for input_id in list(cached_encoder_input_ids):
-                start_pos = request.mm_positions[input_id]["offset"]
-                num_tokens = request.mm_positions[input_id]["length"]
-                if start_pos + num_tokens <= request.num_computed_tokens:
-                    # The encoder output is already processed and stored
-                    # in the decoder's KV cache.
-                    self.encoder_cache_manager.free(request, input_id)
-
-            if request.num_computed_tokens == request.num_tokens:
-                # NOTE(woosuk): Currently, we assume that each request
-                # generates at most one token at each step.
-                token_id = sampled_token_ids[req_index]
-                if request_do_logprobs:
-                    # Construct logprobs, if requested (TODO: assumes one
-                    # generated token).
-                    logprob_token_ids = logprob_token_ids_list[req_index]
-                    logprob_values = logprob_values_list[req_index]
-                    logprob_cnt = max_logprobs
-                    if token_id not in logprob_token_ids[0:max_logprobs]:
-                        # Sampled token is not in the in the top logprobs;
-                        # inject it & resort, ensuring that excess logprobs
-                        # not requested by the user have -inf probability
-                        logprob_values[max_logprobs:-1] = (
-                            [float('-inf')] *
-                            (len(logprob_values) - 1 - max_logprobs))
-
-                        indices = sorted(range(len(logprob_values)),
-                                         key=lambda k: logprob_values[k],
-                                         reverse=True)
-                        logprob_values = [logprob_values[i] for i in indices]
-                        logprob_token_ids = [
-                            logprob_token_ids[i] for i in indices
-                        ]
-
-                        # There will be one more logprob than the user requested
-                        logprob_cnt = max_logprobs + 1
-
-                    # Only keep the number of logprobs specified by the request
-                    # (plus possibly the sampled token id & its logprob)
-                    logprob_values = logprob_values[0:logprob_cnt]
-                    logprob_token_ids = logprob_token_ids[0:logprob_cnt]
-
-                    request.logprobs.append({
-                        lpt: Logprob(lpv, (idx + 1), None)
-                        for idx, (lpv, lpt) in enumerate(
-                            zip(logprob_values, logprob_token_ids))
-                    })
-                request.append_output_token_ids(token_id)
-                # TODO: Update the KV cache manager for prefix caching.
-
-                # Check for stop and update request state.
-                # This must be called before me make the EngineCoreOutput.
-                stopped = self._check_stop(request)
-
-                # Add EngineCoreOutput for this Request.
-                # Return the logprob for the most recently computed tokens.
-                # Return no prompt logprobs in decode-phase.
-                output = EngineCoreOutput(
-                    request_id=req_id,
-                    new_token_ids=request.output_token_ids[-num_new_tokens:],
-                    finished=request.is_finished(),
-                    finish_reason=request.get_finished_reason(),
-                    stop_reason=request.stop_reason,
-                    logprobs=(request.logprobs[-num_new_tokens:]
-                              if request_do_logprobs else None),
-                    prompt_logprobs=(prompt_logprobs
-                                     if request_do_prompt_logprobs else None),
-                    prompt_logprobs_token_ids=(request.prompt_token_ids
-                                               if request_do_prompt_logprobs
-                                               else None))
-                engine_core_outputs.append(output)
-
-                # Breakout of the loop.
-                if stopped:
-                    continue
-
-            elif request_do_prompt_logprobs:
-                # This request is still partial but prompt logprobs were
-                # requested
-                engine_core_outputs.append(
-                    EngineCoreOutput(
-                        request_id=req_id,
-                        new_token_ids=[],
-                        finished=request.is_finished(),
-                        finish_reason=request.get_finished_reason(),
-                        stop_reason=request.stop_reason,
-                        logprobs=[] if request_do_logprobs else None,
-                        prompt_logprobs=(
-                            prompt_logprobs if request_do_prompt_logprobs else
-                            ([] if request_do_prompt_logprobs else None)),
-                        prompt_logprobs_token_ids=(
-                            request.prompt_token_ids[prompt_slice_range_lower:
-                                                     prompt_slice_range_upper]
-                            if request_do_prompt_logprobs else
-                            ([] if request_do_prompt_logprobs else None))))
-
-            new_running.append(request)
-        self.running = new_running
-        return engine_core_outputs
-
     def _check_stop(self, request: Request) -> bool:
         if (request.num_tokens >= self.max_model_len
                 or request.num_output_tokens >= request.max_tokens):
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 34f99dd30ef2e..c6ff0bc59da5f 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -14,13 +14,15 @@
 
 from vllm.config import CacheConfig, VllmConfig
 from vllm.logger import init_logger
+from vllm.sequence import Logprob
 from vllm.usage.usage_lib import UsageContext
-from vllm.v1.core.scheduler import Scheduler
+from vllm.v1.core.scheduler import Scheduler, SchedulerOutput
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType)
 from vllm.v1.engine.mm_input_mapper import MMInputMapper
 from vllm.v1.executor.gpu_executor import GPUExecutor
+from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import PickleEncoder
 from vllm.version import __version__ as VLLM_VERSION
@@ -103,6 +105,254 @@ def abort_requests(self, request_ids: List[str]):
         self.scheduler.finish_requests(request_ids,
                                        RequestStatus.FINISHED_ABORTED)
 
+    def _pythonize_logprobs(
+        self,
+        do_logprobs: bool,
+        do_prompt_logprobs: bool,
+        model_runner_output: "ModelRunnerOutput",
+    ) -> Tuple[List, List, List, List]:
+        """Convert logprobs tensors to Python data structures.
+        
+        Args:
+          do_logprobs: sample logprobs are required
+          do_prompt_logprobs: prompt logprobs are required
+          model_runner_output: model runner output contains CPU logprobs tensors
+
+        Returns:
+          logprob_token_ids_list
+          logprob_values_list
+          prompt_logprob_token_ids_list
+          prompt_logprob_values_list
+        """
+        if do_logprobs:
+            # Pythonize sample logprobs if needed
+            assert model_runner_output.logprob_token_ids_cpu is not None
+            logprob_token_ids_list = (
+                model_runner_output.logprob_token_ids_cpu.tolist())
+            logprob_values_list = (model_runner_output.logprobs_cpu.tolist())
+        else:
+            (
+                logprob_token_ids_list,
+                logprob_values_list,
+            ) = (None, None)
+        if do_prompt_logprobs:
+            # Pythonize prompt logprobs if needed
+            assert model_runner_output.prompt_logprob_token_ids_cpu is not None
+            prompt_logprob_token_ids_list = (
+                model_runner_output.prompt_logprob_token_ids_cpu.tolist())
+            prompt_logprob_values_list = (
+                model_runner_output.prompt_logprobs_cpu.tolist())
+        else:
+            (
+                prompt_logprob_token_ids_list,
+                prompt_logprob_values_list,
+            ) = (None, None)
+
+        return (logprob_token_ids_list, logprob_values_list,
+                prompt_logprob_token_ids_list, prompt_logprob_values_list)
+
+    def update_from_output(
+        self,
+        scheduler_output: "SchedulerOutput",
+        model_runner_output: "ModelRunnerOutput",
+    ) -> List[EngineCoreOutput]:
+        scheduler = self.scheduler
+        # NOTE(woosuk): This method doesn't consider speculative decoding.
+        sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
+        num_scheduled_tokens = scheduler_output.num_scheduled_tokens
+        do_logprobs = model_runner_output.logprobs_cpu is not None
+        do_prompt_logprobs = (
+            model_runner_output.prompt_logprobs_cpu is not None
+            and len(model_runner_output.prompt_logprobs_cpu) > 0)
+
+        # Get logprobs as Python data structures
+        (
+            logprob_token_ids_list,
+            logprob_values_list,
+            prompt_logprob_token_ids_list,
+            prompt_logprob_values_list,
+        ) = self._pythonize_logprobs(do_logprobs, do_prompt_logprobs,
+                                     model_runner_output)
+
+        if do_prompt_logprobs:
+            # Index into prompt tokens, for building
+            # prompt logprobs output data structure
+            curr_prompt_base_idx = 0
+        new_running: List[Request] = []
+        engine_core_outputs: List[EngineCoreOutput] = []
+        for request in scheduler.running:
+            req_id = request.request_id
+            request.num_computed_tokens += num_scheduled_tokens[req_id]
+            req_index = model_runner_output.req_id_to_index[req_id]
+            num_new_tokens = 1
+            max_logprobs = request.max_logprobs
+            request_do_logprobs = (do_logprobs and max_logprobs is not None
+                                   and max_logprobs > 0)
+
+            if do_prompt_logprobs:
+                max_prompt_logprobs = request.max_prompt_logprobs
+                # Number of new prompt tokens is the number of scheduled
+                # tokens *if* the request is partial (because the sampled
+                # token is discarded and all sequence offsets are prompt
+                # offsets), otherwise it is the number of scheduled
+                # tokens minus one (for the sampled token)
+                num_new_prompt_tokens = (
+                    num_scheduled_tokens[request.request_id] -
+                    int(scheduler_output.partial_req_index != req_index))
+
+                request_do_prompt_logprobs = (max_prompt_logprobs is not None
+                                              and max_prompt_logprobs > 0
+                                              and num_new_prompt_tokens > 0)
+
+                if request_do_prompt_logprobs:
+
+                    # Construct prompt logprobs, under the condition that
+                    # prompt logprobs were requested & a nonzero number of
+                    # prompt tokens were computed in this step for this request.
+                    #
+                    # Note that this scenario returns an EngineCoreOutput which
+                    # is empty except for the prompt logprobs which were
+                    # computed for these prompt tokens.
+
+                    slice_upper_index = (curr_prompt_base_idx +
+                                         num_new_prompt_tokens)
+                    prompt_logprob_token_ids = prompt_logprob_token_ids_list[
+                        curr_prompt_base_idx:slice_upper_index]
+                    prompt_logprob_values = prompt_logprob_values_list[
+                        curr_prompt_base_idx:slice_upper_index]
+                    curr_prompt_base_idx = slice_upper_index
+
+                    logprob_cnt = max_prompt_logprobs
+                    prompt_logprobs = [{
+                        lpt: Logprob(lpv, (idx + 1), None)
+                        for idx, (lpv, lpt) in enumerate(
+                            zip(plp_tok_values[0:logprob_cnt],
+                                plp_tok_token_ids[0:logprob_cnt]))
+                    } for plp_tok_values, plp_tok_token_ids in zip(
+                        prompt_logprob_values, prompt_logprob_token_ids)]
+
+                    if not request.prompt_logprobs:
+                        # Ensure that None is the first prompt logprob
+                        prompt_logprobs = [None] + prompt_logprobs
+
+                    curr_prompt_base_idx = slice_upper_index
+
+                    prompt_slice_range_upper = request.num_computed_tokens
+                    prompt_slice_range_lower = (prompt_slice_range_upper -
+                                                num_new_prompt_tokens)
+                    request.prompt_logprobs.extend(prompt_logprobs)
+                else:
+                    curr_prompt_base_idx += num_new_prompt_tokens
+            else:
+                request_do_prompt_logprobs = False
+
+            # When the request's num_computed_tokens catches up its num_tokens,
+            # the request generates output tokens. Otherwise, we ignore the
+            # sampler output for the request.
+            assert request.num_computed_tokens <= request.num_tokens
+
+            cached_encoder_input_ids = (
+                scheduler.encoder_cache_manager.get_cached_input_ids(request))
+            for input_id in list(cached_encoder_input_ids):
+                start_pos = request.mm_positions[input_id]["offset"]
+                num_tokens = request.mm_positions[input_id]["length"]
+                if start_pos + num_tokens <= request.num_computed_tokens:
+                    # The encoder output is already processed and stored
+                    # in the decoder's KV cache.
+                    scheduler.encoder_cache_manager.free(request, input_id)
+
+            if request.num_computed_tokens == request.num_tokens:
+                # NOTE(woosuk): Currently, we assume that each request
+                # generates at most one token at each step.
+                token_id = sampled_token_ids[req_index]
+                if request_do_logprobs:
+                    # Construct logprobs, if requested (TODO: assumes one
+                    # generated token).
+                    logprob_token_ids = logprob_token_ids_list[req_index]
+                    logprob_values = logprob_values_list[req_index]
+                    logprob_cnt = max_logprobs
+                    if token_id not in logprob_token_ids[0:max_logprobs]:
+                        # Sampled token is not in the in the top logprobs;
+                        # inject it & resort, ensuring that excess logprobs
+                        # not requested by the user have -inf probability
+                        logprob_values[max_logprobs:-1] = (
+                            [float('-inf')] *
+                            (len(logprob_values) - 1 - max_logprobs))
+
+                        indices = sorted(range(len(logprob_values)),
+                                         key=lambda k: logprob_values[k],
+                                         reverse=True)
+                        logprob_values = [logprob_values[i] for i in indices]
+                        logprob_token_ids = [
+                            logprob_token_ids[i] for i in indices
+                        ]
+
+                        # There will be one more logprob than the user requested
+                        logprob_cnt = max_logprobs + 1
+
+                    # Only keep the number of logprobs specified by the request
+                    # (plus possibly the sampled token id & its logprob)
+                    logprob_values = logprob_values[0:logprob_cnt]
+                    logprob_token_ids = logprob_token_ids[0:logprob_cnt]
+
+                    request.logprobs.append({
+                        lpt: Logprob(lpv, (idx + 1), None)
+                        for idx, (lpv, lpt) in enumerate(
+                            zip(logprob_values, logprob_token_ids))
+                    })
+                request.append_output_token_ids(token_id)
+                # TODO: Update the KV cache manager for prefix caching.
+
+                # Check for stop and update request state.
+                # This must be called before me make the EngineCoreOutput.
+                stopped = scheduler._check_stop(request)
+
+                # Add EngineCoreOutput for this Request.
+                # Return the logprob for the most recently computed tokens.
+                # Return no prompt logprobs in decode-phase.
+                output = EngineCoreOutput(
+                    request_id=req_id,
+                    new_token_ids=request.output_token_ids[-num_new_tokens:],
+                    finished=request.is_finished(),
+                    finish_reason=request.get_finished_reason(),
+                    stop_reason=request.stop_reason,
+                    logprobs=(request.logprobs[-num_new_tokens:]
+                              if request_do_logprobs else None),
+                    prompt_logprobs=(prompt_logprobs
+                                     if request_do_prompt_logprobs else None),
+                    prompt_logprobs_token_ids=(request.prompt_token_ids
+                                               if request_do_prompt_logprobs
+                                               else None))
+                engine_core_outputs.append(output)
+
+                # Breakout of the loop.
+                if stopped:
+                    continue
+
+            elif request_do_prompt_logprobs:
+                # This request is still partial but prompt logprobs were
+                # requested
+                engine_core_outputs.append(
+                    EngineCoreOutput(
+                        request_id=req_id,
+                        new_token_ids=[],
+                        finished=request.is_finished(),
+                        finish_reason=request.get_finished_reason(),
+                        stop_reason=request.stop_reason,
+                        logprobs=[] if request_do_logprobs else None,
+                        prompt_logprobs=(
+                            prompt_logprobs if request_do_prompt_logprobs else
+                            ([] if request_do_prompt_logprobs else None)),
+                        prompt_logprobs_token_ids=(
+                            request.prompt_token_ids[prompt_slice_range_lower:
+                                                     prompt_slice_range_upper]
+                            if request_do_prompt_logprobs else
+                            ([] if request_do_prompt_logprobs else None))))
+
+            new_running.append(request)
+        scheduler.running = new_running
+        return engine_core_outputs
+
     def step(self) -> List[EngineCoreOutput]:
         """Schedule, execute, and make output."""
 
@@ -111,8 +361,7 @@ def step(self) -> List[EngineCoreOutput]:
 
         scheduler_output = self.scheduler.schedule()
         output = self.model_executor.execute_model(scheduler_output)
-        engine_core_outputs = self.scheduler.update_from_output(
-            scheduler_output, output)
+        engine_core_outputs = self.update_from_output(scheduler_output, output)
         return engine_core_outputs
 
     def profile(self, is_start=True):
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 8fe9d3adb8792..37b16051da9fb 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -76,7 +76,7 @@ def process_inputs(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
-        """Process the input prompt into an engine request
+        """Process the input prompt into engine (& possibly tokenizer) requests
         
         Args:
           request_id: request ID

From 05f39a9a2fa84e414a4264a3a2f0539e6a098ac1 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 2 Dec 2024 18:26:10 +0800
Subject: [PATCH 118/293] [CI/Build] Update `mistral_common` version for tests
 and docs (#10825)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 docs/requirements-docs.txt | 2 +-
 requirements-test.in       | 2 +-
 requirements-test.txt      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index e3e35844405ac..8ea240f59c38f 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -12,7 +12,7 @@ pydantic >= 2.8
 torch
 py-cpuinfo
 transformers
-mistral_common >= 1.3.4
+mistral_common >= 1.5.0
 aiohttp
 starlette
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
diff --git a/requirements-test.in b/requirements-test.in
index 76f6de2f77c34..44972866ddc4b 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -20,7 +20,7 @@ timm # required for internvl test
 torch==2.5.1
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[opencv] >= 1.4.4 # required for pixtral test
+mistral_common[opencv] >= 1.5.0 # required for pixtral test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
 
diff --git a/requirements-test.txt b/requirements-test.txt
index 65695111e4dc5..a59b85023948b 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -217,7 +217,7 @@ mbstrdecoder==1.1.3
     #   dataproperty
     #   pytablewriter
     #   typepy
-mistral-common[opencv]==1.4.4
+mistral-common[opencv]==1.5.1
     # via
     #   -r requirements-test.in
     #   mistral-common

From 74274c26f99fb67cabc2c92e78bd83b6ef0fb20e Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 2 Dec 2024 11:03:16 +0000
Subject: [PATCH 119/293] added explanatory comment to
 EngineCore.update_from_output()

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/core.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index c6ff0bc59da5f..2611d08efe0dc 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -156,6 +156,15 @@ def update_from_output(
         scheduler_output: "SchedulerOutput",
         model_runner_output: "ModelRunnerOutput",
     ) -> List[EngineCoreOutput]:
+        """Build engine core output from model runner output.
+        
+        Args:
+          scheduler_output: scheduler output prior to engine step.
+          model_runner_output: model runner output from engine step.
+
+        Returns:
+          Engine core output which tracks the progress of generation.
+        """
         scheduler = self.scheduler
         # NOTE(woosuk): This method doesn't consider speculative decoding.
         sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()

From c9a7b3f759a134e4ba5dc890d84784b7c9e9cf8b Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 2 Dec 2024 02:50:10 -0800
Subject: [PATCH 120/293] [misc] use out argument for flash attention (#10822)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/attention/backends/abstract.py         |   1 +
 vllm/attention/backends/blocksparse_attn.py |   2 +
 vllm/attention/backends/flash_attn.py       |  55 +++----
 vllm/attention/backends/flashinfer.py       |   4 +
 vllm/attention/backends/hpu_attn.py         |   1 +
 vllm/attention/backends/ipex_attn.py        |   1 +
 vllm/attention/backends/pallas.py           |   1 +
 vllm/attention/backends/rocm_flash_attn.py  |   1 +
 vllm/attention/backends/torch_sdpa.py       |   1 +
 vllm/attention/backends/xformers.py         |   1 +
 vllm/attention/layer.py                     |  76 +++++++++-
 vllm/config.py                              |   2 +-
 vllm/v1/attention/backends/flash_attn.py    | 155 +++++---------------
 13 files changed, 144 insertions(+), 157 deletions(-)

diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 5be2d83346d00..aed04361e5fb4 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -247,5 +247,6 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 9e54c3b40c54e..99cb84346d84e 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -360,6 +360,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
 
@@ -448,5 +449,6 @@ def forward(
                 blocksparse_head_sliding_step=self.head_sliding_step,
             )
 
+        assert output is not None
         # Reshape the output tensor.
         return output.view(num_tokens, hidden_size)
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 32738d1043b1d..c69e12ad78c44 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -638,24 +638,27 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
 
         Args:
-            query: shape = [num_tokens, num_heads * head_size]
-            key: shape = [num_tokens, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            output: shape = [num_tokens, num_heads, head_size]
             kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
-        Returns:
-            shape = [num_tokens, num_heads * head_size]
+        NOTE: It in-place updates the output tensor.
         """
         # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
         assert k_scale == 1.0 and v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")
 
+        assert output is not None, "Output tensor must be provided."
+
         if (attn_type == AttentionType.ENCODER
                 and (not attn_metadata.is_all_encoder_attn_metadata_set)):
             raise AttributeError("Encoder attention requires setting "
@@ -666,23 +669,12 @@ def forward(
                                  "requires setting cross-attention "
                                  "metadata attributes.")
 
-        num_heads: int = self.num_heads
-        head_size: int = self.head_size
-        num_kv_heads: int = self.num_kv_heads
         kv_cache_dtype: str = self.kv_cache_dtype
         softmax_scale: float = self.scale
         window_size = self.sliding_window
         alibi_slopes: Optional[torch.Tensor] = self.alibi_slopes
         logits_soft_cap: Optional[float] = self.logits_soft_cap
 
-        num_tokens, hidden_size = query.shape
-
-        # Reshape the query, key, and value tensors.
-        query = query.view(-1, num_heads, head_size)
-        if (key is not None) and (value is not None):
-            key = key.view(-1, num_kv_heads, head_size)
-            value = value.view(-1, num_kv_heads, head_size)
-
         if kv_cache.numel() > 0:
             key_cache = kv_cache[0]
             value_cache = kv_cache[1]
@@ -721,13 +713,13 @@ def forward(
         num_decode_query_tokens) = \
             get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
         decode_query = query[num_prefill_query_tokens:]
+        decode_output = output[num_prefill_query_tokens:]
         # QKV for prefill.
         query = query[:num_prefill_query_tokens]
+        prefill_output = output[:num_prefill_query_tokens]
         assert query.shape[0] == num_prefill_query_tokens
         assert decode_query.shape[0] == num_decode_query_tokens
 
-        prefill_output: Optional[torch.Tensor] = None
-        decode_output: Optional[torch.Tensor] = None
         if prefill_meta := attn_metadata.prefill_metadata:
             # Prompt run.
             if (kv_cache.numel() == 0 or prefill_meta.block_tables is None
@@ -741,7 +733,7 @@ def forward(
                 key = key[:num_prefill_kv_tokens]
                 value = value[:num_prefill_kv_tokens]
 
-                prefill_output = flash_attn_varlen_func(
+                flash_attn_varlen_func(
                     q=query,
                     k=key,
                     v=value,
@@ -754,6 +746,7 @@ def forward(
                     window_size=window_size,
                     alibi_slopes=alibi_slopes,
                     softcap=logits_soft_cap,
+                    out=prefill_output,
                 )
             else:
                 # prefix-enabled attention
@@ -761,7 +754,7 @@ def forward(
                     "Only decoder-only models support prefix caching")
                 assert prefill_meta.seq_lens is not None
                 max_seq_len = max(prefill_meta.seq_lens)
-                prefill_output = flash_attn_varlen_func(  # noqa
+                flash_attn_varlen_func(  # noqa
                     q=query,
                     k=key_cache,
                     v=value_cache,
@@ -775,6 +768,7 @@ def forward(
                     alibi_slopes=alibi_slopes,
                     block_table=prefill_meta.block_tables,
                     softcap=logits_soft_cap,
+                    out=prefill_output,
                 )
 
         if decode_meta := attn_metadata.decode_metadata:
@@ -788,7 +782,7 @@ def forward(
                 assert attn_type == AttentionType.DECODER, (
                     "Only decoder-only models support max_decode_query_len > 1"
                 )
-                decode_output = flash_attn_varlen_func(
+                flash_attn_varlen_func(
                     q=decode_query,
                     k=key_cache,
                     v=value_cache,
@@ -802,6 +796,7 @@ def forward(
                     alibi_slopes=alibi_slopes,
                     softcap=logits_soft_cap,
                     block_table=decode_meta.block_tables,
+                    out=decode_output,
                 )
             else:
                 # Use flash_attn_with_kvcache for normal decoding.
@@ -810,7 +805,7 @@ def forward(
                     _,
                     block_tables_arg,
                 ) = get_seq_len_block_table_args(decode_meta, False, attn_type)
-                decode_output = flash_attn_with_kvcache(
+                flash_attn_with_kvcache(
                     q=decode_query.unsqueeze(1),
                     k_cache=key_cache,
                     v_cache=value_cache,
@@ -821,20 +816,8 @@ def forward(
                     window_size=window_size,
                     alibi_slopes=alibi_slopes,
                     softcap=logits_soft_cap,
-                ).squeeze(1)
-
-        if prefill_output is None:
-            assert decode_output is not None
-            return decode_output.view(num_decode_query_tokens, hidden_size)
-        if decode_output is None:
-            assert prefill_output is not None
-            return prefill_output.view(num_prefill_query_tokens, hidden_size)
-
-        assert decode_meta is not None
-        decode_output = decode_output.squeeze(1)
-        output = torch.cat([prefill_output, decode_output], dim=0)
-        return output.view(num_tokens, hidden_size)
-
+                    out=decode_output.unsqueeze(1),
+                )
         return output
 
 
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 1a2024705eb04..e367468d05d26 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -774,7 +774,11 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+
+        # TODO: directly write to output tensor
+
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
                                       "encoder/decoder cross-attention "
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 5359941d41fde..2c62e565c04c7 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -145,6 +145,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
 
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 3b0d51ea4a3d8..21949874bea47 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -173,6 +173,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with IPEX varlen_attention and PagedAttention.
 
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 5988be0e6b687..9809aed0e66f9 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -151,6 +151,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with Pallas attention.
 
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 6a494f4e73cb4..9139c3c1314d8 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -415,6 +415,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention and PagedAttention.
 
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index dafa5bb56acda..86e952a903f36 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -431,6 +431,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
 
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 292575a8736bc..e2e989efb020c 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -417,6 +417,7 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
 
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 17157617248f7..e024eef286f05 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -4,7 +4,6 @@
 import torch
 import torch.nn as nn
 
-import vllm.envs as envs
 from vllm.attention import AttentionMetadata, AttentionType
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.config import CacheConfig, get_current_vllm_config
@@ -12,7 +11,7 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
-from vllm.platforms import current_platform
+from vllm.platforms import _Backend, current_platform
 from vllm.utils import direct_register_custom_op
 
 
@@ -97,14 +96,23 @@ def __init__(
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
                              alibi_slopes, sliding_window, kv_cache_dtype,
                              blocksparse_params, logits_soft_cap)
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.num_kv_heads = num_kv_heads
         self.backend = backend_name_to_enum(attn_backend.get_name())
 
         # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
         # torch.compile works by registering the attention as one giant
         # opaque custom op. For other platforms, we directly call them
         # and let torch.compile handle them.
-        self.use_direct_call = envs.VLLM_USE_V1 or not (
-            current_platform.is_cuda_alike() or current_platform.is_cpu())
+        self.use_direct_call = not current_platform.is_cuda_alike(
+        ) and not current_platform.is_cpu()
+
+        # For some attention backends, we allocate an output tensor before
+        # calling the custom op. When piecewise cudagraph is enabled, this
+        # makes sure the output tensor is allocated inside the cudagraph.
+        self.use_output = self.backend == _Backend.FLASH_ATTN or \
+            self.backend == _Backend.FLASH_ATTN_VLLM_V1
         compilation_config = get_current_vllm_config().compilation_config
         if prefix in compilation_config.static_forward_context:
             raise ValueError(f"Duplicate layer name: {prefix}")
@@ -130,6 +138,22 @@ def forward(
                                      self._k_scale,
                                      self._v_scale,
                                      attn_type=attn_type)
+        elif self.use_output:
+            output = torch.empty_like(query)
+            hidden_size = query.size(-1)
+            # Reshape the query, key, and value tensors.
+            # NOTE(woosuk): We do this outside the custom op to minimize the
+            # CPU overheads from the non-CUDA-graph regions.
+            query = query.view(-1, self.num_heads, self.head_size)
+            output = output.view(-1, self.num_heads, self.head_size)
+            if key is not None:
+                key = key.view(-1, self.num_kv_heads, self.head_size)
+            if value is not None:
+                value = value.view(-1, self.num_kv_heads, self.head_size)
+            torch.ops.vllm.unified_attention_with_output(
+                query, key, value, output, kv_cache, attn_type,
+                self.layer_name)
+            return output.view(-1, hidden_size)
         else:
             return torch.ops.vllm.unified_attention(query, key, value,
                                                     kv_cache, attn_type,
@@ -183,3 +207,47 @@ def unified_attention_fake(
     fake_impl=unified_attention_fake,
     dispatch_key=current_platform.dispatch_key,
 )
+
+
+def unified_attention_with_output(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    output: torch.Tensor,
+    kv_cache: torch.Tensor,
+    attn_type: str,
+    layer_name: str,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.dynamic_forward_context
+    self = forward_context.static_forward_context[layer_name]
+    self.impl.forward(query,
+                      key,
+                      value,
+                      kv_cache,
+                      attn_metadata,
+                      self._k_scale,
+                      self._v_scale,
+                      attn_type=attn_type,
+                      output=output)
+
+
+def unified_attention_with_output_fake(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    output: torch.Tensor,
+    kv_cache: torch.Tensor,
+    attn_type: str,
+    layer_name: str,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="unified_attention_with_output",
+    op_func=unified_attention_with_output,
+    mutates_args=["kv_cache", "output"],
+    fake_impl=unified_attention_with_output_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
diff --git a/vllm/config.py b/vllm/config.py
index 510bd81d66217..5f50d65ec87e1 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2238,7 +2238,7 @@ class CompilationConfig(BaseModel):
     custom_ops: List[str] = Field(default_factory=list)
     splitting_ops: List[str] = Field(default_factory=lambda: [
         "vllm.unified_attention",
-        "vllm.unified_v1_flash_attention",
+        "vllm.unified_attention_with_output",
     ])
 
     use_inductor: bool = True
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 4aa4b296f0efc..d37989055c2e5 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -6,8 +6,6 @@
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
-from vllm.forward_context import get_forward_context
-from vllm.utils import direct_register_custom_op
 from vllm.vllm_flash_attn import flash_attn_varlen_func
 
 
@@ -113,13 +111,14 @@ def forward(
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: AttentionType = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
 
         Args:
-            query: shape = [num_tokens, num_heads * head_size]
-            key: shape = [num_tokens, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
             kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
@@ -135,118 +134,42 @@ def forward(
         assert k_scale == 1.0 and v_scale == 1.0, (
             "key/v_scale is not supported in FlashAttention.")
 
-        # Reshape the query, key, and value tensors.
-        # NOTE(woosuk): We do this outside the custom op to minimize the CPU
-        # overheads from the non-CUDA-graph regions.
-        query = query.view(-1, self.num_heads, self.head_size)
-        key = key.view(-1, self.num_kv_heads, self.head_size)
-        value = value.view(-1, self.num_kv_heads, self.head_size)
-
-        output = torch.empty_like(query)
-        torch.ops.vllm.unified_v1_flash_attention(
-            output,
-            query,
-            key,
-            value,
-            self.num_heads,
-            self.head_size,
-            self.num_kv_heads,
-            kv_cache,
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+
+        # Reshape the input keys and values and store them in the cache.
+        key_cache = kv_cache[0]
+        value_cache = kv_cache[1]
+        torch.ops._C_cache_ops.reshape_and_cache_flash(
+            key[:num_actual_tokens],
+            value[:num_actual_tokens],
+            key_cache,
+            value_cache,
+            attn_metadata.slot_mapping,
             self.kv_cache_dtype,
             k_scale,
             v_scale,
-            self.scale,
-            self.sliding_window,
-            self.alibi_slopes,
-            self.logits_soft_cap,
         )
-        return output.view(-1, self.num_heads * self.head_size)
-
-
-def unified_v1_flash_attention(
-    output: torch.Tensor,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    num_heads: int,
-    head_size: int,
-    num_kv_heads: int,
-    kv_cache: torch.Tensor,
-    kv_cache_dtype: str,
-    k_scale: float,
-    v_scale: float,
-    softmax_scale: float,
-    window_size: Optional[List[int]] = None,
-    alibi_slopes: Optional[torch.Tensor] = None,
-    logits_soft_cap: Optional[float] = None,
-) -> None:
-    context = get_forward_context()
-    current_metadata = context.dynamic_forward_context
-    if current_metadata is None:
-        # Profiling run.
-        return
-
-    assert current_metadata is not None
-    assert isinstance(current_metadata, FlashAttentionMetadata)
-    attn_metadata: FlashAttentionMetadata = current_metadata
-    num_actual_tokens = attn_metadata.num_actual_tokens
-
-    # Reshape the input keys and values and store them in the cache.
-    key_cache = kv_cache[0]
-    value_cache = kv_cache[1]
-    torch.ops._C_cache_ops.reshape_and_cache_flash(
-        key[:num_actual_tokens],
-        value[:num_actual_tokens],
-        key_cache,
-        value_cache,
-        attn_metadata.slot_mapping,
-        kv_cache_dtype,
-        k_scale,
-        v_scale,
-    )
-
-    # Compute attention and update output up to `num_actual_tokens`.
-    flash_attn_varlen_func(
-        q=query[:num_actual_tokens],
-        k=key_cache,
-        v=value_cache,
-        out=output[:num_actual_tokens],
-        cu_seqlens_q=attn_metadata.query_start_loc,
-        max_seqlen_q=attn_metadata.max_query_len,
-        cu_seqlens_k=attn_metadata.seq_start_loc,
-        max_seqlen_k=attn_metadata.max_seq_len,
-        softmax_scale=softmax_scale,
-        causal=True,
-        alibi_slopes=alibi_slopes,
-        window_size=window_size,
-        block_table=attn_metadata.block_table,
-        softcap=logits_soft_cap,
-    )
-
-
-def unified_v1_flash_attention_fake(
-    output: torch.Tensor,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    num_heads: int,
-    head_size: int,
-    num_kv_heads: int,
-    kv_cache: torch.Tensor,
-    kv_cache_dtype: str,
-    k_scale: float,
-    v_scale: float,
-    softmax_scale: float,
-    window_size: Optional[List[int]] = None,
-    alibi_slopes: Optional[torch.Tensor] = None,
-    logits_soft_cap: Optional[float] = None,
-) -> None:
-    return
-
-
-direct_register_custom_op(
-    op_name="unified_v1_flash_attention",
-    op_func=unified_v1_flash_attention,
-    mutates_args=["kv_cache", "output"],
-    fake_impl=unified_v1_flash_attention_fake,
-)
+
+        # Compute attention and update output up to `num_actual_tokens`.
+        flash_attn_varlen_func(
+            q=query[:num_actual_tokens],
+            k=key_cache,
+            v=value_cache,
+            out=output[:num_actual_tokens],
+            cu_seqlens_q=attn_metadata.query_start_loc,
+            max_seqlen_q=attn_metadata.max_query_len,
+            cu_seqlens_k=attn_metadata.seq_start_loc,
+            max_seqlen_k=attn_metadata.max_seq_len,
+            softmax_scale=self.scale,
+            causal=True,
+            alibi_slopes=self.alibi_slopes,
+            window_size=self.sliding_window,
+            block_table=attn_metadata.block_table,
+            softcap=self.logits_soft_cap,
+        )
+
+        return output

From f22facd7523fa6c0831ba53053c255b0aaefd7d1 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 2 Dec 2024 13:56:43 +0000
Subject: [PATCH 121/293] constructing dummy logprobs

---
 tests/v1/engine/test_detokenizer.py | 46 ++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 07f343666cb5e..ba4cd62185a45 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -1,12 +1,44 @@
-from typing import List
+import random
+from typing import Dict, List, Union
 
 import pytest
 from transformers import AutoTokenizer
 
 from vllm.sampling_params import RequestOutputKind
+from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.v1.engine import EngineCoreOutput
 from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerRequest
 
+random.seed(42)
+NUM_SAMPLE_LOGPROBS = 5
+NUM_PROMPT_LOGPROBS = 7
+
+
+def _generate_dummy_single_logprob(
+    num_logprobs: int,
+    is_sample_logprobs: bool,
+) -> Dict[int, Logprob]:
+    adjusted_num_logprobs = (num_logprobs + random.choice([0, 1])
+                             if is_sample_logprobs else num_logprobs)
+    return {
+        random.randint(0,
+                       len(tokenizer.vocab) - 1):
+        Logprob(random.uniform(-100, 0), idx, None)
+        for idx in range(adjusted_num_logprobs)
+    }
+
+
+def _generate_dummy_logprobs(
+    tokens_list: List,
+    num_logprobs: int,
+    is_sample_logprobs: bool,
+) -> Union[SampleLogprobs, PromptLogprobs]:
+    return [
+        _generate_dummy_single_logprob(num_logprobs, is_sample_logprobs)
+        for _ in tokens_list
+    ]
+
+
 TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
 tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
 
@@ -23,9 +55,21 @@
 PROMPT_TOKENS = [
     tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
 ]
+PROMPT_LOGPROBS_RAW = [
+    _generate_dummy_logprobs(tokens_list=tokens_list,
+                             num_logprobs=NUM_PROMPT_LOGPROBS,
+                             is_sample_logprobs=False)
+    for tokens_list in PROMPT_TOKENS
+]
 GENERATION_TOKENS = [
     tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
 ]
+GENERATION_LOGPROBS_RAW = [
+    _generate_dummy_logprobs(tokens_list=tokens_list,
+                             num_logprobs=NUM_SAMPLE_LOGPROBS,
+                             is_sample_logprobs=True)
+    for tokens_list in GENERATION_TOKENS
+]
 PROMPT_STRINGS = [
     tokenizer.decode(prompt_tokens, skip_special_tokens=True)
     for prompt_tokens in PROMPT_TOKENS

From b16dd7932ab900c056098316ac385f0023ae2daf Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 2 Dec 2024 14:22:03 +0000
Subject: [PATCH 122/293] dummy logprobs with decodes

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py | 49 ++++++++++++++++++++++++-----
 1 file changed, 41 insertions(+), 8 deletions(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index ba4cd62185a45..3ec8098ec86f9 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -3,6 +3,7 @@
 
 import pytest
 from transformers import AutoTokenizer
+from transformers.tokenization_utils import PreTrainedTokenizer
 
 from vllm.sampling_params import RequestOutputKind
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
@@ -13,10 +14,23 @@
 NUM_SAMPLE_LOGPROBS = 5
 NUM_PROMPT_LOGPROBS = 7
 
+TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
+
+
+def _duplicate_logprob_with_decode(
+    logprob: Logprob,
+    token_id: int,
+    tokenizer: PreTrainedTokenizer,
+) -> Logprob:
+    return Logprob(logprob.logprob, logprob.rank,
+                   tokenizer.decode(token_id, skip_special_tokens=True))
+
 
 def _generate_dummy_single_logprob(
     num_logprobs: int,
     is_sample_logprobs: bool,
+    tokenizer: PreTrainedTokenizer,
 ) -> Dict[int, Logprob]:
     adjusted_num_logprobs = (num_logprobs + random.choice([0, 1])
                              if is_sample_logprobs else num_logprobs)
@@ -32,15 +46,23 @@ def _generate_dummy_logprobs(
     tokens_list: List,
     num_logprobs: int,
     is_sample_logprobs: bool,
+    tokenizer: PreTrainedTokenizer,
 ) -> Union[SampleLogprobs, PromptLogprobs]:
     return [
-        _generate_dummy_single_logprob(num_logprobs, is_sample_logprobs)
-        for _ in tokens_list
+        _generate_dummy_single_logprob(num_logprobs, is_sample_logprobs,
+                                       tokenizer) for _ in tokens_list
     ]
 
 
-TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
-tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
+def _new_logprobs_detokenized(
+    logprobs: Union[SampleLogprobs, PromptLogprobs],
+    tokenizer: PreTrainedTokenizer,
+) -> Union[SampleLogprobs, PromptLogprobs]:
+    return [{
+        tok_id: _duplicate_logprob_with_decode(lp, tok_id, tokenizer)
+        for tok_id, lp in lp_dict.items()
+    } for lp_dict in logprobs]
+
 
 FULL_STRINGS = [
     "My name is Robert from Neural Magic and I love working on vLLM so much!",
@@ -58,21 +80,32 @@ def _generate_dummy_logprobs(
 PROMPT_LOGPROBS_RAW = [
     _generate_dummy_logprobs(tokens_list=tokens_list,
                              num_logprobs=NUM_PROMPT_LOGPROBS,
-                             is_sample_logprobs=False)
+                             is_sample_logprobs=False,
+                             tokenizer=tokenizer)
     for tokens_list in PROMPT_TOKENS
 ]
+PROMPT_LOGPROBS = [
+    _new_logprobs_detokenized(logprobs=logprobs, tokenizer=tokenizer)
+    for logprobs in PROMPT_LOGPROBS_RAW
+]
 GENERATION_TOKENS = [
     tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
 ]
 GENERATION_LOGPROBS_RAW = [
     _generate_dummy_logprobs(tokens_list=tokens_list,
                              num_logprobs=NUM_SAMPLE_LOGPROBS,
-                             is_sample_logprobs=True)
+                             is_sample_logprobs=True,
+                             tokenizer=tokenizer)
     for tokens_list in GENERATION_TOKENS
 ]
+GENERATION_LOGPROBS = [
+    _new_logprobs_detokenized(logprobs=logprobs, tokenizer=tokenizer)
+    for logprobs in GENERATION_LOGPROBS_RAW
+]
 PROMPT_STRINGS = [
-    tokenizer.decode(prompt_tokens, skip_special_tokens=True)
-    for prompt_tokens in PROMPT_TOKENS
+    tokenizer.decode(prompt_tokens,
+                     skip_special_tokens=True,
+                     tokenizer=tokenizer) for prompt_tokens in PROMPT_TOKENS
 ]
 PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS]
 GENERATION_STRINGS = [

From 0054ece8e540fdc47d6a6c024bac97ed67c92adb Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 2 Dec 2024 14:44:44 +0000
Subject: [PATCH 123/293] passing some detokenizer tests

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py | 61 +++++++++++++++++++++++------
 1 file changed, 50 insertions(+), 11 deletions(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 3ec8098ec86f9..0f83d66566d6b 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -1,5 +1,5 @@
 import random
-from typing import Dict, List, Union
+from typing import Dict, List, Optional, Union
 
 import pytest
 from transformers import AutoTokenizer
@@ -117,21 +117,43 @@ def _new_logprobs_detokenized(
 class MockEngineCore:
     """Mock outputs form premade tokens lists."""
 
-    def __init__(self, tokens_list: List[List[int]]):
-        self.tokens_list = tokens_list
+    def __init__(
+        self,
+        generated_tokens_list: List[List[int]],
+        prompt_tokens_list: List[List[int]],
+        generated_logprobs_raw: Optional[SampleLogprobs],
+        prompt_logprobs_raw: Optional[PromptLogprobs],
+    ) -> None:
+        self.generated_tokens_list = generated_tokens_list
+        self.prompt_tokens_list = prompt_tokens_list
         self.current_idx = 0
+        self.generated_logprobs_raw = generated_logprobs_raw
+        self.do_logprobs = generated_logprobs_raw is not None
+        self.prompt_logprobs_raw = prompt_logprobs_raw
+        self.do_prompt_logprobs = prompt_logprobs_raw is not None
 
     def get_outputs(self) -> List[EngineCoreOutput]:
+        do_logprobs = self.do_logprobs
+        do_prompt_logprobs = self.do_prompt_logprobs
         token_idx = self.current_idx
         self.current_idx += 1
 
         outputs = []
-        for req_idx, token_ids in enumerate(self.tokens_list):
-            if len(token_ids) > token_idx:
-                output = EngineCoreOutput(request_id=f"request-{req_idx}",
-                                          new_token_ids=[token_ids[token_idx]],
-                                          finished=False)
-                if token_idx == len(token_ids) - 1:
+        for req_idx, (generated_token_ids, prompt_token_ids) in enumerate(
+                zip(self.generated_tokens_list, self.prompt_tokens_list)):
+            if len(generated_token_ids) > token_idx:
+                output = EngineCoreOutput(
+                    request_id=f"request-{req_idx}",
+                    new_token_ids=[generated_token_ids[token_idx]],
+                    finished=False,
+                    logprobs=self.generated_logprobs_raw
+                    if do_logprobs else None,
+                    prompt_logprobs=self.prompt_logprobs_raw
+                    if do_prompt_logprobs else None,
+                    prompt_logprobs_token_ids=prompt_token_ids
+                    if do_prompt_logprobs else None,
+                )
+                if token_idx == len(generated_token_ids) - 1:
                     output.finished = True
                     output.finish_reason = "stopped"
                 outputs.append(output)
@@ -204,9 +226,24 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
 
 
 @pytest.mark.parametrize("include_stop_str_in_output", [True, False])
-def test_stop_string(include_stop_str_in_output: bool):
+@pytest.mark.parametrize("logprobs,prompt_logprobs",
+                         [(None, None), (NUM_SAMPLE_LOGPROBS, None),
+                          (None, NUM_PROMPT_LOGPROBS),
+                          (NUM_SAMPLE_LOGPROBS, NUM_PROMPT_LOGPROBS)])
+def test_stop_string(
+    include_stop_str_in_output: bool,
+    logprobs: Optional[int],
+    prompt_logprobs: Optional[int],
+) -> None:
+    do_generated_logprobs = logprobs is not None
+    do_prompt_logprobs = prompt_logprobs is not None
     detokenizer = Detokenizer(TOKENIZER_NAME)
-    engine_core = MockEngineCore(GENERATION_TOKENS)
+    engine_core = MockEngineCore(generated_tokens_list=GENERATION_TOKENS,
+                                 prompt_tokens_list=PROMPT_TOKENS,
+                                 generated_logprobs_raw=GENERATION_LOGPROBS_RAW
+                                 if do_generated_logprobs else None,
+                                 prompt_logprobs_raw=PROMPT_LOGPROBS_RAW
+                                 if do_prompt_logprobs else None)
 
     # Make N requests.
     requests = [
@@ -219,6 +256,8 @@ def test_stop_string(include_stop_str_in_output: bool):
             output_kind=RequestOutputKind.DELTA,
             stop=STOP_STRINGS,
             include_stop_str_in_output=include_stop_str_in_output,
+            logprobs=logprobs,
+            prompt_logprobs=prompt_logprobs,
         ) for idx, (
             prompt,
             prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))

From 59853d5db1554ce2325dc1c2016bc48f9927e406 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 2 Dec 2024 14:59:23 +0000
Subject: [PATCH 124/293] fixing error during debug

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 0f83d66566d6b..3d50ce6389d47 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -146,11 +146,11 @@ def get_outputs(self) -> List[EngineCoreOutput]:
                     request_id=f"request-{req_idx}",
                     new_token_ids=[generated_token_ids[token_idx]],
                     finished=False,
-                    logprobs=self.generated_logprobs_raw
+                    logprobs=self.generated_logprobs_raw[req_idx][token_idx]
                     if do_logprobs else None,
-                    prompt_logprobs=self.prompt_logprobs_raw
+                    prompt_logprobs=self.prompt_logprobs_raw[req_idx]
                     if do_prompt_logprobs else None,
-                    prompt_logprobs_token_ids=prompt_token_ids
+                    prompt_logprobs_token_ids=prompt_token_ids[req_idx]
                     if do_prompt_logprobs else None,
                 )
                 if token_idx == len(generated_token_ids) - 1:

From 193e60c9d76a93c3e790c1b4292c171027d5c76a Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 2 Dec 2024 15:12:12 +0000
Subject: [PATCH 125/293] existing detokenizer test checks are unbroken; need
 to add logprobs checks

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 3d50ce6389d47..972f12b2b5bd0 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -146,7 +146,7 @@ def get_outputs(self) -> List[EngineCoreOutput]:
                     request_id=f"request-{req_idx}",
                     new_token_ids=[generated_token_ids[token_idx]],
                     finished=False,
-                    logprobs=self.generated_logprobs_raw[req_idx][token_idx]
+                    logprobs=[self.generated_logprobs_raw[req_idx][token_idx]]
                     if do_logprobs else None,
                     prompt_logprobs=self.prompt_logprobs_raw[req_idx]
                     if do_prompt_logprobs else None,

From 603f2b5731cf1466a4fba57aae482bb895d044dd Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 4 Dec 2024 12:41:34 +0000
Subject: [PATCH 126/293] model runner returns logprobs as np arrays

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/outputs.py                 |  9 +++++----
 vllm/v1/worker/gpu_model_runner.py | 11 ++++++-----
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 0bbbf24abd76d..12a71f419c05c 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass
 from typing import Dict, List, Optional
 
+import numpy.typing as npt
 import torch
 
 
@@ -33,11 +34,11 @@ class ModelRunnerOutput:
     sampled_token_ids_cpu: torch.Tensor
 
     # [num_reqs, max_num_logprobs + 1]
-    logprob_token_ids_cpu: Optional[torch.Tensor]
+    logprob_token_ids_cpu: Optional[npt.NDArray]
     # [num_reqs, max_num_logprobs + 1]
-    logprobs_cpu: Optional[torch.Tensor]
+    logprobs_cpu: Optional[npt.NDArray]
 
     # [num_reqs, max_num_prompt_logprobs]
-    prompt_logprob_token_ids_cpu: Optional[torch.Tensor]
+    prompt_logprob_token_ids_cpu: Optional[npt.NDArray]
     # [num_reqs, max_num_prompt_logprobs]
-    prompt_logprobs_cpu: Optional[torch.Tensor]
+    prompt_logprobs_cpu: Optional[npt.NDArray]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a985025bca6c0..7bc2ce2fd77e4 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -507,14 +507,15 @@ def execute_model(
             req_ids=self.input_batch.req_ids[:num_reqs],
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids_cpu=sampled_token_ids,
-            logprob_token_ids_cpu=(sampler_output.logprob_token_ids.cpu()
-                                   if do_logprobs else None),
-            logprobs_cpu=(sampler_output.logprobs.cpu()
+            logprob_token_ids_cpu=(
+                sampler_output.logprob_token_ids.cpu().numpy()
+                if do_logprobs else None),
+            logprobs_cpu=(sampler_output.logprobs.cpu().numpy()
                           if do_logprobs else None),
             prompt_logprob_token_ids_cpu=(
-                sampler_output.prompt_logprob_token_ids.cpu()
+                sampler_output.prompt_logprob_token_ids.cpu().numpy()
                 if do_prompt_logprobs else None),
-            prompt_logprobs_cpu=(sampler_output.prompt_logprobs.cpu()
+            prompt_logprobs_cpu=(sampler_output.prompt_logprobs.cpu().numpy()
                                  if do_prompt_logprobs else None))
         return model_runner_output
 

From ac602d86580072a075189ea48c4521216329bfc7 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 4 Dec 2024 13:41:51 +0000
Subject: [PATCH 127/293] new request types

---
 vllm/v1/engine/core.py | 55 ------------------------------------------
 vllm/v1/request.py     | 30 +++++++++++++++++++----
 2 files changed, 25 insertions(+), 60 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 8022a92560fce..869eca0f185df 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -98,52 +98,6 @@ def abort_requests(self, request_ids: List[str]):
         self.scheduler.finish_requests(request_ids,
                                        RequestStatus.FINISHED_ABORTED)
 
-    def _pythonize_logprobs(
-        self,
-        do_logprobs: bool,
-        do_prompt_logprobs: bool,
-        model_runner_output: "ModelRunnerOutput",
-    ) -> Tuple[List, List, List, List]:
-        """Convert logprobs tensors to Python data structures.
-        
-        Args:
-          do_logprobs: sample logprobs are required
-          do_prompt_logprobs: prompt logprobs are required
-          model_runner_output: model runner output contains CPU logprobs tensors
-
-        Returns:
-          logprob_token_ids_list
-          logprob_values_list
-          prompt_logprob_token_ids_list
-          prompt_logprob_values_list
-        """
-        if do_logprobs:
-            # Pythonize sample logprobs if needed
-            assert model_runner_output.logprob_token_ids_cpu is not None
-            logprob_token_ids_list = (
-                model_runner_output.logprob_token_ids_cpu.tolist())
-            logprob_values_list = (model_runner_output.logprobs_cpu.tolist())
-        else:
-            (
-                logprob_token_ids_list,
-                logprob_values_list,
-            ) = (None, None)
-        if do_prompt_logprobs:
-            # Pythonize prompt logprobs if needed
-            assert model_runner_output.prompt_logprob_token_ids_cpu is not None
-            prompt_logprob_token_ids_list = (
-                model_runner_output.prompt_logprob_token_ids_cpu.tolist())
-            prompt_logprob_values_list = (
-                model_runner_output.prompt_logprobs_cpu.tolist())
-        else:
-            (
-                prompt_logprob_token_ids_list,
-                prompt_logprob_values_list,
-            ) = (None, None)
-
-        return (logprob_token_ids_list, logprob_values_list,
-                prompt_logprob_token_ids_list, prompt_logprob_values_list)
-
     def update_from_output(
         self,
         scheduler_output: "SchedulerOutput",
@@ -167,15 +121,6 @@ def update_from_output(
             model_runner_output.prompt_logprobs_cpu is not None
             and len(model_runner_output.prompt_logprobs_cpu) > 0)
 
-        # Get logprobs as Python data structures
-        (
-            logprob_token_ids_list,
-            logprob_values_list,
-            prompt_logprob_token_ids_list,
-            prompt_logprob_values_list,
-        ) = self._pythonize_logprobs(do_logprobs, do_prompt_logprobs,
-                                     model_runner_output)
-
         if do_prompt_logprobs:
             # Index into prompt tokens, for building
             # prompt logprobs output data structure
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 9f1b07f5bf2f7..682054f5f9260 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,13 +1,15 @@
 import enum
-from typing import List, Optional, Union
+from typing import List, Optional, Union, Tuple
 
 from vllm.inputs import DecoderOnlyInputs, SingletonInputsAdapter, token_inputs
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import PromptLogprobs, RequestMetrics, SampleLogprobs
+from vllm.sequence import RequestMetrics
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.utils import ConstantList
+import numpy as np
+import numpy.typing as npt
 
 
 class Request:
@@ -45,10 +47,28 @@ def __init__(
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
         self.max_logprobs = sampling_params.logprobs
         self.max_prompt_logprobs = sampling_params.prompt_logprobs
-        self.logprobs: Optional[SampleLogprobs] = (
+        # If sample logprobs are enabled, the number of sample logprobs cannot
+        # be anticipated in advance (because the LLM is partially responsible
+        # for deciding when the completion is finished.) So,
+        # build a list of (logprobs,logprob_token_ids) tuples for each generated
+        # sequence position; logprobs and logprob_token_ids are both
+        # 1 x num_logprobs_at_offset np arrays,
+        # where num_logprobs_at_offset is the number of logprobs at a
+        # particular offset in the generated sequence. This has overheads
+        # compared to a single big NDArray, but should be okay because
+        # subsequent logprobs pythonization steps only
+        # aggregate along rows, not along columns.
+        # TODO: an alternative could be to preallocate a
+        # self.max_tokens x self.max_logprobs NDArray, but
+        # this was not employed because the array could be very large for large
+        # context windows, even if the completion was very short.
+        self.logprobs: Optional[List[Tuple[npt.NDArray, npt.NDArray]]] = (
             None if self.max_logprobs is None else [])
-        self.prompt_logprobs: Optional[PromptLogprobs] = (
-            None if self.max_prompt_logprobs is None else [])
+        # The number of prompt logprobs is known is advance, so preallocate an
+        # NDArray
+        self.prompt_logprobs: Optional[np.NDArray] = (
+            None if self.max_prompt_logprobs is None else np.empty(
+                (self.num_prompt_tokens, self.max_prompt_logprobs)))
         self.num_computed_tokens = 0
 
         mm_positions = self.inputs.multi_modal_placeholders

From 2a9ef8c9c77e5c504f39123fe2639c3e61212fe8 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 4 Dec 2024 14:25:09 +0000
Subject: [PATCH 128/293] first pass at only using numpy in engine core

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/__init__.py         |  11 +--
 vllm/v1/engine/core.py             | 105 +++++++++--------------------
 vllm/v1/request.py                 |  11 ++-
 vllm/v1/worker/gpu_model_runner.py |   4 +-
 4 files changed, 47 insertions(+), 84 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 38bc484daf553..c10f32dc1c061 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,13 +1,14 @@
 import enum
 from dataclasses import dataclass
-from typing import List, Optional, Union
+from typing import List, Optional, Tuple, Union
 
 import msgspec
+import numpy as np
+import numpy.typing as npt
 
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
 from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.sequence import PromptLogprobs, SampleLogprobs
 
 
 @dataclass
@@ -57,9 +58,9 @@ class EngineCoreOutput(msgspec.Struct,
     request_id: str
     new_token_ids: List[int]
     finished: bool
-    logprobs: Optional[SampleLogprobs]
-    prompt_logprobs: Optional[PromptLogprobs]
-    prompt_logprobs_token_ids: Optional[List[int]]
+    logprobs: Optional[List[Tuple[npt.NDArray, npt.NDArray]]]
+    prompt_logprobs: Optional[np.NDArray]
+    prompt_logprobs_token_ids: Optional[np.NDArray]
     finish_reason: Optional[str] = None
     stop_reason: Union[int, str, None] = None
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 869eca0f185df..28c18f9c637e6 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -14,7 +14,6 @@
 
 from vllm.config import CacheConfig, VllmConfig
 from vllm.logger import init_logger
-from vllm.sequence import Logprob
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.core.scheduler import Scheduler, SchedulerOutput
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
@@ -152,7 +151,6 @@ def update_from_output(
                                               and num_new_prompt_tokens > 0)
 
                 if request_do_prompt_logprobs:
-
                     # Construct prompt logprobs, under the condition that
                     # prompt logprobs were requested & a nonzero number of
                     # prompt tokens were computed in this step for this request.
@@ -160,34 +158,27 @@ def update_from_output(
                     # Note that this scenario returns an EngineCoreOutput which
                     # is empty except for the prompt logprobs which were
                     # computed for these prompt tokens.
-
-                    slice_upper_index = (curr_prompt_base_idx +
-                                         num_new_prompt_tokens)
-                    prompt_logprob_token_ids = prompt_logprob_token_ids_list[
-                        curr_prompt_base_idx:slice_upper_index]
-                    prompt_logprob_values = prompt_logprob_values_list[
-                        curr_prompt_base_idx:slice_upper_index]
-                    curr_prompt_base_idx = slice_upper_index
-
-                    logprob_cnt = max_prompt_logprobs
-                    prompt_logprobs = [{
-                        lpt: Logprob(lpv, (idx + 1), None)
-                        for idx, (lpv, lpt) in enumerate(
-                            zip(plp_tok_values[0:logprob_cnt],
-                                plp_tok_token_ids[0:logprob_cnt]))
-                    } for plp_tok_values, plp_tok_token_ids in zip(
-                        prompt_logprob_values, prompt_logprob_token_ids)]
-
-                    if not request.prompt_logprobs:
-                        # Ensure that None is the first prompt logprob
-                        prompt_logprobs = [None] + prompt_logprobs
-
-                    curr_prompt_base_idx = slice_upper_index
-
-                    prompt_slice_range_upper = request.num_computed_tokens
-                    prompt_slice_range_lower = (prompt_slice_range_upper -
-                                                num_new_prompt_tokens)
-                    request.prompt_logprobs.extend(prompt_logprobs)
+                    #
+                    # Note: new_prompt_logprobs will be used later to build the
+                    # engine core output
+
+                    mr_output_slice_upper_index = (curr_prompt_base_idx +
+                                                   num_new_prompt_tokens)
+                    new_prompt_logprobs = (
+                        model_runner_output.prompt_logprobs_cpu[
+                            curr_prompt_base_idx:mr_output_slice_upper_index])
+                    new_prompt_logprob_token_ids = (
+                        model_runner_output.prompt_logprob_token_ids_cpu[
+                            curr_prompt_base_idx:mr_output_slice_upper_index])
+
+                    req_slice_upper_index = (request.num_computed_tokens +
+                                             num_new_prompt_tokens)
+                    request.prompt_logprobs[
+                        request.num_computed_tokens:
+                        req_slice_upper_index] = new_prompt_logprobs
+                    request.prompt_logprob_token_ids[
+                        request.num_computed_tokens:
+                        req_slice_upper_index] = new_prompt_logprob_token_ids
                 else:
                     curr_prompt_base_idx += num_new_prompt_tokens
             else:
@@ -213,40 +204,11 @@ def update_from_output(
                 # generates at most one token at each step.
                 token_id = sampled_token_ids[req_index]
                 if request_do_logprobs:
-                    # Construct logprobs, if requested (TODO: assumes one
-                    # generated token).
-                    logprob_token_ids = logprob_token_ids_list[req_index]
-                    logprob_values = logprob_values_list[req_index]
-                    logprob_cnt = max_logprobs
-                    if token_id not in logprob_token_ids[0:max_logprobs]:
-                        # Sampled token is not in the in the top logprobs;
-                        # inject it & resort, ensuring that excess logprobs
-                        # not requested by the user have -inf probability
-                        logprob_values[max_logprobs:-1] = (
-                            [float('-inf')] *
-                            (len(logprob_values) - 1 - max_logprobs))
-
-                        indices = sorted(range(len(logprob_values)),
-                                         key=lambda k: logprob_values[k],
-                                         reverse=True)
-                        logprob_values = [logprob_values[i] for i in indices]
-                        logprob_token_ids = [
-                            logprob_token_ids[i] for i in indices
-                        ]
-
-                        # There will be one more logprob than the user requested
-                        logprob_cnt = max_logprobs + 1
-
-                    # Only keep the number of logprobs specified by the request
-                    # (plus possibly the sampled token id & its logprob)
-                    logprob_values = logprob_values[0:logprob_cnt]
-                    logprob_token_ids = logprob_token_ids[0:logprob_cnt]
-
-                    request.logprobs.append({
-                        lpt: Logprob(lpv, (idx + 1), None)
-                        for idx, (lpv, lpt) in enumerate(
-                            zip(logprob_values, logprob_token_ids))
-                    })
+                    # Slice out this request's sample logprobs; defer
+                    # pythonization to be carried out in the frontend.
+                    request.logprobs.append(
+                        (model_runner_output.logprobs_cpu[req_index],
+                         model_runner_output.logprob_token_ids_cpu[req_index]))
                 request.append_output_token_ids(token_id)
                 # TODO: Update the KV cache manager for prefix caching.
 
@@ -265,9 +227,9 @@ def update_from_output(
                     stop_reason=request.stop_reason,
                     logprobs=(request.logprobs[-num_new_tokens:]
                               if request_do_logprobs else None),
-                    prompt_logprobs=(prompt_logprobs
+                    prompt_logprobs=(new_prompt_logprobs
                                      if request_do_prompt_logprobs else None),
-                    prompt_logprobs_token_ids=(request.prompt_token_ids
+                    prompt_logprobs_token_ids=(new_prompt_logprob_token_ids
                                                if request_do_prompt_logprobs
                                                else None))
                 engine_core_outputs.append(output)
@@ -287,14 +249,9 @@ def update_from_output(
                         finish_reason=request.get_finished_reason(),
                         stop_reason=request.stop_reason,
                         logprobs=[] if request_do_logprobs else None,
-                        prompt_logprobs=(
-                            prompt_logprobs if request_do_prompt_logprobs else
-                            ([] if request_do_prompt_logprobs else None)),
-                        prompt_logprobs_token_ids=(
-                            request.prompt_token_ids[prompt_slice_range_lower:
-                                                     prompt_slice_range_upper]
-                            if request_do_prompt_logprobs else
-                            ([] if request_do_prompt_logprobs else None))))
+                        prompt_logprobs=new_prompt_logprobs,
+                        prompt_logprobs_token_ids=new_prompt_logprob_token_ids)
+                )
 
             new_running.append(request)
         scheduler.running = new_running
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 682054f5f9260..777e40539dd9e 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,5 +1,8 @@
 import enum
-from typing import List, Optional, Union, Tuple
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import numpy.typing as npt
 
 from vllm.inputs import DecoderOnlyInputs, SingletonInputsAdapter, token_inputs
 from vllm.lora.request import LoRARequest
@@ -8,8 +11,6 @@
 from vllm.sequence import RequestMetrics
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.utils import ConstantList
-import numpy as np
-import numpy.typing as npt
 
 
 class Request:
@@ -69,6 +70,10 @@ def __init__(
         self.prompt_logprobs: Optional[np.NDArray] = (
             None if self.max_prompt_logprobs is None else np.empty(
                 (self.num_prompt_tokens, self.max_prompt_logprobs)))
+        self.prompt_logprob_token_ids: Optional[np.NDArray] = (
+            None if self.max_prompt_logprobs is None else np.empty(
+                (self.num_prompt_tokens, self.max_prompt_logprobs),
+                dtype=np.int32))
         self.num_computed_tokens = 0
 
         mm_positions = self.inputs.multi_modal_placeholders
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 7bc2ce2fd77e4..2d19a55382b16 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -20,7 +20,7 @@
                         is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
-from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.outputs import ModelRunnerOutput, SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 
 if TYPE_CHECKING:
@@ -474,7 +474,7 @@ def execute_model(
         hidden_states = hidden_states[:num_scheduled_tokens]
 
         # Sample the next token and get logprobs if needed.
-        sampler_output = self.model.sample(
+        sampler_output: SamplerOutput = self.model.sample(
             logits=self.model.compute_logits(hidden_states, None),
             sampling_metadata=sampling_metadata,
         )

From 2fe9147eaaf6391794f804a4d187cc4a48d7820d Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 4 Dec 2024 15:00:49 +0000
Subject: [PATCH 129/293] tested removal of pythonization from engine core

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/__init__.py |  5 ++---
 vllm/v1/engine/core.py     | 17 +++++++++++------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index c10f32dc1c061..bf12851ec8c42 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -3,7 +3,6 @@
 from typing import List, Optional, Tuple, Union
 
 import msgspec
-import numpy as np
 import numpy.typing as npt
 
 from vllm.lora.request import LoRARequest
@@ -59,8 +58,8 @@ class EngineCoreOutput(msgspec.Struct,
     new_token_ids: List[int]
     finished: bool
     logprobs: Optional[List[Tuple[npt.NDArray, npt.NDArray]]]
-    prompt_logprobs: Optional[np.NDArray]
-    prompt_logprobs_token_ids: Optional[np.NDArray]
+    prompt_logprobs: Optional[npt.NDArray]
+    prompt_logprobs_token_ids: Optional[npt.NDArray]
     finish_reason: Optional[str] = None
     stop_reason: Union[int, str, None] = None
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 28c18f9c637e6..97d545cecb1c8 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -128,6 +128,7 @@ def update_from_output(
         engine_core_outputs: List[EngineCoreOutput] = []
         for request in scheduler.running:
             req_id = request.request_id
+            prev_num_computed_tokens = request.num_computed_tokens
             request.num_computed_tokens += num_scheduled_tokens[req_id]
             req_index = model_runner_output.req_id_to_index[req_id]
             num_new_tokens = 1
@@ -155,29 +156,33 @@ def update_from_output(
                     # prompt logprobs were requested & a nonzero number of
                     # prompt tokens were computed in this step for this request.
                     #
+                    # Pythonization is deferred to outside the engine core.
+                    #
                     # Note that this scenario returns an EngineCoreOutput which
                     # is empty except for the prompt logprobs which were
                     # computed for these prompt tokens.
                     #
                     # Note: new_prompt_logprobs will be used later to build the
                     # engine core output
-
+                    logprob_cnt = max_prompt_logprobs
                     mr_output_slice_upper_index = (curr_prompt_base_idx +
                                                    num_new_prompt_tokens)
                     new_prompt_logprobs = (
                         model_runner_output.prompt_logprobs_cpu[
-                            curr_prompt_base_idx:mr_output_slice_upper_index])
+                            curr_prompt_base_idx:mr_output_slice_upper_index,
+                            0:logprob_cnt])
                     new_prompt_logprob_token_ids = (
                         model_runner_output.prompt_logprob_token_ids_cpu[
-                            curr_prompt_base_idx:mr_output_slice_upper_index])
+                            curr_prompt_base_idx:mr_output_slice_upper_index,
+                            0:logprob_cnt])
 
-                    req_slice_upper_index = (request.num_computed_tokens +
+                    req_slice_upper_index = (prev_num_computed_tokens +
                                              num_new_prompt_tokens)
                     request.prompt_logprobs[
-                        request.num_computed_tokens:
+                        prev_num_computed_tokens:
                         req_slice_upper_index] = new_prompt_logprobs
                     request.prompt_logprob_token_ids[
-                        request.num_computed_tokens:
+                        prev_num_computed_tokens:
                         req_slice_upper_index] = new_prompt_logprob_token_ids
                 else:
                     curr_prompt_base_idx += num_new_prompt_tokens

From a46a8e599e3dd9a4ea305eb7c88baa5d20eaeaef Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 4 Dec 2024 21:10:31 +0000
Subject: [PATCH 130/293] wip detokenizer updates

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/detokenizer.py | 72 +++++++++++++++++++++--------------
 vllm/v1/request.py            |  4 +-
 2 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 5ad8b8c725f3e..8848a8374ead5 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -11,6 +11,7 @@
     detokenize_logprob_incrementally_in_place)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.v1.engine import DetokenizerRequest, EngineCoreOutput
+import numpy.typing as npt
 
 logger = init_logger(__name__)
 
@@ -105,26 +106,62 @@ def from_new_request(
             logprobs=[] if do_logprobs else None,
             prompt_logprobs=[] if do_prompt_logprobs else None)
 
+    def _pythonize_maybe_detokenize_sample_logprobs(
+        new_logprobs: Optional[List[Tuple[npt.NDArray, npt.NDArray]]],
+        detokenize: bool,
+    ) -> SampleLogprobs:
+        pass
+
+    def _pythonize_maybe_detokenize_prompt_logprobs(
+        new_prompt_logprobs: Optional[npt.NDArray],
+        new_prompt_logprob_token_ids: Optional[npt.NDArray],
+        detokenize: bool,
+    ) -> PromptLogprobs:
+        pass
+
     def add_tokens(
         self,
         new_token_ids: List[int],
-        new_logprobs: Optional[SampleLogprobs],
-        new_prompt_logprobs: Optional[PromptLogprobs],
+        new_logprobs: Optional[List[Tuple[npt.NDArray, npt.NDArray]]],
+        new_prompt_logprobs: Optional[npt.NDArray],
+        new_prompt_logprob_token_ids: Optional[npt.NDArray],
         finish_reason: Optional[str],
         stop_reason: Optional[str],
     ) -> Optional[RequestOutput]:
         """
         Update RequestState for the request_id by:
-            1) Detokenize the new token ids incrementally.
-            1a) If necessary, detokenize logprobs incrementally
-            1b) If necessary, detokenize prompt logprobs incrementally
-            2) Update the RequestOutput with the new text.
+            1) If necessary, detokenize logprobs *non*-incrementally
+            2) If necessary, detokenize prompt logprobs *non*-incrementally
+            3) Detokenize the new token ids incrementally.
+            4) Update the RequestOutput with the new text.
         """
 
         do_logprobs = new_logprobs is not None and len(new_logprobs) > 0
         assert not do_logprobs or len(new_logprobs) == len(new_token_ids)
 
-        # 1) Detokenize the new token ids incrementally. If necessary,
+        # 1) If required, Pythonize & detokenize sample logprobs
+        if do_logprobs:
+            # Detokenize individual token logprobs in-place
+            logprob_dict = new_logprobs[tdx]
+            assert logprob_dict is not None
+            detokenize_logprob_incrementally_in_place(
+                tokenizer=self.tokenizer,
+                logprob_dict=logprob_dict,
+                input_ids_prefix=self.token_ids[0:-1],
+                prev_tokens=self.tokens,
+                prefix_offset=self.prefix_offset,
+                read_offset=self.read_offset,
+                skip_special_tokens=self.skip_special_tokens,
+                spaces_between_special_tokens=self.
+                spaces_between_special_tokens,
+            )
+            self.logprobs.append(logprob_dict)
+
+        # 2) If necessary, detokenize prompt logprobs incrementally
+        if new_prompt_logprobs is not None and len(new_prompt_logprobs) > 0:
+            self.prompt_logprobs.extend(new_prompt_logprobs)
+
+        # 3) Detokenize the new token ids incrementally. If necessary,
         #    detokenize logprobs.
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
@@ -143,23 +180,6 @@ def add_tokens(
                  spaces_between_special_tokens,
              )
 
-            if do_logprobs:
-                # Detokenize individual token logprobs in-place
-                logprob_dict = new_logprobs[tdx]
-                assert logprob_dict is not None
-                detokenize_logprob_incrementally_in_place(
-                    tokenizer=self.tokenizer,
-                    logprob_dict=logprob_dict,
-                    input_ids_prefix=self.token_ids[0:-1],
-                    prev_tokens=self.tokens,
-                    prefix_offset=self.prefix_offset,
-                    read_offset=self.read_offset,
-                    skip_special_tokens=self.skip_special_tokens,
-                    spaces_between_special_tokens=self.
-                    spaces_between_special_tokens,
-                )
-                self.logprobs.append(logprob_dict)
-
             self.tokens.extend(new_tokens)
             self.prefix_offset = prefix_offset
             self.read_offset = read_offset
@@ -167,10 +187,6 @@ def add_tokens(
 
             decoded_text += new_decoded_token_text
 
-        # 1b) If necessary, detokenize prompt logprobs incrementally
-        if new_prompt_logprobs is not None and len(new_prompt_logprobs) > 0:
-            self.prompt_logprobs.extend(new_prompt_logprobs)
-
         # 2) Evaluate stop criteria.
         if self.stop:
             stop = StopChecker.check_stop_strings(
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 777e40539dd9e..9f14e7c9e16e9 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -67,10 +67,10 @@ def __init__(
             None if self.max_logprobs is None else [])
         # The number of prompt logprobs is known is advance, so preallocate an
         # NDArray
-        self.prompt_logprobs: Optional[np.NDArray] = (
+        self.prompt_logprobs: Optional[npt.NDArray] = (
             None if self.max_prompt_logprobs is None else np.empty(
                 (self.num_prompt_tokens, self.max_prompt_logprobs)))
-        self.prompt_logprob_token_ids: Optional[np.NDArray] = (
+        self.prompt_logprob_token_ids: Optional[npt.NDArray] = (
             None if self.max_prompt_logprobs is None else np.empty(
                 (self.num_prompt_tokens, self.max_prompt_logprobs),
                 dtype=np.int32))

From 0c04576aada39a0526a66af70a9f9191d8957d33 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 5 Dec 2024 05:23:14 +0000
Subject: [PATCH 131/293] wip

---
 vllm/v1/engine/detokenizer.py | 69 +++++++++++++++++++++++++++++------
 1 file changed, 58 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 8848a8374ead5..415fae6a3ea25 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -25,8 +25,8 @@ class IncrementalDetokenizer:
     output_text: str
     tokens: List[str]
     token_ids: List[int]
-    logprobs: Optional[SampleLogprobs]
-    prompt_logprobs: Optional[PromptLogprobs]
+    request_logprobs: Optional[SampleLogprobs]
+    request_prompt_logprobs: Optional[PromptLogprobs]
 
     # Stop strings
     stop: List[str]
@@ -53,6 +53,10 @@ class IncrementalDetokenizer:
     stop_buffer_length: int
     _last_output_text_offset: int = 0
 
+    # Maximum number of sample logprobs for this request
+    request_max_sample_logprobs: Optional[int]
+    request_max_prompt_logprobs: Optional[int]
+
     @property
     def output_token_ids(self) -> List[int]:
         assert len(self.token_ids) >= len(self.prompt_token_ids)
@@ -81,6 +85,7 @@ def from_new_request(
 
         # Logprobs & prompt logprobs settings
         do_logprobs = request.logprobs is not None and request.logprobs > 0
+        
         do_prompt_logprobs = (request.prompt_logprobs is not None
                               and request.prompt_logprobs > 0)
 
@@ -106,13 +111,49 @@ def from_new_request(
             logprobs=[] if do_logprobs else None,
             prompt_logprobs=[] if do_prompt_logprobs else None)
 
-    def _pythonize_maybe_detokenize_sample_logprobs(
-        new_logprobs: Optional[List[Tuple[npt.NDArray, npt.NDArray]]],
+    def _pythonize_maybe_detokenize_sample_logprobs_for_request(
+        new_logprobs: List[Tuple[npt.NDArray, npt.NDArray]],
         detokenize: bool,
     ) -> SampleLogprobs:
-        pass
-
-    def _pythonize_maybe_detokenize_prompt_logprobs(
+        for logprob_values, logprob_token_ids in new_logprobs:
+
+
+        # Construct logprobs, if requested (TODO: assumes one
+        # generated token).
+        logprob_token_ids = logprob_token_ids_list[req_index]
+        logprob_values = logprob_values_list[req_index]
+        logprob_cnt = max_logprobs
+        if token_id not in logprob_token_ids[0:max_logprobs]:
+            # Sampled token is not in the in the top logprobs;
+            # inject it & resort, ensuring that excess logprobs
+            # not requested by the user have -inf probability
+            logprob_values[max_logprobs:-1] = (
+                [float('-inf')] *
+                (len(logprob_values) - 1 - max_logprobs))
+
+            indices = sorted(range(len(logprob_values)),
+                                key=lambda k: logprob_values[k],
+                                reverse=True)
+            logprob_values = [logprob_values[i] for i in indices]
+            logprob_token_ids = [
+                logprob_token_ids[i] for i in indices
+            ]
+
+            # There will be one more logprob than the user requested
+            logprob_cnt = max_logprobs + 1
+
+        # Only keep the number of logprobs specified by the request
+        # (plus possibly the sampled token id & its logprob)
+        logprob_values = logprob_values[0:logprob_cnt]
+        logprob_token_ids = logprob_token_ids[0:logprob_cnt]
+
+        request.logprobs.append({
+            lpt: Logprob(lpv, (idx + 1), None)
+            for idx, (lpv, lpt) in enumerate(
+                zip(logprob_values, logprob_token_ids))
+        })
+
+    def _pythonize_maybe_detokenize_prompt_logprobs_for_request(
         new_prompt_logprobs: Optional[npt.NDArray],
         new_prompt_logprob_token_ids: Optional[npt.NDArray],
         detokenize: bool,
@@ -141,6 +182,12 @@ def add_tokens(
 
         # 1) If required, Pythonize & detokenize sample logprobs
         if do_logprobs:
+
+            self.request_logprobs.append(self._pythonize_maybe_detokenize_sample_logprobs_for_request(
+                new_logprobs,
+                detokenize=True
+            ))
+
             # Detokenize individual token logprobs in-place
             logprob_dict = new_logprobs[tdx]
             assert logprob_dict is not None
@@ -155,11 +202,11 @@ def add_tokens(
                 spaces_between_special_tokens=self.
                 spaces_between_special_tokens,
             )
-            self.logprobs.append(logprob_dict)
+            self.request_logprobs.append(logprob_dict)
 
         # 2) If necessary, detokenize prompt logprobs incrementally
         if new_prompt_logprobs is not None and len(new_prompt_logprobs) > 0:
-            self.prompt_logprobs.extend(new_prompt_logprobs)
+            self.request_prompt_logprobs.extend(new_prompt_logprobs)
 
         # 3) Detokenize the new token ids incrementally. If necessary,
         #    detokenize logprobs.
@@ -212,8 +259,8 @@ def add_tokens(
         delta = self.output_kind == RequestOutputKind.DELTA
         output_text = self._get_next_output_text(finished, delta)
         token_ids = new_token_ids if delta else self.output_token_ids
-        logprobs = new_logprobs if delta else self.logprobs
-        prompt_logprobs = new_prompt_logprobs if delta else self.prompt_logprobs
+        logprobs = new_logprobs if delta else self.request_logprobs
+        prompt_logprobs = new_prompt_logprobs if delta else self.request_prompt_logprobs
 
         request_output = RequestOutput.new(
             self.request_id,

From 0f04d6ec75a5346f4566a4fb3ed90304c65d0628 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 5 Dec 2024 05:27:47 +0000
Subject: [PATCH 132/293] wip

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/detokenizer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 415fae6a3ea25..3d48aa1afbe39 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -89,6 +89,9 @@ def from_new_request(
         do_prompt_logprobs = (request.prompt_logprobs is not None
                               and request.prompt_logprobs > 0)
 
+        if do_logprobs:
+            self.
+
         return cls(
             output_text="",
             tokens=tokens,

From c6831ca6634d40bd232f282a99ac97fe13c4c652 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 5 Dec 2024 06:57:00 +0000
Subject: [PATCH 133/293] first pass at pythonization moved out of engine

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/detokenizer.py | 226 ++++++++++++++++++++++------------
 1 file changed, 144 insertions(+), 82 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 3d48aa1afbe39..514faf31a74a1 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,17 +1,17 @@
 from dataclasses import dataclass
 from typing import Dict, Iterable, List, Optional, Tuple, Union
 
+import numpy.typing as npt
+
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import RequestOutputKind
-from vllm.sequence import PromptLogprobs, SampleLogprobs
+from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.transformers_utils.detokenizer_utils import (
-    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally,
-    detokenize_logprob_incrementally_in_place)
+    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.v1.engine import DetokenizerRequest, EngineCoreOutput
-import numpy.typing as npt
 
 logger = init_logger(__name__)
 
@@ -49,14 +49,16 @@ class IncrementalDetokenizer:
     # Tokenizer for this request
     tokenizer: AnyTokenizer
 
+    # Maximum number of sample logprobs for this request
+    max_request_sample_logprobs: Optional[int]
+
+    # Maximum number of prompt logprobs for this request
+    max_request_prompt_logprobs: Optional[int]
+
     # Accounting for stop string buffering
     stop_buffer_length: int
     _last_output_text_offset: int = 0
 
-    # Maximum number of sample logprobs for this request
-    request_max_sample_logprobs: Optional[int]
-    request_max_prompt_logprobs: Optional[int]
-
     @property
     def output_token_ids(self) -> List[int]:
         assert len(self.token_ids) >= len(self.prompt_token_ids)
@@ -84,13 +86,11 @@ def from_new_request(
             stop_buffer_length = 0
 
         # Logprobs & prompt logprobs settings
-        do_logprobs = request.logprobs is not None and request.logprobs > 0
-        
-        do_prompt_logprobs = (request.prompt_logprobs is not None
-                              and request.prompt_logprobs > 0)
+        do_request_logprobs = (request.logprobs is not None
+                               and request.logprobs > 0)
 
-        if do_logprobs:
-            self.
+        do_request_prompt_logprobs = (request.prompt_logprobs is not None
+                                      and request.prompt_logprobs > 0)
 
         return cls(
             output_text="",
@@ -111,57 +111,126 @@ def from_new_request(
             prompt_token_ids=request.prompt_token_ids,
             tokenizer=tokenizer,
             stop_buffer_length=stop_buffer_length,
-            logprobs=[] if do_logprobs else None,
-            prompt_logprobs=[] if do_prompt_logprobs else None)
+            max_request_sample_logprobs=request.logprobs,
+            max_request_prompt_logprobs=request.prompt_logprobs,
+            request_logprobs=[] if do_request_logprobs else None,
+            request_prompt_logprobs=[] if do_request_prompt_logprobs else None)
+
+    def _detokenize_ids(
+        self,
+        token_id_list: int,
+    ) -> List[str]:
+        return self.tokenizer.convert_ids_to_tokens(
+            token_id_list, skip_special_tokens=self.skip_special_tokens)
+
+    def _pythonize_sequence_position(
+        self,
+        logprob_values: npt.NDArray,
+        logprob_token_ids: npt.NDArray,
+        detokenize: bool,
+    ) -> Dict[int, Logprob]:
+        """Pythonize the numpy (np) logprobs & token ids for a sequence position
+        
+        Optionally detokenize (compute logprob decoded token str)
+
+        Args:
+          logprob_values: np logprob values
+          logprob_token_ids: np logprob token ids
+          detokenize: if True, compute logprob decoded token str,
+                      (o/w decoded_token=None)
+
+        Return:
+          mapping from top token id to Logprob data structure
+        """
+        logprob_values = logprob_values.tolist()
+        logprob_token_ids = logprob_token_ids.tolist()
+        logprob_token_strs = (self._detokenize_ids(logprob_token_ids) if
+                              detokenize else [None] * len(logprob_token_ids))
+
+        return {
+            lpt: Logprob(lpv, (idx + 1), lpstr)
+            for idx, (lpv, lpt, lpstr) in enumerate(
+                zip(logprob_values, logprob_token_ids, logprob_token_strs))
+        }
 
     def _pythonize_maybe_detokenize_sample_logprobs_for_request(
+        self,
         new_logprobs: List[Tuple[npt.NDArray, npt.NDArray]],
         detokenize: bool,
     ) -> SampleLogprobs:
-        for logprob_values, logprob_token_ids in new_logprobs:
+        """Pythonize sample logprobs, maybe detokenize.
+        
+        Pythonization entails the conversion from a numpy (np)
+        values/token ids representation to the more idiomatically
+        Pythonic representation required by the OpenAI API,
+        List[Dict[int,Logprob]]
 
+        The Logprob.decoded_token field is only computed (detokenized
+        from the associated top token id) if detokenize=True
 
-        # Construct logprobs, if requested (TODO: assumes one
-        # generated token).
-        logprob_token_ids = logprob_token_ids_list[req_index]
-        logprob_values = logprob_values_list[req_index]
-        logprob_cnt = max_logprobs
-        if token_id not in logprob_token_ids[0:max_logprobs]:
-            # Sampled token is not in the in the top logprobs;
-            # inject it & resort, ensuring that excess logprobs
-            # not requested by the user have -inf probability
-            logprob_values[max_logprobs:-1] = (
-                [float('-inf')] *
-                (len(logprob_values) - 1 - max_logprobs))
-
-            indices = sorted(range(len(logprob_values)),
-                                key=lambda k: logprob_values[k],
-                                reverse=True)
-            logprob_values = [logprob_values[i] for i in indices]
-            logprob_token_ids = [
-                logprob_token_ids[i] for i in indices
-            ]
-
-            # There will be one more logprob than the user requested
-            logprob_cnt = max_logprobs + 1
-
-        # Only keep the number of logprobs specified by the request
-        # (plus possibly the sampled token id & its logprob)
-        logprob_values = logprob_values[0:logprob_cnt]
-        logprob_token_ids = logprob_token_ids[0:logprob_cnt]
-
-        request.logprobs.append({
-            lpt: Logprob(lpv, (idx + 1), None)
-            for idx, (lpv, lpt) in enumerate(
-                zip(logprob_values, logprob_token_ids))
-        })
+        Args:
+          new_logprobs: List of (logprobs,logprob token ids) numpy array tuples
+          detokenize: Logprob.decoded_token is computed if True, otherwise None
+        
+        Returns:
+          Sample logprobs, Pythonized and possibly detokenized
+        """
+        for logprob_values, logprob_token_ids in new_logprobs:
+            # Only keep the number of logprobs specified by the request
+            # (plus possibly the sampled token id & its logprob)
+            logprob_cnt = self.max_request_sample_logprobs
+            self.request_logprobs.append(
+                self._pythonize_sequence_position(
+                    logprob_values[0:logprob_cnt],
+                    logprob_token_ids[0:logprob_cnt], detokenize))
+
+        # if token_id not in logprob_token_ids[0:max_logprobs]:
+        #     # Sampled token is not in the in the top logprobs;
+        #     # inject it & resort, ensuring that excess logprobs
+        #     # not requested by the user have -inf probability
+        #     logprob_values[max_logprobs:-1] = (
+        #         [float('-inf')] *
+        #         (len(logprob_values) - 1 - max_logprobs))
+
+        #     indices = sorted(range(len(logprob_values)),
+        #                         key=lambda k: logprob_values[k],
+        #                         reverse=True)
+        #     logprob_values = [logprob_values[i] for i in indices]
+        #     logprob_token_ids = [
+        #         logprob_token_ids[i] for i in indices
+        #     ]
+
+        #     # There will be one more logprob than the user requested
+        #     logprob_cnt = max_logprobs + 1
 
     def _pythonize_maybe_detokenize_prompt_logprobs_for_request(
-        new_prompt_logprobs: Optional[npt.NDArray],
-        new_prompt_logprob_token_ids: Optional[npt.NDArray],
+        self,
+        prompt_logprob_values: Optional[npt.NDArray],
+        prompt_logprob_token_ids: Optional[npt.NDArray],
         detokenize: bool,
     ) -> PromptLogprobs:
-        pass
+        # Construct prompt logprobs, under the condition that
+        # prompt logprobs were requested & a nonzero number of
+        # prompt tokens were computed in this step for this request.
+        #
+        # Note that this scenario returns an EngineCoreOutput which
+        # is empty except for the prompt logprobs which were
+        # computed for these prompt tokens.
+        logprob_cnt = self.max_request_prompt_logprobs
+        prompt_logprobs = [
+            self._pythonize_sequence_position(plp_tok_values,
+                                              plp_tok_token_ids, detokenize)
+            for plp_tok_values, plp_tok_token_ids in zip(
+                # Slice out top prompt logprobs
+                prompt_logprob_values[:, 0:logprob_cnt],
+                prompt_logprob_token_ids[:, 0:logprob_cnt])
+        ]
+
+        if not self.request_prompt_logprobs:
+            # Ensure that None is the first prompt logprob
+            prompt_logprobs = [None] + prompt_logprobs
+
+        self.request_prompt_logprobs.extend(prompt_logprobs)
 
     def add_tokens(
         self,
@@ -180,43 +249,33 @@ def add_tokens(
             4) Update the RequestOutput with the new text.
         """
 
-        do_logprobs = new_logprobs is not None and len(new_logprobs) > 0
-        assert not do_logprobs or len(new_logprobs) == len(new_token_ids)
+        do_request_sample_logprobs = new_logprobs is not None and len(
+            new_logprobs) > 0
+        assert not do_request_sample_logprobs or len(new_logprobs) == len(
+            new_token_ids)
+        do_request_prompt_logprobs = new_prompt_logprobs is not None and len(
+            new_prompt_logprobs) > 0
+        assert (not do_request_prompt_logprobs
+                or new_prompt_logprob_token_ids is not None)
 
         # 1) If required, Pythonize & detokenize sample logprobs
-        if do_logprobs:
-
-            self.request_logprobs.append(self._pythonize_maybe_detokenize_sample_logprobs_for_request(
-                new_logprobs,
-                detokenize=True
-            ))
-
-            # Detokenize individual token logprobs in-place
-            logprob_dict = new_logprobs[tdx]
-            assert logprob_dict is not None
-            detokenize_logprob_incrementally_in_place(
-                tokenizer=self.tokenizer,
-                logprob_dict=logprob_dict,
-                input_ids_prefix=self.token_ids[0:-1],
-                prev_tokens=self.tokens,
-                prefix_offset=self.prefix_offset,
-                read_offset=self.read_offset,
-                skip_special_tokens=self.skip_special_tokens,
-                spaces_between_special_tokens=self.
-                spaces_between_special_tokens,
-            )
-            self.request_logprobs.append(logprob_dict)
+        if do_request_sample_logprobs:
+            self._pythonize_maybe_detokenize_sample_logprobs_for_request(
+                new_logprobs, detokenize=True)
 
         # 2) If necessary, detokenize prompt logprobs incrementally
-        if new_prompt_logprobs is not None and len(new_prompt_logprobs) > 0:
-            self.request_prompt_logprobs.extend(new_prompt_logprobs)
+        if do_request_prompt_logprobs:
+            self._pythonize_maybe_detokenize_prompt_logprobs_for_request(
+                new_prompt_logprobs,
+                new_prompt_logprob_token_ids,
+                detokenize=True)
 
         # 3) Detokenize the new token ids incrementally. If necessary,
         #    detokenize logprobs.
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
         decoded_text = ""
-        for tdx, new_token_id in enumerate(new_token_ids):
+        for new_token_id in new_token_ids:
             self.token_ids.append(new_token_id)
             (new_tokens, new_decoded_token_text, prefix_offset,
              read_offset) = detokenize_incrementally(
@@ -263,7 +322,8 @@ def add_tokens(
         output_text = self._get_next_output_text(finished, delta)
         token_ids = new_token_ids if delta else self.output_token_ids
         logprobs = new_logprobs if delta else self.request_logprobs
-        prompt_logprobs = new_prompt_logprobs if delta else self.request_prompt_logprobs
+        prompt_logprobs = (new_prompt_logprobs
+                           if delta else self.request_prompt_logprobs)
 
         request_output = RequestOutput.new(
             self.request_id,
@@ -366,6 +426,8 @@ def step(
                 new_token_ids=engine_core_output.new_token_ids,
                 new_logprobs=engine_core_output.logprobs,
                 new_prompt_logprobs=engine_core_output.prompt_logprobs,
+                new_prompt_logprob_token_ids=engine_core_output.
+                prompt_logprobs_token_ids,
                 finish_reason=engine_core_output.finish_reason,
                 stop_reason=engine_core_output.stop_reason,
             )

From ae7e10c9c5ff8b257478959833940befe04dfbe9 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 5 Dec 2024 07:18:39 +0000
Subject: [PATCH 134/293] incremental/non-incremental detokenized text
 comparison

---
 tests/v1/samplers/test_logprobs.py | 34 +++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index a42e78da85ca0..0d7da5ed71819 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -1,3 +1,4 @@
+import re
 from typing import List, Tuple
 
 import pytest
@@ -75,6 +76,36 @@ def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
         raise ValueError("Invalid logprobs batch configuration for test.")
 
 
+def _assert_incr_detok_str_matches_non_incr_detok_str(
+    incremental_detokenization_str: str,
+    non_incremental_detokenization_str: str,
+    msg: str,
+) -> None:
+    """Compare incrementally detok. text to non-incrementally detok. text
+    
+    Fail if the strings mismatch after non-alphanumeric characters are stripped
+    out.
+
+    Rationale: incremental detokenization in the text generation process allows
+    the tokenizer to adjust the next token text output based on the token's
+    context in the string. However, logprobs detokenization detokenizes each
+    token individually, and the resultant strings may include some
+    non-alphanumeric placeholder characters where there could be i.e.
+    whitespace. So, this function compares only the alphanumeric text
+    between two strings and fails if there is a mismatch, which helps
+    with validating logprobs detokenization.
+
+    Args:
+      incremental_detokenization_str: incrementally-detokenized generated text
+      non_incremental_detokenization_str: non-incrementally-detokenized logprob
+                                          tokens
+      msg: error message if `assert` fails
+    """
+    rgx = r'[^a-zA-Z0-9]+'
+    assert (re.sub(rgx, '', incremental_detokenization_str) == re.sub(
+        rgx, '', non_incremental_detokenization_str)), (msg)
+
+
 def _test_case_get_logprobs_and_prompt_logprobs(
     hf_runner,
     vllm_runner,
@@ -180,7 +211,8 @@ def _test_case_get_logprobs_and_prompt_logprobs(
             if detokenize:
                 output_string_from_most_likely_tokens = "".join(
                     output_string_from_most_likely_tokens_lst)
-                assert output_text == output_string_from_most_likely_tokens, (
+                _assert_incr_detok_str_matches_non_incr_detok_str(
+                    output_text, output_string_from_most_likely_tokens,
                     "The output text from the top logprob for each token "
                     "position should be the same as the output text in the "
                     "result.")

From 3cffca39252fe008f6a7c913b9159da90b11cf1a Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 5 Dec 2024 07:36:25 +0000
Subject: [PATCH 135/293] implemented the sample logprobs N+1 scenario in the
 front end

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/detokenizer.py | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 514faf31a74a1..4fa2f986d68ac 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass
 from typing import Dict, Iterable, List, Optional, Tuple, Union
 
+import numpy as np
 import numpy.typing as npt
 
 from vllm.engine.output_processor.stop_checker import StopChecker
@@ -156,6 +157,7 @@ def _pythonize_sequence_position(
     def _pythonize_maybe_detokenize_sample_logprobs_for_request(
         self,
         new_logprobs: List[Tuple[npt.NDArray, npt.NDArray]],
+        new_token_ids: List[int],
         detokenize: bool,
     ) -> SampleLogprobs:
         """Pythonize sample logprobs, maybe detokenize.
@@ -175,10 +177,26 @@ def _pythonize_maybe_detokenize_sample_logprobs_for_request(
         Returns:
           Sample logprobs, Pythonized and possibly detokenized
         """
-        for logprob_values, logprob_token_ids in new_logprobs:
+        max_logprobs = self.max_request_sample_logprobs
+        for (logprob_values,
+             logprob_token_ids), token_id in zip(new_logprobs, new_token_ids):
             # Only keep the number of logprobs specified by the request
             # (plus possibly the sampled token id & its logprob)
-            logprob_cnt = self.max_request_sample_logprobs
+            logprob_cnt = max_logprobs
+            if token_id not in logprob_token_ids[0:logprob_cnt]:
+                # Sampled token is not in the in the top logprobs;
+                # inject it & resort, ensuring that excess logprobs
+                # not requested by the user have -inf probability
+                logprob_values[max_logprobs:-1] = float('-inf')
+                # Get indices that would sort logprob_values in descending order
+                indices = np.argsort(logprob_values)[::-1]
+                # Use these indices to reorder logprob_values and
+                # logprob_token_ids
+                logprob_values = logprob_values[indices]
+                logprob_token_ids = logprob_token_ids[indices]
+                # There will be one more logprob than the user requested
+                logprob_cnt = max_logprobs + 1
+
             self.request_logprobs.append(
                 self._pythonize_sequence_position(
                     logprob_values[0:logprob_cnt],
@@ -261,7 +279,7 @@ def add_tokens(
         # 1) If required, Pythonize & detokenize sample logprobs
         if do_request_sample_logprobs:
             self._pythonize_maybe_detokenize_sample_logprobs_for_request(
-                new_logprobs, detokenize=True)
+                new_logprobs, new_token_ids, detokenize=True)
 
         # 2) If necessary, detokenize prompt logprobs incrementally
         if do_request_prompt_logprobs:

From 73e4c12891df8d7d373abbc2652dbc9eeeb1fef1 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 5 Dec 2024 07:52:34 +0000
Subject: [PATCH 136/293] fixed prompt logprob count bug

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/detokenizer.py | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 4fa2f986d68ac..e7cf01d03fb5c 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -120,9 +120,10 @@ def from_new_request(
     def _detokenize_ids(
         self,
         token_id_list: int,
+        skip_special_tokens=False,
     ) -> List[str]:
         return self.tokenizer.convert_ids_to_tokens(
-            token_id_list, skip_special_tokens=self.skip_special_tokens)
+            token_id_list, skip_special_tokens=skip_special_tokens)
 
     def _pythonize_sequence_position(
         self,
@@ -202,25 +203,6 @@ def _pythonize_maybe_detokenize_sample_logprobs_for_request(
                     logprob_values[0:logprob_cnt],
                     logprob_token_ids[0:logprob_cnt], detokenize))
 
-        # if token_id not in logprob_token_ids[0:max_logprobs]:
-        #     # Sampled token is not in the in the top logprobs;
-        #     # inject it & resort, ensuring that excess logprobs
-        #     # not requested by the user have -inf probability
-        #     logprob_values[max_logprobs:-1] = (
-        #         [float('-inf')] *
-        #         (len(logprob_values) - 1 - max_logprobs))
-
-        #     indices = sorted(range(len(logprob_values)),
-        #                         key=lambda k: logprob_values[k],
-        #                         reverse=True)
-        #     logprob_values = [logprob_values[i] for i in indices]
-        #     logprob_token_ids = [
-        #         logprob_token_ids[i] for i in indices
-        #     ]
-
-        #     # There will be one more logprob than the user requested
-        #     logprob_cnt = max_logprobs + 1
-
     def _pythonize_maybe_detokenize_prompt_logprobs_for_request(
         self,
         prompt_logprob_values: Optional[npt.NDArray],

From 5b49d36705dad9ef05cd4ba80d6d3a4f833d4d29 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 5 Dec 2024 08:58:28 +0000
Subject: [PATCH 137/293] passing one test!

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/core.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 97d545cecb1c8..acef4e49310fe 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -123,7 +123,7 @@ def update_from_output(
         if do_prompt_logprobs:
             # Index into prompt tokens, for building
             # prompt logprobs output data structure
-            curr_prompt_base_idx = 0
+            mr_output_slice_lower_index = 0
         new_running: List[Request] = []
         engine_core_outputs: List[EngineCoreOutput] = []
         for request in scheduler.running:
@@ -143,9 +143,11 @@ def update_from_output(
                 # token is discarded and all sequence offsets are prompt
                 # offsets), otherwise it is the number of scheduled
                 # tokens minus one (for the sampled token)
+                req_is_not_partial = (scheduler_output.partial_req_index !=
+                                      req_index)
                 num_new_prompt_tokens = (
                     num_scheduled_tokens[request.request_id] -
-                    int(scheduler_output.partial_req_index != req_index))
+                    int(req_is_not_partial))
 
                 request_do_prompt_logprobs = (max_prompt_logprobs is not None
                                               and max_prompt_logprobs > 0
@@ -165,16 +167,16 @@ def update_from_output(
                     # Note: new_prompt_logprobs will be used later to build the
                     # engine core output
                     logprob_cnt = max_prompt_logprobs
-                    mr_output_slice_upper_index = (curr_prompt_base_idx +
-                                                   num_new_prompt_tokens)
+                    mr_output_slice_upper_index = (
+                        mr_output_slice_lower_index + num_new_prompt_tokens)
                     new_prompt_logprobs = (
                         model_runner_output.prompt_logprobs_cpu[
-                            curr_prompt_base_idx:mr_output_slice_upper_index,
-                            0:logprob_cnt])
+                            mr_output_slice_lower_index:
+                            mr_output_slice_upper_index, 0:logprob_cnt])
                     new_prompt_logprob_token_ids = (
                         model_runner_output.prompt_logprob_token_ids_cpu[
-                            curr_prompt_base_idx:mr_output_slice_upper_index,
-                            0:logprob_cnt])
+                            mr_output_slice_lower_index:
+                            mr_output_slice_upper_index, 0:logprob_cnt])
 
                     req_slice_upper_index = (prev_num_computed_tokens +
                                              num_new_prompt_tokens)
@@ -184,8 +186,9 @@ def update_from_output(
                     request.prompt_logprob_token_ids[
                         prev_num_computed_tokens:
                         req_slice_upper_index] = new_prompt_logprob_token_ids
+                    mr_output_slice_lower_index = mr_output_slice_upper_index
                 else:
-                    curr_prompt_base_idx += num_new_prompt_tokens
+                    mr_output_slice_lower_index += num_new_prompt_tokens
             else:
                 request_do_prompt_logprobs = False
 

From 0cf2c794f38caa3a595d3eeda018eaa4110b0cbe Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 5 Dec 2024 21:32:02 +0000
Subject: [PATCH 138/293] successfully failing cumulative logprobs test

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/samplers/test_logprobs.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index 0d7da5ed71819..0533cac74acee 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -5,7 +5,7 @@
 import torch
 
 from tests.kernels.utils import override_backend_env_variable
-from vllm import SamplingParams
+from vllm import CompletionOutput, SamplingParams
 
 from ...conftest import VllmRunner
 
@@ -106,6 +106,14 @@ def _assert_incr_detok_str_matches_non_incr_detok_str(
         rgx, '', non_incremental_detokenization_str)), (msg)
 
 
+def _compute_correct_cumulative_logprob(
+        completion_output: CompletionOutput) -> float:
+    token_ids = completion_output.token_ids
+    logprobs = completion_output.logprobs
+    assert logprobs is not None
+    return sum([lp[tok_id].logprob for tok_id, lp in zip(token_ids, logprobs)])
+
+
 def _test_case_get_logprobs_and_prompt_logprobs(
     hf_runner,
     vllm_runner,
@@ -235,6 +243,16 @@ def _test_case_get_logprobs_and_prompt_logprobs(
                         assert isinstance(sample_logprob.decoded_token, str), (
                             "The token should be decoded by the time it is"
                             " returned to the user.")
+
+            # At this point we know the sample logprobs are correct for this
+            # request. Validate that cumulative_logprob is actually the sum.
+            # For each request, assert that the returned cumulative logprob
+            # matches the correct value, which is computed below.
+            torch.testing.assert_close(
+                vllm_result.outputs[0].cumulative_logprob,
+                _compute_correct_cumulative_logprob(vllm_result.outputs[0]),
+                atol=1e-6,
+                rtol=1e-6)
         else:
             # Logprobs disabled for this request; should be None
             assert vllm_result.outputs[0].logprobs is None

From 49e0b33432f982f05132e685fb4b97dd9415dee6 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 5 Dec 2024 21:59:17 +0000
Subject: [PATCH 139/293] cumulative logprob works

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/outputs.py               | 12 +++++++-----
 vllm/v1/engine/detokenizer.py | 20 ++++++++++++++------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/vllm/outputs.py b/vllm/outputs.py
index 08bc5a91174a9..c412d5ce21571 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -129,6 +129,7 @@ def new(
         token_ids: List[int],
         logprobs: Optional[SampleLogprobs],
         prompt_logprobs: Optional[PromptLogprobs],
+        cumulative_logprob: Optional[float],
         finished: bool = False,
     ) -> "RequestOutput":
         """Initialize a new RequestOutput object.
@@ -145,11 +146,12 @@ def new(
         """
 
         # TODO: Support `n` > 1.
-        completion_output = CompletionOutput(index=0,
-                                             text=text,
-                                             token_ids=token_ids,
-                                             cumulative_logprob=None,
-                                             logprobs=logprobs)
+        completion_output = CompletionOutput(
+            index=0,
+            text=text,
+            token_ids=token_ids,
+            cumulative_logprob=cumulative_logprob,
+            logprobs=logprobs)
 
         return RequestOutput(
             request_id=request_id,
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index e7cf01d03fb5c..53bc078897f77 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -28,6 +28,7 @@ class IncrementalDetokenizer:
     token_ids: List[int]
     request_logprobs: Optional[SampleLogprobs]
     request_prompt_logprobs: Optional[PromptLogprobs]
+    request_cumulative_logprob: Optional[float]
 
     # Stop strings
     stop: List[str]
@@ -115,7 +116,8 @@ def from_new_request(
             max_request_sample_logprobs=request.logprobs,
             max_request_prompt_logprobs=request.prompt_logprobs,
             request_logprobs=[] if do_request_logprobs else None,
-            request_prompt_logprobs=[] if do_request_prompt_logprobs else None)
+            request_prompt_logprobs=[] if do_request_prompt_logprobs else None,
+            request_cumulative_logprob=0 if do_request_logprobs else None)
 
     def _detokenize_ids(
         self,
@@ -160,7 +162,7 @@ def _pythonize_maybe_detokenize_sample_logprobs_for_request(
         new_logprobs: List[Tuple[npt.NDArray, npt.NDArray]],
         new_token_ids: List[int],
         detokenize: bool,
-    ) -> SampleLogprobs:
+    ) -> Tuple[SampleLogprobs, float]:
         """Pythonize sample logprobs, maybe detokenize.
         
         Pythonization entails the conversion from a numpy (np)
@@ -171,6 +173,8 @@ def _pythonize_maybe_detokenize_sample_logprobs_for_request(
         The Logprob.decoded_token field is only computed (detokenized
         from the associated top token id) if detokenize=True
 
+        Also computes cumulative logprob.
+
         Args:
           new_logprobs: List of (logprobs,logprob token ids) numpy array tuples
           detokenize: Logprob.decoded_token is computed if True, otherwise None
@@ -198,10 +202,12 @@ def _pythonize_maybe_detokenize_sample_logprobs_for_request(
                 # There will be one more logprob than the user requested
                 logprob_cnt = max_logprobs + 1
 
-            self.request_logprobs.append(
-                self._pythonize_sequence_position(
-                    logprob_values[0:logprob_cnt],
-                    logprob_token_ids[0:logprob_cnt], detokenize))
+            new_pythonized_logprobs = self._pythonize_sequence_position(
+                logprob_values[0:logprob_cnt],
+                logprob_token_ids[0:logprob_cnt], detokenize)
+            self.request_logprobs.append(new_pythonized_logprobs)
+            self.request_cumulative_logprob += new_pythonized_logprobs[
+                token_id].logprob
 
     def _pythonize_maybe_detokenize_prompt_logprobs_for_request(
         self,
@@ -324,6 +330,7 @@ def add_tokens(
         logprobs = new_logprobs if delta else self.request_logprobs
         prompt_logprobs = (new_prompt_logprobs
                            if delta else self.request_prompt_logprobs)
+        cumulative_logprob = self.request_cumulative_logprob
 
         request_output = RequestOutput.new(
             self.request_id,
@@ -333,6 +340,7 @@ def add_tokens(
             token_ids,
             logprobs,
             prompt_logprobs,
+            cumulative_logprob,
             finished,
         )
 

From e8bd24732129dd1e40c665a59e38570b85b38879 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Fri, 6 Dec 2024 21:48:39 +0000
Subject: [PATCH 140/293] wip

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py | 517 +++++++++++++++-------------
 vllm/v1/engine/detokenizer.py       |   3 +-
 2 files changed, 276 insertions(+), 244 deletions(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 972f12b2b5bd0..54a0a0b4211b0 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -1,5 +1,8 @@
 import random
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Union, Tuple
+
+import numpy as np
+import numpy.typing as npt
 
 import pytest
 from transformers import AutoTokenizer
@@ -17,17 +20,62 @@
 TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
 tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
 
-
-def _duplicate_logprob_with_decode(
+def _create_random_top_logprob_array(
+        shape: Tuple, 
+        lower: float, 
+        upper: float,
+) -> npt.NDArray:
+    return np.random.rand(*shape) * (upper - lower) + lower
+
+def _create_random_top_token_array(shape: Tuple, 
+                                lower: int, 
+                                upper: int, 
+                                sampled_token_ids: Optional[npt.NDArray], 
+                                adjust_num_logprobs: bool,
+) -> npt.NDArray:
+    choice_list = list(range(lower,upper))
+    res = np.random.choice(choice_list,(shape[0], shape[1] + (1 if adjust_num_logprobs else 0)),replace=False)
+    if sampled_token_ids is not None:
+        res[:,-1] = sampled_token_ids
+    return res
+
+def _generate_dummy_sample_logprobs(
+    sampled_tokens_list: List,
+    num_logprobs: int,
+    tokenizer: PreTrainedTokenizer,
+) -> List[Tuple[npt.NDArray,npt.NDArray]]:
+    res=[]
+    for sampled_token_id in sampled_tokens_list:
+        num_logprobs_adjustment = random.choice([0, 1])
+        res.append(
+            (_create_random_top_logprob_array((1,num_logprobs+num_logprobs_adjustment), -100, 0),
+             _create_random_top_token_array((1,num_logprobs),0, len(tokenizer.vocab) - 1,
+                                         np.array([sampled_token_id]),num_logprobs_adjustment > 0)
+        ))
+    return res
+
+def _generate_dummy_prompt_logprobs(
+    tokens_list: List,
+    num_logprobs: int,
+    tokenizer: PreTrainedTokenizer,
+) -> Tuple[npt.NDArray, npt.NDArray]:
+    num_tok = len(tokens_list)
+    return (
+        _create_random_top_logprob_array((num_tok,num_logprobs), -100, 0),
+        _create_random_top_token_array((num_tok,num_logprobs),0, len(tokenizer.vocab) - 1,
+                                    None,0)
+    )
+
+def _copy_logprob_add_decode(
     logprob: Logprob,
     token_id: int,
     tokenizer: PreTrainedTokenizer,
 ) -> Logprob:
     return Logprob(logprob.logprob, logprob.rank,
-                   tokenizer.decode(token_id, skip_special_tokens=True))
-
+                   tokenizer.convert_ids_to_tokens(
+            [token_id], skip_special_tokens=False))
 
-def _generate_dummy_single_logprob(
+def _generate_dummy_logprobs_tuple(
     num_logprobs: int,
     is_sample_logprobs: bool,
     tokenizer: PreTrainedTokenizer,
@@ -41,25 +89,12 @@ def _generate_dummy_single_logprob(
         for idx in range(adjusted_num_logprobs)
     }
 
-
-def _generate_dummy_logprobs(
-    tokens_list: List,
-    num_logprobs: int,
-    is_sample_logprobs: bool,
-    tokenizer: PreTrainedTokenizer,
-) -> Union[SampleLogprobs, PromptLogprobs]:
-    return [
-        _generate_dummy_single_logprob(num_logprobs, is_sample_logprobs,
-                                       tokenizer) for _ in tokens_list
-    ]
-
-
 def _new_logprobs_detokenized(
     logprobs: Union[SampleLogprobs, PromptLogprobs],
     tokenizer: PreTrainedTokenizer,
 ) -> Union[SampleLogprobs, PromptLogprobs]:
     return [{
-        tok_id: _duplicate_logprob_with_decode(lp, tok_id, tokenizer)
+        tok_id: _copy_logprob_add_decode(lp, tok_id, tokenizer)
         for tok_id, lp in lp_dict.items()
     } for lp_dict in logprobs]
 
@@ -77,10 +112,9 @@ def _new_logprobs_detokenized(
 PROMPT_TOKENS = [
     tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
 ]
-PROMPT_LOGPROBS_RAW = [
-    _generate_dummy_logprobs(tokens_list=tokens_list,
+PROMPT_LOGPROBS_RAW:Tuple[npt.NDArray, npt.NDArray] = [
+    _generate_dummy_prompt_logprobs(tokens_list=tokens_list,
                              num_logprobs=NUM_PROMPT_LOGPROBS,
-                             is_sample_logprobs=False,
                              tokenizer=tokenizer)
     for tokens_list in PROMPT_TOKENS
 ]
@@ -92,9 +126,8 @@ def _new_logprobs_detokenized(
     tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
 ]
 GENERATION_LOGPROBS_RAW = [
-    _generate_dummy_logprobs(tokens_list=tokens_list,
+    _generate_dummy_sample_logprobs(sampled_tokens_list=tokens_list,
                              num_logprobs=NUM_SAMPLE_LOGPROBS,
-                             is_sample_logprobs=True,
                              tokenizer=tokenizer)
     for tokens_list in GENERATION_TOKENS
 ]
@@ -102,220 +135,220 @@ def _new_logprobs_detokenized(
     _new_logprobs_detokenized(logprobs=logprobs, tokenizer=tokenizer)
     for logprobs in GENERATION_LOGPROBS_RAW
 ]
-PROMPT_STRINGS = [
-    tokenizer.decode(prompt_tokens,
-                     skip_special_tokens=True,
-                     tokenizer=tokenizer) for prompt_tokens in PROMPT_TOKENS
-]
-PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS]
-GENERATION_STRINGS = [
-    text[prompt_len:]
-    for text, prompt_len in zip(FULL_STRINGS, PROMPT_STRINGS_LEN)
-]
-
-
-class MockEngineCore:
-    """Mock outputs form premade tokens lists."""
-
-    def __init__(
-        self,
-        generated_tokens_list: List[List[int]],
-        prompt_tokens_list: List[List[int]],
-        generated_logprobs_raw: Optional[SampleLogprobs],
-        prompt_logprobs_raw: Optional[PromptLogprobs],
-    ) -> None:
-        self.generated_tokens_list = generated_tokens_list
-        self.prompt_tokens_list = prompt_tokens_list
-        self.current_idx = 0
-        self.generated_logprobs_raw = generated_logprobs_raw
-        self.do_logprobs = generated_logprobs_raw is not None
-        self.prompt_logprobs_raw = prompt_logprobs_raw
-        self.do_prompt_logprobs = prompt_logprobs_raw is not None
-
-    def get_outputs(self) -> List[EngineCoreOutput]:
-        do_logprobs = self.do_logprobs
-        do_prompt_logprobs = self.do_prompt_logprobs
-        token_idx = self.current_idx
-        self.current_idx += 1
-
-        outputs = []
-        for req_idx, (generated_token_ids, prompt_token_ids) in enumerate(
-                zip(self.generated_tokens_list, self.prompt_tokens_list)):
-            if len(generated_token_ids) > token_idx:
-                output = EngineCoreOutput(
-                    request_id=f"request-{req_idx}",
-                    new_token_ids=[generated_token_ids[token_idx]],
-                    finished=False,
-                    logprobs=[self.generated_logprobs_raw[req_idx][token_idx]]
-                    if do_logprobs else None,
-                    prompt_logprobs=self.prompt_logprobs_raw[req_idx]
-                    if do_prompt_logprobs else None,
-                    prompt_logprobs_token_ids=prompt_token_ids[req_idx]
-                    if do_prompt_logprobs else None,
-                )
-                if token_idx == len(generated_token_ids) - 1:
-                    output.finished = True
-                    output.finish_reason = "stopped"
-                outputs.append(output)
-
-        return outputs
-
-
-@pytest.mark.parametrize(
-    "request_output_kind",
-    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-def test_incremental_detokenization(request_output_kind: RequestOutputKind):
-    detokenizer = Detokenizer(TOKENIZER_NAME)
-    engine_core = MockEngineCore(GENERATION_TOKENS)
-
-    # Make N requests.
-    requests = [
-        DetokenizerRequest(
-            request_id=f"request-{idx}",
-            prompt=prompt,
-            prompt_token_ids=prompt_tokens,
-            skip_special_tokens=False,
-            spaces_between_special_tokens=False,
-            output_kind=request_output_kind,
-            stop=[],
-            include_stop_str_in_output=False,
-        ) for idx, (
-            prompt,
-            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
-    ]
-
-    # Add requests to the detokenizer.
-    for request in requests:
-        detokenizer.add_request(request)
-
-    gen_strings = {}
-    gen_tokens = {}
-    while True:
-        # Mock output from the EngineCore.
-        outputs = engine_core.get_outputs()
-        if len(outputs) == 0:
-            break
-
-        # Step the Detokenizer.
-        request_outputs, requests_to_abort = detokenizer.step(outputs)
-        assert len(requests_to_abort) == 0
-
-        # Update tracking.
-        for request_output in request_outputs:
-            request_id = request_output.request_id
-            new_text = request_output.outputs[0].text
-            new_tokens = request_output.outputs[0].token_ids
-            if request_id not in gen_strings:
-                gen_strings[request_id] = new_text
-                gen_tokens[request_id] = new_tokens
-            else:
-                gen_strings[request_id] += new_text
-                gen_tokens[request_id].extend(new_tokens)
-
-    # Confirmed tracked values matches what we expected.
-    for idx, (ref_gen_str, ref_gen_toks) in enumerate(
-            zip(GENERATION_STRINGS, GENERATION_TOKENS)):
-        gen_str = gen_strings[f"request-{idx}"]
-        gen_toks = gen_tokens[f"request-{idx}"]
-
-        assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
-        assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"
-
-    assert detokenizer.get_num_unfinished_requests() == 0
-    assert not detokenizer.has_unfinished_requests()
-
-
-@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
-@pytest.mark.parametrize("logprobs,prompt_logprobs",
-                         [(None, None), (NUM_SAMPLE_LOGPROBS, None),
-                          (None, NUM_PROMPT_LOGPROBS),
-                          (NUM_SAMPLE_LOGPROBS, NUM_PROMPT_LOGPROBS)])
-def test_stop_string(
-    include_stop_str_in_output: bool,
-    logprobs: Optional[int],
-    prompt_logprobs: Optional[int],
-) -> None:
-    do_generated_logprobs = logprobs is not None
-    do_prompt_logprobs = prompt_logprobs is not None
-    detokenizer = Detokenizer(TOKENIZER_NAME)
-    engine_core = MockEngineCore(generated_tokens_list=GENERATION_TOKENS,
-                                 prompt_tokens_list=PROMPT_TOKENS,
-                                 generated_logprobs_raw=GENERATION_LOGPROBS_RAW
-                                 if do_generated_logprobs else None,
-                                 prompt_logprobs_raw=PROMPT_LOGPROBS_RAW
-                                 if do_prompt_logprobs else None)
-
-    # Make N requests.
-    requests = [
-        DetokenizerRequest(
-            request_id=f"request-{idx}",
-            prompt=prompt,
-            prompt_token_ids=prompt_tokens,
-            skip_special_tokens=False,
-            spaces_between_special_tokens=False,
-            output_kind=RequestOutputKind.DELTA,
-            stop=STOP_STRINGS,
-            include_stop_str_in_output=include_stop_str_in_output,
-            logprobs=logprobs,
-            prompt_logprobs=prompt_logprobs,
-        ) for idx, (
-            prompt,
-            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
-    ]
-
-    # Add requests to the detokenizer.
-    for request in requests:
-        detokenizer.add_request(request)
-
-    gen_strings = {}
-    aborted = []
-    while True:
-        # Mock output from the EngineCore.
-        outputs = engine_core.get_outputs()
-        if len(outputs) == 0:
-            break
-
-        # Step the Detokenizer.
-        request_outputs, requests_to_abort = detokenizer.step(outputs)
-        for request_output in request_outputs:
-            # If aborted, we should not get a request output.
-            assert request_output.request_id not in aborted
-        aborted.extend(requests_to_abort)
-
-        # Update tracking.
-        for request_output in request_outputs:
-            if request_output.finished:
-                assert request_output.outputs[0].finish_reason == "stop"
-
-            request_id = request_output.request_id
-            new_text = request_output.outputs[0].text
-            if request_id not in gen_strings:
-                gen_strings[request_id] = new_text
-            else:
-                gen_strings[request_id] += new_text
-
-    # Confirmed tracked values matches what we expected.
-    for idx, (ref_gen_str,
-              stop_str) in enumerate(zip(GENERATION_STRINGS, STOP_STRINGS)):
-
-        # Request should be aborted.
-        request_id = f"request-{idx}"
-        assert request_id in aborted
-
-        # Collected values that were generated.
-        gen_str = gen_strings[request_id]
-
-        # Construct reference strings.
-        stop_str_idx = ref_gen_str.find(stop_str)
-        ref_str_exc_stop = ref_gen_str[:stop_str_idx]
-        ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str
-
-        if include_stop_str_in_output:
-            assert gen_str == ref_str_inc_stop, (
-                f"{gen_str=}, {ref_str_inc_stop=}")
-        else:
-            assert gen_str == ref_str_exc_stop, (
-                f"{gen_str=}, {ref_str_exc_stop=}")
-
-    assert detokenizer.get_num_unfinished_requests() == 0
-    assert not detokenizer.has_unfinished_requests()
+# PROMPT_STRINGS = [
+#     tokenizer.decode(prompt_tokens,
+#                      skip_special_tokens=True,
+#                      tokenizer=tokenizer) for prompt_tokens in PROMPT_TOKENS
+# ]
+# PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS]
+# GENERATION_STRINGS = [
+#     text[prompt_len:]
+#     for text, prompt_len in zip(FULL_STRINGS, PROMPT_STRINGS_LEN)
+# ]
+
+
+# class MockEngineCore:
+#     """Mock outputs form premade tokens lists."""
+
+#     def __init__(
+#         self,
+#         generated_tokens_list: List[List[int]],
+#         prompt_tokens_list: List[List[int]],
+#         generated_logprobs_raw: Optional[SampleLogprobs],
+#         prompt_logprobs_raw: Optional[PromptLogprobs],
+#     ) -> None:
+#         self.generated_tokens_list = generated_tokens_list
+#         self.prompt_tokens_list = prompt_tokens_list
+#         self.current_idx = 0
+#         self.generated_logprobs_raw = generated_logprobs_raw
+#         self.do_logprobs = generated_logprobs_raw is not None
+#         self.prompt_logprobs_raw = prompt_logprobs_raw
+#         self.do_prompt_logprobs = prompt_logprobs_raw is not None
+
+#     def get_outputs(self) -> List[EngineCoreOutput]:
+#         do_logprobs = self.do_logprobs
+#         do_prompt_logprobs = self.do_prompt_logprobs
+#         token_idx = self.current_idx
+#         self.current_idx += 1
+
+#         outputs = []
+#         for req_idx, (generated_token_ids, prompt_token_ids) in enumerate(
+#                 zip(self.generated_tokens_list, self.prompt_tokens_list)):
+#             if len(generated_token_ids) > token_idx:
+#                 output = EngineCoreOutput(
+#                     request_id=f"request-{req_idx}",
+#                     new_token_ids=[generated_token_ids[token_idx]],
+#                     finished=False,
+#                     logprobs=[self.generated_logprobs_raw[req_idx][token_idx]]
+#                     if do_logprobs else None,
+#                     prompt_logprobs=self.prompt_logprobs_raw[req_idx]
+#                     if do_prompt_logprobs else None,
+#                     prompt_logprobs_token_ids=prompt_token_ids[req_idx]
+#                     if do_prompt_logprobs else None,
+#                 )
+#                 if token_idx == len(generated_token_ids) - 1:
+#                     output.finished = True
+#                     output.finish_reason = "stopped"
+#                 outputs.append(output)
+
+#         return outputs
+
+
+# @pytest.mark.parametrize(
+#     "request_output_kind",
+#     [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+# def test_incremental_detokenization(request_output_kind: RequestOutputKind):
+#     detokenizer = Detokenizer(TOKENIZER_NAME)
+#     engine_core = MockEngineCore(GENERATION_TOKENS)
+
+#     # Make N requests.
+#     requests = [
+#         DetokenizerRequest(
+#             request_id=f"request-{idx}",
+#             prompt=prompt,
+#             prompt_token_ids=prompt_tokens,
+#             skip_special_tokens=False,
+#             spaces_between_special_tokens=False,
+#             output_kind=request_output_kind,
+#             stop=[],
+#             include_stop_str_in_output=False,
+#         ) for idx, (
+#             prompt,
+#             prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+#     ]
+
+#     # Add requests to the detokenizer.
+#     for request in requests:
+#         detokenizer.add_request(request)
+
+#     gen_strings = {}
+#     gen_tokens = {}
+#     while True:
+#         # Mock output from the EngineCore.
+#         outputs = engine_core.get_outputs()
+#         if len(outputs) == 0:
+#             break
+
+#         # Step the Detokenizer.
+#         request_outputs, requests_to_abort = detokenizer.step(outputs)
+#         assert len(requests_to_abort) == 0
+
+#         # Update tracking.
+#         for request_output in request_outputs:
+#             request_id = request_output.request_id
+#             new_text = request_output.outputs[0].text
+#             new_tokens = request_output.outputs[0].token_ids
+#             if request_id not in gen_strings:
+#                 gen_strings[request_id] = new_text
+#                 gen_tokens[request_id] = new_tokens
+#             else:
+#                 gen_strings[request_id] += new_text
+#                 gen_tokens[request_id].extend(new_tokens)
+
+#     # Confirmed tracked values matches what we expected.
+#     for idx, (ref_gen_str, ref_gen_toks) in enumerate(
+#             zip(GENERATION_STRINGS, GENERATION_TOKENS)):
+#         gen_str = gen_strings[f"request-{idx}"]
+#         gen_toks = gen_tokens[f"request-{idx}"]
+
+#         assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
+#         assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"
+
+#     assert detokenizer.get_num_unfinished_requests() == 0
+#     assert not detokenizer.has_unfinished_requests()
+
+
+# @pytest.mark.parametrize("include_stop_str_in_output", [True, False])
+# @pytest.mark.parametrize("logprobs,prompt_logprobs",
+#                          [(None, None), (NUM_SAMPLE_LOGPROBS, None),
+#                           (None, NUM_PROMPT_LOGPROBS),
+#                           (NUM_SAMPLE_LOGPROBS, NUM_PROMPT_LOGPROBS)])
+# def test_stop_string(
+#     include_stop_str_in_output: bool,
+#     logprobs: Optional[int],
+#     prompt_logprobs: Optional[int],
+# ) -> None:
+#     do_generated_logprobs = logprobs is not None
+#     do_prompt_logprobs = prompt_logprobs is not None
+#     detokenizer = Detokenizer(TOKENIZER_NAME)
+#     engine_core = MockEngineCore(generated_tokens_list=GENERATION_TOKENS,
+#                                  prompt_tokens_list=PROMPT_TOKENS,
+#                                  generated_logprobs_raw=GENERATION_LOGPROBS_RAW
+#                                  if do_generated_logprobs else None,
+#                                  prompt_logprobs_raw=PROMPT_LOGPROBS_RAW
+#                                  if do_prompt_logprobs else None)
+
+#     # Make N requests.
+#     requests = [
+#         DetokenizerRequest(
+#             request_id=f"request-{idx}",
+#             prompt=prompt,
+#             prompt_token_ids=prompt_tokens,
+#             skip_special_tokens=False,
+#             spaces_between_special_tokens=False,
+#             output_kind=RequestOutputKind.DELTA,
+#             stop=STOP_STRINGS,
+#             include_stop_str_in_output=include_stop_str_in_output,
+#             logprobs=logprobs,
+#             prompt_logprobs=prompt_logprobs,
+#         ) for idx, (
+#             prompt,
+#             prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+#     ]
+
+#     # Add requests to the detokenizer.
+#     for request in requests:
+#         detokenizer.add_request(request)
+
+#     gen_strings = {}
+#     aborted = []
+#     while True:
+#         # Mock output from the EngineCore.
+#         outputs = engine_core.get_outputs()
+#         if len(outputs) == 0:
+#             break
+
+#         # Step the Detokenizer.
+#         request_outputs, requests_to_abort = detokenizer.step(outputs)
+#         for request_output in request_outputs:
+#             # If aborted, we should not get a request output.
+#             assert request_output.request_id not in aborted
+#         aborted.extend(requests_to_abort)
+
+#         # Update tracking.
+#         for request_output in request_outputs:
+#             if request_output.finished:
+#                 assert request_output.outputs[0].finish_reason == "stop"
+
+#             request_id = request_output.request_id
+#             new_text = request_output.outputs[0].text
+#             if request_id not in gen_strings:
+#                 gen_strings[request_id] = new_text
+#             else:
+#                 gen_strings[request_id] += new_text
+
+#     # Confirmed tracked values matches what we expected.
+#     for idx, (ref_gen_str,
+#               stop_str) in enumerate(zip(GENERATION_STRINGS, STOP_STRINGS)):
+
+#         # Request should be aborted.
+#         request_id = f"request-{idx}"
+#         assert request_id in aborted
+
+#         # Collected values that were generated.
+#         gen_str = gen_strings[request_id]
+
+#         # Construct reference strings.
+#         stop_str_idx = ref_gen_str.find(stop_str)
+#         ref_str_exc_stop = ref_gen_str[:stop_str_idx]
+#         ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str
+
+#         if include_stop_str_in_output:
+#             assert gen_str == ref_str_inc_stop, (
+#                 f"{gen_str=}, {ref_str_inc_stop=}")
+#         else:
+#             assert gen_str == ref_str_exc_stop, (
+#                 f"{gen_str=}, {ref_str_exc_stop=}")
+
+#     assert detokenizer.get_num_unfinished_requests() == 0
+#     assert not detokenizer.has_unfinished_requests()
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 53bc078897f77..33d546ee060dc 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -122,10 +122,9 @@ def from_new_request(
     def _detokenize_ids(
         self,
         token_id_list: int,
-        skip_special_tokens=False,
     ) -> List[str]:
         return self.tokenizer.convert_ids_to_tokens(
-            token_id_list, skip_special_tokens=skip_special_tokens)
+            token_id_list, skip_special_tokens=False)
 
     def _pythonize_sequence_position(
         self,

From 9f3981786edb9017add312655a735b7654a77c51 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 7 Dec 2024 01:21:03 +0000
Subject: [PATCH 141/293] progress toward detok stop token test

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py | 594 ++++++++++++++++------------
 vllm/v1/engine/detokenizer.py       |   4 +-
 2 files changed, 337 insertions(+), 261 deletions(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 54a0a0b4211b0..604350b693417 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -1,9 +1,8 @@
 import random
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import numpy.typing as npt
-
 import pytest
 from transformers import AutoTokenizer
 from transformers.tokenization_utils import PreTrainedTokenizer
@@ -20,60 +19,124 @@
 TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
 tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
 
-def _create_random_top_logprob_array(
-        shape: Tuple, 
-        lower: float, 
-        upper: float,
+
+def _create_random_top_logprob_vector(
+    num_logprobs: int,
+    lower: float,
+    upper: float,
+) -> npt.NDArray:
+    return np.random.rand(num_logprobs) * (upper - lower) + lower
+
+
+def _create_random_top_logprob_matrix(
+    shape: Tuple,
+    lower: float,
+    upper: float,
 ) -> npt.NDArray:
     return np.random.rand(*shape) * (upper - lower) + lower
 
-def _create_random_top_token_array(shape: Tuple, 
-                                lower: int, 
-                                upper: int, 
-                                sampled_token_ids: Optional[npt.NDArray], 
-                                adjust_num_logprobs: bool,
+
+def _create_random_top_token_vector(
+    num_logprobs: int,
+    lower: int,
+    upper: int,
+    sampled_token_ids: Optional[npt.NDArray],
+    adjust_num_logprobs: bool,
+) -> npt.NDArray:
+    choice_list = list(range(lower, upper))
+    res = np.random.choice(choice_list, (num_logprobs +
+                                         (1 if adjust_num_logprobs else 0), ),
+                           replace=False)
+    if sampled_token_ids is not None:
+        res[-1] = sampled_token_ids
+    return res
+
+
+def _create_random_top_token_matrix(
+    shape: Tuple,
+    lower: int,
+    upper: int,
+    sampled_token_ids: Optional[npt.NDArray],
+    adjust_num_logprobs: bool,
 ) -> npt.NDArray:
-    choice_list = list(range(lower,upper))
-    res = np.random.choice(choice_list,(shape[0], shape[1] + (1 if adjust_num_logprobs else 0)),replace=False)
+    choice_list = list(range(lower, upper))
+    res = np.random.choice(choice_list, (shape[0], shape[1] +
+                                         (1 if adjust_num_logprobs else 0)),
+                           replace=False)
     if sampled_token_ids is not None:
-        res[:,-1] = sampled_token_ids
+        res[:, -1] = sampled_token_ids
     return res
 
+
 def _generate_dummy_sample_logprobs(
     sampled_tokens_list: List,
     num_logprobs: int,
     tokenizer: PreTrainedTokenizer,
-) -> List[Tuple[npt.NDArray,npt.NDArray]]:
-    res=[]
+) -> List[Tuple[npt.NDArray, npt.NDArray]]:
+    res = []
     for sampled_token_id in sampled_tokens_list:
         num_logprobs_adjustment = random.choice([0, 1])
         res.append(
-            (_create_random_top_logprob_array((1,num_logprobs+num_logprobs_adjustment), -100, 0),
-             _create_random_top_token_array((1,num_logprobs),0, len(tokenizer.vocab) - 1,
-                                         np.array([sampled_token_id]),num_logprobs_adjustment > 0)
-        ))
+            (_create_random_top_logprob_vector(
+                num_logprobs + num_logprobs_adjustment, -100, 0),
+             _create_random_top_token_vector(num_logprobs, 0,
+                                             len(tokenizer.vocab) - 1,
+                                             np.array([sampled_token_id]),
+                                             num_logprobs_adjustment > 0)))
     return res
 
+
 def _generate_dummy_prompt_logprobs(
     tokens_list: List,
     num_logprobs: int,
     tokenizer: PreTrainedTokenizer,
 ) -> Tuple[npt.NDArray, npt.NDArray]:
     num_tok = len(tokens_list)
-    return (
-        _create_random_top_logprob_array((num_tok,num_logprobs), -100, 0),
-        _create_random_top_token_array((num_tok,num_logprobs),0, len(tokenizer.vocab) - 1,
-                                    None,0)
-    )
+    return (_create_random_top_logprob_matrix((num_tok, num_logprobs), -100,
+                                              0),
+            _create_random_top_token_matrix((num_tok, num_logprobs), 0,
+                                            len(tokenizer.vocab) - 1, None,
+                                            False))
+
+
+def _pythonize_logprobs_at_single_seq_offset(
+    logprobs_np: npt.NDArray,
+    token_ids_np: npt.NDArray,
+    tokenizer: PreTrainedTokenizer,
+) -> Dict[int, Logprob]:
+    return {
+        tok_id: Logprob(
+            val, tdx + 1,
+            tokenizer.convert_ids_to_tokens([tok_id],
+                                            skip_special_tokens=False))
+        for tdx, (val, tok_id) in enumerate(zip(logprobs_np, token_ids_np))
+    }
+
+
+def _detokenize_prompt_logprobs(
+    prompt_logprobs_np: Tuple[npt.NDArray, npt.NDArray],
+    tokenizer: PreTrainedTokenizer,
+) -> PromptLogprobs:
+    prompt_logprobs_np_vals = prompt_logprobs_np[0]
+    prompt_logprobs_np_toks = prompt_logprobs_np[1]
+    num_prompt_tokens = prompt_logprobs_np_vals.shape[0]
+    res = [
+        _pythonize_logprobs_at_single_seq_offset(
+            prompt_logprobs_np_vals[sdx, :], prompt_logprobs_np_toks[sdx, :],
+            tokenizer) for sdx in range(num_prompt_tokens)
+    ]
+    return res
+
 
 def _copy_logprob_add_decode(
     logprob: Logprob,
     token_id: int,
     tokenizer: PreTrainedTokenizer,
 ) -> Logprob:
-    return Logprob(logprob.logprob, logprob.rank,
-                   tokenizer.convert_ids_to_tokens(
-            [token_id], skip_special_tokens=False))
+    return Logprob(
+        logprob.logprob, logprob.rank,
+        tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False))
+
 
 def _generate_dummy_logprobs_tuple(
     num_logprobs: int,
@@ -89,10 +152,9 @@ def _generate_dummy_logprobs_tuple(
         for idx in range(adjusted_num_logprobs)
     }
 
-def _new_logprobs_detokenized(
-    logprobs: Union[SampleLogprobs, PromptLogprobs],
-    tokenizer: PreTrainedTokenizer,
-) -> Union[SampleLogprobs, PromptLogprobs]:
+
+def _new_logprobs_detokenized(logprobs: Union[SampleLogprobs, PromptLogprobs],
+                              C) -> Union[SampleLogprobs, PromptLogprobs]:
     return [{
         tok_id: _copy_logprob_add_decode(lp, tok_id, tokenizer)
         for tok_id, lp in lp_dict.items()
@@ -112,243 +174,257 @@ def _new_logprobs_detokenized(
 PROMPT_TOKENS = [
     tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
 ]
-PROMPT_LOGPROBS_RAW:Tuple[npt.NDArray, npt.NDArray] = [
+PROMPT_LOGPROBS_RAW: List[Tuple[npt.NDArray, npt.NDArray]] = [
     _generate_dummy_prompt_logprobs(tokens_list=tokens_list,
-                             num_logprobs=NUM_PROMPT_LOGPROBS,
-                             tokenizer=tokenizer)
+                                    num_logprobs=NUM_PROMPT_LOGPROBS,
+                                    tokenizer=tokenizer)
     for tokens_list in PROMPT_TOKENS
 ]
-PROMPT_LOGPROBS = [
-    _new_logprobs_detokenized(logprobs=logprobs, tokenizer=tokenizer)
-    for logprobs in PROMPT_LOGPROBS_RAW
-]
+# PROMPT_LOGPROBS = [
+#     _new_logprobs_detokenized(logprobs=logprobs, tokenizer=tokenizer)
+#     for logprobs in PROMPT_LOGPROBS_RAW
+# ]
 GENERATION_TOKENS = [
     tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
 ]
 GENERATION_LOGPROBS_RAW = [
     _generate_dummy_sample_logprobs(sampled_tokens_list=tokens_list,
-                             num_logprobs=NUM_SAMPLE_LOGPROBS,
-                             tokenizer=tokenizer)
+                                    num_logprobs=NUM_SAMPLE_LOGPROBS,
+                                    tokenizer=tokenizer)
     for tokens_list in GENERATION_TOKENS
 ]
-GENERATION_LOGPROBS = [
-    _new_logprobs_detokenized(logprobs=logprobs, tokenizer=tokenizer)
-    for logprobs in GENERATION_LOGPROBS_RAW
-]
-# PROMPT_STRINGS = [
-#     tokenizer.decode(prompt_tokens,
-#                      skip_special_tokens=True,
-#                      tokenizer=tokenizer) for prompt_tokens in PROMPT_TOKENS
-# ]
-# PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS]
-# GENERATION_STRINGS = [
-#     text[prompt_len:]
-#     for text, prompt_len in zip(FULL_STRINGS, PROMPT_STRINGS_LEN)
+# GENERATION_LOGPROBS = [
+#     _new_logprobs_detokenized(logprobs=logprobs, tokenizer=tokenizer)
+#     for logprobs in GENERATION_LOGPROBS_RAW
 # ]
+PROMPT_STRINGS = [
+    tokenizer.decode(prompt_tokens,
+                     skip_special_tokens=True,
+                     tokenizer=tokenizer) for prompt_tokens in PROMPT_TOKENS
+]
+PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS]
+GENERATION_STRINGS = [
+    text[prompt_len:]
+    for text, prompt_len in zip(FULL_STRINGS, PROMPT_STRINGS_LEN)
+]
 
 
-# class MockEngineCore:
-#     """Mock outputs form premade tokens lists."""
-
-#     def __init__(
-#         self,
-#         generated_tokens_list: List[List[int]],
-#         prompt_tokens_list: List[List[int]],
-#         generated_logprobs_raw: Optional[SampleLogprobs],
-#         prompt_logprobs_raw: Optional[PromptLogprobs],
-#     ) -> None:
-#         self.generated_tokens_list = generated_tokens_list
-#         self.prompt_tokens_list = prompt_tokens_list
-#         self.current_idx = 0
-#         self.generated_logprobs_raw = generated_logprobs_raw
-#         self.do_logprobs = generated_logprobs_raw is not None
-#         self.prompt_logprobs_raw = prompt_logprobs_raw
-#         self.do_prompt_logprobs = prompt_logprobs_raw is not None
-
-#     def get_outputs(self) -> List[EngineCoreOutput]:
-#         do_logprobs = self.do_logprobs
-#         do_prompt_logprobs = self.do_prompt_logprobs
-#         token_idx = self.current_idx
-#         self.current_idx += 1
-
-#         outputs = []
-#         for req_idx, (generated_token_ids, prompt_token_ids) in enumerate(
-#                 zip(self.generated_tokens_list, self.prompt_tokens_list)):
-#             if len(generated_token_ids) > token_idx:
-#                 output = EngineCoreOutput(
-#                     request_id=f"request-{req_idx}",
-#                     new_token_ids=[generated_token_ids[token_idx]],
-#                     finished=False,
-#                     logprobs=[self.generated_logprobs_raw[req_idx][token_idx]]
-#                     if do_logprobs else None,
-#                     prompt_logprobs=self.prompt_logprobs_raw[req_idx]
-#                     if do_prompt_logprobs else None,
-#                     prompt_logprobs_token_ids=prompt_token_ids[req_idx]
-#                     if do_prompt_logprobs else None,
-#                 )
-#                 if token_idx == len(generated_token_ids) - 1:
-#                     output.finished = True
-#                     output.finish_reason = "stopped"
-#                 outputs.append(output)
-
-#         return outputs
-
-
-# @pytest.mark.parametrize(
-#     "request_output_kind",
-#     [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-# def test_incremental_detokenization(request_output_kind: RequestOutputKind):
-#     detokenizer = Detokenizer(TOKENIZER_NAME)
-#     engine_core = MockEngineCore(GENERATION_TOKENS)
-
-#     # Make N requests.
-#     requests = [
-#         DetokenizerRequest(
-#             request_id=f"request-{idx}",
-#             prompt=prompt,
-#             prompt_token_ids=prompt_tokens,
-#             skip_special_tokens=False,
-#             spaces_between_special_tokens=False,
-#             output_kind=request_output_kind,
-#             stop=[],
-#             include_stop_str_in_output=False,
-#         ) for idx, (
-#             prompt,
-#             prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
-#     ]
-
-#     # Add requests to the detokenizer.
-#     for request in requests:
-#         detokenizer.add_request(request)
-
-#     gen_strings = {}
-#     gen_tokens = {}
-#     while True:
-#         # Mock output from the EngineCore.
-#         outputs = engine_core.get_outputs()
-#         if len(outputs) == 0:
-#             break
-
-#         # Step the Detokenizer.
-#         request_outputs, requests_to_abort = detokenizer.step(outputs)
-#         assert len(requests_to_abort) == 0
-
-#         # Update tracking.
-#         for request_output in request_outputs:
-#             request_id = request_output.request_id
-#             new_text = request_output.outputs[0].text
-#             new_tokens = request_output.outputs[0].token_ids
-#             if request_id not in gen_strings:
-#                 gen_strings[request_id] = new_text
-#                 gen_tokens[request_id] = new_tokens
-#             else:
-#                 gen_strings[request_id] += new_text
-#                 gen_tokens[request_id].extend(new_tokens)
-
-#     # Confirmed tracked values matches what we expected.
-#     for idx, (ref_gen_str, ref_gen_toks) in enumerate(
-#             zip(GENERATION_STRINGS, GENERATION_TOKENS)):
-#         gen_str = gen_strings[f"request-{idx}"]
-#         gen_toks = gen_tokens[f"request-{idx}"]
-
-#         assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
-#         assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"
-
-#     assert detokenizer.get_num_unfinished_requests() == 0
-#     assert not detokenizer.has_unfinished_requests()
-
-
-# @pytest.mark.parametrize("include_stop_str_in_output", [True, False])
-# @pytest.mark.parametrize("logprobs,prompt_logprobs",
-#                          [(None, None), (NUM_SAMPLE_LOGPROBS, None),
-#                           (None, NUM_PROMPT_LOGPROBS),
-#                           (NUM_SAMPLE_LOGPROBS, NUM_PROMPT_LOGPROBS)])
-# def test_stop_string(
-#     include_stop_str_in_output: bool,
-#     logprobs: Optional[int],
-#     prompt_logprobs: Optional[int],
-# ) -> None:
-#     do_generated_logprobs = logprobs is not None
-#     do_prompt_logprobs = prompt_logprobs is not None
-#     detokenizer = Detokenizer(TOKENIZER_NAME)
-#     engine_core = MockEngineCore(generated_tokens_list=GENERATION_TOKENS,
-#                                  prompt_tokens_list=PROMPT_TOKENS,
-#                                  generated_logprobs_raw=GENERATION_LOGPROBS_RAW
-#                                  if do_generated_logprobs else None,
-#                                  prompt_logprobs_raw=PROMPT_LOGPROBS_RAW
-#                                  if do_prompt_logprobs else None)
-
-#     # Make N requests.
-#     requests = [
-#         DetokenizerRequest(
-#             request_id=f"request-{idx}",
-#             prompt=prompt,
-#             prompt_token_ids=prompt_tokens,
-#             skip_special_tokens=False,
-#             spaces_between_special_tokens=False,
-#             output_kind=RequestOutputKind.DELTA,
-#             stop=STOP_STRINGS,
-#             include_stop_str_in_output=include_stop_str_in_output,
-#             logprobs=logprobs,
-#             prompt_logprobs=prompt_logprobs,
-#         ) for idx, (
-#             prompt,
-#             prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
-#     ]
-
-#     # Add requests to the detokenizer.
-#     for request in requests:
-#         detokenizer.add_request(request)
-
-#     gen_strings = {}
-#     aborted = []
-#     while True:
-#         # Mock output from the EngineCore.
-#         outputs = engine_core.get_outputs()
-#         if len(outputs) == 0:
-#             break
-
-#         # Step the Detokenizer.
-#         request_outputs, requests_to_abort = detokenizer.step(outputs)
-#         for request_output in request_outputs:
-#             # If aborted, we should not get a request output.
-#             assert request_output.request_id not in aborted
-#         aborted.extend(requests_to_abort)
-
-#         # Update tracking.
-#         for request_output in request_outputs:
-#             if request_output.finished:
-#                 assert request_output.outputs[0].finish_reason == "stop"
-
-#             request_id = request_output.request_id
-#             new_text = request_output.outputs[0].text
-#             if request_id not in gen_strings:
-#                 gen_strings[request_id] = new_text
-#             else:
-#                 gen_strings[request_id] += new_text
-
-#     # Confirmed tracked values matches what we expected.
-#     for idx, (ref_gen_str,
-#               stop_str) in enumerate(zip(GENERATION_STRINGS, STOP_STRINGS)):
-
-#         # Request should be aborted.
-#         request_id = f"request-{idx}"
-#         assert request_id in aborted
-
-#         # Collected values that were generated.
-#         gen_str = gen_strings[request_id]
-
-#         # Construct reference strings.
-#         stop_str_idx = ref_gen_str.find(stop_str)
-#         ref_str_exc_stop = ref_gen_str[:stop_str_idx]
-#         ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str
-
-#         if include_stop_str_in_output:
-#             assert gen_str == ref_str_inc_stop, (
-#                 f"{gen_str=}, {ref_str_inc_stop=}")
-#         else:
-#             assert gen_str == ref_str_exc_stop, (
-#                 f"{gen_str=}, {ref_str_exc_stop=}")
-
-#     assert detokenizer.get_num_unfinished_requests() == 0
-#     assert not detokenizer.has_unfinished_requests()
+class MockEngineCore:
+    """Mock outputs form premade tokens lists."""
+
+    def __init__(
+        self,
+        generated_tokens_list: List[List[int]],
+        prompt_tokens_list: List[List[int]],
+        generated_logprobs_raw: Optional[List[List[Tuple[npt.NDArray,
+                                                         npt.NDArray]]]],
+        prompt_logprobs_raw: Optional[List[Tuple[npt.NDArray, npt.NDArray]]],
+    ) -> None:
+        self.generated_tokens_list = generated_tokens_list
+        self.prompt_tokens_list = prompt_tokens_list
+        self.current_idx = 0
+        self.generated_logprobs_raw = generated_logprobs_raw
+        self.do_logprobs = generated_logprobs_raw is not None
+        self.prompt_logprobs_raw = prompt_logprobs_raw
+        self.do_prompt_logprobs = prompt_logprobs_raw is not None
+
+    def get_outputs(self) -> List[EngineCoreOutput]:
+        do_logprobs = self.do_logprobs
+        do_prompt_logprobs = self.do_prompt_logprobs
+        token_idx = self.current_idx
+        self.current_idx += 1
+
+        outputs = []
+        for req_idx, (generated_token_ids, prompt_token_ids) in enumerate(
+                zip(self.generated_tokens_list, self.prompt_tokens_list)):
+            if len(generated_token_ids) > token_idx:
+                if do_logprobs:
+                    assert self.generated_logprobs_raw is not None
+                    logprobs = [
+                        self.generated_logprobs_raw[req_idx][token_idx]
+                    ]
+                else:
+                    logprobs = None
+                if self.current_idx == 0 and do_prompt_logprobs:
+                    assert self.prompt_logprobs_raw is not None
+                    prompt_logprobs = self.prompt_logprobs_raw[req_idx][0]
+                    prompt_logprobs_token_ids = self.prompt_logprobs_raw[
+                        req_idx][1]
+                else:
+                    prompt_logprobs = None
+                    prompt_logprobs_token_ids = None
+                output = EngineCoreOutput(
+                    request_id=f"request-{req_idx}",
+                    new_token_ids=[generated_token_ids[token_idx]],
+                    finished=False,
+                    logprobs=logprobs,
+                    prompt_logprobs=prompt_logprobs,
+                    prompt_logprobs_token_ids=prompt_logprobs_token_ids
+                    if self.current_idx == 0 and do_prompt_logprobs else None,
+                )
+                if token_idx == len(generated_token_ids) - 1:
+                    output.finished = True
+                    output.finish_reason = "stopped"
+                outputs.append(output)
+
+        return outputs
+
+
+@pytest.mark.parametrize(
+    "request_output_kind",
+    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+def test_incremental_detokenization(request_output_kind: RequestOutputKind):
+    detokenizer = Detokenizer(TOKENIZER_NAME)
+    engine_core = MockEngineCore(GENERATION_TOKENS)
+
+    # Make N requests.
+    requests = [
+        DetokenizerRequest(
+            request_id=f"request-{idx}",
+            prompt=prompt,
+            prompt_token_ids=prompt_tokens,
+            skip_special_tokens=False,
+            spaces_between_special_tokens=False,
+            output_kind=request_output_kind,
+            stop=[],
+            include_stop_str_in_output=False,
+        ) for idx, (
+            prompt,
+            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+    ]
+
+    # Add requests to the detokenizer.
+    for request in requests:
+        detokenizer.add_request(request)
+
+    gen_strings = {}
+    gen_tokens = {}
+    while True:
+        # Mock output from the EngineCore.
+        outputs = engine_core.get_outputs()
+        if len(outputs) == 0:
+            break
+
+        # Step the Detokenizer.
+        request_outputs, requests_to_abort = detokenizer.step(outputs)
+        assert len(requests_to_abort) == 0
+
+        # Update tracking.
+        for request_output in request_outputs:
+            request_id = request_output.request_id
+            new_text = request_output.outputs[0].text
+            new_tokens = request_output.outputs[0].token_ids
+            if request_id not in gen_strings:
+                gen_strings[request_id] = new_text
+                gen_tokens[request_id] = new_tokens
+            else:
+                gen_strings[request_id] += new_text
+                gen_tokens[request_id].extend(new_tokens)
+
+    # Confirmed tracked values matches what we expected.
+    for idx, (ref_gen_str, ref_gen_toks) in enumerate(
+            zip(GENERATION_STRINGS, GENERATION_TOKENS)):
+        gen_str = gen_strings[f"request-{idx}"]
+        gen_toks = gen_tokens[f"request-{idx}"]
+
+        assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
+        assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"
+
+    assert detokenizer.get_num_unfinished_requests() == 0
+    assert not detokenizer.has_unfinished_requests()
+
+
+@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
+@pytest.mark.parametrize("logprobs,prompt_logprobs",
+                         [(None, None), (NUM_SAMPLE_LOGPROBS, None),
+                          (None, NUM_PROMPT_LOGPROBS),
+                          (NUM_SAMPLE_LOGPROBS, NUM_PROMPT_LOGPROBS)])
+def test_stop_string(
+    include_stop_str_in_output: bool,
+    logprobs: Optional[int],
+    prompt_logprobs: Optional[int],
+) -> None:
+    do_generated_logprobs = logprobs is not None
+    do_prompt_logprobs = prompt_logprobs is not None
+    detokenizer = Detokenizer(TOKENIZER_NAME)
+    engine_core = MockEngineCore(generated_tokens_list=GENERATION_TOKENS,
+                                 prompt_tokens_list=PROMPT_TOKENS,
+                                 generated_logprobs_raw=GENERATION_LOGPROBS_RAW
+                                 if do_generated_logprobs else None,
+                                 prompt_logprobs_raw=PROMPT_LOGPROBS_RAW
+                                 if do_prompt_logprobs else None)
+
+    # Make N requests.
+    requests = [
+        DetokenizerRequest(
+            request_id=f"request-{idx}",
+            prompt=prompt,
+            prompt_token_ids=prompt_tokens,
+            skip_special_tokens=False,
+            spaces_between_special_tokens=False,
+            output_kind=RequestOutputKind.DELTA,
+            stop=STOP_STRINGS,
+            include_stop_str_in_output=include_stop_str_in_output,
+            logprobs=logprobs,
+            prompt_logprobs=prompt_logprobs,
+        ) for idx, (
+            prompt,
+            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+    ]
+
+    # Add requests to the detokenizer.
+    for request in requests:
+        detokenizer.add_request(request)
+
+    gen_strings = {}
+    aborted = []
+    while True:
+        # Mock output from the EngineCore.
+        outputs = engine_core.get_outputs()
+        if len(outputs) == 0:
+            break
+
+        # Step the Detokenizer.
+        request_outputs, requests_to_abort = detokenizer.step(outputs)
+        for request_output in request_outputs:
+            # If aborted, we should not get a request output.
+            assert request_output.request_id not in aborted
+        aborted.extend(requests_to_abort)
+
+        # Update tracking.
+        for request_output in request_outputs:
+            if request_output.finished:
+                assert request_output.outputs[0].finish_reason == "stop"
+
+            request_id = request_output.request_id
+            new_text = request_output.outputs[0].text
+            if request_id not in gen_strings:
+                gen_strings[request_id] = new_text
+            else:
+                gen_strings[request_id] += new_text
+
+    # Confirmed tracked values matches what we expected.
+    for idx, (ref_gen_str,
+              stop_str) in enumerate(zip(GENERATION_STRINGS, STOP_STRINGS)):
+
+        # Request should be aborted.
+        request_id = f"request-{idx}"
+        assert request_id in aborted
+
+        # Collected values that were generated.
+        gen_str = gen_strings[request_id]
+
+        # Construct reference strings.
+        stop_str_idx = ref_gen_str.find(stop_str)
+        ref_str_exc_stop = ref_gen_str[:stop_str_idx]
+        ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str
+
+        if include_stop_str_in_output:
+            assert gen_str == ref_str_inc_stop, (
+                f"{gen_str=}, {ref_str_inc_stop=}")
+        else:
+            assert gen_str == ref_str_exc_stop, (
+                f"{gen_str=}, {ref_str_exc_stop=}")
+
+    assert detokenizer.get_num_unfinished_requests() == 0
+    assert not detokenizer.has_unfinished_requests()
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 33d546ee060dc..ac59df5ebde05 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -123,8 +123,8 @@ def _detokenize_ids(
         self,
         token_id_list: int,
     ) -> List[str]:
-        return self.tokenizer.convert_ids_to_tokens(
-            token_id_list, skip_special_tokens=False)
+        return self.tokenizer.convert_ids_to_tokens(token_id_list,
+                                                    skip_special_tokens=False)
 
     def _pythonize_sequence_position(
         self,

From 58bcc5a45c651809f50dcefb206b5403123d7804 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 7 Dec 2024 02:43:55 +0000
Subject: [PATCH 142/293] detokenizer stop tokens test passing; some slight
 engine fixes for the delta case

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py | 121 +++++++++++-----------------
 vllm/v1/engine/detokenizer.py       |  28 ++++---
 2 files changed, 66 insertions(+), 83 deletions(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 604350b693417..2a1f665c5323f 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -1,5 +1,5 @@
 import random
-from typing import Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple
 
 import numpy as np
 import numpy.typing as npt
@@ -7,8 +7,8 @@
 from transformers import AutoTokenizer
 from transformers.tokenization_utils import PreTrainedTokenizer
 
+from vllm.outputs import RequestOutput
 from vllm.sampling_params import RequestOutputKind
-from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.v1.engine import EngineCoreOutput
 from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerRequest
 
@@ -99,66 +99,35 @@ def _generate_dummy_prompt_logprobs(
                                             False))
 
 
-def _pythonize_logprobs_at_single_seq_offset(
-    logprobs_np: npt.NDArray,
-    token_ids_np: npt.NDArray,
+def _decode_token(
+    tok_id: int,
     tokenizer: PreTrainedTokenizer,
-) -> Dict[int, Logprob]:
-    return {
-        tok_id: Logprob(
-            val, tdx + 1,
-            tokenizer.convert_ids_to_tokens([tok_id],
-                                            skip_special_tokens=False))
-        for tdx, (val, tok_id) in enumerate(zip(logprobs_np, token_ids_np))
-    }
-
-
-def _detokenize_prompt_logprobs(
-    prompt_logprobs_np: Tuple[npt.NDArray, npt.NDArray],
-    tokenizer: PreTrainedTokenizer,
-) -> PromptLogprobs:
-    prompt_logprobs_np_vals = prompt_logprobs_np[0]
-    prompt_logprobs_np_toks = prompt_logprobs_np[1]
-    num_prompt_tokens = prompt_logprobs_np_vals.shape[0]
-    res = [
-        _pythonize_logprobs_at_single_seq_offset(
-            prompt_logprobs_np_vals[sdx, :], prompt_logprobs_np_toks[sdx, :],
-            tokenizer) for sdx in range(num_prompt_tokens)
-    ]
-    return res
-
-
-def _copy_logprob_add_decode(
-    logprob: Logprob,
-    token_id: int,
-    tokenizer: PreTrainedTokenizer,
-) -> Logprob:
-    return Logprob(
-        logprob.logprob, logprob.rank,
-        tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False))
-
-
-def _generate_dummy_logprobs_tuple(
-    num_logprobs: int,
-    is_sample_logprobs: bool,
-    tokenizer: PreTrainedTokenizer,
-) -> Dict[int, Logprob]:
-    adjusted_num_logprobs = (num_logprobs + random.choice([0, 1])
-                             if is_sample_logprobs else num_logprobs)
-    return {
-        random.randint(0,
-                       len(tokenizer.vocab) - 1):
-        Logprob(random.uniform(-100, 0), idx, None)
-        for idx in range(adjusted_num_logprobs)
-    }
-
-
-def _new_logprobs_detokenized(logprobs: Union[SampleLogprobs, PromptLogprobs],
-                              C) -> Union[SampleLogprobs, PromptLogprobs]:
-    return [{
-        tok_id: _copy_logprob_add_decode(lp, tok_id, tokenizer)
-        for tok_id, lp in lp_dict.items()
-    } for lp_dict in logprobs]
+) -> str:
+    return tokenizer.convert_ids_to_tokens([tok_id],
+                                           skip_special_tokens=False)[0]
+
+
+def _validate_requests_logprobs(requests: List[DetokenizerRequest],
+                                request_outputs: List[RequestOutput]):
+    # Validate logprob detokenization
+    for req, req_out in zip(requests, request_outputs):
+        if req.logprobs is not None and req.logprobs > 0:
+            for comp in req_out.outputs:
+                for lp_dict in comp.logprobs:
+                    for tok_id, lp in lp_dict.items():
+                        assert lp.decoded_token == _decode_token(
+                            tok_id,
+                            tokenizer), "sample logprob decoded token mismatch"
+
+        if req.prompt_logprobs is not None and req.prompt_logprobs > 0 and len(
+                req_out.prompt_logprobs) > 0:
+            # Validate prompt logprobs
+            assert req_out.prompt_logprobs[0] is None
+            for plp_dict in req_out.prompt_logprobs[1:]:
+                for tok_id, plp in plp_dict.items():
+                    assert plp.decoded_token == _decode_token(
+                        tok_id,
+                        tokenizer), "prompt logprob decoded token mismatch"
 
 
 FULL_STRINGS = [
@@ -232,11 +201,10 @@ def get_outputs(self) -> List[EngineCoreOutput]:
         do_logprobs = self.do_logprobs
         do_prompt_logprobs = self.do_prompt_logprobs
         token_idx = self.current_idx
-        self.current_idx += 1
 
         outputs = []
-        for req_idx, (generated_token_ids, prompt_token_ids) in enumerate(
-                zip(self.generated_tokens_list, self.prompt_tokens_list)):
+        for req_idx, generated_token_ids in enumerate(
+                self.generated_tokens_list):
             if len(generated_token_ids) > token_idx:
                 if do_logprobs:
                     assert self.generated_logprobs_raw is not None
@@ -245,28 +213,30 @@ def get_outputs(self) -> List[EngineCoreOutput]:
                     ]
                 else:
                     logprobs = None
-                if self.current_idx == 0 and do_prompt_logprobs:
-                    assert self.prompt_logprobs_raw is not None
-                    prompt_logprobs = self.prompt_logprobs_raw[req_idx][0]
-                    prompt_logprobs_token_ids = self.prompt_logprobs_raw[
-                        req_idx][1]
+                if do_prompt_logprobs:
+                    if self.current_idx == 0:
+                        assert self.prompt_logprobs_raw is not None
+                        prompt_logprobs = self.prompt_logprobs_raw[req_idx][0]
+                        prompt_logprobs_token_ids = self.prompt_logprobs_raw[
+                            req_idx][1]
+                    else:
+                        (prompt_logprobs, prompt_logprobs_token_ids) = ([], [])
                 else:
-                    prompt_logprobs = None
-                    prompt_logprobs_token_ids = None
+                    (prompt_logprobs, prompt_logprobs_token_ids) = (None, None)
                 output = EngineCoreOutput(
                     request_id=f"request-{req_idx}",
                     new_token_ids=[generated_token_ids[token_idx]],
                     finished=False,
                     logprobs=logprobs,
                     prompt_logprobs=prompt_logprobs,
-                    prompt_logprobs_token_ids=prompt_logprobs_token_ids
-                    if self.current_idx == 0 and do_prompt_logprobs else None,
+                    prompt_logprobs_token_ids=prompt_logprobs_token_ids,
                 )
                 if token_idx == len(generated_token_ids) - 1:
                     output.finished = True
                     output.finish_reason = "stopped"
                 outputs.append(output)
 
+        self.current_idx += 1
         return outputs
 
 
@@ -378,6 +348,7 @@ def test_stop_string(
 
     gen_strings = {}
     aborted = []
+    i = 0
     while True:
         # Mock output from the EngineCore.
         outputs = engine_core.get_outputs()
@@ -391,6 +362,9 @@ def test_stop_string(
             assert request_output.request_id not in aborted
         aborted.extend(requests_to_abort)
 
+        # Validate logprob detokenization
+        _validate_requests_logprobs(requests, request_outputs)
+
         # Update tracking.
         for request_output in request_outputs:
             if request_output.finished:
@@ -402,6 +376,7 @@ def test_stop_string(
                 gen_strings[request_id] = new_text
             else:
                 gen_strings[request_id] += new_text
+        i += 1
 
     # Confirmed tracked values matches what we expected.
     for idx, (ref_gen_str,
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index ac59df5ebde05..0029c194efb0b 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -161,7 +161,7 @@ def _pythonize_maybe_detokenize_sample_logprobs_for_request(
         new_logprobs: List[Tuple[npt.NDArray, npt.NDArray]],
         new_token_ids: List[int],
         detokenize: bool,
-    ) -> Tuple[SampleLogprobs, float]:
+    ) -> SampleLogprobs:
         """Pythonize sample logprobs, maybe detokenize.
         
         Pythonization entails the conversion from a numpy (np)
@@ -181,6 +181,7 @@ def _pythonize_maybe_detokenize_sample_logprobs_for_request(
         Returns:
           Sample logprobs, Pythonized and possibly detokenized
         """
+        new_pythonized_logprobs = []
         max_logprobs = self.max_request_sample_logprobs
         for (logprob_values,
              logprob_token_ids), token_id in zip(new_logprobs, new_token_ids):
@@ -201,12 +202,15 @@ def _pythonize_maybe_detokenize_sample_logprobs_for_request(
                 # There will be one more logprob than the user requested
                 logprob_cnt = max_logprobs + 1
 
-            new_pythonized_logprobs = self._pythonize_sequence_position(
+            new_pythonized_logprobs_dict = self._pythonize_sequence_position(
                 logprob_values[0:logprob_cnt],
                 logprob_token_ids[0:logprob_cnt], detokenize)
-            self.request_logprobs.append(new_pythonized_logprobs)
-            self.request_cumulative_logprob += new_pythonized_logprobs[
+            self.request_logprobs.append(new_pythonized_logprobs_dict)
+            self.request_cumulative_logprob += new_pythonized_logprobs_dict[
                 token_id].logprob
+            new_pythonized_logprobs.append(new_pythonized_logprobs_dict)
+
+        return new_pythonized_logprobs
 
     def _pythonize_maybe_detokenize_prompt_logprobs_for_request(
         self,
@@ -237,6 +241,8 @@ def _pythonize_maybe_detokenize_prompt_logprobs_for_request(
 
         self.request_prompt_logprobs.extend(prompt_logprobs)
 
+        return prompt_logprobs
+
     def add_tokens(
         self,
         new_token_ids: List[int],
@@ -265,15 +271,17 @@ def add_tokens(
 
         # 1) If required, Pythonize & detokenize sample logprobs
         if do_request_sample_logprobs:
-            self._pythonize_maybe_detokenize_sample_logprobs_for_request(
-                new_logprobs, new_token_ids, detokenize=True)
+            new_logprobs = (
+                self._pythonize_maybe_detokenize_sample_logprobs_for_request(
+                    new_logprobs, new_token_ids, detokenize=True))
 
         # 2) If necessary, detokenize prompt logprobs incrementally
         if do_request_prompt_logprobs:
-            self._pythonize_maybe_detokenize_prompt_logprobs_for_request(
-                new_prompt_logprobs,
-                new_prompt_logprob_token_ids,
-                detokenize=True)
+            new_prompt_logprobs = (
+                self._pythonize_maybe_detokenize_prompt_logprobs_for_request(
+                    new_prompt_logprobs,
+                    new_prompt_logprob_token_ids,
+                    detokenize=True))
 
         # 3) Detokenize the new token ids incrementally. If necessary,
         #    detokenize logprobs.

From 63208681b96f4b15435a106f59ea8045750b0989 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 7 Dec 2024 03:18:10 +0000
Subject: [PATCH 143/293] refactored detokenizer

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/detokenizer.py | 136 ++++++++++++++++++++++++++--------
 1 file changed, 104 insertions(+), 32 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 0029c194efb0b..89ffa0dac21d4 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple, Union
+from typing import Dict, Iterable, List, Optional, Tuple
 
 import numpy as np
 import numpy.typing as npt
@@ -16,11 +16,14 @@
 
 logger = init_logger(__name__)
 
-AnyLogprobs = Union[Optional[SampleLogprobs], Optional[PromptLogprobs]]
-
 
 @dataclass
 class IncrementalDetokenizer:
+    """Track and implement detokenization for a single request.
+    
+    Also handles Pythonization (conversion to OpenAI-API-compatible Python
+    data structures) of logprobs Numpy arrays computed for the request.
+    """
 
     # Generation data
     output_text: str
@@ -63,6 +66,7 @@ class IncrementalDetokenizer:
 
     @property
     def output_token_ids(self) -> List[int]:
+        """Return generated tokens"""
         assert len(self.token_ids) >= len(self.prompt_token_ids)
         return self.token_ids[len(self.prompt_token_ids):]
 
@@ -72,6 +76,15 @@ def from_new_request(
         tokenizer: AnyTokenizer,
         request: DetokenizerRequest,
     ) -> "IncrementalDetokenizer":
+        """Construct incremental detokenizer for a request.
+        
+        Args:
+          tokenizer: tokenizer provides detokenization methods
+          request: track detokenization progress of this request
+
+        Returns:
+          Incremental detokenizer for the request
+        """
 
         tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
             tokenizer=tokenizer,
@@ -87,10 +100,10 @@ def from_new_request(
         else:
             stop_buffer_length = 0
 
-        # Logprobs & prompt logprobs settings
+        # Flags for whether to detokenize sample logprobs and prompt logprobs,
+        # respectively.
         do_request_logprobs = (request.logprobs is not None
                                and request.logprobs > 0)
-
         do_request_prompt_logprobs = (request.prompt_logprobs is not None
                                       and request.prompt_logprobs > 0)
 
@@ -123,6 +136,14 @@ def _detokenize_ids(
         self,
         token_id_list: int,
     ) -> List[str]:
+        """Helper method to detokenize one or more token ids.
+        
+        Args:
+          token_id_list: list of tokens to detokenize
+
+        Returns:
+          List of token string representations of tokens
+        """
         return self.tokenizer.convert_ids_to_tokens(token_id_list,
                                                     skip_special_tokens=False)
 
@@ -134,13 +155,15 @@ def _pythonize_sequence_position(
     ) -> Dict[int, Logprob]:
         """Pythonize the numpy (np) logprobs & token ids for a sequence position
         
-        Optionally detokenize (compute logprob decoded token str)
+        Outputs the OpenAI-API-compatible representation of the top tokens and
+        their logprobs at a single position in a sequence.
+
+        Optionally detokenize (compute logprob `decoded_token`)
 
         Args:
           logprob_values: np logprob values
           logprob_token_ids: np logprob token ids
-          detokenize: if True, compute logprob decoded token str,
-                      (o/w decoded_token=None)
+          detokenize: if True, detokenize logprob top token ids
 
         Return:
           mapping from top token id to Logprob data structure
@@ -164,6 +187,12 @@ def _pythonize_maybe_detokenize_sample_logprobs_for_request(
     ) -> SampleLogprobs:
         """Pythonize sample logprobs, maybe detokenize.
         
+        Only Pythonizes sample logprobs computed in the current
+        step. Has the side effect of updating the incremental detokenizer
+        state by (1) appending the new sample logprobs to the list of what
+        was computed for previously-sampled tokens, and (2) accumulating
+        into the request's cumulative logprob value.ß
+
         Pythonization entails the conversion from a numpy (np)
         values/token ids representation to the more idiomatically
         Pythonic representation required by the OpenAI API,
@@ -172,14 +201,14 @@ def _pythonize_maybe_detokenize_sample_logprobs_for_request(
         The Logprob.decoded_token field is only computed (detokenized
         from the associated top token id) if detokenize=True
 
-        Also computes cumulative logprob.
-
         Args:
           new_logprobs: List of (logprobs,logprob token ids) numpy array tuples
+          new_token_ids: List of sample token ids
           detokenize: Logprob.decoded_token is computed if True, otherwise None
         
         Returns:
-          Sample logprobs, Pythonized and possibly detokenized
+          Sample logprobs compute in this step, Pythonized and possibly
+          detokenized
         """
         new_pythonized_logprobs = []
         max_logprobs = self.max_request_sample_logprobs
@@ -190,7 +219,7 @@ def _pythonize_maybe_detokenize_sample_logprobs_for_request(
             logprob_cnt = max_logprobs
             if token_id not in logprob_token_ids[0:logprob_cnt]:
                 # Sampled token is not in the in the top logprobs;
-                # inject it & resort, ensuring that excess logprobs
+                # inject it & re-sort, ensuring that excess logprobs
                 # not requested by the user have -inf probability
                 logprob_values[max_logprobs:-1] = float('-inf')
                 # Get indices that would sort logprob_values in descending order
@@ -202,6 +231,7 @@ def _pythonize_maybe_detokenize_sample_logprobs_for_request(
                 # There will be one more logprob than the user requested
                 logprob_cnt = max_logprobs + 1
 
+            # Pythonize top logprobs
             new_pythonized_logprobs_dict = self._pythonize_sequence_position(
                 logprob_values[0:logprob_cnt],
                 logprob_token_ids[0:logprob_cnt], detokenize)
@@ -218,13 +248,33 @@ def _pythonize_maybe_detokenize_prompt_logprobs_for_request(
         prompt_logprob_token_ids: Optional[npt.NDArray],
         detokenize: bool,
     ) -> PromptLogprobs:
-        # Construct prompt logprobs, under the condition that
-        # prompt logprobs were requested & a nonzero number of
-        # prompt tokens were computed in this step for this request.
-        #
-        # Note that this scenario returns an EngineCoreOutput which
-        # is empty except for the prompt logprobs which were
-        # computed for these prompt tokens.
+        """Pythonize prompt logprobs, maybe detokenize.
+        
+        Only Pythonizes prompt logprobs computed in the current
+        step. Has the side effect of updating the incremental detokenizer
+        state by appending the new prompt logprobs to the list of what
+        was computed for previous prompt chunks. Forces the first prompt
+        logprob associated with the request to be `None`.
+
+        Pythonization entails the conversion from a numpy (np)
+        values/token ids representation to the more idiomatically
+        Pythonic representation required by the OpenAI API,
+        List[Dict[int,Logprob]]
+
+        The Logprob.decoded_token field is only computed (detokenized
+        from the associated top token id) if detokenize=True
+
+        Args:
+          prompt_logprob_values: num_chunk_tokens x num_prompt_logprobs np array
+                                 of top token log probabilities
+          prompt_logprob_token_ids: num_chunk_tokens x num_prompt_logprobs np
+                                    array of top token ids
+          detokenize: Logprob.decoded_token is computed if True, otherwise None
+        
+        Returns:
+          Prompt logprobs compute in this step, Pythonized and possibly
+          detokenized
+        """
         logprob_cnt = self.max_request_prompt_logprobs
         prompt_logprobs = [
             self._pythonize_sequence_position(plp_tok_values,
@@ -234,13 +284,10 @@ def _pythonize_maybe_detokenize_prompt_logprobs_for_request(
                 prompt_logprob_values[:, 0:logprob_cnt],
                 prompt_logprob_token_ids[:, 0:logprob_cnt])
         ]
-
         if not self.request_prompt_logprobs:
             # Ensure that None is the first prompt logprob
             prompt_logprobs = [None] + prompt_logprobs
-
         self.request_prompt_logprobs.extend(prompt_logprobs)
-
         return prompt_logprobs
 
     def add_tokens(
@@ -252,31 +299,51 @@ def add_tokens(
         finish_reason: Optional[str],
         stop_reason: Optional[str],
     ) -> Optional[RequestOutput]:
-        """
-        Update RequestState for the request_id by:
-            1) If necessary, detokenize logprobs *non*-incrementally
-            2) If necessary, detokenize prompt logprobs *non*-incrementally
-            3) Detokenize the new token ids incrementally.
-            4) Update the RequestOutput with the new text.
+        """Update RequestState for the request_id.
+
+        1) If necessary, detokenize sample logprobs *non*-incrementally
+        2) If necessary, detokenize prompt logprobs *non*-incrementally
+        3) Detokenize the new token ids incrementally.
+        4) Evaluate stop criteria
+        5) Update the `RequestOutput` object with new text
+
+        Args:
+          new_token_ids: list of newly-sampled token ids
+          new_logprobs: list of (logprobs,token ids) top logprobs
+                        tuples for sampled tokens
+          new_prompt_logprobs: num_chunk_tokens x num_prompt_logprobs np array
+                               of prompt logprobs values
+          new_prompt_logprob_token_ids: num_chunk_tokens x num_prompt_logprobs
+                                        np array of top token ids
+          finish_reason: string representation of the reason request
+                         detokenization completed
+          stop_reason: reason that detokenization stopped
+
+        Returns:
+          Returns request output instance, except i.e. when the request
+          is configured to only return a result on the final decode step
+          which has not occurred yet.
         """
 
+        # Only try to Pythonize sample logprobs if any were provided
         do_request_sample_logprobs = new_logprobs is not None and len(
             new_logprobs) > 0
         assert not do_request_sample_logprobs or len(new_logprobs) == len(
             new_token_ids)
+        # Only try to Pythonize prompt logprobs if any were provided
         do_request_prompt_logprobs = new_prompt_logprobs is not None and len(
             new_prompt_logprobs) > 0
         assert (not do_request_prompt_logprobs
                 or new_prompt_logprob_token_ids is not None)
 
-        # 1) If required, Pythonize & detokenize sample logprobs
         if do_request_sample_logprobs:
+            # 1) Pythonize & detokenize sample logprobs
             new_logprobs = (
                 self._pythonize_maybe_detokenize_sample_logprobs_for_request(
                     new_logprobs, new_token_ids, detokenize=True))
 
-        # 2) If necessary, detokenize prompt logprobs incrementally
         if do_request_prompt_logprobs:
+            # 2) If necessary, detokenize prompt logprobs incrementally
             new_prompt_logprobs = (
                 self._pythonize_maybe_detokenize_prompt_logprobs_for_request(
                     new_prompt_logprobs,
@@ -309,8 +376,8 @@ def add_tokens(
 
             decoded_text += new_decoded_token_text
 
-        # 2) Evaluate stop criteria.
         if self.stop:
+            # 4) Evaluate stop criteria.
             stop = StopChecker.check_stop_strings(
                 output_text=self.output_text,
                 new_char_count=len(decoded_text),
@@ -325,7 +392,7 @@ def add_tokens(
 
         # TODO: handle stop_token_ids here too?
 
-        # 3) Update the RequestOutput object with the new text.
+        # 5) Update the RequestOutput object with the new text.
         finished = bool(finish_reason)
         if self.output_kind == RequestOutputKind.FINAL_ONLY \
             and not finished:
@@ -333,6 +400,10 @@ def add_tokens(
 
         delta = self.output_kind == RequestOutputKind.DELTA
         output_text = self._get_next_output_text(finished, delta)
+        # DELTA -> new sampled tokens and logprobs + current cumulative prompt
+        #          logprob
+        # FINAL -> all sampled tokens and logprobs + current cumulative prompt
+        #          logprob
         token_ids = new_token_ids if delta else self.output_token_ids
         logprobs = new_logprobs if delta else self.request_logprobs
         prompt_logprobs = (new_prompt_logprobs
@@ -376,6 +447,7 @@ def _get_next_output_text(self, finished: bool, delta: bool) -> str:
 
 
 class Detokenizer:
+    """Track and implement detokenization of multiple requests"""
 
     def __init__(self,
                  tokenizer_name: str,

From 54abd99693e60602e68cec9681540b517b32daaa Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 7 Dec 2024 03:38:24 +0000
Subject: [PATCH 144/293] wip

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 2a1f665c5323f..67a2205131dc4 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -243,9 +243,24 @@ def get_outputs(self) -> List[EngineCoreOutput]:
 @pytest.mark.parametrize(
     "request_output_kind",
     [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-def test_incremental_detokenization(request_output_kind: RequestOutputKind):
+@pytest.mark.parametrize("logprobs,prompt_logprobs",
+                         [(None, None), (NUM_SAMPLE_LOGPROBS, None),
+                          (None, NUM_PROMPT_LOGPROBS),
+                          (NUM_SAMPLE_LOGPROBS, NUM_PROMPT_LOGPROBS)])
+def test_incremental_detokenization(
+    request_output_kind: RequestOutputKind,
+    logprobs: Optional[int],
+    prompt_logprobs: Optional[int],
+) -> None:
+    do_generated_logprobs = logprobs is not None
+    do_prompt_logprobs = prompt_logprobs is not None
     detokenizer = Detokenizer(TOKENIZER_NAME)
-    engine_core = MockEngineCore(GENERATION_TOKENS)
+    engine_core = MockEngineCore(generated_tokens_list=GENERATION_TOKENS,
+                                 prompt_tokens_list=PROMPT_TOKENS,
+                                 generated_logprobs_raw=GENERATION_LOGPROBS_RAW
+                                 if do_generated_logprobs else None,
+                                 prompt_logprobs_raw=PROMPT_LOGPROBS_RAW
+                                 if do_prompt_logprobs else None)
 
     # Make N requests.
     requests = [

From 7852bb2f8a82c6ae6c772be8d10842bfd5b8fe6e Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 7 Dec 2024 03:41:00 +0000
Subject: [PATCH 145/293] incremental detokenization test now also checks
 logprobs

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 67a2205131dc4..fb28442d3e798 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -273,6 +273,8 @@ def test_incremental_detokenization(
             output_kind=request_output_kind,
             stop=[],
             include_stop_str_in_output=False,
+            logprobs=logprobs,
+            prompt_logprobs=prompt_logprobs,
         ) for idx, (
             prompt,
             prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
@@ -294,6 +296,9 @@ def test_incremental_detokenization(
         request_outputs, requests_to_abort = detokenizer.step(outputs)
         assert len(requests_to_abort) == 0
 
+        # Validate logprob detokenization
+        _validate_requests_logprobs(requests, request_outputs)
+
         # Update tracking.
         for request_output in request_outputs:
             request_id = request_output.request_id

From f6d4329ce85ca564762adb1d506d271160e587aa Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 7 Dec 2024 12:09:04 +0000
Subject: [PATCH 146/293] woosuk code structure suggestion

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/worker/gpu_model_runner.py | 60 ++++++++++++------------------
 1 file changed, 23 insertions(+), 37 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index fac8688327f8e..5766448312cbe 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -213,7 +213,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
     def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
-        sampling_metadata: SamplingMetadata,
     ) -> Tuple[torch.Tensor, FlashAttentionMetadata]:
 
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
@@ -298,7 +297,14 @@ def _prepare_inputs(
                   out=slot_mapping)
 
         # Prepare the attention metadata.
-        query_start_loc = sampling_metadata.query_start_loc
+        query_start_loc = torch.empty((num_reqs + 1, ),
+                                      dtype=torch.int32,
+                                      device="cpu",
+                                      pin_memory=self.pin_memory)
+        query_start_loc_np = query_start_loc.numpy()
+        query_start_loc_np[0] = 0
+        np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1:])
+
         seq_lens = (self.input_batch.num_computed_tokens_cpu[:num_reqs] +
                     num_scheduled_tokens)
         max_seq_len = seq_lens.max()
@@ -313,6 +319,7 @@ def _prepare_inputs(
         input_ids = input_ids.to(self.device, non_blocking=True)
         self.positions[:total_num_scheduled_tokens].copy_(positions,
                                                           non_blocking=True)
+        query_start_loc = query_start_loc.to(self.device, non_blocking=True)
         seq_start_loc = seq_start_loc.to(self.device, non_blocking=True)
         slot_mapping = slot_mapping.to(self.device, non_blocking=True).long()
         attn_metadata = FlashAttentionMetadata(
@@ -334,6 +341,7 @@ def _prepare_sampling(
         self,
         scheduler_output: "SchedulerOutput",
         num_input_tokens: int,
+        query_start_loc: torch.Tensor,
     ) -> SamplingMetadata:
         skip_copy = True
         if (scheduler_output.finished_req_ids
@@ -346,6 +354,7 @@ def _prepare_sampling(
         sampling_metadata = self.input_batch.make_sampling_metadata(
             scheduler_output,
             num_input_tokens,
+            query_start_loc,
             skip_copy,
         )
         return sampling_metadata
@@ -433,6 +442,11 @@ def execute_model(
         self._execute_encoder(scheduler_output)
         encoder_outputs = self._gather_encoder_outputs(scheduler_output)
 
+        # Prepare the decoder inputs.
+        (
+            input_ids,
+            attn_metadata,
+        ) = self._prepare_inputs(scheduler_output=scheduler_output)
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
@@ -444,18 +458,11 @@ def execute_model(
             # Eager mode.
             num_input_tokens = num_scheduled_tokens
 
-        sampling_metadata = self._prepare_sampling(scheduler_output,
-                                                   num_input_tokens)
+        sampling_metadata = self._prepare_sampling(
+            scheduler_output, num_input_tokens, attn_metadata.query_start_loc)
         do_logprobs = sampling_metadata.max_num_logprobs > 0
         do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
 
-        # Prepare the decoder inputs.
-        (
-            input_ids,
-            attn_metadata,
-        ) = self._prepare_inputs(scheduler_output=scheduler_output,
-                                 sampling_metadata=sampling_metadata)
-
         # Get the inputs embeds.
         if encoder_outputs:
             inputs_embeds = self.model.get_input_embeddings(
@@ -486,7 +493,7 @@ def execute_model(
             sampling_metadata=sampling_metadata,
         )
 
-        # NOTE: CPU-GPU synchronization happens here.
+        # NOTE: sampled token id CPU-GPU synchronization happens here.
         sampled_token_ids = sampler_output.sampled_token_ids.cpu()
         sampled_token_ids_list = sampled_token_ids.tolist()
         # TODO(woosuk): The following loop can be slow since it iterates over
@@ -514,6 +521,8 @@ def execute_model(
             req_ids=self.input_batch.req_ids[:num_reqs],
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids_cpu=sampled_token_ids,
+            # NOTE: sample and prompt logprob CPU-GPU synchronization happens
+            # here
             logprob_token_ids_cpu=(
                 sampler_output.logprob_token_ids.cpu().numpy()
                 if do_logprobs else None),
@@ -838,6 +847,7 @@ def make_sampling_metadata(
         self,
         scheduler_output: "SchedulerOutput",
         num_input_tokens: int,
+        query_start_loc: torch.Tensor,
         skip_copy: bool = False,
     ) -> SamplingMetadata:
         if not skip_copy:
@@ -850,31 +860,6 @@ def make_sampling_metadata(
 
         num_reqs = self.num_reqs
 
-        # Get the number of scheduled tokens for each request.
-        # TODO: The Python loop can be slow. Optimize.
-        num_scheduled_tokens = []
-        max_num_scheduled_tokens = 0
-        for req_id in self.req_ids[:num_reqs]:
-            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
-            num_scheduled_tokens.append(num_tokens)
-            max_num_scheduled_tokens = max(max_num_scheduled_tokens,
-                                           num_tokens)
-        num_scheduled_tokens = np.array(num_scheduled_tokens, dtype=np.int32)
-        assert max_num_scheduled_tokens > 0
-
-        # Compute query start offsets. It makes sense to compute this here
-        # rather than in model runner _prepare_inputs() because query start
-        # offsets are required for computing num_query_tokens in the scenario
-        # where prompt logprobs are required by the batch.
-        query_start_loc = torch.empty((num_reqs + 1, ),
-                                      dtype=torch.int32,
-                                      device="cpu",
-                                      pin_memory=self.pin_memory)
-        query_start_loc_np = query_start_loc.numpy()
-        query_start_loc_np[0] = 0
-        np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1:])
-        query_start_loc = query_start_loc.to(self.device, non_blocking=True)
-
         return SamplingMetadata(
             temperature=self.temperature[:num_reqs],
             all_greedy=self.all_greedy,
@@ -886,6 +871,7 @@ def make_sampling_metadata(
             generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
             max_num_prompt_logprobs=self.max_num_prompt_logprobs,
+            # Required for sampling indices computation
             query_start_loc=query_start_loc,
             num_input_tokens=num_input_tokens,
             partial_req_index=scheduler_output.partial_req_index,

From a4eb6bc431bb16cfb40e20cac3ab4997b9bd2ed1 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 7 Dec 2024 12:24:52 +0000
Subject: [PATCH 147/293] detokenizer tests refactor

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py | 127 ++--------------------------
 tests/v1/engine/utils.py            | 123 +++++++++++++++++++++++++++
 2 files changed, 131 insertions(+), 119 deletions(-)
 create mode 100644 tests/v1/engine/utils.py

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index fb28442d3e798..831963c4ec836 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -1,135 +1,24 @@
-import random
 from typing import List, Optional, Tuple
 
-import numpy as np
 import numpy.typing as npt
 import pytest
 from transformers import AutoTokenizer
-from transformers.tokenization_utils import PreTrainedTokenizer
 
-from vllm.outputs import RequestOutput
+from tests.v1.engine.utils import (_generate_dummy_prompt_logprobs,
+                                   _generate_dummy_sample_logprobs,
+                                   _validate_requests_logprobs)
 from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine import EngineCoreOutput
 from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerRequest
 
-random.seed(42)
+# Number of sample logprobs to request when testing sample logprobs
 NUM_SAMPLE_LOGPROBS = 5
+# Number of prompt logprobs to request when testing prompt logprobs
 NUM_PROMPT_LOGPROBS = 7
-
+# Use Mistral instruct tokenizer
 TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
 tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
 
-
-def _create_random_top_logprob_vector(
-    num_logprobs: int,
-    lower: float,
-    upper: float,
-) -> npt.NDArray:
-    return np.random.rand(num_logprobs) * (upper - lower) + lower
-
-
-def _create_random_top_logprob_matrix(
-    shape: Tuple,
-    lower: float,
-    upper: float,
-) -> npt.NDArray:
-    return np.random.rand(*shape) * (upper - lower) + lower
-
-
-def _create_random_top_token_vector(
-    num_logprobs: int,
-    lower: int,
-    upper: int,
-    sampled_token_ids: Optional[npt.NDArray],
-    adjust_num_logprobs: bool,
-) -> npt.NDArray:
-    choice_list = list(range(lower, upper))
-    res = np.random.choice(choice_list, (num_logprobs +
-                                         (1 if adjust_num_logprobs else 0), ),
-                           replace=False)
-    if sampled_token_ids is not None:
-        res[-1] = sampled_token_ids
-    return res
-
-
-def _create_random_top_token_matrix(
-    shape: Tuple,
-    lower: int,
-    upper: int,
-    sampled_token_ids: Optional[npt.NDArray],
-    adjust_num_logprobs: bool,
-) -> npt.NDArray:
-    choice_list = list(range(lower, upper))
-    res = np.random.choice(choice_list, (shape[0], shape[1] +
-                                         (1 if adjust_num_logprobs else 0)),
-                           replace=False)
-    if sampled_token_ids is not None:
-        res[:, -1] = sampled_token_ids
-    return res
-
-
-def _generate_dummy_sample_logprobs(
-    sampled_tokens_list: List,
-    num_logprobs: int,
-    tokenizer: PreTrainedTokenizer,
-) -> List[Tuple[npt.NDArray, npt.NDArray]]:
-    res = []
-    for sampled_token_id in sampled_tokens_list:
-        num_logprobs_adjustment = random.choice([0, 1])
-        res.append(
-            (_create_random_top_logprob_vector(
-                num_logprobs + num_logprobs_adjustment, -100, 0),
-             _create_random_top_token_vector(num_logprobs, 0,
-                                             len(tokenizer.vocab) - 1,
-                                             np.array([sampled_token_id]),
-                                             num_logprobs_adjustment > 0)))
-    return res
-
-
-def _generate_dummy_prompt_logprobs(
-    tokens_list: List,
-    num_logprobs: int,
-    tokenizer: PreTrainedTokenizer,
-) -> Tuple[npt.NDArray, npt.NDArray]:
-    num_tok = len(tokens_list)
-    return (_create_random_top_logprob_matrix((num_tok, num_logprobs), -100,
-                                              0),
-            _create_random_top_token_matrix((num_tok, num_logprobs), 0,
-                                            len(tokenizer.vocab) - 1, None,
-                                            False))
-
-
-def _decode_token(
-    tok_id: int,
-    tokenizer: PreTrainedTokenizer,
-) -> str:
-    return tokenizer.convert_ids_to_tokens([tok_id],
-                                           skip_special_tokens=False)[0]
-
-
-def _validate_requests_logprobs(requests: List[DetokenizerRequest],
-                                request_outputs: List[RequestOutput]):
-    # Validate logprob detokenization
-    for req, req_out in zip(requests, request_outputs):
-        if req.logprobs is not None and req.logprobs > 0:
-            for comp in req_out.outputs:
-                for lp_dict in comp.logprobs:
-                    for tok_id, lp in lp_dict.items():
-                        assert lp.decoded_token == _decode_token(
-                            tok_id,
-                            tokenizer), "sample logprob decoded token mismatch"
-
-        if req.prompt_logprobs is not None and req.prompt_logprobs > 0 and len(
-                req_out.prompt_logprobs) > 0:
-            # Validate prompt logprobs
-            assert req_out.prompt_logprobs[0] is None
-            for plp_dict in req_out.prompt_logprobs[1:]:
-                for tok_id, plp in plp_dict.items():
-                    assert plp.decoded_token == _decode_token(
-                        tok_id,
-                        tokenizer), "prompt logprob decoded token mismatch"
-
-
 FULL_STRINGS = [
     "My name is Robert from Neural Magic and I love working on vLLM so much!",
     "Red Hat is the best open source company by far across Linux, K8s, and AI.",
@@ -297,7 +186,7 @@ def test_incremental_detokenization(
         assert len(requests_to_abort) == 0
 
         # Validate logprob detokenization
-        _validate_requests_logprobs(requests, request_outputs)
+        _validate_requests_logprobs(requests, request_outputs, tokenizer)
 
         # Update tracking.
         for request_output in request_outputs:
@@ -383,7 +272,7 @@ def test_stop_string(
         aborted.extend(requests_to_abort)
 
         # Validate logprob detokenization
-        _validate_requests_logprobs(requests, request_outputs)
+        _validate_requests_logprobs(requests, request_outputs, tokenizer)
 
         # Update tracking.
         for request_output in request_outputs:
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
new file mode 100644
index 0000000000000..208b3d9d6df85
--- /dev/null
+++ b/tests/v1/engine/utils.py
@@ -0,0 +1,123 @@
+"""Engine test utils"""
+import random
+from typing import List, Optional, Tuple
+
+import numpy as np
+import numpy.typing as npt
+from transformers.tokenization_utils import PreTrainedTokenizer
+
+from vllm.outputs import RequestOutput
+from vllm.v1.engine.detokenizer import DetokenizerRequest
+
+random.seed(42)
+
+def _create_random_top_logprob_vector(
+    num_logprobs: int,
+    lower: float,
+    upper: float,
+) -> npt.NDArray:
+    return np.random.rand(num_logprobs) * (upper - lower) + lower
+
+
+def _create_random_top_logprob_matrix(
+    shape: Tuple,
+    lower: float,
+    upper: float,
+) -> npt.NDArray:
+    return np.random.rand(*shape) * (upper - lower) + lower
+
+
+def _create_random_top_token_vector(
+    num_logprobs: int,
+    lower: int,
+    upper: int,
+    sampled_token_ids: Optional[npt.NDArray],
+    adjust_num_logprobs: bool,
+) -> npt.NDArray:
+    choice_list = list(range(lower, upper))
+    res = np.random.choice(choice_list, (num_logprobs +
+                                         (1 if adjust_num_logprobs else 0), ),
+                           replace=False)
+    if sampled_token_ids is not None:
+        res[-1] = sampled_token_ids
+    return res
+
+
+def _create_random_top_token_matrix(
+    shape: Tuple,
+    lower: int,
+    upper: int,
+    sampled_token_ids: Optional[npt.NDArray],
+    adjust_num_logprobs: bool,
+) -> npt.NDArray:
+    choice_list = list(range(lower, upper))
+    res = np.random.choice(choice_list, (shape[0], shape[1] +
+                                         (1 if adjust_num_logprobs else 0)),
+                           replace=False)
+    if sampled_token_ids is not None:
+        res[:, -1] = sampled_token_ids
+    return res
+
+
+def _generate_dummy_sample_logprobs(
+    sampled_tokens_list: List,
+    num_logprobs: int,
+    tokenizer: PreTrainedTokenizer,
+) -> List[Tuple[npt.NDArray, npt.NDArray]]:
+    res = []
+    for sampled_token_id in sampled_tokens_list:
+        num_logprobs_adjustment = random.choice([0, 1])
+        res.append(
+            (_create_random_top_logprob_vector(
+                num_logprobs + num_logprobs_adjustment, -100, 0),
+             _create_random_top_token_vector(num_logprobs, 0,
+                                             len(tokenizer.vocab) - 1,
+                                             np.array([sampled_token_id]),
+                                             num_logprobs_adjustment > 0)))
+    return res
+
+
+def _generate_dummy_prompt_logprobs(
+    tokens_list: List,
+    num_logprobs: int,
+    tokenizer: PreTrainedTokenizer,
+) -> Tuple[npt.NDArray, npt.NDArray]:
+    num_tok = len(tokens_list)
+    return (_create_random_top_logprob_matrix((num_tok, num_logprobs), -100,
+                                              0),
+            _create_random_top_token_matrix((num_tok, num_logprobs), 0,
+                                            len(tokenizer.vocab) - 1, None,
+                                            False))
+
+
+def _decode_token(
+    tok_id: int,
+    tokenizer: PreTrainedTokenizer,
+) -> str:
+    return tokenizer.convert_ids_to_tokens([tok_id],
+                                           skip_special_tokens=False)[0]
+
+
+def _validate_requests_logprobs(requests: List[DetokenizerRequest],
+                                request_outputs: List[RequestOutput],
+                                tokenizer: PreTrainedTokenizer,
+) -> None:
+    # Validate logprob detokenization
+    for req, req_out in zip(requests, request_outputs):
+        if req.logprobs is not None and req.logprobs > 0:
+            for comp in req_out.outputs:
+                for lp_dict in comp.logprobs:
+                    for tok_id, lp in lp_dict.items():
+                        assert lp.decoded_token == _decode_token(
+                            tok_id,
+                            tokenizer), "sample logprob decoded token mismatch"
+
+        if req.prompt_logprobs is not None and req.prompt_logprobs > 0 and len(
+                req_out.prompt_logprobs) > 0:
+            # Validate prompt logprobs
+            assert req_out.prompt_logprobs[0] is None
+            for plp_dict in req_out.prompt_logprobs[1:]:
+                for tok_id, plp in plp_dict.items():
+                    assert plp.decoded_token == _decode_token(
+                        tok_id,
+                        tokenizer), "prompt logprob decoded token mismatch"
\ No newline at end of file

From 06185d0ae07590410b6dfd21278795755c6c18b6 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 7 Dec 2024 12:56:38 +0000
Subject: [PATCH 148/293] refactor

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py |   2 +-
 tests/v1/engine/utils.py            | 208 +++++++++++++++++++++++-----
 2 files changed, 174 insertions(+), 36 deletions(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 831963c4ec836..410a8fd1fd8f0 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -33,7 +33,7 @@
     tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
 ]
 PROMPT_LOGPROBS_RAW: List[Tuple[npt.NDArray, npt.NDArray]] = [
-    _generate_dummy_prompt_logprobs(tokens_list=tokens_list,
+    _generate_dummy_prompt_logprobs(prompt_tokens_list=tokens_list,
                                     num_logprobs=NUM_PROMPT_LOGPROBS,
                                     tokenizer=tokenizer)
     for tokens_list in PROMPT_TOKENS
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index 208b3d9d6df85..f6d6888003a49 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -1,6 +1,6 @@
 """Engine test utils"""
 import random
-from typing import List, Optional, Tuple
+from typing import List, Tuple
 
 import numpy as np
 import numpy.typing as npt
@@ -11,51 +11,119 @@
 
 random.seed(42)
 
-def _create_random_top_logprob_vector(
+
+def _create_random_top_logprob_test_vector(
     num_logprobs: int,
     lower: float,
     upper: float,
 ) -> npt.NDArray:
+    """Create a random vector of top logprob float values.
+    
+    Use to create fake sample logprobs for testing.
+
+    Note that a real production scenario would require
+    logprobs to be sorted in descending order, something
+    which is omitted in this function.
+
+    Args:
+      num_logprobs: number of top logprobs
+      lower: lower range of logprob float values
+      upper: upper range of logprob float values
+
+    Returns:
+      1D length-`num_logprobs` np array of float logprob values
+    """
     return np.random.rand(num_logprobs) * (upper - lower) + lower
 
 
-def _create_random_top_logprob_matrix(
+def _create_random_top_logprob_test_matrix(
     shape: Tuple,
     lower: float,
     upper: float,
 ) -> npt.NDArray:
+    """Create a random matrix of top logprob float values.
+    
+    Use to create fake prompt logprobs for testing.
+
+    Note that a real production scenario would require
+    logprobs to be sorted in descending order along rows,
+    something which is omitted in this function.
+
+    Args:
+      shape: (num_tokens,num_logprobs) tuple representing
+             matrix shape
+      lower: lower range of logprob float values
+      upper: upper range of logprob float values
+
+    Returns:
+      2D num_tokens x num_logprobs np array of float logprob values
+    """
     return np.random.rand(*shape) * (upper - lower) + lower
 
 
-def _create_random_top_token_vector(
+def _create_random_top_token_test_vector(
     num_logprobs: int,
     lower: int,
     upper: int,
-    sampled_token_ids: Optional[npt.NDArray],
+    sampled_token_id: int,
     adjust_num_logprobs: bool,
 ) -> npt.NDArray:
+    """Create a random vector of top logprob token indices
+
+    Use to create fake sample logprobs for testing. The sampled token
+    ID must always be one of the top logprobs, which this dummy test
+    vector generator enforces. OpenAI API
+    compatible engines must be able to return an addition sample
+    logprob for the sampled token if the sampled token was not
+    among the top sample logprobs; `adjust_num_logprobs` emulates
+    this behavior by increasing the vector length by 1 if
+    `adjust_num_logprobs` is set.
+
+    Args:
+      num_logprobs: number of top logprobs
+      lower: lower range of token ids
+      upper: upper range of token ids
+      sampled_token_id: the token actually sampled
+      adjust_num_logprobs: if True, emulate situation where sampled
+                           token logprob must be injected into top
+                           logprobs
+
+    Returns:
+      1D length-x np array of token ids where x is
+      `num_logprobs+1` if `adjust_num_logprobs` and
+      `num_logprobs` otherwise
+    """
     choice_list = list(range(lower, upper))
     res = np.random.choice(choice_list, (num_logprobs +
                                          (1 if adjust_num_logprobs else 0), ),
                            replace=False)
-    if sampled_token_ids is not None:
-        res[-1] = sampled_token_ids
+    res[-1] = sampled_token_id
     return res
 
 
-def _create_random_top_token_matrix(
+def _create_random_top_token_test_matrix(
     shape: Tuple,
     lower: int,
     upper: int,
-    sampled_token_ids: Optional[npt.NDArray],
-    adjust_num_logprobs: bool,
 ) -> npt.NDArray:
+    """Create a random matrix of top logprob token indices
+
+    Use to create fake prompt logprobs for testing.
+
+    Token ids are generated randomly and sampled without
+    replacement.
+
+    Args:
+      shape: (num_tokens,num_logprobs) tuple representing
+             matrix shape
+      lower: lower range of token ids
+      upper: upper range of token ids
+
+    Returns:
+      2D num_tokens x num_logprobs np array of token ids
+    """
     choice_list = list(range(lower, upper))
-    res = np.random.choice(choice_list, (shape[0], shape[1] +
-                                         (1 if adjust_num_logprobs else 0)),
-                           replace=False)
-    if sampled_token_ids is not None:
-        res[:, -1] = sampled_token_ids
+    res = np.random.choice(choice_list, (shape[0], shape[1]), replace=False)
     return res
 
 
@@ -64,50 +132,114 @@ def _generate_dummy_sample_logprobs(
     num_logprobs: int,
     tokenizer: PreTrainedTokenizer,
 ) -> List[Tuple[npt.NDArray, npt.NDArray]]:
+    """Generate dummy sample logprobs
+
+    Generate a test data structure which imitates the list of sample logprobs
+    which would be assembled in the engine core during decode phase.
+
+    Args:
+      sampled_tokens_list: list of sampled tokens
+      num_logprobs: return `num_logprobs` or `num_logprobs+1` logprobs per token
+      tokenizer: model tokenizer to use for detokenization
+
+    Returns
+      List of (logprobs vector, top token ids vector) np array tuples; each pair
+      of vectors have the same length which is either `num_logprobs` or
+      `num_logprobs+1`
+    """
     res = []
     for sampled_token_id in sampled_tokens_list:
         num_logprobs_adjustment = random.choice([0, 1])
-        res.append(
-            (_create_random_top_logprob_vector(
-                num_logprobs + num_logprobs_adjustment, -100, 0),
-             _create_random_top_token_vector(num_logprobs, 0,
-                                             len(tokenizer.vocab) - 1,
-                                             np.array([sampled_token_id]),
-                                             num_logprobs_adjustment > 0)))
+        res.append((_create_random_top_logprob_test_vector(
+            num_logprobs + num_logprobs_adjustment, -100, 0),
+                    _create_random_top_token_test_vector(
+                        num_logprobs, 0,
+                        len(tokenizer.vocab) - 1, sampled_token_id,
+                        num_logprobs_adjustment > 0)))
     return res
 
 
 def _generate_dummy_prompt_logprobs(
-    tokens_list: List,
+    prompt_tokens_list: List,
     num_logprobs: int,
     tokenizer: PreTrainedTokenizer,
 ) -> Tuple[npt.NDArray, npt.NDArray]:
-    num_tok = len(tokens_list)
-    return (_create_random_top_logprob_matrix((num_tok, num_logprobs), -100,
-                                              0),
-            _create_random_top_token_matrix((num_tok, num_logprobs), 0,
-                                            len(tokenizer.vocab) - 1, None,
-                                            False))
+    """Generate dummy prompt logprobs
+
+    Generate a test data structure which imitates the np arrays of prompt
+    logprobs which would be assembled in the engine core during chunked
+    prefill.
+
+    Args:
+      prompt_tokens_list: list of prompt tokens
+      num_logprobs: return `num_logprobs` logprobs per token
+      tokenizer: model tokenizer to use for detokenization
+
+    Returns
+      Single Tuple of (logprobs matrix, top token ids matrix) np arrays,
+      where both matrices have dimensions
+      num_prompt_tokens x num_logprobs
+    """
+    num_prompt_tokens = len(prompt_tokens_list)
+    return (_create_random_top_logprob_test_matrix(
+        (num_prompt_tokens, num_logprobs), -100, 0),
+            _create_random_top_token_test_matrix(
+                (num_prompt_tokens, num_logprobs), 0,
+                len(tokenizer.vocab) - 1))
 
 
 def _decode_token(
     tok_id: int,
     tokenizer: PreTrainedTokenizer,
 ) -> str:
+    """Reproduce the process of detokenizing a token for testing purposes.
+
+    Args:
+      tok_id: token id to detokenize
+      tokenizer: tokenizer to use for detokenization
+
+    Returns:
+      string representation of token
+    """
     return tokenizer.convert_ids_to_tokens([tok_id],
                                            skip_special_tokens=False)[0]
 
 
-def _validate_requests_logprobs(requests: List[DetokenizerRequest],
-                                request_outputs: List[RequestOutput],
-                                tokenizer: PreTrainedTokenizer,
+def _validate_requests_logprobs(
+    requests: List[DetokenizerRequest],
+    request_outputs: List[RequestOutput],
+    tokenizer: PreTrainedTokenizer,
 ) -> None:
-    # Validate logprob detokenization
+    """Validate detokenizer logprobs output
+
+    For each sample or prompt logprob, the logprob's
+    `decoded_token` member should match the result of
+    detokenizing the logprob's token id.
+
+    Fails upon mismatch.
+
+    Requires that `requests` and `request_outputs` have
+    the same ordering with respect to requests (i.e.
+    the data structure pertaining to a given request
+    id appears at the same index in both lists and
+    both lists have the same length.)
+
+    Args:
+      requests: list of detokenizer input requests
+      request_outputs: list of detokenizer outputs
+    """
+    assert len(requests) == len(request_outputs)
     for req, req_out in zip(requests, request_outputs):
         if req.logprobs is not None and req.logprobs > 0:
+            # Validate sample logprobs
             for comp in req_out.outputs:
+                # For each completion
                 for lp_dict in comp.logprobs:
+                    # For each sampled token offset
                     for tok_id, lp in lp_dict.items():
+                        # For each top logprob,
+                        # compare each `decoded_token` to the result
+                        # of decoding the logprob's token id
                         assert lp.decoded_token == _decode_token(
                             tok_id,
                             tokenizer), "sample logprob decoded token mismatch"
@@ -115,9 +247,15 @@ def _validate_requests_logprobs(requests: List[DetokenizerRequest],
         if req.prompt_logprobs is not None and req.prompt_logprobs > 0 and len(
                 req_out.prompt_logprobs) > 0:
             # Validate prompt logprobs
-            assert req_out.prompt_logprobs[0] is None
+            assert req_out.prompt_logprobs[
+                0] is None  # always true for prompt logprobs
             for plp_dict in req_out.prompt_logprobs[1:]:
+                # For each prompt token offset
+                assert plp_dict is not None
                 for tok_id, plp in plp_dict.items():
+                    # For each top logprob,
+                    # compare each `decoded_token` to the result
+                    # of decoding the logprob's token id
                     assert plp.decoded_token == _decode_token(
                         tok_id,
-                        tokenizer), "prompt logprob decoded token mismatch"
\ No newline at end of file
+                        tokenizer), "prompt logprob decoded token mismatch"

From 90ed53d2aaa73ea58f42be7b7ceffd05dfd7fdf4 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 7 Dec 2024 13:14:45 +0000
Subject: [PATCH 149/293] refactoring

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/samplers/test_logprobs.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index 0533cac74acee..129feb7c7fe49 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -1,3 +1,4 @@
+import os
 import re
 from typing import List, Tuple
 
@@ -114,6 +115,11 @@ def _compute_correct_cumulative_logprob(
     return sum([lp[tok_id].logprob for tok_id, lp in zip(token_ids, logprobs)])
 
 
+def _assert_vllm_use_v1():
+    if os.getenv("VLLM_USE_V1") != "1":
+        raise OSError("Test requires VLLM_USE_V1=\"1\"")
+
+
 def _test_case_get_logprobs_and_prompt_logprobs(
     hf_runner,
     vllm_runner,
@@ -125,10 +131,8 @@ def _test_case_get_logprobs_and_prompt_logprobs(
     example_prompts,
     monkeypatch,
 ) -> None:
+    _assert_vllm_use_v1()
     test_prompts = example_prompts
-
-    # LLM engine v1
-    monkeypatch.setenv("VLLM_USE_V1", "1")
     override_backend_env_variable(monkeypatch, "FLASH_ATTN")
 
     max_num_seqs = 128
@@ -342,6 +346,10 @@ def test_get_logprobs_and_prompt_logprobs(
         monkeypatch=monkeypatch)
 
 
+# LLM engine v1
+@pytest.mark.skipif(os.getenv("VLLM_V1_FAST_TESTS") != "1",
+                    reason="vLLM v1 fast tests not enabled by "
+                    "VLLM_V1_FAST_TESTS=\"1\" in the environment.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype",
                          ["half"])  # needed for comparing logprobs with HF
@@ -363,7 +371,6 @@ def test_fast_get_logprobs_and_prompt_logprobs(
     Faster version of `test_get_logprobs_and_prompt_logprobs` with
     fewer test cases.
     """
-
     _test_case_get_logprobs_and_prompt_logprobs(
         hf_runner=hf_runner,
         vllm_runner=vllm_runner,
@@ -384,8 +391,7 @@ def test_max_logprobs(monkeypatch):
     Args:
       monkeypatch
     """
-    # LLM engine v1
-    monkeypatch.setenv("VLLM_USE_V1", "1")
+    _assert_vllm_use_v1()
     override_backend_env_variable(monkeypatch, "FLASH_ATTN")
 
     runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
@@ -408,9 +414,7 @@ def test_none_logprobs(vllm_runner, model, example_prompts, monkeypatch):
       example_prompts
       monkeypatch
     """
-
-    # LLM engine v1
-    monkeypatch.setenv("VLLM_USE_V1", "1")
+    _assert_vllm_use_v1()
     override_backend_env_variable(monkeypatch, "FLASH_ATTN")
 
     max_num_seqs = 256

From 48f46710fb8392091e757b89e164bc23604c944e Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 7 Dec 2024 13:30:00 +0000
Subject: [PATCH 150/293] refactor

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py |  22 ++---
 tests/v1/engine/utils.py            |   6 +-
 tests/v1/samplers/test_logprobs.py  | 128 +++-------------------------
 tests/v1/samplers/utils.py          | 110 ++++++++++++++++++++++++
 tests/v1/utils.py                   |   6 ++
 5 files changed, 142 insertions(+), 130 deletions(-)
 create mode 100644 tests/v1/samplers/utils.py
 create mode 100644 tests/v1/utils.py

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 410a8fd1fd8f0..083236059b42f 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -4,9 +4,9 @@
 import pytest
 from transformers import AutoTokenizer
 
-from tests.v1.engine.utils import (_generate_dummy_prompt_logprobs,
-                                   _generate_dummy_sample_logprobs,
-                                   _validate_requests_logprobs)
+from tests.v1.engine.utils import (generate_dummy_prompt_logprobs,
+                                   generate_dummy_sample_logprobs,
+                                   validate_requests_logprobs)
 from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine import EngineCoreOutput
 from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerRequest
@@ -33,9 +33,9 @@
     tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
 ]
 PROMPT_LOGPROBS_RAW: List[Tuple[npt.NDArray, npt.NDArray]] = [
-    _generate_dummy_prompt_logprobs(prompt_tokens_list=tokens_list,
-                                    num_logprobs=NUM_PROMPT_LOGPROBS,
-                                    tokenizer=tokenizer)
+    generate_dummy_prompt_logprobs(prompt_tokens_list=tokens_list,
+                                   num_logprobs=NUM_PROMPT_LOGPROBS,
+                                   tokenizer=tokenizer)
     for tokens_list in PROMPT_TOKENS
 ]
 # PROMPT_LOGPROBS = [
@@ -46,9 +46,9 @@
     tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
 ]
 GENERATION_LOGPROBS_RAW = [
-    _generate_dummy_sample_logprobs(sampled_tokens_list=tokens_list,
-                                    num_logprobs=NUM_SAMPLE_LOGPROBS,
-                                    tokenizer=tokenizer)
+    generate_dummy_sample_logprobs(sampled_tokens_list=tokens_list,
+                                   num_logprobs=NUM_SAMPLE_LOGPROBS,
+                                   tokenizer=tokenizer)
     for tokens_list in GENERATION_TOKENS
 ]
 # GENERATION_LOGPROBS = [
@@ -186,7 +186,7 @@ def test_incremental_detokenization(
         assert len(requests_to_abort) == 0
 
         # Validate logprob detokenization
-        _validate_requests_logprobs(requests, request_outputs, tokenizer)
+        validate_requests_logprobs(requests, request_outputs, tokenizer)
 
         # Update tracking.
         for request_output in request_outputs:
@@ -272,7 +272,7 @@ def test_stop_string(
         aborted.extend(requests_to_abort)
 
         # Validate logprob detokenization
-        _validate_requests_logprobs(requests, request_outputs, tokenizer)
+        validate_requests_logprobs(requests, request_outputs, tokenizer)
 
         # Update tracking.
         for request_output in request_outputs:
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index f6d6888003a49..986844c397926 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -127,7 +127,7 @@ def _create_random_top_token_test_matrix(
     return res
 
 
-def _generate_dummy_sample_logprobs(
+def generate_dummy_sample_logprobs(
     sampled_tokens_list: List,
     num_logprobs: int,
     tokenizer: PreTrainedTokenizer,
@@ -159,7 +159,7 @@ def _generate_dummy_sample_logprobs(
     return res
 
 
-def _generate_dummy_prompt_logprobs(
+def generate_dummy_prompt_logprobs(
     prompt_tokens_list: List,
     num_logprobs: int,
     tokenizer: PreTrainedTokenizer,
@@ -205,7 +205,7 @@ def _decode_token(
                                            skip_special_tokens=False)[0]
 
 
-def _validate_requests_logprobs(
+def validate_requests_logprobs(
     requests: List[DetokenizerRequest],
     request_outputs: List[RequestOutput],
     tokenizer: PreTrainedTokenizer,
diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index 129feb7c7fe49..48f3414b4e693 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -1,125 +1,21 @@
 import os
-import re
-from typing import List, Tuple
+from typing import List
 
 import pytest
 import torch
 
 from tests.kernels.utils import override_backend_env_variable
-from vllm import CompletionOutput, SamplingParams
+from tests.v1.samplers.utils import (
+    assert_incr_detok_str_matches_non_incr_detok_str,
+    compute_correct_cumulative_logprob, get_test_batch)
+from tests.v1.utils import assert_vllm_use_v1
+from vllm import SamplingParams
 
 from ...conftest import VllmRunner
 
 MODELS = ["facebook/opt-125m"]
 
 
-def _get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
-    """Generate logprobs configs for a batch of requests
-    
-    A given request's logprobs configuration is (1) num_sample_logprobs and (2)
-    num_prompt_logprobs. The batch logprobs configuration is the list of request
-    logprobs configs.
-
-    batch_logprobs_composition == "NONE" yields a batch with no sample or prompt
-    logprobs
-
-    batch_logprobs_composition == "SAMPLE" yields a batch with some requests
-    configured for sample logprobs only, and others configured for no logprobs
-
-    batch_logprobs_composition == "PROMPT" yields a batch with some requests
-    configured for prompt logprobs only, and others configured for no logprobs
-
-    batch_logprobs_composition == "SAMPLE_PROMPT" yields a batch with some
-    requests configured for sample logprobs and prompt logprobs, some configured
-    for only sample logprobs or only prompt logprobs, and some configured for
-    no logprobs
-
-    Args:
-      batch_logprobs_composition: types of logprobs configs to include in batch
-
-    Returns:
-
-      List of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs])
-      tuples
-    """
-    if batch_logprobs_composition == "NONE":
-        # No requests with sample or prompt logprobs
-        return [(None, None), (0, None), (None, 0), (0, 0)]
-    elif batch_logprobs_composition == "SAMPLE":
-        return [
-            (None, None),
-            (None, 0),
-            (0, None),
-            (0, 0),
-            (5, None),
-            (3, 0),
-        ]
-    elif batch_logprobs_composition == "PROMPT":
-        return [
-            (None, 0),
-            (0, None),
-            (0, 0),
-            (None, 6),
-            (0, 5),
-        ]
-    elif batch_logprobs_composition == "SAMPLE_PROMPT":
-        return [
-            (None, 0),
-            (0, None),
-            (0, 0),
-            (5, None),
-            (3, 0),
-            (6, 3),
-            (None, 6),
-            (0, 5),
-        ]
-    else:
-        raise ValueError("Invalid logprobs batch configuration for test.")
-
-
-def _assert_incr_detok_str_matches_non_incr_detok_str(
-    incremental_detokenization_str: str,
-    non_incremental_detokenization_str: str,
-    msg: str,
-) -> None:
-    """Compare incrementally detok. text to non-incrementally detok. text
-    
-    Fail if the strings mismatch after non-alphanumeric characters are stripped
-    out.
-
-    Rationale: incremental detokenization in the text generation process allows
-    the tokenizer to adjust the next token text output based on the token's
-    context in the string. However, logprobs detokenization detokenizes each
-    token individually, and the resultant strings may include some
-    non-alphanumeric placeholder characters where there could be i.e.
-    whitespace. So, this function compares only the alphanumeric text
-    between two strings and fails if there is a mismatch, which helps
-    with validating logprobs detokenization.
-
-    Args:
-      incremental_detokenization_str: incrementally-detokenized generated text
-      non_incremental_detokenization_str: non-incrementally-detokenized logprob
-                                          tokens
-      msg: error message if `assert` fails
-    """
-    rgx = r'[^a-zA-Z0-9]+'
-    assert (re.sub(rgx, '', incremental_detokenization_str) == re.sub(
-        rgx, '', non_incremental_detokenization_str)), (msg)
-
-
-def _compute_correct_cumulative_logprob(
-        completion_output: CompletionOutput) -> float:
-    token_ids = completion_output.token_ids
-    logprobs = completion_output.logprobs
-    assert logprobs is not None
-    return sum([lp[tok_id].logprob for tok_id, lp in zip(token_ids, logprobs)])
-
-
-def _assert_vllm_use_v1():
-    if os.getenv("VLLM_USE_V1") != "1":
-        raise OSError("Test requires VLLM_USE_V1=\"1\"")
-
-
 def _test_case_get_logprobs_and_prompt_logprobs(
     hf_runner,
     vllm_runner,
@@ -131,7 +27,7 @@ def _test_case_get_logprobs_and_prompt_logprobs(
     example_prompts,
     monkeypatch,
 ) -> None:
-    _assert_vllm_use_v1()
+    assert_vllm_use_v1()
     test_prompts = example_prompts
     override_backend_env_variable(monkeypatch, "FLASH_ATTN")
 
@@ -152,7 +48,7 @@ def _test_case_get_logprobs_and_prompt_logprobs(
 
     # Batch has mixed sample params
     # (different logprobs/prompt logprobs combos)
-    logprob_prompt_logprob_list = _get_test_batch(batch_logprobs_composition)
+    logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
 
     # We rely on there being more prompts than combinations of
     # logprobs & prompt logprobs which we want to test
@@ -223,7 +119,7 @@ def _test_case_get_logprobs_and_prompt_logprobs(
             if detokenize:
                 output_string_from_most_likely_tokens = "".join(
                     output_string_from_most_likely_tokens_lst)
-                _assert_incr_detok_str_matches_non_incr_detok_str(
+                assert_incr_detok_str_matches_non_incr_detok_str(
                     output_text, output_string_from_most_likely_tokens,
                     "The output text from the top logprob for each token "
                     "position should be the same as the output text in the "
@@ -254,7 +150,7 @@ def _test_case_get_logprobs_and_prompt_logprobs(
             # matches the correct value, which is computed below.
             torch.testing.assert_close(
                 vllm_result.outputs[0].cumulative_logprob,
-                _compute_correct_cumulative_logprob(vllm_result.outputs[0]),
+                compute_correct_cumulative_logprob(vllm_result.outputs[0]),
                 atol=1e-6,
                 rtol=1e-6)
         else:
@@ -391,7 +287,7 @@ def test_max_logprobs(monkeypatch):
     Args:
       monkeypatch
     """
-    _assert_vllm_use_v1()
+    assert_vllm_use_v1()
     override_backend_env_variable(monkeypatch, "FLASH_ATTN")
 
     runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
@@ -414,7 +310,7 @@ def test_none_logprobs(vllm_runner, model, example_prompts, monkeypatch):
       example_prompts
       monkeypatch
     """
-    _assert_vllm_use_v1()
+    assert_vllm_use_v1()
     override_backend_env_variable(monkeypatch, "FLASH_ATTN")
 
     max_num_seqs = 256
diff --git a/tests/v1/samplers/utils.py b/tests/v1/samplers/utils.py
new file mode 100644
index 0000000000000..4c9eae5e3f0a9
--- /dev/null
+++ b/tests/v1/samplers/utils.py
@@ -0,0 +1,110 @@
+"""Sampler testing utils"""
+import re
+from typing import List, Tuple
+from vllm import CompletionOutput
+
+def get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
+    """Generate logprobs configs for a batch of requests
+    
+    A given request's logprobs configuration is (1) num_sample_logprobs and (2)
+    num_prompt_logprobs. The batch logprobs configuration is the list of request
+    logprobs configs.
+
+    batch_logprobs_composition == "NONE" yields a batch with no sample or prompt
+    logprobs
+
+    batch_logprobs_composition == "SAMPLE" yields a batch with some requests
+    configured for sample logprobs only, and others configured for no logprobs
+
+    batch_logprobs_composition == "PROMPT" yields a batch with some requests
+    configured for prompt logprobs only, and others configured for no logprobs
+
+    batch_logprobs_composition == "SAMPLE_PROMPT" yields a batch with some
+    requests configured for sample logprobs and prompt logprobs, some configured
+    for only sample logprobs or only prompt logprobs, and some configured for
+    no logprobs
+
+    Args:
+      batch_logprobs_composition: types of logprobs configs to include in batch
+
+    Returns:
+
+      List of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs])
+      tuples
+    """
+    if batch_logprobs_composition == "NONE":
+        # No requests with sample or prompt logprobs
+        return [(None, None), (0, None), (None, 0), (0, 0)]
+    elif batch_logprobs_composition == "SAMPLE":
+        # Requests requiring sample logprobs or no logprobs
+        return [
+            (None, None),
+            (None, 0),
+            (0, None),
+            (0, 0),
+            (5, None),
+            (3, 0),
+        ]
+    elif batch_logprobs_composition == "PROMPT":
+        # Requests requiring prompt logprobs or no logprobs
+        return [
+            (None, 0),
+            (0, None),
+            (0, 0),
+            (None, 6),
+            (0, 5),
+        ]
+    elif batch_logprobs_composition == "SAMPLE_PROMPT":
+        # Requests requiring either no logprobs, just
+        # sample logprobs, just prompt logprobs, or
+        # both sample and prompt logprobs
+        return [
+            (None, 0),
+            (0, None),
+            (0, 0),
+            (5, None),
+            (3, 0),
+            (6, 3),
+            (None, 6),
+            (0, 5),
+        ]
+    else:
+        raise ValueError("Invalid logprobs batch configuration for test.")
+
+
+def assert_incr_detok_str_matches_non_incr_detok_str(
+    incremental_detokenization_str: str,
+    non_incremental_detokenization_str: str,
+    msg: str,
+) -> None:
+    """Compare incrementally detok. text to non-incrementally detok. text
+    
+    Fail if the strings mismatch after non-alphanumeric characters are stripped
+    out.
+
+    Rationale: incremental detokenization in the text generation process allows
+    the tokenizer to adjust the next token text output based on the token's
+    context in the string. However, logprobs detokenization detokenizes each
+    token individually, and the resultant strings may include some
+    non-alphanumeric placeholder characters where there could be i.e.
+    whitespace. So, this function compares only the alphanumeric text
+    between two strings and fails if there is a mismatch, which helps
+    with validating logprobs detokenization.
+
+    Args:
+      incremental_detokenization_str: incrementally-detokenized generated text
+      non_incremental_detokenization_str: non-incrementally-detokenized logprob
+                                          tokens
+      msg: error message if `assert` fails
+    """
+    rgx = r'[^a-zA-Z0-9]+'
+    assert (re.sub(rgx, '', incremental_detokenization_str) == re.sub(
+        rgx, '', non_incremental_detokenization_str)), (msg)
+
+
+def compute_correct_cumulative_logprob(
+        completion_output: CompletionOutput) -> float:
+    token_ids = completion_output.token_ids
+    logprobs = completion_output.logprobs
+    assert logprobs is not None
+    return sum([lp[tok_id].logprob for tok_id, lp in zip(token_ids, logprobs)])
\ No newline at end of file
diff --git a/tests/v1/utils.py b/tests/v1/utils.py
new file mode 100644
index 0000000000000..52b9d3ada2c20
--- /dev/null
+++ b/tests/v1/utils.py
@@ -0,0 +1,6 @@
+"""V1 vLLM engine test utils"""
+import os
+
+def assert_vllm_use_v1():
+    if os.getenv("VLLM_USE_V1") != "1":
+        raise OSError("Test requires VLLM_USE_V1=\"1\"")
\ No newline at end of file

From 7121739f8f9962b24ad8fe4810c1d42f41e503c7 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 7 Dec 2024 14:11:17 +0000
Subject: [PATCH 151/293] refactoring to make logprobs var names clearer,
 touched a lot of files. Bugfix.

---
 examples/llm_engine_example.py                |  4 +-
 examples/lora_with_quantization_inference.py  | 16 ++--
 examples/multilora_inference.py               | 12 +--
 tests/conftest.py                             | 14 ++--
 tests/engine/test_skip_tokenizer_init.py      |  3 +-
 .../decoder_only/language/test_mistral.py     |  4 +-
 .../vision_language/test_pixtral.py           |  4 +-
 tests/samplers/test_logits_processor.py       |  4 +-
 tests/samplers/test_logprobs.py               | 24 +++---
 tests/samplers/test_ranks.py                  | 15 ++--
 tests/samplers/test_sampler.py                |  4 +-
 tests/spec_decode/e2e/conftest.py             |  4 +-
 tests/spec_decode/e2e/test_logprobs.py        |  2 +-
 tests/tokenization/test_detokenize.py         |  8 +-
 tests/v1/engine/utils.py                      |  1 -
 tests/v1/samplers/test_logprobs.py            | 17 ++--
 tests/v1/samplers/utils.py                    | 12 ++-
 tests/v1/utils.py                             |  3 +-
 vllm/engine/llm_engine.py                     |  8 +-
 vllm/engine/protocol.py                       |  2 +-
 vllm/entrypoints/llm.py                       |  3 +-
 vllm/model_executor/layers/sampler.py         | 17 ++--
 vllm/model_executor/sampling_metadata.py      | 13 +--
 vllm/outputs.py                               |  2 +-
 vllm/sampling_params.py                       | 35 ++++----
 vllm/spec_decode/spec_decode_worker.py        |  5 +-
 vllm/spec_decode/util.py                      |  3 +-
 vllm/v1/engine/core.py                        | 38 +++++----
 vllm/v1/engine/detokenizer.py                 | 40 +++++----
 vllm/v1/engine/processor.py                   | 11 +--
 vllm/v1/outputs.py                            | 16 ++--
 vllm/v1/request.py                            | 16 ++--
 vllm/v1/sample/metadata.py                    |  4 +-
 vllm/v1/sample/sampler.py                     | 83 +++++++++++--------
 vllm/v1/worker/gpu_model_runner.py            | 40 +++++----
 vllm/worker/hpu_model_runner.py               |  8 +-
 vllm/worker/model_runner.py                   |  8 +-
 vllm/worker/multi_step_model_runner.py        |  8 +-
 vllm/worker/tpu_model_runner.py               |  4 +-
 39 files changed, 289 insertions(+), 226 deletions(-)

diff --git a/examples/llm_engine_example.py b/examples/llm_engine_example.py
index 60d894aae9692..dc87ef3df1ce2 100644
--- a/examples/llm_engine_example.py
+++ b/examples/llm_engine_example.py
@@ -9,7 +9,9 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
     """Create a list of test prompts with their sampling parameters."""
     return [
         ("A robot may not injure a human being",
-         SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)),
+         SamplingParams(temperature=0.0,
+                        request_sample_logprobs=1,
+                        request_prompt_logprobs=1)),
         ("To be or not to be,",
          SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
         ("What is the meaning of life?",
diff --git a/examples/lora_with_quantization_inference.py b/examples/lora_with_quantization_inference.py
index 0c454ea50f665..ac2cd90ec7ceb 100644
--- a/examples/lora_with_quantization_inference.py
+++ b/examples/lora_with_quantization_inference.py
@@ -22,26 +22,26 @@ def create_test_prompts(
         # this is an example of using quantization without LoRA
         ("My name is",
          SamplingParams(temperature=0.0,
-                        logprobs=1,
-                        prompt_logprobs=1,
+                        request_sample_logprobs=1,
+                        request_prompt_logprobs=1,
                         max_tokens=128), None),
         # the next three examples use quantization with LoRA
         ("my name is",
          SamplingParams(temperature=0.0,
-                        logprobs=1,
-                        prompt_logprobs=1,
+                        request_sample_logprobs=1,
+                        request_prompt_logprobs=1,
                         max_tokens=128),
          LoRARequest("lora-test-1", 1, lora_path)),
         ("The capital of USA is",
          SamplingParams(temperature=0.0,
-                        logprobs=1,
-                        prompt_logprobs=1,
+                        request_sample_logprobs=1,
+                        request_prompt_logprobs=1,
                         max_tokens=128),
          LoRARequest("lora-test-2", 1, lora_path)),
         ("The capital of France is",
          SamplingParams(temperature=0.0,
-                        logprobs=1,
-                        prompt_logprobs=1,
+                        request_sample_logprobs=1,
+                        request_prompt_logprobs=1,
                         max_tokens=128),
          LoRARequest("lora-test-3", 1, lora_path)),
     ]
diff --git a/examples/multilora_inference.py b/examples/multilora_inference.py
index 043220d979c3c..904bb6764b2e5 100644
--- a/examples/multilora_inference.py
+++ b/examples/multilora_inference.py
@@ -27,8 +27,8 @@ def create_test_prompts(
     return [
         ("A robot may not injure a human being",
          SamplingParams(temperature=0.0,
-                        logprobs=1,
-                        prompt_logprobs=1,
+                        request_sample_logprobs=1,
+                        request_prompt_logprobs=1,
                         max_tokens=128), None),
         ("To be or not to be,",
          SamplingParams(temperature=0.8,
@@ -38,16 +38,16 @@ def create_test_prompts(
         (
             "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
             SamplingParams(temperature=0.0,
-                           logprobs=1,
-                           prompt_logprobs=1,
+                           request_sample_logprobs=1,
+                           request_prompt_logprobs=1,
                            max_tokens=128,
                            stop_token_ids=[32003]),
             LoRARequest("sql-lora", 1, lora_path)),
         (
             "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
             SamplingParams(temperature=0.0,
-                           logprobs=1,
-                           prompt_logprobs=1,
+                           request_sample_logprobs=1,
+                           request_prompt_logprobs=1,
                            max_tokens=128,
                            stop_token_ids=[32003]),
             LoRARequest("sql-lora2", 2, lora_path)),
diff --git a/tests/conftest.py b/tests/conftest.py
index d6be8f5b00af8..61015117a9654 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -794,7 +794,7 @@ def generate_w_logprobs(
             self._final_steps_generate_w_logprobs(req_outputs))
         # Omit prompt logprobs if not required by sampling params
         return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
-                if sampling_params.prompt_logprobs is None else
+                if sampling_params.request_prompt_logprobs is None else
                 toks_str_logsprobs_prompt_logprobs)
 
     def generate_encoder_decoder_w_logprobs(
@@ -807,14 +807,14 @@ def generate_encoder_decoder_w_logprobs(
         Logprobs generation for vLLM encoder/decoder models
         '''
 
-        assert sampling_params.logprobs is not None
+        assert sampling_params.request_sample_logprobs is not None
         req_outputs = self.model.generate(encoder_decoder_prompts,
                                           sampling_params=sampling_params)
         toks_str_logsprobs_prompt_logprobs = (
             self._final_steps_generate_w_logprobs(req_outputs))
         # Omit prompt logprobs if not required by sampling params
         return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
-                if sampling_params.prompt_logprobs is None else
+                if sampling_params.request_prompt_logprobs is None else
                 toks_str_logsprobs_prompt_logprobs)
 
     def generate_greedy(
@@ -850,8 +850,8 @@ def generate_greedy_logprobs(
         greedy_logprobs_params = SamplingParams(
             temperature=0.0,
             max_tokens=max_tokens,
-            logprobs=num_logprobs,
-            prompt_logprobs=num_prompt_logprobs,
+            request_sample_logprobs=num_logprobs,
+            request_prompt_logprobs=num_prompt_logprobs,
             stop_token_ids=stop_token_ids,
             stop=stop)
 
@@ -872,8 +872,8 @@ def generate_encoder_decoder_greedy_logprobs(
         greedy_logprobs_params = SamplingParams(
             temperature=0.0,
             max_tokens=max_tokens,
-            logprobs=num_logprobs,
-            prompt_logprobs=(num_prompt_logprobs),
+            request_sample_logprobs=num_logprobs,
+            request_prompt_logprobs=(num_prompt_logprobs),
         )
         '''
         Greedy logprobs generation for vLLM encoder/decoder models
diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
index b8818af5614cf..09c9ed1474880 100644
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -10,7 +10,8 @@ def test_skip_tokenizer_initialization(model: str):
     # of tokenizer and detokenizer. The generated output is expected to contain
     # token ids.
     llm = LLM(model=model, skip_tokenizer_init=True)
-    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
+    sampling_params = SamplingParams(request_prompt_logprobs=True,
+                                     detokenize=True)
 
     with pytest.raises(ValueError, match="cannot pass text prompts when"):
         llm.generate("abc", sampling_params)
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 99b5d5694f9f7..68b95fb800bcb 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -24,7 +24,9 @@
     # "mistralai/Mistral-Nemo-Instruct-2407"
 ]
 
-SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
+SAMPLING_PARAMS = SamplingParams(max_tokens=512,
+                                 temperature=0.0,
+                                 request_sample_logprobs=5)
 SYMBOLIC_LANG_PROMPTS = [
     "勇敢な船乗りについての詩を書く",  # japanese
     "寫一首關於勇敢的水手的詩",  # chinese
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index 90c0fab99054c..492cafa8a18a7 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -116,7 +116,9 @@ def _create_engine_inputs_hf(urls: List[str]) -> TextPrompt:
     _create_engine_inputs(IMG_URLS),
 ]
 
-SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
+SAMPLING_PARAMS = SamplingParams(max_tokens=512,
+                                 temperature=0.0,
+                                 request_sample_logprobs=5)
 LIMIT_MM_PER_PROMPT = dict(image=4)
 
 MAX_MODEL_LEN = [8192, 65536]
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
index 2979470120710..646ef56f23a7b 100644
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -29,7 +29,7 @@ def pick_vllm(token_ids, logits):
 
         params_with_logprobs = SamplingParams(
             logits_processors=[pick_vllm],
-            prompt_logprobs=3,
+            request_prompt_logprobs=3,
             max_tokens=max_tokens,
         )
 
@@ -43,7 +43,7 @@ def pick_vllm(token_ids, logits):
         vllm_model.model._add_request(
             example_prompts[1],
             params=SamplingParams(
-                prompt_logprobs=3,
+                request_prompt_logprobs=3,
                 max_tokens=max_tokens,
             ),
         )
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index c07c71e38233f..dcd75c7539fe2 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -49,11 +49,12 @@ def test_get_prompt_logprobs(
             max_num_batched_tokens=max_num_batched_tokens,
             max_num_seqs=max_num_seqs,
     ) as vllm_model:
-        vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
-                                              logprobs=num_top_logprobs,
-                                              prompt_logprobs=num_top_logprobs,
-                                              temperature=0.0,
-                                              detokenize=detokenize)
+        vllm_sampling_params = SamplingParams(
+            max_tokens=max_tokens,
+            request_sample_logprobs=num_top_logprobs,
+            request_prompt_logprobs=num_top_logprobs,
+            temperature=0.0,
+            detokenize=detokenize)
         vllm_results = vllm_model.model.generate(
             example_prompts, sampling_params=vllm_sampling_params)
 
@@ -131,11 +132,11 @@ def test_get_prompt_logprobs(
 
 def test_max_logprobs():
     runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
-    vllm_sampling_params = SamplingParams(logprobs=1)
+    vllm_sampling_params = SamplingParams(request_sample_logprobs=1)
     # should pass
     runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
 
-    bad_sampling_params = SamplingParams(logprobs=2)
+    bad_sampling_params = SamplingParams(request_sample_logprobs=2)
     with pytest.raises(ValueError):
         runner.generate(["Hello world"], sampling_params=bad_sampling_params)
 
@@ -160,10 +161,11 @@ def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
             max_num_batched_tokens=max_num_batched_tokens,
             max_num_seqs=max_num_seqs,
     ) as vllm_model:
-        sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
-                                                       logprobs=None,
-                                                       temperature=0.0,
-                                                       detokenize=detokenize)
+        sampling_params_logprobs_none = SamplingParams(
+            max_tokens=max_tokens,
+            request_sample_logprobs=None,
+            temperature=0.0,
+            detokenize=detokenize)
         results_logprobs_none = vllm_model.model.generate(
             example_prompts, sampling_params=sampling_params_logprobs_none)
 
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
index ed2fee1ae252e..ba41fc615d14a 100644
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -25,17 +25,18 @@ def test_ranks(
             temperature=0.0,
             top_p=1.0,
             max_tokens=max_tokens,
-            logprobs=num_top_logprobs,
-            prompt_logprobs=num_prompt_logprobs)
+            request_sample_logprobs=num_top_logprobs,
+            request_prompt_logprobs=num_prompt_logprobs)
         vllm_results = vllm_model.generate_w_logprobs(example_prompts,
                                                       vllm_sampling_params)
 
         ## Test non-greedy logprobs ranks
-        sampling_params = SamplingParams(temperature=1.0,
-                                         top_p=1.0,
-                                         max_tokens=max_tokens,
-                                         logprobs=num_top_logprobs,
-                                         prompt_logprobs=num_prompt_logprobs)
+        sampling_params = SamplingParams(
+            temperature=1.0,
+            top_p=1.0,
+            max_tokens=max_tokens,
+            request_sample_logprobs=num_top_logprobs,
+            request_prompt_logprobs=num_prompt_logprobs)
         res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
 
     for result in vllm_results:
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 28c34064f670c..4c1dfb48fbe6f 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -176,7 +176,7 @@ def create_sampling_params(min_tokens,
             max_tokens=9999,  # keep higher than max of min_tokens
             stop_token_ids=stop_token_ids,
             # requesting prompt_logprobs changes the structure of `logits`
-            prompt_logprobs=prompt_logprobs,
+            request_prompt_logprobs=prompt_logprobs,
         )
         sampling_params.all_stop_token_ids.add(eos_token_id)
         return sampling_params
@@ -395,7 +395,7 @@ def run_test_case(*, expected_penalization: List[bool],
                 seq_lens.append(prompt_len)
 
                 assert sgm.sampling_params is not None
-                if sgm.sampling_params.prompt_logprobs:
+                if sgm.sampling_params.request_prompt_logprobs:
                     # with prompt_logprobs each token in the prompt has a row in
                     # logits
                     num_rows = prompt_len
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index b9cb3858c0068..39a9dab2b9f11 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -196,8 +196,8 @@ def run_equality_correctness_test(
                                      max_tokens=max_output_len,
                                      seed=seed,
                                      ignore_eos=ignore_eos,
-                                     logprobs=logprobs,
-                                     prompt_logprobs=prompt_logprobs)
+                                     request_sample_logprobs=logprobs,
+                                     request_prompt_logprobs=prompt_logprobs)
 
     with vllm_runner(**org_args) as vllm_model:
         org_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
index 4cfca8b78e79b..7d0d90615bac2 100644
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -211,7 +211,7 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
         max_tokens=output_len,
         ignore_eos=True,
         temperature=temperature,
-        logprobs=logprobs,
+        request_sample_logprobs=logprobs,
     )
 
     sd_args = {
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 84348cbc0bced..2fce280b188bb 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -201,7 +201,7 @@ def test_decode_sequence_logprobs(complete_sequence: str,
                                   skip_special_tokens: bool):
     """Verify Detokenizer decodes logprobs correctly."""
     sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
-                                     logprobs=2)
+                                     request_sample_logprobs=2)
 
     # Run sequentially.
     seq = create_sequence()
@@ -234,7 +234,7 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int],
                                 detokenizer: Detokenizer):
     """Verify Detokenizer decodes prompt logprobs correctly."""
     sampling_params = SamplingParams(skip_special_tokens=True,
-                                     prompt_logprobs=1)
+                                     request_prompt_logprobs=1)
 
     # Run sequentially.
     seq = create_sequence(complete_sequence_token_ids)
@@ -294,8 +294,8 @@ def test_decode_prompt_logprobs_chunked_prefill(
                      max_num_seqs=max_num_seqs) as vllm_model:
 
         vllm_sampling_params = SamplingParams(max_tokens=10,
-                                              logprobs=5,
-                                              prompt_logprobs=5,
+                                              request_sample_logprobs=5,
+                                              request_prompt_logprobs=5,
                                               temperature=0.0)
         vllm_results = vllm_model.model.generate(
             example_prompts, sampling_params=vllm_sampling_params)
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index 986844c397926..f3617067455da 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -228,7 +228,6 @@ def validate_requests_logprobs(
       requests: list of detokenizer input requests
       request_outputs: list of detokenizer outputs
     """
-    assert len(requests) == len(request_outputs)
     for req, req_out in zip(requests, request_outputs):
         if req.logprobs is not None and req.logprobs > 0:
             # Validate sample logprobs
diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/samplers/test_logprobs.py
index 48f3414b4e693..1a1d361170187 100644
--- a/tests/v1/samplers/test_logprobs.py
+++ b/tests/v1/samplers/test_logprobs.py
@@ -64,8 +64,8 @@ def _test_case_get_logprobs_and_prompt_logprobs(
     # Generate SamplingParams
     vllm_sampling_params = [
         SamplingParams(max_tokens=max_tokens,
-                       logprobs=lp,
-                       prompt_logprobs=plp,
+                       request_sample_logprobs=lp,
+                       request_prompt_logprobs=plp,
                        temperature=0.0,
                        detokenize=detokenize)
         for lp, plp in logprob_prompt_logprob_list
@@ -291,11 +291,11 @@ def test_max_logprobs(monkeypatch):
     override_backend_env_variable(monkeypatch, "FLASH_ATTN")
 
     runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
-    vllm_sampling_params = SamplingParams(logprobs=1)
+    vllm_sampling_params = SamplingParams(request_sample_logprobs=1)
     # should pass
     runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
 
-    bad_sampling_params = SamplingParams(logprobs=2)
+    bad_sampling_params = SamplingParams(request_sample_logprobs=2)
     with pytest.raises(ValueError):
         runner.generate(["Hello world"], sampling_params=bad_sampling_params)
 
@@ -322,10 +322,11 @@ def test_none_logprobs(vllm_runner, model, example_prompts, monkeypatch):
             max_num_batched_tokens=max_num_batched_tokens,
             max_num_seqs=max_num_seqs,
     ) as vllm_model:
-        sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
-                                                       logprobs=None,
-                                                       prompt_logprobs=None,
-                                                       temperature=0.0)
+        sampling_params_logprobs_none = SamplingParams(
+            max_tokens=max_tokens,
+            request_sample_logprobs=None,
+            request_prompt_logprobs=None,
+            temperature=0.0)
         results_logprobs_none = vllm_model.model.generate(
             example_prompts, sampling_params=sampling_params_logprobs_none)
 
diff --git a/tests/v1/samplers/utils.py b/tests/v1/samplers/utils.py
index 4c9eae5e3f0a9..5ee260c97c453 100644
--- a/tests/v1/samplers/utils.py
+++ b/tests/v1/samplers/utils.py
@@ -1,8 +1,10 @@
 """Sampler testing utils"""
 import re
 from typing import List, Tuple
+
 from vllm import CompletionOutput
 
+
 def get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
     """Generate logprobs configs for a batch of requests
     
@@ -104,7 +106,15 @@ def assert_incr_detok_str_matches_non_incr_detok_str(
 
 def compute_correct_cumulative_logprob(
         completion_output: CompletionOutput) -> float:
+    """Compute known-good value for evaluating cumulative logprob
+    
+    Args:
+      completion_output: completion output from engine
+
+    Returns:
+      Known-good cumulative logprob value
+    """
     token_ids = completion_output.token_ids
     logprobs = completion_output.logprobs
     assert logprobs is not None
-    return sum([lp[tok_id].logprob for tok_id, lp in zip(token_ids, logprobs)])
\ No newline at end of file
+    return sum([lp[tok_id].logprob for tok_id, lp in zip(token_ids, logprobs)])
diff --git a/tests/v1/utils.py b/tests/v1/utils.py
index 52b9d3ada2c20..db9193a487c95 100644
--- a/tests/v1/utils.py
+++ b/tests/v1/utils.py
@@ -1,6 +1,7 @@
 """V1 vLLM engine test utils"""
 import os
 
+
 def assert_vllm_use_v1():
     if os.getenv("VLLM_USE_V1") != "1":
-        raise OSError("Test requires VLLM_USE_V1=\"1\"")
\ No newline at end of file
+        raise OSError("Test requires VLLM_USE_V1=\"1\"")
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 26a8c94099a11..12fbd4cdfcf39 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -895,10 +895,10 @@ def _create_sequence_group_with_sampling(
     ) -> SequenceGroup:
         """Creates a SequenceGroup with SamplingParams."""
         max_logprobs = self.get_model_config().max_logprobs
-        if (sampling_params.logprobs
-                and sampling_params.logprobs > max_logprobs) or (
-                    sampling_params.prompt_logprobs
-                    and sampling_params.prompt_logprobs > max_logprobs):
+        if (sampling_params.request_sample_logprobs
+                and sampling_params.request_sample_logprobs > max_logprobs
+            ) or (sampling_params.request_prompt_logprobs
+                  and sampling_params.request_prompt_logprobs > max_logprobs):
             raise ValueError(f"Cannot request more than "
                              f"{max_logprobs} logprobs.")
 
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 4079de7d36793..dac592f9f373d 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -95,7 +95,7 @@ async def beam_search(
             tokenizer.eos_token_id, length_penalty)
 
         beam_search_params = SamplingParams(
-            logprobs=2 * beam_width,
+            request_sample_logprobs=2 * beam_width,
             max_tokens=1,
             temperature=temperature,
         )
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 65fa9873df28c..b64d01f48b4a9 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -465,7 +465,8 @@ def sort_beams_key(x: BeamSearchSequence) -> float:
         # generate 2 * beam_width candidates at each step
         # following the huggingface transformers implementation
         # at https://github.com/huggingface/transformers/blob/e15687fffe5c9d20598a19aeab721ae0a7580f8a/src/transformers/generation/beam_search.py#L534 # noqa
-        beam_search_params = SamplingParams(logprobs=2 * beam_width,
+        beam_search_params = SamplingParams(request_sample_logprobs=2 *
+                                            beam_width,
                                             max_tokens=1,
                                             temperature=temperature)
         instances: List[BeamSearchInstance] = []
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index c10efefea5471..89156850900f7 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -967,9 +967,9 @@ def get_logprobs(
 
         # Update indices and tokens for prompt logprobs.
         if (seq_group.is_prompt
-                and sampling_params.prompt_logprobs is not None):
+                and sampling_params.request_prompt_logprobs is not None):
             largest_num_logprobs = max(largest_num_logprobs,
-                                       sampling_params.prompt_logprobs)
+                                       sampling_params.request_prompt_logprobs)
             next_prompt_tokens = _get_next_prompt_tokens(seq_group)
             query_indices.extend(seq_group.prompt_logprob_indices)
             next_token_ids.extend(next_prompt_tokens)
@@ -986,9 +986,10 @@ def get_logprobs(
                 [query_idx + parent_id for parent_id in parent_seq_ids])
             next_token_ids.extend(token_ids)
 
-            if sampling_params.logprobs is not None:
-                largest_num_logprobs = max(largest_num_logprobs,
-                                           sampling_params.logprobs)
+            if sampling_params.request_sample_logprobs is not None:
+                largest_num_logprobs = max(
+                    largest_num_logprobs,
+                    sampling_params.request_sample_logprobs)
 
         assert len(next_token_ids) == len(query_indices)
 
@@ -1070,9 +1071,9 @@ def _get_prompt_logprob_if_needed(
 
     # Find prompt logprobs
     prompt_logprobs: Optional[PromptLogprobs] = None
-    if is_prompt and sampling_params.prompt_logprobs is not None:
+    if is_prompt and sampling_params.request_prompt_logprobs is not None:
         prompt_logprobs = []
-        num_logprobs = sampling_params.prompt_logprobs
+        num_logprobs = sampling_params.request_prompt_logprobs
         next_prompt_tokens = _get_next_prompt_tokens(seq_group)
         # Pre-select indexes and create a list. It is faster than calling .item
         # repetitively.
@@ -1127,7 +1128,7 @@ def _get_sampled_logprob_if_needed(
 ):
     """Compute the sample logprob if needed."""
     seq_ids = seq_group.seq_ids
-    num_logprobs = seq_group.sampling_params.logprobs
+    num_logprobs = seq_group.sampling_params.request_sample_logprobs
     sampled_logprobs: SampleLogprobs = []
     next_token_ids, parent_seq_ids = sample_result
 
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 1df8f84ed4093..579319ffdf2ed 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -52,7 +52,7 @@ def do_sample(self):
 
     def __post_init__(self):
         if len(self.prompt_logprob_indices) > 0:
-            assert self.sampling_params.prompt_logprobs is not None
+            assert self.sampling_params.request_prompt_logprobs is not None
         if self.is_prompt:
             assert self.seq_len is not None
             assert self.query_len is not None
@@ -300,7 +300,7 @@ def _prepare_seq_groups(
         logits = hidden_states[selected_token_indices]
         """
 
-        if sampling_params.prompt_logprobs is not None:
+        if sampling_params.request_prompt_logprobs is not None:
             selected_token_indices.extend(
                 range(model_output_idx, model_output_idx + prompt_logprob_len))
         model_output_idx += prompt_logprob_len
@@ -322,7 +322,7 @@ def sample(logits):
            # sample_indices to find sample indices.
         """
 
-        if sampling_params.prompt_logprobs is not None:
+        if sampling_params.request_prompt_logprobs is not None:
             prompt_logprob_indices.extend(
                 range(logit_idx, logit_idx + prompt_logprob_len))
             logit_idx += prompt_logprob_len
@@ -426,7 +426,8 @@ def from_sampling_metadata(
                 do_penalties = True
 
             is_prompt = seq_group.is_prompt
-            if is_prompt and sampling_params.prompt_logprobs is not None:
+            if (is_prompt
+                    and sampling_params.request_prompt_logprobs is not None):
                 # For tokens in the prompt that we only need to get
                 # their logprobs
                 query_len = seq_group.query_len
@@ -455,8 +456,8 @@ def from_sampling_metadata(
             for seq_group in sampling_metadata.seq_groups:
                 seq_ids = seq_group.seq_ids
                 sampling_params = seq_group.sampling_params
-                if (seq_group.is_prompt
-                        and sampling_params.prompt_logprobs is not None):
+                if (seq_group.is_prompt and
+                        sampling_params.request_prompt_logprobs is not None):
                     prefill_len = len(seq_group.prompt_logprob_indices)
                     prompt_tokens.extend(
                         array(VLLM_TOKEN_ID_ARRAY_TYPE)
diff --git a/vllm/outputs.py b/vllm/outputs.py
index c412d5ce21571..c6d0a31cbd8d8 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -205,7 +205,7 @@ def from_seq_group(
         # NOTE: We need omit logprobs here explicitly because the sequence
         # always has the logprobs of the sampled tokens even if the
         # logprobs are not requested.
-        include_logprobs = sampling_params.logprobs is not None
+        include_logprobs = sampling_params.request_sample_logprobs is not None
         text_buffer_length = sampling_params.output_text_buffer_length
         delta = sampling_params.output_kind == RequestOutputKind.DELTA
 
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index fc77f3ca529b2..cc4d16b3dc6ce 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -184,8 +184,10 @@ class SamplingParams(
     ignore_eos: bool = False
     max_tokens: Optional[int] = 16
     min_tokens: int = 0
-    logprobs: Optional[int] = None
-    prompt_logprobs: Optional[int] = None
+    # Number of sample logprobs and prompt logprobs,
+    # respectively, requested
+    request_sample_logprobs: Optional[int] = None
+    request_prompt_logprobs: Optional[int] = None
     # NOTE: This parameter is only exposed at the engine level for now.
     # It is not exposed in the OpenAI API server, as the OpenAI API does
     # not support returning only a list of token IDs.
@@ -268,8 +270,8 @@ def from_optional(
             ignore_eos=ignore_eos,
             max_tokens=max_tokens,
             min_tokens=min_tokens,
-            logprobs=logprobs,
-            prompt_logprobs=prompt_logprobs,
+            request_sample_logprobs=logprobs,
+            request_prompt_logprobs=prompt_logprobs,
             detokenize=detokenize,
             skip_special_tokens=skip_special_tokens,
             spaces_between_special_tokens=spaces_between_special_tokens,
@@ -326,9 +328,12 @@ def __post_init__(self) -> None:
         else:
             self.bad_words = list(self.bad_words)
 
-        self.logprobs = 1 if self.logprobs is True else self.logprobs
-        self.prompt_logprobs = (1 if self.prompt_logprobs is True else
-                                self.prompt_logprobs)
+        self.request_sample_logprobs = (1
+                                        if self.request_sample_logprobs is True
+                                        else self.request_sample_logprobs)
+        self.request_prompt_logprobs = (1
+                                        if self.request_prompt_logprobs is True
+                                        else self.request_prompt_logprobs)
 
         # Number of characters to hold back for stop string evaluation
         # until sequence is finished.
@@ -385,12 +390,14 @@ def _verify_args(self) -> None:
             raise ValueError(
                 f"min_tokens must be less than or equal to "
                 f"max_tokens={self.max_tokens}, got {self.min_tokens}.")
-        if self.logprobs is not None and self.logprobs < 0:
-            raise ValueError(
-                f"logprobs must be non-negative, got {self.logprobs}.")
-        if self.prompt_logprobs is not None and self.prompt_logprobs < 0:
+        if (self.request_sample_logprobs is not None
+                and self.request_sample_logprobs < 0):
+            raise ValueError(f"logprobs must be non-negative, "
+                             f"got {self.request_sample_logprobs}.")
+        if (self.request_prompt_logprobs is not None
+                and self.request_prompt_logprobs < 0):
             raise ValueError(f"prompt_logprobs must be non-negative, got "
-                             f"{self.prompt_logprobs}.")
+                             f"{self.request_prompt_logprobs}.")
         if (self.truncate_prompt_tokens is not None
                 and self.truncate_prompt_tokens < 1):
             raise ValueError(f"truncate_prompt_tokens must be >= 1, "
@@ -481,8 +488,8 @@ def __repr__(self) -> str:
             f"ignore_eos={self.ignore_eos}, "
             f"max_tokens={self.max_tokens}, "
             f"min_tokens={self.min_tokens}, "
-            f"logprobs={self.logprobs}, "
-            f"prompt_logprobs={self.prompt_logprobs}, "
+            f"logprobs={self.request_sample_logprobs}, "
+            f"prompt_logprobs={self.request_prompt_logprobs}, "
             f"skip_special_tokens={self.skip_special_tokens}, "
             "spaces_between_special_tokens="
             f"{self.spaces_between_special_tokens}, "
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index ced7f53827665..62cb4a87e7f90 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -539,8 +539,9 @@ def _serialize_sampler_output_no_logprobs(
             populated.
         """
         seq_output_prompt_logprobs = [
-            seq.is_prompt and seq.sampling_params.prompt_logprobs is not None
-            and seq.sampling_params.prompt_logprobs > 0
+            seq.is_prompt
+            and seq.sampling_params.request_prompt_logprobs is not None
+            and seq.sampling_params.request_prompt_logprobs > 0
             for seq in execute_model_req.seq_group_metadata_list
         ]
         # ignore slots for prompt tokens that are filled with INVALID_TOKEN_ID
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index da8706658d09a..1ecc653521ad9 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -23,7 +23,8 @@ def get_all_num_logprobs(
 
     all_num_logprobs: List[int] = []
     for seq_group_metadata in seq_group_metadata_list:
-        num_logprobs = seq_group_metadata.sampling_params.logprobs
+        num_logprobs = (
+            seq_group_metadata.sampling_params.request_sample_logprobs)
         if num_logprobs is None:
             num_logprobs = 0
         all_num_logprobs.append(num_logprobs)
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index c0186d419c1a2..5fc4f2e425726 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -119,12 +119,13 @@ def update_from_output(
         # NOTE(woosuk): This method doesn't consider speculative decoding.
         sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
-        do_logprobs = model_runner_output.logprobs_cpu is not None
-        do_prompt_logprobs = (
-            model_runner_output.prompt_logprobs_cpu is not None
-            and len(model_runner_output.prompt_logprobs_cpu) > 0)
+        do_batch_sample_logprobs = (model_runner_output.batch_logprobs_cpu
+                                    is not None)
+        do_batch_prompt_logprobs = (
+            model_runner_output.batch_prompt_logprobs_cpu is not None
+            and len(model_runner_output.batch_prompt_logprobs_cpu) > 0)
 
-        if do_prompt_logprobs:
+        if do_batch_prompt_logprobs:
             # Index into prompt tokens, for building
             # prompt logprobs output data structure
             mr_output_slice_lower_index = 0
@@ -136,12 +137,13 @@ def update_from_output(
             request.num_computed_tokens += num_scheduled_tokens[req_id]
             req_index = model_runner_output.req_id_to_index[req_id]
             num_new_tokens = 1
-            max_logprobs = request.max_logprobs
-            request_do_logprobs = (do_logprobs and max_logprobs is not None
-                                   and max_logprobs > 0)
+            request_sample_logprobs = request.request_sample_logprobs
+            request_do_logprobs = (do_batch_sample_logprobs
+                                   and request_sample_logprobs is not None
+                                   and request_sample_logprobs > 0)
 
-            if do_prompt_logprobs:
-                max_prompt_logprobs = request.max_prompt_logprobs
+            if do_batch_prompt_logprobs:
+                request_prompt_logprobs = request.request_prompt_logprobs
                 # Number of new prompt tokens is the number of scheduled
                 # tokens *if* the request is partial (because the sampled
                 # token is discarded and all sequence offsets are prompt
@@ -153,8 +155,9 @@ def update_from_output(
                     num_scheduled_tokens[request.request_id] -
                     int(req_is_not_partial))
 
-                request_do_prompt_logprobs = (max_prompt_logprobs is not None
-                                              and max_prompt_logprobs > 0
+                request_do_prompt_logprobs = (request_prompt_logprobs
+                                              is not None
+                                              and request_prompt_logprobs > 0
                                               and num_new_prompt_tokens > 0)
 
                 if request_do_prompt_logprobs:
@@ -170,15 +173,15 @@ def update_from_output(
                     #
                     # Note: new_prompt_logprobs will be used later to build the
                     # engine core output
-                    logprob_cnt = max_prompt_logprobs
+                    logprob_cnt = request_prompt_logprobs
                     mr_output_slice_upper_index = (
                         mr_output_slice_lower_index + num_new_prompt_tokens)
                     new_prompt_logprobs = (
-                        model_runner_output.prompt_logprobs_cpu[
+                        model_runner_output.batch_prompt_logprobs_cpu[
                             mr_output_slice_lower_index:
                             mr_output_slice_upper_index, 0:logprob_cnt])
                     new_prompt_logprob_token_ids = (
-                        model_runner_output.prompt_logprob_token_ids_cpu[
+                        model_runner_output.batch_prompt_logprob_token_ids_cpu[
                             mr_output_slice_lower_index:
                             mr_output_slice_upper_index, 0:logprob_cnt])
 
@@ -219,8 +222,9 @@ def update_from_output(
                     # Slice out this request's sample logprobs; defer
                     # pythonization to be carried out in the frontend.
                     request.logprobs.append(
-                        (model_runner_output.logprobs_cpu[req_index],
-                         model_runner_output.logprob_token_ids_cpu[req_index]))
+                        (model_runner_output.batch_logprobs_cpu[req_index],
+                         model_runner_output.
+                         batch_logprob_token_ids_cpu[req_index]))
                 request.append_output_token_ids(token_id)
                 # TODO: Update the KV cache manager for prefix caching.
 
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 89ffa0dac21d4..e1a0156d3183a 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -181,8 +181,8 @@ def _pythonize_sequence_position(
 
     def _pythonize_maybe_detokenize_sample_logprobs_for_request(
         self,
-        new_logprobs: List[Tuple[npt.NDArray, npt.NDArray]],
-        new_token_ids: List[int],
+        new_sample_logprobs: List[Tuple[npt.NDArray, npt.NDArray]],
+        new_sample_token_ids: List[int],
         detokenize: bool,
     ) -> SampleLogprobs:
         """Pythonize sample logprobs, maybe detokenize.
@@ -202,8 +202,9 @@ def _pythonize_maybe_detokenize_sample_logprobs_for_request(
         from the associated top token id) if detokenize=True
 
         Args:
-          new_logprobs: List of (logprobs,logprob token ids) numpy array tuples
-          new_token_ids: List of sample token ids
+          new_sample_logprobs: List of (logprobs,logprob token ids) numpy array
+                               tuples
+          new_sample_token_ids: List of sample token ids
           detokenize: Logprob.decoded_token is computed if True, otherwise None
         
         Returns:
@@ -213,7 +214,8 @@ def _pythonize_maybe_detokenize_sample_logprobs_for_request(
         new_pythonized_logprobs = []
         max_logprobs = self.max_request_sample_logprobs
         for (logprob_values,
-             logprob_token_ids), token_id in zip(new_logprobs, new_token_ids):
+             logprob_token_ids), token_id in zip(new_sample_logprobs,
+                                                 new_sample_token_ids):
             # Only keep the number of logprobs specified by the request
             # (plus possibly the sampled token id & its logprob)
             logprob_cnt = max_logprobs
@@ -292,8 +294,8 @@ def _pythonize_maybe_detokenize_prompt_logprobs_for_request(
 
     def add_tokens(
         self,
-        new_token_ids: List[int],
-        new_logprobs: Optional[List[Tuple[npt.NDArray, npt.NDArray]]],
+        new_sampled_token_ids: List[int],
+        new_sample_logprobs: Optional[List[Tuple[npt.NDArray, npt.NDArray]]],
         new_prompt_logprobs: Optional[npt.NDArray],
         new_prompt_logprob_token_ids: Optional[npt.NDArray],
         finish_reason: Optional[str],
@@ -326,10 +328,10 @@ def add_tokens(
         """
 
         # Only try to Pythonize sample logprobs if any were provided
-        do_request_sample_logprobs = new_logprobs is not None and len(
-            new_logprobs) > 0
-        assert not do_request_sample_logprobs or len(new_logprobs) == len(
-            new_token_ids)
+        do_request_sample_logprobs = new_sample_logprobs is not None and len(
+            new_sample_logprobs) > 0
+        assert not do_request_sample_logprobs or len(
+            new_sample_logprobs) == len(new_sampled_token_ids)
         # Only try to Pythonize prompt logprobs if any were provided
         do_request_prompt_logprobs = new_prompt_logprobs is not None and len(
             new_prompt_logprobs) > 0
@@ -338,9 +340,11 @@ def add_tokens(
 
         if do_request_sample_logprobs:
             # 1) Pythonize & detokenize sample logprobs
-            new_logprobs = (
+            new_sample_logprobs = (
                 self._pythonize_maybe_detokenize_sample_logprobs_for_request(
-                    new_logprobs, new_token_ids, detokenize=True))
+                    new_sample_logprobs,
+                    new_sampled_token_ids,
+                    detokenize=True))
 
         if do_request_prompt_logprobs:
             # 2) If necessary, detokenize prompt logprobs incrementally
@@ -355,7 +359,7 @@ def add_tokens(
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
         decoded_text = ""
-        for new_token_id in new_token_ids:
+        for new_token_id in new_sampled_token_ids:
             self.token_ids.append(new_token_id)
             (new_tokens, new_decoded_token_text, prefix_offset,
              read_offset) = detokenize_incrementally(
@@ -404,8 +408,8 @@ def add_tokens(
         #          logprob
         # FINAL -> all sampled tokens and logprobs + current cumulative prompt
         #          logprob
-        token_ids = new_token_ids if delta else self.output_token_ids
-        logprobs = new_logprobs if delta else self.request_logprobs
+        token_ids = new_sampled_token_ids if delta else self.output_token_ids
+        logprobs = new_sample_logprobs if delta else self.request_logprobs
         prompt_logprobs = (new_prompt_logprobs
                            if delta else self.request_prompt_logprobs)
         cumulative_logprob = self.request_cumulative_logprob
@@ -510,8 +514,8 @@ def step(
 
             # Detokenize and update state.
             request_output = detokenizer.add_tokens(
-                new_token_ids=engine_core_output.new_token_ids,
-                new_logprobs=engine_core_output.logprobs,
+                new_sampled_token_ids=engine_core_output.new_token_ids,
+                new_sample_logprobs=engine_core_output.logprobs,
                 new_prompt_logprobs=engine_core_output.prompt_logprobs,
                 new_prompt_logprob_token_ids=engine_core_output.
                 prompt_logprobs_token_ids,
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 3f6fc33d5cae0..535874a1fd6de 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -59,9 +59,10 @@ def _assert_valid_sample_logprobs_prompt_logprobs(
         """
 
         if isinstance(params, SamplingParams) and (
-            (params.logprobs and params.logprobs > max_logprobs) or
-            (params.prompt_logprobs
-             and params.prompt_logprobs > max_logprobs)):
+            (params.request_sample_logprobs
+             and params.request_sample_logprobs > max_logprobs) or
+            (params.request_prompt_logprobs
+             and params.request_prompt_logprobs > max_logprobs)):
 
             raise ValueError(f"Cannot request more than "
                              f"{max_logprobs} logprobs or prompt logprobs.")
@@ -166,8 +167,8 @@ def process_inputs(
             sampling_params.output_kind,
             sampling_params.stop,
             sampling_params.include_stop_str_in_output,
-            sampling_params.logprobs,
-            sampling_params.prompt_logprobs,
+            sampling_params.request_sample_logprobs,
+            sampling_params.request_prompt_logprobs,
         )
 
         # Make Request for EngineCore.
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 12a71f419c05c..8de33f413fed9 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -12,14 +12,14 @@ class SamplerOutput:
     sampled_token_ids: torch.Tensor
 
     # [num_reqs, max_num_logprobs + 1]
-    logprob_token_ids: Optional[torch.Tensor] = None
+    batch_sample_logprob_token_ids: Optional[torch.Tensor] = None
     # [num_reqs, max_num_logprobs + 1]
-    logprobs: Optional[torch.Tensor] = None
+    batch_sample_logprobs: Optional[torch.Tensor] = None
 
     # [num_prompt_tokens, max_num_prompt_logprobs + 1]
-    prompt_logprobs: Optional[torch.Tensor] = None
+    batch_prompt_logprobs: Optional[torch.Tensor] = None
     # [num_prompt_tokens, max_num_prompt_logprobs + 1]
-    prompt_logprob_token_ids: Optional[torch.Tensor] = None
+    batch_prompt_logprob_token_ids: Optional[torch.Tensor] = None
 
 
 @dataclass
@@ -34,11 +34,11 @@ class ModelRunnerOutput:
     sampled_token_ids_cpu: torch.Tensor
 
     # [num_reqs, max_num_logprobs + 1]
-    logprob_token_ids_cpu: Optional[npt.NDArray]
+    batch_logprob_token_ids_cpu: Optional[npt.NDArray]
     # [num_reqs, max_num_logprobs + 1]
-    logprobs_cpu: Optional[npt.NDArray]
+    batch_logprobs_cpu: Optional[npt.NDArray]
 
     # [num_reqs, max_num_prompt_logprobs]
-    prompt_logprob_token_ids_cpu: Optional[npt.NDArray]
+    batch_prompt_logprob_token_ids_cpu: Optional[npt.NDArray]
     # [num_reqs, max_num_prompt_logprobs]
-    prompt_logprobs_cpu: Optional[npt.NDArray]
+    batch_prompt_logprobs_cpu: Optional[npt.NDArray]
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 9f14e7c9e16e9..7fd37f2effe0c 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -46,8 +46,10 @@ def __init__(
         self.num_prompt_tokens = len(self.prompt_token_ids)
         self._output_token_ids: List[int] = []
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
-        self.max_logprobs = sampling_params.logprobs
-        self.max_prompt_logprobs = sampling_params.prompt_logprobs
+        # Number of sample logprobs and prompt logprobs requested,
+        # respectively
+        self.request_sample_logprobs = sampling_params.request_sample_logprobs
+        self.request_prompt_logprobs = sampling_params.request_prompt_logprobs
         # If sample logprobs are enabled, the number of sample logprobs cannot
         # be anticipated in advance (because the LLM is partially responsible
         # for deciding when the completion is finished.) So,
@@ -64,15 +66,15 @@ def __init__(
         # this was not employed because the array could be very large for large
         # context windows, even if the completion was very short.
         self.logprobs: Optional[List[Tuple[npt.NDArray, npt.NDArray]]] = (
-            None if self.max_logprobs is None else [])
+            None if self.request_sample_logprobs is None else [])
         # The number of prompt logprobs is known is advance, so preallocate an
         # NDArray
         self.prompt_logprobs: Optional[npt.NDArray] = (
-            None if self.max_prompt_logprobs is None else np.empty(
-                (self.num_prompt_tokens, self.max_prompt_logprobs)))
+            None if self.request_prompt_logprobs is None else np.empty(
+                (self.num_prompt_tokens, self.request_prompt_logprobs)))
         self.prompt_logprob_token_ids: Optional[npt.NDArray] = (
-            None if self.max_prompt_logprobs is None else np.empty(
-                (self.num_prompt_tokens, self.max_prompt_logprobs),
+            None if self.request_prompt_logprobs is None else np.empty(
+                (self.num_prompt_tokens, self.request_prompt_logprobs),
                 dtype=np.int32))
         self.num_computed_tokens = 0
 
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index b9c97bcfb0d47..38297ccac355a 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -20,8 +20,8 @@ class SamplingMetadata:
 
     # Max number of sample or prompt logprobs
     # (respectiely) at the batch level
-    max_num_logprobs: int
-    max_num_prompt_logprobs: int
+    max_num_batch_sample_logprobs: int
+    max_num_batch_prompt_logprobs: int
 
     # Attributes which support logprob computation
     query_start_loc: Optional[torch.Tensor]
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index e0b03f7aa03b3..dea4607ff8d19 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -38,7 +38,7 @@ def _probs_sample(
 
     def _top_logprobs_token_indices(
         self,
-        logprobs: torch.Tensor,
+        logprob_values: torch.Tensor,
         max_num_logprobs: int,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Compute top logprobs and associated token indices
@@ -47,13 +47,14 @@ def _top_logprobs_token_indices(
           logprobs: total_tokens x vocab tensor
           max_num_logprobs: Max number of top {sample,prompt} logprobs
                             requested in batch (depending on whether top sample
-                            logprobs or top prompt logprobs are being computed)
+                            logprobs or top prompt logprobs are being computed).
+                            This will be the k.
 
         Returns:
           Top logprobs, total_tokens x max_num_logprobs tensor
           Top logprob token indices, total_tokens x max_num_logprobs tensor
         """
-        topk_logprobs, topk_indices = torch.topk(logprobs,
+        topk_logprobs, topk_indices = torch.topk(logprob_values,
                                                  max_num_logprobs,
                                                  dim=-1)
         # Use int32 to reduce the tensor size.
@@ -61,8 +62,8 @@ def _top_logprobs_token_indices(
 
     def _compute_logprobs_from_processed_logits(
         self,
-        do_logprobs: bool,
-        do_prompt_logprobs: bool,
+        do_batch_sample_logprobs: bool,
+        do_batch_prompt_logprobs: bool,
         maybe_sampled: torch.Tensor,
         maybe_sample_logits_indices: Optional[torch.Tensor],
         prompt_logits_mask: Optional[torch.Tensor],
@@ -75,16 +76,18 @@ def _compute_logprobs_from_processed_logits(
         Consumes logits which have already had temperature, top-k and top-p
         applied. 
          
-        `do_logprobs` and `do_prompt_logprobs` control whether sample and
-        prompt logprobs are computed, respectively.
+        `do_batch_sample_logprobs` and `do_batch_prompt_logprobs` control
+        whether sample and prompt logprobs are computed, respectively.
 
         This function does not handle the case where no logprobs are required
         at the batch level; it is assumed this function will not be called in
         that scenario.
 
         Args:
-          do_logprobs: compute sample logprobs
-          do_prompt_logprobs: compute prompt logprobs
+          do_batch_sample_logprobs: at least one request in the batch requires
+                                    sample logprobs to be computed
+          do_batch_prompt_logprobs: at least one request in the batch requires
+                                    prompt logprobs to be computed
           maybe_sampled: list of sampled tokens; if there is a partial request,
                          includes the partial request's sampled token (which
                          will later be discarded.)
@@ -109,20 +112,21 @@ def _compute_logprobs_from_processed_logits(
                          top-p applied.
 
           Returns:
-            Sample logprobs (`None` if `do_logprobs == False`,
+            Sample logprobs (`None` if `do_batch_sample_logprobs == False`,
                              o/w num_samples x max_num_logprobs tensor)
-            Sample logprobs token indices (`None` if `do_logprobs == False`,
+            Sample logprobs token indices (`None` if
+                            `do_batch_sample_logprobs == False`,
                              o/w num_samples x max_num_logprobs tensor)
-            Prompt logprobs (`None` if `do_prompt_logprobs == False`,
+            Prompt logprobs (`None` if `do_batch_prompt_logprobs == False`,
                              o/w num_prompt_tokens x max_num_prompt_logprobs
                              tensor)
             Prompt logprobs token indices (`None` if
-                 `do_prompt_logprobs == False`, o/w
+                 `do_batch_prompt_logprobs == False`, o/w
                  num_prompt_tokens x max_num_prompt_logprobs tensor)
         """
 
-        assert do_logprobs or do_prompt_logprobs
-        if do_logprobs and do_prompt_logprobs:
+        assert do_batch_sample_logprobs or do_batch_prompt_logprobs
+        if do_batch_sample_logprobs and do_batch_prompt_logprobs:
             # Batch requires sample and prompt logprobs
 
             # - Compute logprobs for all sequence offsets
@@ -135,7 +139,7 @@ def _compute_logprobs_from_processed_logits(
                 maybe_sample_topk_indices,
             ) = self._top_logprobs_token_indices(
                 logprobs[maybe_sample_logits_indices, :],
-                sampling_metadata.max_num_logprobs)
+                sampling_metadata.max_num_batch_sample_logprobs)
 
             # - In case sampled tokens are not in the top logprobs at their
             #   respective sequence offsets, gather logprobs associated with
@@ -155,8 +159,8 @@ def _compute_logprobs_from_processed_logits(
                     # Prompt logprobs and token indices
                     self._top_logprobs_token_indices(
                         logprobs[prompt_logits_mask, :],
-                        sampling_metadata.max_num_prompt_logprobs))
-        elif do_logprobs:
+                        sampling_metadata.max_num_batch_prompt_logprobs))
+        elif do_batch_sample_logprobs:
             # Batch requires only sample logprobs
 
             # - Compute top logprobs only at sequence offsets where new tokens
@@ -166,7 +170,7 @@ def _compute_logprobs_from_processed_logits(
                 maybe_sample_topk_logprobs,
                 maybe_sample_topk_indices,
             ) = self._top_logprobs_token_indices(
-                logprobs, sampling_metadata.max_num_logprobs)
+                logprobs, sampling_metadata.max_num_batch_sample_logprobs)
 
             # - In case sampled tokens are not in the top logprobs at their
             #   respective sequence offsets, gather logprobs associated with
@@ -188,7 +192,7 @@ def _compute_logprobs_from_processed_logits(
             return (maybe_sample_topk_logprobs, maybe_sample_topk_indices,
                     None, None)
 
-        elif do_prompt_logprobs:
+        elif do_batch_prompt_logprobs:
             # Batch requires only prompt logprobs
 
             # - Compute top logprobs only at sequence offsets of prompt tokens
@@ -197,7 +201,7 @@ def _compute_logprobs_from_processed_logits(
 
             # Return prompt logprobs
             return ((None, None) + self._top_logprobs_token_indices(
-                logprobs, sampling_metadata.max_num_prompt_logprobs))
+                logprobs, sampling_metadata.max_num_batch_prompt_logprobs))
 
     def forward(
         self,
@@ -220,12 +224,18 @@ def forward(
           (if requested)
         """
 
-        # Batch-level logprobs configs. `do_logprobs` indicates whether
-        # any request requires sample logprobs. `do_prompt_logprobs`
-        # indicates whether any request requires prompt logprobs.
-        do_logprobs = sampling_metadata.max_num_logprobs > 0
-        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
-        do_any_logprobs = do_logprobs or do_prompt_logprobs
+        # Batch-level logprobs configs. `do_batch_sample_logprobs`
+        # indicates whether any request requires sample logprobs.
+        # `do_batch_prompt_logprobs` indicates whether any request
+        # requires prompt logprobs. `do_batch_any_logprobs` indicates
+        # whether, overall, any request in the batch requires logprobs
+        # computed
+        do_batch_sample_logprobs = (
+            sampling_metadata.max_num_batch_sample_logprobs > 0)
+        do_batch_prompt_logprobs = (
+            sampling_metadata.max_num_batch_prompt_logprobs > 0)
+        do_batch_any_logprobs = (do_batch_sample_logprobs
+                                 or do_batch_prompt_logprobs)
 
         num_query_tokens = sampling_metadata.num_query_tokens
         # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
@@ -244,7 +254,7 @@ def forward(
 
         # Apply temperature, top-k and top-p to logits at sequence offsets
         # where a new token is being decoded.
-        if do_prompt_logprobs:
+        if do_batch_prompt_logprobs:
             # If prompt logprobs are required, then temp/top-k/top-p
             # must also be applied to prompt logits as a prerequisite.
             # So pass *all* logits through temp/top-k/top-p, then gather
@@ -270,15 +280,15 @@ def forward(
                                            sampling_metadata)
 
         # Compute sample & prompt logprobs, as-needed
-        if do_any_logprobs:
+        if do_batch_any_logprobs:
             (
                 maybe_sample_logprobs,
                 maybe_sample_logprobs_token_indices,
                 prompt_logprobs,
                 prompt_logprobs_token_indices,
             ) = self._compute_logprobs_from_processed_logits(
-                do_logprobs=do_logprobs,
-                do_prompt_logprobs=do_prompt_logprobs,
+                do_batch_sample_logprobs=do_batch_sample_logprobs,
+                do_batch_prompt_logprobs=do_batch_prompt_logprobs,
                 maybe_sampled=maybe_sampled,
                 maybe_sample_logits_indices=maybe_sample_logits_indices,
                 prompt_logits_mask=prompt_logits_mask,
@@ -286,16 +296,17 @@ def forward(
                 maybe_sample_logits_w_tmp_tpk_tpp=
                 maybe_sample_logits_w_tmp_tpk_tpp,
                 logits_w_tmp_tpk_tpp=(logits_w_tmp_tpk_tpp
-                                      if do_prompt_logprobs else None))
+                                      if do_batch_prompt_logprobs else None))
 
             # Return decoded output tokens and sample/prompt logprobs,
             # as required
             return SamplerOutput(
                 sampled_token_ids=maybe_sampled,
-                logprobs=maybe_sample_logprobs,
-                logprob_token_ids=maybe_sample_logprobs_token_indices,
-                prompt_logprobs=prompt_logprobs,
-                prompt_logprob_token_ids=prompt_logprobs_token_indices)
+                batch_sample_logprobs=maybe_sample_logprobs,
+                batch_sample_logprob_token_ids=
+                maybe_sample_logprobs_token_indices,
+                batch_prompt_logprobs=prompt_logprobs,
+                batch_prompt_logprob_token_ids=prompt_logprobs_token_indices)
         else:
             # No logprobs; return decoded output tokens
             return SamplerOutput(sampled_token_ids=maybe_sampled)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 5766448312cbe..01edb637a9644 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -460,8 +460,12 @@ def execute_model(
 
         sampling_metadata = self._prepare_sampling(
             scheduler_output, num_input_tokens, attn_metadata.query_start_loc)
-        do_logprobs = sampling_metadata.max_num_logprobs > 0
-        do_prompt_logprobs = sampling_metadata.max_num_prompt_logprobs > 0
+        # Indicate whether one or more requests in the batch require sample
+        # logprobs or prompt logprobs to be computed, respectively
+        do_batch_sample_logprobs = (
+            sampling_metadata.max_num_batch_sample_logprobs > 0)
+        do_batch_prompt_logprobs = (
+            sampling_metadata.max_num_batch_prompt_logprobs > 0)
 
         # Get the inputs embeds.
         if encoder_outputs:
@@ -523,16 +527,18 @@ def execute_model(
             sampled_token_ids_cpu=sampled_token_ids,
             # NOTE: sample and prompt logprob CPU-GPU synchronization happens
             # here
-            logprob_token_ids_cpu=(
-                sampler_output.logprob_token_ids.cpu().numpy()
-                if do_logprobs else None),
-            logprobs_cpu=(sampler_output.logprobs.cpu().numpy()
-                          if do_logprobs else None),
-            prompt_logprob_token_ids_cpu=(
-                sampler_output.prompt_logprob_token_ids.cpu().numpy()
-                if do_prompt_logprobs else None),
-            prompt_logprobs_cpu=(sampler_output.prompt_logprobs.cpu().numpy()
-                                 if do_prompt_logprobs else None))
+            batch_logprob_token_ids_cpu=(
+                sampler_output.batch_sample_logprob_token_ids.cpu().numpy()
+                if do_batch_sample_logprobs else None),
+            batch_logprobs_cpu=(
+                sampler_output.batch_sample_logprobs.cpu().numpy()
+                if do_batch_sample_logprobs else None),
+            batch_prompt_logprob_token_ids_cpu=(
+                sampler_output.batch_prompt_logprob_token_ids.cpu().numpy()
+                if do_batch_prompt_logprobs else None),
+            batch_prompt_logprobs_cpu=(
+                sampler_output.batch_prompt_logprobs.cpu().numpy()
+                if do_batch_prompt_logprobs else None))
         return model_runner_output
 
     def load_model(self) -> None:
@@ -763,13 +769,13 @@ def add_request(
 
         self.generators[req_index] = request.generator
 
-        num_logprobs = sampling_params.logprobs
-        num_prompt_logprobs = sampling_params.prompt_logprobs
+        num_logprobs = sampling_params.request_sample_logprobs
+        num_prompt_logprobs = sampling_params.request_prompt_logprobs
         if num_logprobs is not None and num_logprobs > 0:
             self.num_logprobs[req_id] = num_logprobs
         if num_prompt_logprobs is not None and num_prompt_logprobs > 0:
             self.num_prompt_logprobs[req_id] = num_prompt_logprobs
-        if sampling_params.prompt_logprobs:
+        if sampling_params.request_prompt_logprobs:
             self.prompt_logprob_reqs.add(req_id)
 
     def remove_request(self, req_id: str) -> Optional[int]:
@@ -869,8 +875,8 @@ def make_sampling_metadata(
             no_top_p=self.no_top_p,
             no_top_k=self.no_top_k,
             generators=self.generators,
-            max_num_logprobs=self.max_num_logprobs,
-            max_num_prompt_logprobs=self.max_num_prompt_logprobs,
+            max_num_batch_sample_logprobs=self.max_num_logprobs,
+            max_num_batch_prompt_logprobs=self.max_num_prompt_logprobs,
             # Required for sampling indices computation
             query_start_loc=query_start_loc,
             num_input_tokens=num_input_tokens,
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 99cf9a7e67256..42ed3fa39abf3 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -846,8 +846,8 @@ def _prepare_prompt(
             lora_index_mapping += [lora_id] * (max_prompt_len - context_len)
             lora_prompt_mapping.extend(
                 [lora_id] *
-                (max_prompt_len - context_len
-                 if seq_group_metadata.sampling_params.prompt_logprobs else 1))
+                (max_prompt_len - context_len if seq_group_metadata.
+                 sampling_params.request_prompt_logprobs else 1))
 
         input_tokens = make_tensor_with_pad(input_tokens,
                                             max_len=max_prompt_len,
@@ -1154,8 +1154,8 @@ def prepare_input_tensors(
         paddings = list(itertools.accumulate(paddings))
         paddings_prompt_logprobs = []
         for i, seq_group_metadata in enumerate(seq_group_metadata_list):
-            if seq_group_metadata.sampling_params.prompt_logprobs is not None \
-                              and seq_group_metadata.is_prompt:
+            if (seq_group_metadata.sampling_params.request_prompt_logprobs
+                    is not None and seq_group_metadata.is_prompt):
                 paddings_prompt_logprobs += ([paddings[i]] * seq_lens[i])
         paddings = torch.tensor(
             paddings_prompt_logprobs if paddings_prompt_logprobs else paddings,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 1bc5f65c7127f..a27ada83d5da7 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -625,8 +625,8 @@ def _compute_lora_input(self, inter_data: InterDataForSeqGroup,
         inter_data.lora_prompt_mapping.append(
             [lora_id] *
             (query_len if seq_group_metadata.sampling_params
-             and seq_group_metadata.sampling_params.prompt_logprobs is not None
-             else 1))
+             and seq_group_metadata.sampling_params.request_prompt_logprobs
+             is not None else 1))
 
     def _compute_prompt_adapter_input(
             self, inter_data: InterDataForSeqGroup,
@@ -653,8 +653,8 @@ def _compute_prompt_adapter_input(
             prompt_adapter_id
         ] * num_tokens + [0] * (query_len - num_tokens)
         inter_data.prompt_adapter_prompt_mapping = [prompt_adapter_id] * (
-            query_len if seq_group_metadata.sampling_params
-            and seq_group_metadata.sampling_params.prompt_logprobs else 1)
+            query_len if seq_group_metadata.sampling_params and
+            seq_group_metadata.sampling_params.request_prompt_logprobs else 1)
 
     def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
                                    seq_group_metadata: SequenceGroupMetadata):
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 3ca0d88a42183..0783fed12daf8 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -775,12 +775,14 @@ def _pythonize_sampler_output(
 
     seq_groups = sampling_metadata.seq_groups
     prompt_logprobs_are_requested_for_prefill = any([
-        sg.sampling_params.prompt_logprobs is not None and sg.is_prompt
+        sg.sampling_params.request_prompt_logprobs is not None and sg.is_prompt
         for sg in seq_groups
     ])
     any_logprobs_are_requested = (
-        prompt_logprobs_are_requested_for_prefill
-        or any([sg.sampling_params.logprobs is not None for sg in seq_groups]))
+        prompt_logprobs_are_requested_for_prefill or any([
+            sg.sampling_params.request_sample_logprobs is not None
+            for sg in seq_groups
+        ]))
 
     if prompt_logprobs_are_requested_for_prefill:
         # CPU GPU sync, after gathering *only* sampled tokens (since
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 9a054eb8a4cf7..742dfdfce6cd0 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -520,10 +520,10 @@ def _prepare_sample(
                     f"Best of > {_MAX_NUM_SAMPLES} is not supported by the TPU "
                     "backend.")
             n.append(sampling_params.n)
-            if sampling_params.logprobs is not None:
+            if sampling_params.request_sample_logprobs is not None:
                 raise NotImplementedError(
                     "logprobs is not currently supported by the TPU backend.")
-            if sampling_params.prompt_logprobs is not None:
+            if sampling_params.request_prompt_logprobs is not None:
                 raise NotImplementedError(
                     "prompt_logprobs is not currently supported by the TPU "
                     "backend.")

From 5ce812877103b4220e689fd7e2c94af41a25968b Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 7 Dec 2024 17:32:23 +0000
Subject: [PATCH 152/293] move

---
 tests/v1/{samplers => sample}/__init__.py      | 0
 tests/v1/{samplers => sample}/test_logprobs.py | 0
 tests/v1/{samplers => sample}/utils.py         | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/v1/{samplers => sample}/__init__.py (100%)
 rename tests/v1/{samplers => sample}/test_logprobs.py (100%)
 rename tests/v1/{samplers => sample}/utils.py (100%)

diff --git a/tests/v1/samplers/__init__.py b/tests/v1/sample/__init__.py
similarity index 100%
rename from tests/v1/samplers/__init__.py
rename to tests/v1/sample/__init__.py
diff --git a/tests/v1/samplers/test_logprobs.py b/tests/v1/sample/test_logprobs.py
similarity index 100%
rename from tests/v1/samplers/test_logprobs.py
rename to tests/v1/sample/test_logprobs.py
diff --git a/tests/v1/samplers/utils.py b/tests/v1/sample/utils.py
similarity index 100%
rename from tests/v1/samplers/utils.py
rename to tests/v1/sample/utils.py

From bdd0abf9d061dbfc68e24e2328475f276d21f25f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 9 Dec 2024 18:08:29 +0000
Subject: [PATCH 153/293] removed VLLM_USE_V1 checks

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/sample/test_logprobs.py | 15 ++++++---------
 tests/v1/utils.py                |  7 -------
 2 files changed, 6 insertions(+), 16 deletions(-)
 delete mode 100644 tests/v1/utils.py

diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 1a1d361170187..0d8031f05e8d1 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -5,10 +5,9 @@
 import torch
 
 from tests.kernels.utils import override_backend_env_variable
-from tests.v1.samplers.utils import (
+from tests.v1.sample.utils import (
     assert_incr_detok_str_matches_non_incr_detok_str,
     compute_correct_cumulative_logprob, get_test_batch)
-from tests.v1.utils import assert_vllm_use_v1
 from vllm import SamplingParams
 
 from ...conftest import VllmRunner
@@ -27,7 +26,6 @@ def _test_case_get_logprobs_and_prompt_logprobs(
     example_prompts,
     monkeypatch,
 ) -> None:
-    assert_vllm_use_v1()
     test_prompts = example_prompts
     override_backend_env_variable(monkeypatch, "FLASH_ATTN")
 
@@ -287,7 +285,6 @@ def test_max_logprobs(monkeypatch):
     Args:
       monkeypatch
     """
-    assert_vllm_use_v1()
     override_backend_env_variable(monkeypatch, "FLASH_ATTN")
 
     runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
@@ -305,12 +302,12 @@ def test_none_logprobs(vllm_runner, model, example_prompts, monkeypatch):
     """Engine should return `logprobs` and `prompt_logprobs` as `None`
     
     Args:
-      vllm_runner
-      model
-      example_prompts
-      monkeypatch
+      vllm_runner: vLLM engine runner fixture
+      model: model name
+      example_prompts: list of example prompts (test fixture)
+      monkeypatch: supports editing env vars and rolling back changes
+                   after the test
     """
-    assert_vllm_use_v1()
     override_backend_env_variable(monkeypatch, "FLASH_ATTN")
 
     max_num_seqs = 256
diff --git a/tests/v1/utils.py b/tests/v1/utils.py
deleted file mode 100644
index db9193a487c95..0000000000000
--- a/tests/v1/utils.py
+++ /dev/null
@@ -1,7 +0,0 @@
-"""V1 vLLM engine test utils"""
-import os
-
-
-def assert_vllm_use_v1():
-    if os.getenv("VLLM_USE_V1") != "1":
-        raise OSError("Test requires VLLM_USE_V1=\"1\"")

From 1fc981eac6e6f521f64489745aaeec9c22654b43 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 9 Dec 2024 18:15:20 +0000
Subject: [PATCH 154/293] revert logprobs name changes

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 examples/llm_engine_example.py                |  4 +--
 examples/lora_with_quantization_inference.py  | 16 +++++-----
 examples/multilora_inference.py               | 12 +++----
 tests/conftest.py                             | 14 ++++-----
 tests/engine/test_skip_tokenizer_init.py      |  3 +-
 .../decoder_only/language/test_mistral.py     |  4 +--
 .../vision_language/test_pixtral.py           |  4 +--
 tests/samplers/test_logits_processor.py       |  4 +--
 tests/samplers/test_logprobs.py               | 24 +++++++-------
 tests/samplers/test_ranks.py                  | 15 +++++----
 tests/samplers/test_sampler.py                |  4 +--
 tests/spec_decode/e2e/conftest.py             |  4 +--
 tests/spec_decode/e2e/test_logprobs.py        |  2 +-
 tests/tokenization/test_detokenize.py         |  8 ++---
 tests/v1/sample/test_logprobs.py              | 17 +++++-----
 vllm/engine/llm_engine.py                     |  8 ++---
 vllm/engine/protocol.py                       |  2 +-
 vllm/entrypoints/llm.py                       |  3 +-
 vllm/model_executor/layers/sampler.py         | 17 +++++-----
 vllm/model_executor/sampling_metadata.py      | 13 ++++----
 vllm/outputs.py                               |  2 +-
 vllm/sampling_params.py                       | 31 ++++++++-----------
 vllm/spec_decode/spec_decode_worker.py        |  5 ++-
 vllm/spec_decode/util.py                      |  3 +-
 vllm/v1/engine/processor.py                   | 11 +++----
 vllm/v1/request.py                            |  4 +--
 vllm/v1/worker/gpu_input_batch.py             |  6 ++--
 vllm/worker/hpu_model_runner.py               |  8 ++---
 vllm/worker/model_runner.py                   |  8 ++---
 vllm/worker/multi_step_model_runner.py        |  8 ++---
 vllm/worker/tpu_model_runner.py               |  4 +--
 31 files changed, 122 insertions(+), 146 deletions(-)

diff --git a/examples/llm_engine_example.py b/examples/llm_engine_example.py
index dc87ef3df1ce2..60d894aae9692 100644
--- a/examples/llm_engine_example.py
+++ b/examples/llm_engine_example.py
@@ -9,9 +9,7 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
     """Create a list of test prompts with their sampling parameters."""
     return [
         ("A robot may not injure a human being",
-         SamplingParams(temperature=0.0,
-                        request_sample_logprobs=1,
-                        request_prompt_logprobs=1)),
+         SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)),
         ("To be or not to be,",
          SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
         ("What is the meaning of life?",
diff --git a/examples/lora_with_quantization_inference.py b/examples/lora_with_quantization_inference.py
index ac2cd90ec7ceb..0c454ea50f665 100644
--- a/examples/lora_with_quantization_inference.py
+++ b/examples/lora_with_quantization_inference.py
@@ -22,26 +22,26 @@ def create_test_prompts(
         # this is an example of using quantization without LoRA
         ("My name is",
          SamplingParams(temperature=0.0,
-                        request_sample_logprobs=1,
-                        request_prompt_logprobs=1,
+                        logprobs=1,
+                        prompt_logprobs=1,
                         max_tokens=128), None),
         # the next three examples use quantization with LoRA
         ("my name is",
          SamplingParams(temperature=0.0,
-                        request_sample_logprobs=1,
-                        request_prompt_logprobs=1,
+                        logprobs=1,
+                        prompt_logprobs=1,
                         max_tokens=128),
          LoRARequest("lora-test-1", 1, lora_path)),
         ("The capital of USA is",
          SamplingParams(temperature=0.0,
-                        request_sample_logprobs=1,
-                        request_prompt_logprobs=1,
+                        logprobs=1,
+                        prompt_logprobs=1,
                         max_tokens=128),
          LoRARequest("lora-test-2", 1, lora_path)),
         ("The capital of France is",
          SamplingParams(temperature=0.0,
-                        request_sample_logprobs=1,
-                        request_prompt_logprobs=1,
+                        logprobs=1,
+                        prompt_logprobs=1,
                         max_tokens=128),
          LoRARequest("lora-test-3", 1, lora_path)),
     ]
diff --git a/examples/multilora_inference.py b/examples/multilora_inference.py
index 904bb6764b2e5..043220d979c3c 100644
--- a/examples/multilora_inference.py
+++ b/examples/multilora_inference.py
@@ -27,8 +27,8 @@ def create_test_prompts(
     return [
         ("A robot may not injure a human being",
          SamplingParams(temperature=0.0,
-                        request_sample_logprobs=1,
-                        request_prompt_logprobs=1,
+                        logprobs=1,
+                        prompt_logprobs=1,
                         max_tokens=128), None),
         ("To be or not to be,",
          SamplingParams(temperature=0.8,
@@ -38,16 +38,16 @@ def create_test_prompts(
         (
             "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
             SamplingParams(temperature=0.0,
-                           request_sample_logprobs=1,
-                           request_prompt_logprobs=1,
+                           logprobs=1,
+                           prompt_logprobs=1,
                            max_tokens=128,
                            stop_token_ids=[32003]),
             LoRARequest("sql-lora", 1, lora_path)),
         (
             "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
             SamplingParams(temperature=0.0,
-                           request_sample_logprobs=1,
-                           request_prompt_logprobs=1,
+                           logprobs=1,
+                           prompt_logprobs=1,
                            max_tokens=128,
                            stop_token_ids=[32003]),
             LoRARequest("sql-lora2", 2, lora_path)),
diff --git a/tests/conftest.py b/tests/conftest.py
index 61015117a9654..d6be8f5b00af8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -794,7 +794,7 @@ def generate_w_logprobs(
             self._final_steps_generate_w_logprobs(req_outputs))
         # Omit prompt logprobs if not required by sampling params
         return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
-                if sampling_params.request_prompt_logprobs is None else
+                if sampling_params.prompt_logprobs is None else
                 toks_str_logsprobs_prompt_logprobs)
 
     def generate_encoder_decoder_w_logprobs(
@@ -807,14 +807,14 @@ def generate_encoder_decoder_w_logprobs(
         Logprobs generation for vLLM encoder/decoder models
         '''
 
-        assert sampling_params.request_sample_logprobs is not None
+        assert sampling_params.logprobs is not None
         req_outputs = self.model.generate(encoder_decoder_prompts,
                                           sampling_params=sampling_params)
         toks_str_logsprobs_prompt_logprobs = (
             self._final_steps_generate_w_logprobs(req_outputs))
         # Omit prompt logprobs if not required by sampling params
         return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
-                if sampling_params.request_prompt_logprobs is None else
+                if sampling_params.prompt_logprobs is None else
                 toks_str_logsprobs_prompt_logprobs)
 
     def generate_greedy(
@@ -850,8 +850,8 @@ def generate_greedy_logprobs(
         greedy_logprobs_params = SamplingParams(
             temperature=0.0,
             max_tokens=max_tokens,
-            request_sample_logprobs=num_logprobs,
-            request_prompt_logprobs=num_prompt_logprobs,
+            logprobs=num_logprobs,
+            prompt_logprobs=num_prompt_logprobs,
             stop_token_ids=stop_token_ids,
             stop=stop)
 
@@ -872,8 +872,8 @@ def generate_encoder_decoder_greedy_logprobs(
         greedy_logprobs_params = SamplingParams(
             temperature=0.0,
             max_tokens=max_tokens,
-            request_sample_logprobs=num_logprobs,
-            request_prompt_logprobs=(num_prompt_logprobs),
+            logprobs=num_logprobs,
+            prompt_logprobs=(num_prompt_logprobs),
         )
         '''
         Greedy logprobs generation for vLLM encoder/decoder models
diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
index 09c9ed1474880..b8818af5614cf 100644
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -10,8 +10,7 @@ def test_skip_tokenizer_initialization(model: str):
     # of tokenizer and detokenizer. The generated output is expected to contain
     # token ids.
     llm = LLM(model=model, skip_tokenizer_init=True)
-    sampling_params = SamplingParams(request_prompt_logprobs=True,
-                                     detokenize=True)
+    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
 
     with pytest.raises(ValueError, match="cannot pass text prompts when"):
         llm.generate("abc", sampling_params)
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 68b95fb800bcb..99b5d5694f9f7 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -24,9 +24,7 @@
     # "mistralai/Mistral-Nemo-Instruct-2407"
 ]
 
-SAMPLING_PARAMS = SamplingParams(max_tokens=512,
-                                 temperature=0.0,
-                                 request_sample_logprobs=5)
+SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 SYMBOLIC_LANG_PROMPTS = [
     "勇敢な船乗りについての詩を書く",  # japanese
     "寫一首關於勇敢的水手的詩",  # chinese
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index 492cafa8a18a7..90c0fab99054c 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -116,9 +116,7 @@ def _create_engine_inputs_hf(urls: List[str]) -> TextPrompt:
     _create_engine_inputs(IMG_URLS),
 ]
 
-SAMPLING_PARAMS = SamplingParams(max_tokens=512,
-                                 temperature=0.0,
-                                 request_sample_logprobs=5)
+SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 LIMIT_MM_PER_PROMPT = dict(image=4)
 
 MAX_MODEL_LEN = [8192, 65536]
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
index 646ef56f23a7b..2979470120710 100644
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -29,7 +29,7 @@ def pick_vllm(token_ids, logits):
 
         params_with_logprobs = SamplingParams(
             logits_processors=[pick_vllm],
-            request_prompt_logprobs=3,
+            prompt_logprobs=3,
             max_tokens=max_tokens,
         )
 
@@ -43,7 +43,7 @@ def pick_vllm(token_ids, logits):
         vllm_model.model._add_request(
             example_prompts[1],
             params=SamplingParams(
-                request_prompt_logprobs=3,
+                prompt_logprobs=3,
                 max_tokens=max_tokens,
             ),
         )
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index dcd75c7539fe2..c07c71e38233f 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -49,12 +49,11 @@ def test_get_prompt_logprobs(
             max_num_batched_tokens=max_num_batched_tokens,
             max_num_seqs=max_num_seqs,
     ) as vllm_model:
-        vllm_sampling_params = SamplingParams(
-            max_tokens=max_tokens,
-            request_sample_logprobs=num_top_logprobs,
-            request_prompt_logprobs=num_top_logprobs,
-            temperature=0.0,
-            detokenize=detokenize)
+        vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
+                                              logprobs=num_top_logprobs,
+                                              prompt_logprobs=num_top_logprobs,
+                                              temperature=0.0,
+                                              detokenize=detokenize)
         vllm_results = vllm_model.model.generate(
             example_prompts, sampling_params=vllm_sampling_params)
 
@@ -132,11 +131,11 @@ def test_get_prompt_logprobs(
 
 def test_max_logprobs():
     runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
-    vllm_sampling_params = SamplingParams(request_sample_logprobs=1)
+    vllm_sampling_params = SamplingParams(logprobs=1)
     # should pass
     runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
 
-    bad_sampling_params = SamplingParams(request_sample_logprobs=2)
+    bad_sampling_params = SamplingParams(logprobs=2)
     with pytest.raises(ValueError):
         runner.generate(["Hello world"], sampling_params=bad_sampling_params)
 
@@ -161,11 +160,10 @@ def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
             max_num_batched_tokens=max_num_batched_tokens,
             max_num_seqs=max_num_seqs,
     ) as vllm_model:
-        sampling_params_logprobs_none = SamplingParams(
-            max_tokens=max_tokens,
-            request_sample_logprobs=None,
-            temperature=0.0,
-            detokenize=detokenize)
+        sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
+                                                       logprobs=None,
+                                                       temperature=0.0,
+                                                       detokenize=detokenize)
         results_logprobs_none = vllm_model.model.generate(
             example_prompts, sampling_params=sampling_params_logprobs_none)
 
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
index ba41fc615d14a..ed2fee1ae252e 100644
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -25,18 +25,17 @@ def test_ranks(
             temperature=0.0,
             top_p=1.0,
             max_tokens=max_tokens,
-            request_sample_logprobs=num_top_logprobs,
-            request_prompt_logprobs=num_prompt_logprobs)
+            logprobs=num_top_logprobs,
+            prompt_logprobs=num_prompt_logprobs)
         vllm_results = vllm_model.generate_w_logprobs(example_prompts,
                                                       vllm_sampling_params)
 
         ## Test non-greedy logprobs ranks
-        sampling_params = SamplingParams(
-            temperature=1.0,
-            top_p=1.0,
-            max_tokens=max_tokens,
-            request_sample_logprobs=num_top_logprobs,
-            request_prompt_logprobs=num_prompt_logprobs)
+        sampling_params = SamplingParams(temperature=1.0,
+                                         top_p=1.0,
+                                         max_tokens=max_tokens,
+                                         logprobs=num_top_logprobs,
+                                         prompt_logprobs=num_prompt_logprobs)
         res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
 
     for result in vllm_results:
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 4c1dfb48fbe6f..28c34064f670c 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -176,7 +176,7 @@ def create_sampling_params(min_tokens,
             max_tokens=9999,  # keep higher than max of min_tokens
             stop_token_ids=stop_token_ids,
             # requesting prompt_logprobs changes the structure of `logits`
-            request_prompt_logprobs=prompt_logprobs,
+            prompt_logprobs=prompt_logprobs,
         )
         sampling_params.all_stop_token_ids.add(eos_token_id)
         return sampling_params
@@ -395,7 +395,7 @@ def run_test_case(*, expected_penalization: List[bool],
                 seq_lens.append(prompt_len)
 
                 assert sgm.sampling_params is not None
-                if sgm.sampling_params.request_prompt_logprobs:
+                if sgm.sampling_params.prompt_logprobs:
                     # with prompt_logprobs each token in the prompt has a row in
                     # logits
                     num_rows = prompt_len
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 39a9dab2b9f11..b9cb3858c0068 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -196,8 +196,8 @@ def run_equality_correctness_test(
                                      max_tokens=max_output_len,
                                      seed=seed,
                                      ignore_eos=ignore_eos,
-                                     request_sample_logprobs=logprobs,
-                                     request_prompt_logprobs=prompt_logprobs)
+                                     logprobs=logprobs,
+                                     prompt_logprobs=prompt_logprobs)
 
     with vllm_runner(**org_args) as vllm_model:
         org_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
index 7d0d90615bac2..4cfca8b78e79b 100644
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -211,7 +211,7 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
         max_tokens=output_len,
         ignore_eos=True,
         temperature=temperature,
-        request_sample_logprobs=logprobs,
+        logprobs=logprobs,
     )
 
     sd_args = {
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 2fce280b188bb..84348cbc0bced 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -201,7 +201,7 @@ def test_decode_sequence_logprobs(complete_sequence: str,
                                   skip_special_tokens: bool):
     """Verify Detokenizer decodes logprobs correctly."""
     sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
-                                     request_sample_logprobs=2)
+                                     logprobs=2)
 
     # Run sequentially.
     seq = create_sequence()
@@ -234,7 +234,7 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int],
                                 detokenizer: Detokenizer):
     """Verify Detokenizer decodes prompt logprobs correctly."""
     sampling_params = SamplingParams(skip_special_tokens=True,
-                                     request_prompt_logprobs=1)
+                                     prompt_logprobs=1)
 
     # Run sequentially.
     seq = create_sequence(complete_sequence_token_ids)
@@ -294,8 +294,8 @@ def test_decode_prompt_logprobs_chunked_prefill(
                      max_num_seqs=max_num_seqs) as vllm_model:
 
         vllm_sampling_params = SamplingParams(max_tokens=10,
-                                              request_sample_logprobs=5,
-                                              request_prompt_logprobs=5,
+                                              logprobs=5,
+                                              prompt_logprobs=5,
                                               temperature=0.0)
         vllm_results = vllm_model.model.generate(
             example_prompts, sampling_params=vllm_sampling_params)
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 0d8031f05e8d1..68c72c63786ec 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -62,8 +62,8 @@ def _test_case_get_logprobs_and_prompt_logprobs(
     # Generate SamplingParams
     vllm_sampling_params = [
         SamplingParams(max_tokens=max_tokens,
-                       request_sample_logprobs=lp,
-                       request_prompt_logprobs=plp,
+                       logprobs=lp,
+                       prompt_logprobs=plp,
                        temperature=0.0,
                        detokenize=detokenize)
         for lp, plp in logprob_prompt_logprob_list
@@ -288,11 +288,11 @@ def test_max_logprobs(monkeypatch):
     override_backend_env_variable(monkeypatch, "FLASH_ATTN")
 
     runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
-    vllm_sampling_params = SamplingParams(request_sample_logprobs=1)
+    vllm_sampling_params = SamplingParams(logprobs=1)
     # should pass
     runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
 
-    bad_sampling_params = SamplingParams(request_sample_logprobs=2)
+    bad_sampling_params = SamplingParams(logprobs=2)
     with pytest.raises(ValueError):
         runner.generate(["Hello world"], sampling_params=bad_sampling_params)
 
@@ -319,11 +319,10 @@ def test_none_logprobs(vllm_runner, model, example_prompts, monkeypatch):
             max_num_batched_tokens=max_num_batched_tokens,
             max_num_seqs=max_num_seqs,
     ) as vllm_model:
-        sampling_params_logprobs_none = SamplingParams(
-            max_tokens=max_tokens,
-            request_sample_logprobs=None,
-            request_prompt_logprobs=None,
-            temperature=0.0)
+        sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
+                                                       logprobs=None,
+                                                       prompt_logprobs=None,
+                                                       temperature=0.0)
         results_logprobs_none = vllm_model.model.generate(
             example_prompts, sampling_params=sampling_params_logprobs_none)
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 8286e9ce9c70d..560f84a008291 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -847,10 +847,10 @@ def _create_sequence_group_with_sampling(
     ) -> SequenceGroup:
         """Creates a SequenceGroup with SamplingParams."""
         max_logprobs = self.get_model_config().max_logprobs
-        if (sampling_params.request_sample_logprobs
-                and sampling_params.request_sample_logprobs > max_logprobs
-            ) or (sampling_params.request_prompt_logprobs
-                  and sampling_params.request_prompt_logprobs > max_logprobs):
+        if (sampling_params.logprobs
+                and sampling_params.logprobs > max_logprobs) or (
+                    sampling_params.prompt_logprobs
+                    and sampling_params.prompt_logprobs > max_logprobs):
             raise ValueError(f"Cannot request more than "
                              f"{max_logprobs} logprobs.")
 
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index dac592f9f373d..4079de7d36793 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -95,7 +95,7 @@ async def beam_search(
             tokenizer.eos_token_id, length_penalty)
 
         beam_search_params = SamplingParams(
-            request_sample_logprobs=2 * beam_width,
+            logprobs=2 * beam_width,
             max_tokens=1,
             temperature=temperature,
         )
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index b2a13143cdb4d..8de30ccd18a11 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -461,8 +461,7 @@ def sort_beams_key(x: BeamSearchSequence) -> float:
         # generate 2 * beam_width candidates at each step
         # following the huggingface transformers implementation
         # at https://github.com/huggingface/transformers/blob/e15687fffe5c9d20598a19aeab721ae0a7580f8a/src/transformers/generation/beam_search.py#L534 # noqa
-        beam_search_params = SamplingParams(request_sample_logprobs=2 *
-                                            beam_width,
+        beam_search_params = SamplingParams(logprobs=2 * beam_width,
                                             max_tokens=1,
                                             temperature=temperature)
         instances: List[BeamSearchInstance] = []
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 89156850900f7..c10efefea5471 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -967,9 +967,9 @@ def get_logprobs(
 
         # Update indices and tokens for prompt logprobs.
         if (seq_group.is_prompt
-                and sampling_params.request_prompt_logprobs is not None):
+                and sampling_params.prompt_logprobs is not None):
             largest_num_logprobs = max(largest_num_logprobs,
-                                       sampling_params.request_prompt_logprobs)
+                                       sampling_params.prompt_logprobs)
             next_prompt_tokens = _get_next_prompt_tokens(seq_group)
             query_indices.extend(seq_group.prompt_logprob_indices)
             next_token_ids.extend(next_prompt_tokens)
@@ -986,10 +986,9 @@ def get_logprobs(
                 [query_idx + parent_id for parent_id in parent_seq_ids])
             next_token_ids.extend(token_ids)
 
-            if sampling_params.request_sample_logprobs is not None:
-                largest_num_logprobs = max(
-                    largest_num_logprobs,
-                    sampling_params.request_sample_logprobs)
+            if sampling_params.logprobs is not None:
+                largest_num_logprobs = max(largest_num_logprobs,
+                                           sampling_params.logprobs)
 
         assert len(next_token_ids) == len(query_indices)
 
@@ -1071,9 +1070,9 @@ def _get_prompt_logprob_if_needed(
 
     # Find prompt logprobs
     prompt_logprobs: Optional[PromptLogprobs] = None
-    if is_prompt and sampling_params.request_prompt_logprobs is not None:
+    if is_prompt and sampling_params.prompt_logprobs is not None:
         prompt_logprobs = []
-        num_logprobs = sampling_params.request_prompt_logprobs
+        num_logprobs = sampling_params.prompt_logprobs
         next_prompt_tokens = _get_next_prompt_tokens(seq_group)
         # Pre-select indexes and create a list. It is faster than calling .item
         # repetitively.
@@ -1128,7 +1127,7 @@ def _get_sampled_logprob_if_needed(
 ):
     """Compute the sample logprob if needed."""
     seq_ids = seq_group.seq_ids
-    num_logprobs = seq_group.sampling_params.request_sample_logprobs
+    num_logprobs = seq_group.sampling_params.logprobs
     sampled_logprobs: SampleLogprobs = []
     next_token_ids, parent_seq_ids = sample_result
 
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 579319ffdf2ed..a58589bb915ed 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -52,7 +52,7 @@ def do_sample(self):
 
     def __post_init__(self):
         if len(self.prompt_logprob_indices) > 0:
-            assert self.sampling_params.request_prompt_logprobs is not None
+            assert self.sampling_params.prompt_logprobs is not None
         if self.is_prompt:
             assert self.seq_len is not None
             assert self.query_len is not None
@@ -300,7 +300,7 @@ def _prepare_seq_groups(
         logits = hidden_states[selected_token_indices]
         """
 
-        if sampling_params.request_prompt_logprobs is not None:
+        if sampling_params.prompt_logprobs is not None:
             selected_token_indices.extend(
                 range(model_output_idx, model_output_idx + prompt_logprob_len))
         model_output_idx += prompt_logprob_len
@@ -322,7 +322,7 @@ def sample(logits):
            # sample_indices to find sample indices.
         """
 
-        if sampling_params.request_prompt_logprobs is not None:
+        if sampling_params.prompt_logprobs is not None:
             prompt_logprob_indices.extend(
                 range(logit_idx, logit_idx + prompt_logprob_len))
             logit_idx += prompt_logprob_len
@@ -426,8 +426,7 @@ def from_sampling_metadata(
                 do_penalties = True
 
             is_prompt = seq_group.is_prompt
-            if (is_prompt
-                    and sampling_params.request_prompt_logprobs is not None):
+            if (is_prompt and sampling_params.prompt_logprobs is not None):
                 # For tokens in the prompt that we only need to get
                 # their logprobs
                 query_len = seq_group.query_len
@@ -456,8 +455,8 @@ def from_sampling_metadata(
             for seq_group in sampling_metadata.seq_groups:
                 seq_ids = seq_group.seq_ids
                 sampling_params = seq_group.sampling_params
-                if (seq_group.is_prompt and
-                        sampling_params.request_prompt_logprobs is not None):
+                if (seq_group.is_prompt
+                        and sampling_params.prompt_logprobs is not None):
                     prefill_len = len(seq_group.prompt_logprob_indices)
                     prompt_tokens.extend(
                         array(VLLM_TOKEN_ID_ARRAY_TYPE)
diff --git a/vllm/outputs.py b/vllm/outputs.py
index c6d0a31cbd8d8..c412d5ce21571 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -205,7 +205,7 @@ def from_seq_group(
         # NOTE: We need omit logprobs here explicitly because the sequence
         # always has the logprobs of the sampled tokens even if the
         # logprobs are not requested.
-        include_logprobs = sampling_params.request_sample_logprobs is not None
+        include_logprobs = sampling_params.logprobs is not None
         text_buffer_length = sampling_params.output_text_buffer_length
         delta = sampling_params.output_kind == RequestOutputKind.DELTA
 
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index cc4d16b3dc6ce..55664c6cf787a 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -186,8 +186,8 @@ class SamplingParams(
     min_tokens: int = 0
     # Number of sample logprobs and prompt logprobs,
     # respectively, requested
-    request_sample_logprobs: Optional[int] = None
-    request_prompt_logprobs: Optional[int] = None
+    logprobs: Optional[int] = None
+    prompt_logprobs: Optional[int] = None
     # NOTE: This parameter is only exposed at the engine level for now.
     # It is not exposed in the OpenAI API server, as the OpenAI API does
     # not support returning only a list of token IDs.
@@ -270,8 +270,8 @@ def from_optional(
             ignore_eos=ignore_eos,
             max_tokens=max_tokens,
             min_tokens=min_tokens,
-            request_sample_logprobs=logprobs,
-            request_prompt_logprobs=prompt_logprobs,
+            logprobs=logprobs,
+            prompt_logprobs=prompt_logprobs,
             detokenize=detokenize,
             skip_special_tokens=skip_special_tokens,
             spaces_between_special_tokens=spaces_between_special_tokens,
@@ -328,12 +328,9 @@ def __post_init__(self) -> None:
         else:
             self.bad_words = list(self.bad_words)
 
-        self.request_sample_logprobs = (1
-                                        if self.request_sample_logprobs is True
-                                        else self.request_sample_logprobs)
-        self.request_prompt_logprobs = (1
-                                        if self.request_prompt_logprobs is True
-                                        else self.request_prompt_logprobs)
+        self.logprobs = (1 if self.logprobs is True else self.logprobs)
+        self.prompt_logprobs = (1 if self.prompt_logprobs is True else
+                                self.prompt_logprobs)
 
         # Number of characters to hold back for stop string evaluation
         # until sequence is finished.
@@ -390,14 +387,12 @@ def _verify_args(self) -> None:
             raise ValueError(
                 f"min_tokens must be less than or equal to "
                 f"max_tokens={self.max_tokens}, got {self.min_tokens}.")
-        if (self.request_sample_logprobs is not None
-                and self.request_sample_logprobs < 0):
+        if (self.logprobs is not None and self.logprobs < 0):
             raise ValueError(f"logprobs must be non-negative, "
-                             f"got {self.request_sample_logprobs}.")
-        if (self.request_prompt_logprobs is not None
-                and self.request_prompt_logprobs < 0):
+                             f"got {self.logprobs}.")
+        if (self.prompt_logprobs is not None and self.prompt_logprobs < 0):
             raise ValueError(f"prompt_logprobs must be non-negative, got "
-                             f"{self.request_prompt_logprobs}.")
+                             f"{self.prompt_logprobs}.")
         if (self.truncate_prompt_tokens is not None
                 and self.truncate_prompt_tokens < 1):
             raise ValueError(f"truncate_prompt_tokens must be >= 1, "
@@ -488,8 +483,8 @@ def __repr__(self) -> str:
             f"ignore_eos={self.ignore_eos}, "
             f"max_tokens={self.max_tokens}, "
             f"min_tokens={self.min_tokens}, "
-            f"logprobs={self.request_sample_logprobs}, "
-            f"prompt_logprobs={self.request_prompt_logprobs}, "
+            f"logprobs={self.logprobs}, "
+            f"prompt_logprobs={self.prompt_logprobs}, "
             f"skip_special_tokens={self.skip_special_tokens}, "
             "spaces_between_special_tokens="
             f"{self.spaces_between_special_tokens}, "
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index f76b1bbd7aa07..2689802161987 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -543,9 +543,8 @@ def _serialize_sampler_output_no_logprobs(
             populated.
         """
         seq_output_prompt_logprobs = [
-            seq.is_prompt
-            and seq.sampling_params.request_prompt_logprobs is not None
-            and seq.sampling_params.request_prompt_logprobs > 0
+            seq.is_prompt and seq.sampling_params.prompt_logprobs is not None
+            and seq.sampling_params.prompt_logprobs > 0
             for seq in execute_model_req.seq_group_metadata_list
         ]
         # ignore slots for prompt tokens that are filled with INVALID_TOKEN_ID
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index 1ecc653521ad9..0b6003673578e 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -23,8 +23,7 @@ def get_all_num_logprobs(
 
     all_num_logprobs: List[int] = []
     for seq_group_metadata in seq_group_metadata_list:
-        num_logprobs = (
-            seq_group_metadata.sampling_params.request_sample_logprobs)
+        num_logprobs = (seq_group_metadata.sampling_params.logprobs)
         if num_logprobs is None:
             num_logprobs = 0
         all_num_logprobs.append(num_logprobs)
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 535874a1fd6de..3f6fc33d5cae0 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -59,10 +59,9 @@ def _assert_valid_sample_logprobs_prompt_logprobs(
         """
 
         if isinstance(params, SamplingParams) and (
-            (params.request_sample_logprobs
-             and params.request_sample_logprobs > max_logprobs) or
-            (params.request_prompt_logprobs
-             and params.request_prompt_logprobs > max_logprobs)):
+            (params.logprobs and params.logprobs > max_logprobs) or
+            (params.prompt_logprobs
+             and params.prompt_logprobs > max_logprobs)):
 
             raise ValueError(f"Cannot request more than "
                              f"{max_logprobs} logprobs or prompt logprobs.")
@@ -167,8 +166,8 @@ def process_inputs(
             sampling_params.output_kind,
             sampling_params.stop,
             sampling_params.include_stop_str_in_output,
-            sampling_params.request_sample_logprobs,
-            sampling_params.request_prompt_logprobs,
+            sampling_params.logprobs,
+            sampling_params.prompt_logprobs,
         )
 
         # Make Request for EngineCore.
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 7fd37f2effe0c..bf789c5a01f66 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -48,8 +48,8 @@ def __init__(
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
         # Number of sample logprobs and prompt logprobs requested,
         # respectively
-        self.request_sample_logprobs = sampling_params.request_sample_logprobs
-        self.request_prompt_logprobs = sampling_params.request_prompt_logprobs
+        self.request_sample_logprobs = sampling_params.logprobs
+        self.request_prompt_logprobs = sampling_params.prompt_logprobs
         # If sample logprobs are enabled, the number of sample logprobs cannot
         # be anticipated in advance (because the LLM is partially responsible
         # for deciding when the completion is finished.) So,
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 1d59d798896f6..d88350e8303a9 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -150,13 +150,13 @@ def add_request(
 
         self.generators[req_index] = request.generator
 
-        num_logprobs = sampling_params.request_sample_logprobs
-        num_prompt_logprobs = sampling_params.request_prompt_logprobs
+        num_logprobs = sampling_params.logprobs
+        num_prompt_logprobs = sampling_params.prompt_logprobs
         if num_logprobs is not None and num_logprobs > 0:
             self.num_logprobs[req_id] = num_logprobs
         if num_prompt_logprobs is not None and num_prompt_logprobs > 0:
             self.num_prompt_logprobs[req_id] = num_prompt_logprobs
-        if sampling_params.request_prompt_logprobs:
+        if sampling_params.prompt_logprobs:
             self.prompt_logprob_reqs.add(req_id)
 
     def remove_request(self, req_id: str) -> Optional[int]:
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 42ed3fa39abf3..0a7699cba1f32 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -846,8 +846,8 @@ def _prepare_prompt(
             lora_index_mapping += [lora_id] * (max_prompt_len - context_len)
             lora_prompt_mapping.extend(
                 [lora_id] *
-                (max_prompt_len - context_len if seq_group_metadata.
-                 sampling_params.request_prompt_logprobs else 1))
+                (max_prompt_len - context_len
+                 if seq_group_metadata.sampling_params.prompt_logprobs else 1))
 
         input_tokens = make_tensor_with_pad(input_tokens,
                                             max_len=max_prompt_len,
@@ -1154,8 +1154,8 @@ def prepare_input_tensors(
         paddings = list(itertools.accumulate(paddings))
         paddings_prompt_logprobs = []
         for i, seq_group_metadata in enumerate(seq_group_metadata_list):
-            if (seq_group_metadata.sampling_params.request_prompt_logprobs
-                    is not None and seq_group_metadata.is_prompt):
+            if (seq_group_metadata.sampling_params.prompt_logprobs is not None
+                    and seq_group_metadata.is_prompt):
                 paddings_prompt_logprobs += ([paddings[i]] * seq_lens[i])
         paddings = torch.tensor(
             paddings_prompt_logprobs if paddings_prompt_logprobs else paddings,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index a27ada83d5da7..1bc5f65c7127f 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -625,8 +625,8 @@ def _compute_lora_input(self, inter_data: InterDataForSeqGroup,
         inter_data.lora_prompt_mapping.append(
             [lora_id] *
             (query_len if seq_group_metadata.sampling_params
-             and seq_group_metadata.sampling_params.request_prompt_logprobs
-             is not None else 1))
+             and seq_group_metadata.sampling_params.prompt_logprobs is not None
+             else 1))
 
     def _compute_prompt_adapter_input(
             self, inter_data: InterDataForSeqGroup,
@@ -653,8 +653,8 @@ def _compute_prompt_adapter_input(
             prompt_adapter_id
         ] * num_tokens + [0] * (query_len - num_tokens)
         inter_data.prompt_adapter_prompt_mapping = [prompt_adapter_id] * (
-            query_len if seq_group_metadata.sampling_params and
-            seq_group_metadata.sampling_params.request_prompt_logprobs else 1)
+            query_len if seq_group_metadata.sampling_params
+            and seq_group_metadata.sampling_params.prompt_logprobs else 1)
 
     def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
                                    seq_group_metadata: SequenceGroupMetadata):
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 0783fed12daf8..3ca0d88a42183 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -775,14 +775,12 @@ def _pythonize_sampler_output(
 
     seq_groups = sampling_metadata.seq_groups
     prompt_logprobs_are_requested_for_prefill = any([
-        sg.sampling_params.request_prompt_logprobs is not None and sg.is_prompt
+        sg.sampling_params.prompt_logprobs is not None and sg.is_prompt
         for sg in seq_groups
     ])
     any_logprobs_are_requested = (
-        prompt_logprobs_are_requested_for_prefill or any([
-            sg.sampling_params.request_sample_logprobs is not None
-            for sg in seq_groups
-        ]))
+        prompt_logprobs_are_requested_for_prefill
+        or any([sg.sampling_params.logprobs is not None for sg in seq_groups]))
 
     if prompt_logprobs_are_requested_for_prefill:
         # CPU GPU sync, after gathering *only* sampled tokens (since
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 742dfdfce6cd0..9a054eb8a4cf7 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -520,10 +520,10 @@ def _prepare_sample(
                     f"Best of > {_MAX_NUM_SAMPLES} is not supported by the TPU "
                     "backend.")
             n.append(sampling_params.n)
-            if sampling_params.request_sample_logprobs is not None:
+            if sampling_params.logprobs is not None:
                 raise NotImplementedError(
                     "logprobs is not currently supported by the TPU backend.")
-            if sampling_params.request_prompt_logprobs is not None:
+            if sampling_params.prompt_logprobs is not None:
                 raise NotImplementedError(
                     "prompt_logprobs is not currently supported by the TPU "
                     "backend.")

From dc63ac12513dd55952701115d53e614cf21a16a9 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 9 Dec 2024 18:24:31 +0000
Subject: [PATCH 155/293] removing some unnecessary changes'

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/sampling_params.py   | 12 +++++-------
 vllm/spec_decode/util.py  |  2 +-
 vllm/v1/core/scheduler.py |  6 +++---
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 55664c6cf787a..fc77f3ca529b2 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -184,8 +184,6 @@ class SamplingParams(
     ignore_eos: bool = False
     max_tokens: Optional[int] = 16
     min_tokens: int = 0
-    # Number of sample logprobs and prompt logprobs,
-    # respectively, requested
     logprobs: Optional[int] = None
     prompt_logprobs: Optional[int] = None
     # NOTE: This parameter is only exposed at the engine level for now.
@@ -328,7 +326,7 @@ def __post_init__(self) -> None:
         else:
             self.bad_words = list(self.bad_words)
 
-        self.logprobs = (1 if self.logprobs is True else self.logprobs)
+        self.logprobs = 1 if self.logprobs is True else self.logprobs
         self.prompt_logprobs = (1 if self.prompt_logprobs is True else
                                 self.prompt_logprobs)
 
@@ -387,10 +385,10 @@ def _verify_args(self) -> None:
             raise ValueError(
                 f"min_tokens must be less than or equal to "
                 f"max_tokens={self.max_tokens}, got {self.min_tokens}.")
-        if (self.logprobs is not None and self.logprobs < 0):
-            raise ValueError(f"logprobs must be non-negative, "
-                             f"got {self.logprobs}.")
-        if (self.prompt_logprobs is not None and self.prompt_logprobs < 0):
+        if self.logprobs is not None and self.logprobs < 0:
+            raise ValueError(
+                f"logprobs must be non-negative, got {self.logprobs}.")
+        if self.prompt_logprobs is not None and self.prompt_logprobs < 0:
             raise ValueError(f"prompt_logprobs must be non-negative, got "
                              f"{self.prompt_logprobs}.")
         if (self.truncate_prompt_tokens is not None
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index 0b6003673578e..da8706658d09a 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -23,7 +23,7 @@ def get_all_num_logprobs(
 
     all_num_logprobs: List[int] = []
     for seq_group_metadata in seq_group_metadata_list:
-        num_logprobs = (seq_group_metadata.sampling_params.logprobs)
+        num_logprobs = seq_group_metadata.sampling_params.logprobs
         if num_logprobs is None:
             num_logprobs = 0
         all_num_logprobs.append(num_logprobs)
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index b71d1b3718528..ecf1d105d4d65 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -156,9 +156,9 @@ def schedule(self) -> "SchedulerOutput":
             ]
             num_scheduled_tokens[request.request_id] = num_new_tokens
             token_budget -= num_new_tokens
+            req_index += 1
             has_partial_request = (request.num_computed_tokens + num_new_tokens
                                    < request.num_tokens)
-            req_index += 1
 
             # Encoder-related.
             if encoder_inputs_to_schedule:
@@ -234,8 +234,8 @@ def schedule(self) -> "SchedulerOutput":
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
                 request.num_computed_tokens = num_computed_tokens
-                has_partial_request = (request.num_computed_tokens +
-                                       num_new_tokens < request.num_tokens)
+                has_partial_request = (num_computed_tokens + num_new_tokens <
+                                       request.num_tokens)
 
                 # Encoder-related.
                 if encoder_inputs_to_schedule:

From 4f304083c27351faca321f987c07eb7ee1612577 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 9 Dec 2024 18:27:32 +0000
Subject: [PATCH 156/293] removed fast checks

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/sample/test_logprobs.py | 38 --------------------------------
 1 file changed, 38 deletions(-)

diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 68c72c63786ec..275f6b8335f4a 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -1,4 +1,3 @@
-import os
 from typing import List
 
 import pytest
@@ -240,43 +239,6 @@ def test_get_logprobs_and_prompt_logprobs(
         monkeypatch=monkeypatch)
 
 
-# LLM engine v1
-@pytest.mark.skipif(os.getenv("VLLM_V1_FAST_TESTS") != "1",
-                    reason="vLLM v1 fast tests not enabled by "
-                    "VLLM_V1_FAST_TESTS=\"1\" in the environment.")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype",
-                         ["half"])  # needed for comparing logprobs with HF
-@pytest.mark.parametrize("max_num_batched_tokens", [128])
-@pytest.mark.parametrize("batch_logprobs_composition",
-                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
-def test_fast_get_logprobs_and_prompt_logprobs(
-    hf_runner,
-    vllm_runner,
-    model: str,
-    dtype: str,
-    batch_logprobs_composition: str,
-    max_num_batched_tokens: int,
-    example_prompts,
-    monkeypatch,
-) -> None:
-    """Fast test: V1 Engine logprobs & prompt logprobs
-    
-    Faster version of `test_get_logprobs_and_prompt_logprobs` with
-    fewer test cases.
-    """
-    _test_case_get_logprobs_and_prompt_logprobs(
-        hf_runner=hf_runner,
-        vllm_runner=vllm_runner,
-        model=model,
-        dtype=dtype,
-        detokenize=True,
-        batch_logprobs_composition=batch_logprobs_composition,
-        max_num_batched_tokens=max_num_batched_tokens,
-        example_prompts=example_prompts,
-        monkeypatch=monkeypatch)
-
-
 def test_max_logprobs(monkeypatch):
     """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
     

From 77488cb324b94a8bf5bfc5ff07a0137bf5633cc5 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 12 Dec 2024 10:53:20 +0000
Subject: [PATCH 157/293] wip test_completion

---
 .../v1/entrypoints/openai/test_completion.py  | 781 ++++++++++++++++++
 1 file changed, 781 insertions(+)
 create mode 100644 tests/v1/entrypoints/openai/test_completion.py

diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
new file mode 100644
index 0000000000000..20255d6b33b06
--- /dev/null
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -0,0 +1,781 @@
+# imports for guided decoding tests
+import json
+import re
+import shutil
+from tempfile import TemporaryDirectory
+from typing import Dict, List, Optional
+
+import jsonschema
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+from openai import BadRequestError
+from transformers import AutoTokenizer
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically these adapters use a different base model,
+# but we're not testing generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+PA_NAME = "swapnilbp/llama_tweet_ptune"
+# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
+# need to change to match the prompt adapter
+PA_NUM_VIRTUAL_TOKENS = 8
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_added_tokens_files(zephyr_lora_files):
+    tmp_dir = TemporaryDirectory()
+    tmp_model_dir = f"{tmp_dir.name}/zephyr"
+    shutil.copytree(zephyr_lora_files, tmp_model_dir)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    # Copy tokenizer to adapter and add some unique tokens
+    # 32000, 32001, 32002
+    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
+                                 special_tokens=True)
+    assert added == 3
+    tokenizer.save_pretrained(tmp_model_dir)
+    yield tmp_model_dir
+    tmp_dir.cleanup()
+
+
+@pytest.fixture(scope="module")
+def zephyr_pa_files():
+    return snapshot_download(repo_id=PA_NAME)
+
+
+@pytest.fixture(scope="module")
+def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
+                        zephyr_pa_files):
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        # lora config
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        # pa config
+        "--enable-prompt-adapter",
+        "--prompt-adapters",
+        f"zephyr-pa={zephyr_pa_files}",
+        f"zephyr-pa2={zephyr_pa_files}",
+        "--max-prompt-adapters",
+        "2",
+        "--max-prompt-adapter-token",
+        "128",
+    ]
+
+
+@pytest.fixture(scope="module",
+                params=["", "--disable-frontend-multiprocessing"])
+def server(default_server_args, request):
+    if request.param:
+        default_server_args.append(request.param)
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras, then test prompt adapters
+    "model_name,num_virtual_tokens",
+    [(MODEL_NAME, 0), ("zephyr-lora", 0), ("zephyr-lora2", 0),
+     ("zephyr-pa", PA_NUM_VIRTUAL_TOKENS),
+     ("zephyr-pa2", PA_NUM_VIRTUAL_TOKENS)],
+)
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
+                                 num_virtual_tokens: int):
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5,
+        prompt_tokens=6 + num_virtual_tokens,
+        total_tokens=11 + num_virtual_tokens)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 1
+    assert completion.choices[0].prompt_logprobs is None
+
+
+@pytest.mark.asyncio
+async def test_added_lora_tokens(client: openai.AsyncOpenAI):
+    # test using token IDs
+    completion = await client.completions.create(
+        model="zephyr-lora2",
+        prompt=[0, 0, 32000, 32001, 32002],
+        echo=True,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    # Added tokens should appear in tokenized prompt
+    assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3")
+
+
+@pytest.mark.asyncio
+async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
+    # test using token IDs
+    with pytest.raises(openai.BadRequestError, match="out of vocabulary"):
+        # Added tokens should be rejected by the base model
+        await client.completions.create(
+            model=MODEL_NAME,
+            prompt=[0, 0, 32000, 32001, 32002],
+            echo=True,
+            max_tokens=5,
+            temperature=0.0,
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras, then test prompt adapters
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2", "zephyr-pa", "zephyr-pa2"],
+)
+async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=None,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora and 1 pa hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=0,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert len(choice.logprobs.top_logprobs[0]) == 1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=5,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
+                                            model_name: str):
+
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=21,
+        )
+        ...
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        stream = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=30,
+            stream=True,
+        )
+        async for chunk in stream:
+            ...
+
+    # the server should still work afterwards
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1),
+                                                         (MODEL_NAME, 0),
+                                                         (MODEL_NAME, 1),
+                                                         (MODEL_NAME, None)])
+async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
+                                          model_name: str,
+                                          prompt_logprobs: Optional[int]):
+    params: Dict = {
+        "prompt": ["A robot may not injure another robot", "My name is"],
+        "model": model_name,
+    }
+    if prompt_logprobs is not None:
+        params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
+
+    if prompt_logprobs is not None and prompt_logprobs < 0:
+        with pytest.raises(BadRequestError):
+            await client.completions.create(**params)
+    else:
+        completion = await client.completions.create(**params)
+        if prompt_logprobs is not None:
+            assert completion.choices[0].prompt_logprobs is not None
+            assert len(completion.choices[0].prompt_logprobs) > 0
+
+            assert completion.choices[1].prompt_logprobs is not None
+            assert len(completion.choices[1].prompt_logprobs) > 0
+
+        else:
+            assert completion.choices[0].prompt_logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_completion_streaming(client: openai.AsyncOpenAI,
+                                    model_name: str):
+    prompt = "What is an LLM?"
+
+    single_completion = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    single_output = single_completion.choices[0].text
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True)
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.choices[0].text
+    assert "".join(chunks) == single_output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
+    """Streaming for parallel sampling.
+    The tokens from multiple samples, are flattened into a single stream,
+    with an index to indicate which sample the token belongs to.
+    """
+
+    prompt = "What is an LLM?"
+    n = 3
+    max_tokens = 5
+
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=max_tokens,
+                                             n=n,
+                                             stream=True)
+    chunks: List[List[str]] = [[] for i in range(n)]
+    finish_reason_count = 0
+    async for chunk in stream:
+        index = chunk.choices[0].index
+        text = chunk.choices[0].text
+        chunks[index].append(text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    assert finish_reason_count == n
+    for chunk in chunks:
+        assert len(chunk) == max_tokens
+        print("".join(chunk))
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_completion_stream_options(client: openai.AsyncOpenAI,
+                                         model_name: str):
+    prompt = "What is the capital of France?"
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is not None
+        assert chunk.usage.prompt_tokens > 0
+        assert chunk.usage.completion_tokens > 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if chunk.choices[0].finish_reason is not None:
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=False, stream_options=
+    #     {"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": None})
+
+    # Test stream=False, stream_options=
+    #    {"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": True})
+
+    # Test stream=False, stream_options=
+    #     {"continuous_usage_stats": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": None})
+
+    # Test stream=False, stream_options=
+    #    {"continuous_usage_stats": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": True})
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+)
+async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
+    # test both text and token IDs
+    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
+        # test simple list
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        assert len(batch.choices) == 2
+        assert batch.choices[0].text == batch.choices[1].text
+
+        # test n = 2
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            n=2,
+            max_tokens=5,
+            temperature=0.0,
+            extra_body=dict(
+                # NOTE: this has to be true for n > 1 in vLLM, but
+                # not necessary for official client.
+                use_beam_search=True),
+        )
+        assert len(batch.choices) == 4
+        assert batch.choices[0].text != batch.choices[
+            1].text, "beam search should be different"
+        assert batch.choices[0].text == batch.choices[
+            2].text, "two copies of the same prompt should be the same"
+        assert batch.choices[1].text == batch.choices[
+            3].text, "two copies of the same prompt should be the same"
+
+        # test streaming
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+            stream=True,
+        )
+        texts = [""] * 2
+        async for chunk in batch:
+            assert len(chunk.choices) == 1
+            choice = chunk.choices[0]
+            texts[choice.index] += choice.text
+        assert texts[0] == texts[1]
+
+
+@pytest.mark.asyncio
+async def test_logits_bias(client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 5
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    token_id = 1000
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token_id): 100},
+        seed=42,
+    )
+    assert len(completion.choices[0].text) >= 5
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
+                                add_special_tokens=False)["input_ids"]
+    assert all([
+        response == expected
+        for response, expected in zip(response_tokens, expected_tokens)
+    ])
+
+    # Test ban
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+    )
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    first_response = completion.choices[0].text
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token): -100
+                    for token in response_tokens},
+    )
+    assert first_response != completion.choices[0].text
+
+
+@pytest.mark.asyncio
+async def test_allowed_token_ids(client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 1
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    allowed_ids = [21555, 21557, 21558]
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        seed=42,
+        extra_body=dict(allowed_token_ids=allowed_ids),
+        logprobs=1,
+    )
+    response_tokens = completion.choices[0].logprobs.tokens
+    assert len(response_tokens) == 1
+    assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_json_completion(client: openai.AsyncOpenAI,
+                                      guided_decoding_backend: str,
+                                      sample_json_schema):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example JSON for an employee profile "
+        f"that fits this schema: {sample_json_schema}",
+        n=3,
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_json=sample_json_schema,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 3
+    for i in range(3):
+        output_json = json.loads(completion.choices[i].text)
+        jsonschema.validate(instance=output_json, schema=sample_json_schema)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_regex_completion(client: openai.AsyncOpenAI,
+                                       guided_decoding_backend: str,
+                                       sample_regex):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
+        n=3,
+        temperature=1.0,
+        max_tokens=20,
+        extra_body=dict(guided_regex=sample_regex,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 3
+    for i in range(3):
+        assert re.fullmatch(sample_regex,
+                            completion.choices[i].text) is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_completion(client: openai.AsyncOpenAI,
+                                        guided_decoding_backend: str,
+                                        sample_guided_choice):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt="The best language for type-safe systems programming is ",
+        n=2,
+        temperature=1.0,
+        max_tokens=10,
+        extra_body=dict(guided_choice=sample_guided_choice,
+                        guided_decoding_backend=guided_decoding_backend))
+
+    assert completion.id is not None
+    assert len(completion.choices) == 2
+    for i in range(2):
+        assert completion.choices[i].text in sample_guided_choice
+
+
+@pytest.mark.asyncio
+async def test_guided_grammar(client: openai.AsyncOpenAI,
+                              sample_sql_statements):
+
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=("Generate a sql state that select col_1 from "
+                "table_1 where it is equals to 1"),
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_grammar=sample_sql_statements))
+
+    content = completion.choices[0].text
+
+    # use Lark to parse the output, and make sure it's a valid parse tree
+    from lark import Lark
+    parser = Lark(sample_sql_statements)
+    parser.parse(content)
+
+    # remove spaces for comparison b/c we removed them in the grammar
+    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
+
+    assert content.strip() == ground_truth
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+@pytest.mark.parametrize("logprobs_arg", [1, 0])
+async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
+                                       model_name: str, logprobs_arg: int):
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    # test using text and token IDs
+    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
+        completion = await client.completions.create(model=model_name,
+                                                     prompt=prompt,
+                                                     max_tokens=5,
+                                                     temperature=0.0,
+                                                     echo=True,
+                                                     logprobs=logprobs_arg)
+
+        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
+                                                             list) else prompt
+        assert re.search(r"^" + prompt_text, completion.choices[0].text)
+        logprobs = completion.choices[0].logprobs
+        assert logprobs is not None
+        assert len(logprobs.text_offset) > 5
+        assert (len(logprobs.token_logprobs) > 5
+                and logprobs.token_logprobs[0] is None)
+        assert (len(logprobs.top_logprobs) > 5
+                and logprobs.top_logprobs[0] is None)
+        for top_logprobs in logprobs.top_logprobs[1:]:
+            assert max(logprobs_arg,
+                       1) <= len(top_logprobs) <= logprobs_arg + 1
+        assert len(logprobs.tokens) > 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
+                                          guided_decoding_backend: str,
+                                          sample_json_schema, sample_regex):
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example JSON that fits this schema: 42",
+            extra_body=dict(guided_json=42,
+                            guided_decoding_backend=guided_decoding_backend))
+
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example string that fits this regex",
+            extra_body=dict(guided_regex=sample_regex,
+                            guided_json=sample_json_schema))
\ No newline at end of file

From f1a689c2d0b4a90ff96216fce5eb0cae44262fa2 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 12 Dec 2024 11:34:53 +0000
Subject: [PATCH 158/293] toward completion tests

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .../v1/entrypoints/openai/test_completion.py  | 291 ++----------------
 1 file changed, 18 insertions(+), 273 deletions(-)

diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 20255d6b33b06..1a3d458b118ab 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -1,63 +1,21 @@
 # imports for guided decoding tests
-import json
 import re
-import shutil
-from tempfile import TemporaryDirectory
 from typing import Dict, List, Optional
 
-import jsonschema
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
 from openai import BadRequestError
-from transformers import AutoTokenizer
 
+from tests.utils import RemoteOpenAIServer
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ...utils import RemoteOpenAIServer
-
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-# technically these adapters use a different base model,
-# but we're not testing generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
-PA_NAME = "swapnilbp/llama_tweet_ptune"
-# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
-# need to change to match the prompt adapter
-PA_NUM_VIRTUAL_TOKENS = 8
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_added_tokens_files(zephyr_lora_files):
-    tmp_dir = TemporaryDirectory()
-    tmp_model_dir = f"{tmp_dir.name}/zephyr"
-    shutil.copytree(zephyr_lora_files, tmp_model_dir)
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    # Copy tokenizer to adapter and add some unique tokens
-    # 32000, 32001, 32002
-    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
-                                 special_tokens=True)
-    assert added == 3
-    tokenizer.save_pretrained(tmp_model_dir)
-    yield tmp_model_dir
-    tmp_dir.cleanup()
-
-
-@pytest.fixture(scope="module")
-def zephyr_pa_files():
-    return snapshot_download(repo_id=PA_NAME)
 
 
 @pytest.fixture(scope="module")
-def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
-                        zephyr_pa_files):
+def default_server_args():
     return [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -67,24 +25,6 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
         "--max-num-seqs",
         "128",
         "--enforce-eager",
-        # lora config
-        "--enable-lora",
-        "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
-        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
-        "--max-lora-rank",
-        "64",
-        "--max-cpu-loras",
-        "2",
-        # pa config
-        "--enable-prompt-adapter",
-        "--prompt-adapters",
-        f"zephyr-pa={zephyr_pa_files}",
-        f"zephyr-pa2={zephyr_pa_files}",
-        "--max-prompt-adapters",
-        "2",
-        "--max-prompt-adapter-token",
-        "128",
     ]
 
 
@@ -105,14 +45,11 @@ async def client(server):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    # first test base model, then test loras, then test prompt adapters
-    "model_name,num_virtual_tokens",
-    [(MODEL_NAME, 0), ("zephyr-lora", 0), ("zephyr-lora2", 0),
-     ("zephyr-pa", PA_NUM_VIRTUAL_TOKENS),
-     ("zephyr-pa2", PA_NUM_VIRTUAL_TOKENS)],
+    "model_name",
+    [MODEL_NAME],
 )
-async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
-                                 num_virtual_tokens: int):
+async def test_single_completion(client: openai.AsyncOpenAI,
+                                 model_name: str) -> None:
     completion = await client.completions.create(model=model_name,
                                                  prompt="Hello, my name is",
                                                  max_tokens=5,
@@ -125,9 +62,7 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
     assert len(choice.text) >= 5
     assert choice.finish_reason == "length"
     assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5,
-        prompt_tokens=6 + num_virtual_tokens,
-        total_tokens=11 + num_virtual_tokens)
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
 
     # test using token IDs
     completion = await client.completions.create(
@@ -140,39 +75,10 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
     assert completion.choices[0].prompt_logprobs is None
 
 
-@pytest.mark.asyncio
-async def test_added_lora_tokens(client: openai.AsyncOpenAI):
-    # test using token IDs
-    completion = await client.completions.create(
-        model="zephyr-lora2",
-        prompt=[0, 0, 32000, 32001, 32002],
-        echo=True,
-        max_tokens=5,
-        temperature=0.0,
-    )
-    # Added tokens should appear in tokenized prompt
-    assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3")
-
-
-@pytest.mark.asyncio
-async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
-    # test using token IDs
-    with pytest.raises(openai.BadRequestError, match="out of vocabulary"):
-        # Added tokens should be rejected by the base model
-        await client.completions.create(
-            model=MODEL_NAME,
-            prompt=[0, 0, 32000, 32001, 32002],
-            echo=True,
-            max_tokens=5,
-            temperature=0.0,
-        )
-
-
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    # first test base model, then test loras, then test prompt adapters
     "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2", "zephyr-pa", "zephyr-pa2"],
+    [MODEL_NAME],
 )
 async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
     # test using token IDs
@@ -189,9 +95,8 @@ async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    # just test 1 lora and 1 pa hereafter
     "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+    [MODEL_NAME],
 )
 async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
     # test using token IDs
@@ -212,7 +117,7 @@ async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+    [MODEL_NAME],
 )
 async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
     # test using token IDs
@@ -233,10 +138,10 @@ async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+    [MODEL_NAME],
 )
 async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
-                                            model_name: str):
+                                            model_name: str) -> None:
 
     with pytest.raises(
         (openai.BadRequestError, openai.APIError)):  # test using token IDs
@@ -309,10 +214,10 @@ async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+    [MODEL_NAME],
 )
 async def test_completion_streaming(client: openai.AsyncOpenAI,
-                                    model_name: str):
+                                    model_name: str) -> None:
     prompt = "What is an LLM?"
 
     single_completion = await client.completions.create(
@@ -343,7 +248,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+    [MODEL_NAME],
 )
 async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
     """Streaming for parallel sampling.
@@ -377,7 +282,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+    [MODEL_NAME],
 )
 async def test_completion_stream_options(client: openai.AsyncOpenAI,
                                          model_name: str):
@@ -514,7 +419,7 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+    [MODEL_NAME],
 )
 async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
     # test both text and token IDs
@@ -565,53 +470,6 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
         assert texts[0] == texts[1]
 
 
-@pytest.mark.asyncio
-async def test_logits_bias(client: openai.AsyncOpenAI):
-    prompt = "Hello, my name is"
-    max_tokens = 5
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-
-    # Test exclusive selection
-    token_id = 1000
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        logit_bias={str(token_id): 100},
-        seed=42,
-    )
-    assert len(completion.choices[0].text) >= 5
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
-                                add_special_tokens=False)["input_ids"]
-    assert all([
-        response == expected
-        for response, expected in zip(response_tokens, expected_tokens)
-    ])
-
-    # Test ban
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-    )
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    first_response = completion.choices[0].text
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        logit_bias={str(token): -100
-                    for token in response_tokens},
-    )
-    assert first_response != completion.choices[0].text
-
-
 @pytest.mark.asyncio
 async def test_allowed_token_ids(client: openai.AsyncOpenAI):
     prompt = "Hello, my name is"
@@ -634,102 +492,10 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI):
     assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_json_completion(client: openai.AsyncOpenAI,
-                                      guided_decoding_backend: str,
-                                      sample_json_schema):
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=f"Give an example JSON for an employee profile "
-        f"that fits this schema: {sample_json_schema}",
-        n=3,
-        temperature=1.0,
-        max_tokens=500,
-        extra_body=dict(guided_json=sample_json_schema,
-                        guided_decoding_backend=guided_decoding_backend))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 3
-    for i in range(3):
-        output_json = json.loads(completion.choices[i].text)
-        jsonschema.validate(instance=output_json, schema=sample_json_schema)
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_regex_completion(client: openai.AsyncOpenAI,
-                                       guided_decoding_backend: str,
-                                       sample_regex):
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
-        n=3,
-        temperature=1.0,
-        max_tokens=20,
-        extra_body=dict(guided_regex=sample_regex,
-                        guided_decoding_backend=guided_decoding_backend))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 3
-    for i in range(3):
-        assert re.fullmatch(sample_regex,
-                            completion.choices[i].text) is not None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_choice_completion(client: openai.AsyncOpenAI,
-                                        guided_decoding_backend: str,
-                                        sample_guided_choice):
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt="The best language for type-safe systems programming is ",
-        n=2,
-        temperature=1.0,
-        max_tokens=10,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 2
-    for i in range(2):
-        assert completion.choices[i].text in sample_guided_choice
-
-
-@pytest.mark.asyncio
-async def test_guided_grammar(client: openai.AsyncOpenAI,
-                              sample_sql_statements):
-
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=("Generate a sql state that select col_1 from "
-                "table_1 where it is equals to 1"),
-        temperature=1.0,
-        max_tokens=500,
-        extra_body=dict(guided_grammar=sample_sql_statements))
-
-    content = completion.choices[0].text
-
-    # use Lark to parse the output, and make sure it's a valid parse tree
-    from lark import Lark
-    parser = Lark(sample_sql_statements)
-    parser.parse(content)
-
-    # remove spaces for comparison b/c we removed them in the grammar
-    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
-
-    assert content.strip() == ground_truth
-
-
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    # first test base model, then test loras
     "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+    [MODEL_NAME],
 )
 @pytest.mark.parametrize("logprobs_arg", [1, 0])
 async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
@@ -758,24 +524,3 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
             assert max(logprobs_arg,
                        1) <= len(top_logprobs) <= logprobs_arg + 1
         assert len(logprobs.tokens) > 5
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
-async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
-                                          guided_decoding_backend: str,
-                                          sample_json_schema, sample_regex):
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example JSON that fits this schema: 42",
-            extra_body=dict(guided_json=42,
-                            guided_decoding_backend=guided_decoding_backend))
-
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example string that fits this regex",
-            extra_body=dict(guided_regex=sample_regex,
-                            guided_json=sample_json_schema))
\ No newline at end of file

From e962aa7e4d74f4e42a5464ba82f2ac41156e803d Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 12 Dec 2024 17:51:40 +0000
Subject: [PATCH 159/293] serialization fix

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/core.py        |  4 ++--
 vllm/v1/engine/core_client.py |  5 +++--
 vllm/v1/serial_utils.py       | 28 ++++++++++++++++++++++++++++
 3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 5fc4f2e425726..bf07dc94bb8f7 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -23,7 +23,7 @@
 from vllm.v1.executor.gpu_executor import GPUExecutor
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
-from vllm.v1.serial_utils import PickleEncoder
+from vllm.v1.serial_utils import PickleEncoder, custom_enc_hook
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -517,7 +517,7 @@ def process_output_socket(self, output_path: str):
         """Output socket IO thread."""
 
         # Msgpack serialization encoding.
-        encoder = msgpack.Encoder()
+        encoder = msgpack.Encoder(enc_hook=custom_enc_hook)
         # Reuse send buffer.
         buffer = bytearray()
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 835963f7ee86c..236d633e8d5da 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -12,7 +12,7 @@
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
-from vllm.v1.serial_utils import PickleEncoder
+from vllm.v1.serial_utils import PickleEncoder, custom_ext_hook
 
 logger = init_logger(__name__)
 
@@ -124,7 +124,8 @@ def __init__(
     ):
         # Serialization setup.
         self.encoder = PickleEncoder()
-        self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
+        self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs,
+                                               ext_hook=custom_ext_hook)
 
         # ZMQ setup.
         self.ctx = (zmq.asyncio.Context() if asyncio_mode else zmq.Context())
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index b1cd5c11834f8..76f7076cfa9e0 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,4 +1,11 @@
 import pickle
+from typing import Any
+
+import numpy as np
+from msgspec import msgpack
+
+CUSTOM_TYPE_CODE_PICKLE = 1
+pickle_types = (np.ndarray, )
 
 
 class PickleEncoder:
@@ -8,3 +15,24 @@ def encode(self, obj):
 
     def decode(self, data):
         return pickle.loads(data)
+
+
+def custom_enc_hook(obj: Any) -> Any:
+    if isinstance(obj, pickle_types):
+        # Return an `Ext` object so msgspec serializes it as an extension type.
+        return msgpack.Ext(CUSTOM_TYPE_CODE_PICKLE, pickle.dumps(obj))
+    else:
+        # Raise a NotImplementedError for other types
+        raise NotImplementedError(
+            f"Objects of type {type(obj)} are not supported")
+
+
+def custom_ext_hook(code: int, data: memoryview) -> Any:
+    if code == CUSTOM_TYPE_CODE_PICKLE:
+        # This extension type represents a complex number, decode the data
+        # buffer accordingly.
+        return pickle.loads(data)
+    else:
+        # Raise a NotImplementedError for other extension type codes
+        raise NotImplementedError(
+            f"Extension type code {code} is not supported")

From b22c5e79701bbe547e65ccf894579071815846a2 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 16 Dec 2024 21:15:45 +0000
Subject: [PATCH 160/293] formatted vllm/v1/engine/core.py

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/core.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index a2d64bc7e3e5b..46a744940b940 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -188,6 +188,13 @@ def update_from_output(
                     #
                     # Note: new_prompt_logprobs will be used later to build the
                     # engine core output
+                    assert (model_runner_output.batch_prompt_logprobs_cpu
+                            is not None)
+                    assert (
+                        model_runner_output.batch_prompt_logprob_token_ids_cpu
+                        is not None)
+                    assert request.prompt_logprobs is not None
+                    assert request.prompt_logprob_token_ids is not None
                     logprob_cnt = request_prompt_logprobs
                     mr_output_slice_upper_index = (
                         mr_output_slice_lower_index + num_new_prompt_tokens)
@@ -234,6 +241,9 @@ def update_from_output(
                 # generates at most one token at each step.
                 token_id = sampled_token_ids[req_index]
                 if request_do_logprobs:
+                    assert model_runner_output.batch_logprobs_cpu is not None
+                    assert model_runner_output.batch_logprob_token_ids_cpu is not None
+                    assert request.logprobs is not None
                     # Slice out this request's sample logprobs; defer
                     # pythonization to be carried out in the frontend.
                     request.logprobs.append(
@@ -247,6 +257,14 @@ def update_from_output(
                 # This must be called before me make the EngineCoreOutput.
                 stopped = scheduler._check_stop(request)
 
+                # Compute engine core output logprobs list as such,
+                # so the type checker can see the assert
+                if request_do_logprobs:
+                    assert request.logprobs is not None
+                    logprobs = request.logprobs[-num_new_tokens:]
+                else:
+                    logprobs = None
+
                 # Add EngineCoreOutput for this Request.
                 # Return the logprob for the most recently computed tokens.
                 # Return no prompt logprobs in decode-phase.
@@ -256,8 +274,7 @@ def update_from_output(
                     finished=request.is_finished(),
                     finish_reason=request.get_finished_reason(),
                     stop_reason=request.stop_reason,
-                    logprobs=(request.logprobs[-num_new_tokens:]
-                              if request_do_logprobs else None),
+                    logprobs=logprobs,
                     prompt_logprobs=(new_prompt_logprobs
                                      if request_do_prompt_logprobs else None),
                     prompt_logprobs_token_ids=(new_prompt_logprob_token_ids

From 5bc70399dc006b3d7d8e5685896fad6fd48b167c Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 16 Dec 2024 21:58:32 +0000
Subject: [PATCH 161/293] wip merge

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/sample/sampler.py          |  5 ++++
 vllm/v1/worker/gpu_model_runner.py | 41 ++++++++++++++++++++++--------
 2 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index ec5dd461ece6c..dea64c4f9b134 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -196,6 +196,7 @@ def _compute_logprobs_from_processed_logits(
             # Batch requires only prompt logprobs
 
             # - Compute top logprobs only at sequence offsets of prompt tokens
+            assert logits_w_tmp_tpk_tpp is not None
             logprobs = self.get_logprobs(
                 logits_w_tmp_tpk_tpp[prompt_logits_mask, :])
 
@@ -203,6 +204,9 @@ def _compute_logprobs_from_processed_logits(
             return ((None, None) + self._top_logprobs_token_indices(
                 logprobs, sampling_metadata.max_num_batch_prompt_logprobs))
 
+        raise ValueError("One or both of Logprobs and Prompt Logprobs must"
+                         " be enabled to use this method.")
+
     def forward(
         self,
         logits: torch.Tensor,
@@ -242,6 +246,7 @@ def forward(
         # request in the batch. While we should not sample any token from this
         # partial request, we do so for simplicity. We will ignore the sampled
         # token from the partial request.
+        assert sampling_metadata.query_start_loc is not None
         maybe_sample_logits_indices = sampling_metadata.query_start_loc[1:] - 1
         prompt_logits_mask = torch.ones(sampling_metadata.num_input_tokens,
                                         dtype=torch.bool)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 7ba1efd0d8280..963ab254e5ea3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -558,24 +558,43 @@ def execute_model(
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
+        # Prepare batch-level sample logprobs in a way that the type-checker
+        # understands
+        if do_batch_sample_logprobs:
+            assert (sampler_output.batch_sample_logprob_token_ids is not None)
+            assert (sampler_output.batch_sample_logprobs is not None)
+            batch_logprob_token_ids_cpu = (
+                sampler_output.batch_sample_logprob_token_ids.cpu().numpy())
+            batch_logprobs_cpu = (
+                sampler_output.batch_sample_logprobs.cpu().numpy())
+        else:
+            batch_logprob_token_ids_cpu = None
+            batch_logprobs_cpu = None
+
+        # Prepare batch-level prompt logprobs in a way that the type-checker
+        # understands
+        if do_batch_prompt_logprobs:
+            assert (sampler_output.batch_prompt_logprob_token_ids is not None)
+            assert (sampler_output.batch_prompt_logprobs is not None)
+            batch_prompt_logprob_token_ids_cpu = sampler_output.batch_prompt_logprob_token_ids.cpu(
+            ).numpy()
+            batch_prompt_logprobs_cpu = sampler_output.batch_prompt_logprobs.cpu(
+            ).numpy()
+        else:
+            batch_prompt_logprob_token_ids_cpu = None
+            batch_prompt_logprobs_cpu = None
+
         model_runner_output = ModelRunnerOutput(
             req_ids=self.input_batch.req_ids[:num_reqs],
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids_cpu=sampled_token_ids,
             # NOTE: sample and prompt logprob CPU-GPU synchronization happens
             # here
-            batch_logprob_token_ids_cpu=(
-                sampler_output.batch_sample_logprob_token_ids.cpu().numpy()
-                if do_batch_sample_logprobs else None),
-            batch_logprobs_cpu=(
-                sampler_output.batch_sample_logprobs.cpu().numpy()
-                if do_batch_sample_logprobs else None),
+            batch_logprob_token_ids_cpu=batch_logprob_token_ids_cpu,
+            batch_logprobs_cpu=batch_logprobs_cpu,
             batch_prompt_logprob_token_ids_cpu=(
-                sampler_output.batch_prompt_logprob_token_ids.cpu().numpy()
-                if do_batch_prompt_logprobs else None),
-            batch_prompt_logprobs_cpu=(
-                sampler_output.batch_prompt_logprobs.cpu().numpy()
-                if do_batch_prompt_logprobs else None))
+                batch_prompt_logprob_token_ids_cpu),
+            batch_prompt_logprobs_cpu=(batch_prompt_logprobs_cpu))
         return model_runner_output
 
     def load_model(self) -> None:

From 4d53751c8f5e0d6a83869ab964e42d3caec3eff4 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 16 Dec 2024 22:44:46 +0000
Subject: [PATCH 162/293] formatting

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/core.py             |  3 ++-
 vllm/v1/engine/detokenizer.py      | 29 +++++++++++++++++++----------
 vllm/v1/worker/gpu_model_runner.py | 14 +++++++-------
 3 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 46a744940b940..a599b2552e541 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -242,7 +242,8 @@ def update_from_output(
                 token_id = sampled_token_ids[req_index]
                 if request_do_logprobs:
                     assert model_runner_output.batch_logprobs_cpu is not None
-                    assert model_runner_output.batch_logprob_token_ids_cpu is not None
+                    assert (model_runner_output.batch_logprob_token_ids_cpu
+                            is not None)
                     assert request.logprobs is not None
                     # Slice out this request's sample logprobs; defer
                     # pythonization to be carried out in the frontend.
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 2d1ce9a4056a3..1b3c8e671909c 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple, Union
+from typing import Dict, Iterable, List, Optional, Tuple, Union, cast
 
 import numpy as np
 import numpy.typing as npt
@@ -170,7 +170,8 @@ def _pythonize_sequence_position(
         """
         logprob_values = logprob_values.tolist()
         logprob_token_ids = logprob_token_ids.tolist()
-        logprob_token_strs = (self._detokenize_ids(logprob_token_ids) if
+        logprob_token_strs = (cast(List[Optional[str]],
+                                   self._detokenize_ids(logprob_token_ids)) if
                               detokenize else [None] * len(logprob_token_ids))
 
         return {
@@ -213,6 +214,8 @@ def _pythonize_maybe_detokenize_sample_logprobs_for_request(
         """
         new_pythonized_logprobs = []
         max_logprobs = self.max_request_sample_logprobs
+        assert max_logprobs is not None
+        assert self.request_logprobs is not None
         for (logprob_values,
              logprob_token_ids), token_id in zip(new_sample_logprobs,
                                                  new_sample_token_ids):
@@ -246,8 +249,8 @@ def _pythonize_maybe_detokenize_sample_logprobs_for_request(
 
     def _pythonize_maybe_detokenize_prompt_logprobs_for_request(
         self,
-        prompt_logprob_values: Optional[npt.NDArray],
-        prompt_logprob_token_ids: Optional[npt.NDArray],
+        prompt_logprob_values: npt.NDArray,
+        prompt_logprob_token_ids: npt.NDArray,
         detokenize: bool,
     ) -> PromptLogprobs:
         """Pythonize prompt logprobs, maybe detokenize.
@@ -278,7 +281,7 @@ def _pythonize_maybe_detokenize_prompt_logprobs_for_request(
           detokenized
         """
         logprob_cnt = self.max_request_prompt_logprobs
-        prompt_logprobs = [
+        prompt_logprobs: List[Optional[Dict[int, Logprob]]] = [
             self._pythonize_sequence_position(plp_tok_values,
                                               plp_tok_token_ids, detokenize)
             for plp_tok_values, plp_tok_token_ids in zip(
@@ -288,7 +291,9 @@ def _pythonize_maybe_detokenize_prompt_logprobs_for_request(
         ]
         if not self.request_prompt_logprobs:
             # Ensure that None is the first prompt logprob
-            prompt_logprobs = [None] + prompt_logprobs
+            prompt_logprobs = cast(List[Optional[Dict[int, Logprob]]],
+                                   [None]) + prompt_logprobs
+        assert self.request_prompt_logprobs is not None
         self.request_prompt_logprobs.extend(prompt_logprobs)
         return prompt_logprobs
 
@@ -330,16 +335,18 @@ def add_tokens(
         # Only try to Pythonize sample logprobs if any were provided
         do_request_sample_logprobs = new_sample_logprobs is not None and len(
             new_sample_logprobs) > 0
-        assert not do_request_sample_logprobs or len(
-            new_sample_logprobs) == len(new_sampled_token_ids)
+        if do_request_sample_logprobs:
+            assert new_sample_logprobs is not None
+            assert len(new_sample_logprobs) == len(new_sampled_token_ids)
         # Only try to Pythonize prompt logprobs if any were provided
         do_request_prompt_logprobs = new_prompt_logprobs is not None and len(
             new_prompt_logprobs) > 0
-        assert (not do_request_prompt_logprobs
-                or new_prompt_logprob_token_ids is not None)
+        if do_request_prompt_logprobs:
+            assert new_prompt_logprob_token_ids is not None
 
         if do_request_sample_logprobs:
             # 1) Pythonize & detokenize sample logprobs
+            assert new_sample_logprobs is not None
             new_sample_logprobs = (
                 self._pythonize_maybe_detokenize_sample_logprobs_for_request(
                     new_sample_logprobs,
@@ -348,6 +355,8 @@ def add_tokens(
 
         if do_request_prompt_logprobs:
             # 2) If necessary, detokenize prompt logprobs incrementally
+            assert new_prompt_logprobs is not None
+            assert new_prompt_logprob_token_ids is not None
             new_prompt_logprobs = (
                 self._pythonize_maybe_detokenize_prompt_logprobs_for_request(
                     new_prompt_logprobs,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 963ab254e5ea3..f0587ca44c3c5 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -251,7 +251,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
     def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> Tuple[torch.Tensor, FlashAttentionMetadata]:
+    ) -> FlashAttentionMetadata:
 
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
@@ -576,18 +576,18 @@ def execute_model(
         if do_batch_prompt_logprobs:
             assert (sampler_output.batch_prompt_logprob_token_ids is not None)
             assert (sampler_output.batch_prompt_logprobs is not None)
-            batch_prompt_logprob_token_ids_cpu = sampler_output.batch_prompt_logprob_token_ids.cpu(
-            ).numpy()
-            batch_prompt_logprobs_cpu = sampler_output.batch_prompt_logprobs.cpu(
-            ).numpy()
+            batch_prompt_logprob_token_ids_cpu = (
+                sampler_output.batch_prompt_logprob_token_ids.cpu().numpy())
+            batch_prompt_logprobs_cpu = (
+                sampler_output.batch_prompt_logprobs.cpu().numpy())
         else:
             batch_prompt_logprob_token_ids_cpu = None
             batch_prompt_logprobs_cpu = None
 
         model_runner_output = ModelRunnerOutput(
-            req_ids=self.input_batch.req_ids[:num_reqs],
+            req_ids=cast(List[str], self.input_batch.req_ids[:num_reqs]),
             req_id_to_index=self.input_batch.req_id_to_index,
-            sampled_token_ids_cpu=sampled_token_ids,
+            sampled_token_ids=sampled_token_ids,
             # NOTE: sample and prompt logprob CPU-GPU synchronization happens
             # here
             batch_logprob_token_ids_cpu=batch_logprob_token_ids_cpu,

From 697fc153935758ac68839a7d45116edccc01ccc2 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 17 Dec 2024 06:59:26 +0000
Subject: [PATCH 163/293] cleanup

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/model_executor/sampling_metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index a58589bb915ed..1df8f84ed4093 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -426,7 +426,7 @@ def from_sampling_metadata(
                 do_penalties = True
 
             is_prompt = seq_group.is_prompt
-            if (is_prompt and sampling_params.prompt_logprobs is not None):
+            if is_prompt and sampling_params.prompt_logprobs is not None:
                 # For tokens in the prompt that we only need to get
                 # their logprobs
                 query_len = seq_group.query_len

From f61d8223cce33c76b7d6132ad09d535612caa027 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 17 Dec 2024 07:04:38 +0000
Subject: [PATCH 164/293] remove calling max_logprobs from engine

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/async_llm.py  | 2 --
 vllm/v1/engine/llm_engine.py | 2 --
 vllm/v1/engine/processor.py  | 3 +--
 3 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index d744bf87155ff..497b3d79b637d 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -163,8 +163,6 @@ async def add_request(
             request_id=request_id,
             prompt=prompt,
             params=params,
-            max_logprobs_permitted_by_engine=(
-                await self.get_model_config()).max_logprobs,
             arrival_time=arrival_time,
             lora_request=lora_request,
             trace_headers=trace_headers,
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 89f6ee2ce648b..fa835cbb28324 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -150,8 +150,6 @@ def add_request(
             request_id=request_id,
             prompt=prompt,
             params=params,
-            max_logprobs_permitted_by_engine=self.get_model_config(
-            ).max_logprobs,
             arrival_time=arrival_time,
             lora_request=lora_request,
             trace_headers=trace_headers,
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index aec80163ffa92..9b4dfe39f5f47 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -78,7 +78,6 @@ def process_inputs(
         request_id: str,
         prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
-        max_logprobs_permitted_by_engine: int,
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
@@ -109,7 +108,7 @@ def process_inputs(
         # TODO(woosuk): Support encoder-decoder models.
 
         self._assert_valid_sample_logprobs_prompt_logprobs(
-            params, max_logprobs_permitted_by_engine)
+            params, self.model_config.max_logprobs)
 
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "

From b77c1aff0e48e144f8a6635c6bf2689193988cc5 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 17 Dec 2024 07:07:16 +0000
Subject: [PATCH 165/293] remove change in hpu

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/worker/hpu_model_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index d3d6ac0c613aa..9d479f412af46 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1158,8 +1158,8 @@ def prepare_input_tensors(
         paddings = list(itertools.accumulate(paddings))
         paddings_prompt_logprobs = []
         for i, seq_group_metadata in enumerate(seq_group_metadata_list):
-            if (seq_group_metadata.sampling_params.prompt_logprobs is not None
-                    and seq_group_metadata.is_prompt):
+            if seq_group_metadata.sampling_params.prompt_logprobs is not None \
+                              and seq_group_metadata.is_prompt:
                 paddings_prompt_logprobs += ([paddings[i]] * seq_lens[i])
         paddings = torch.tensor(
             paddings_prompt_logprobs if paddings_prompt_logprobs else paddings,

From f0c1ba7d3d3ff28eb6e2a2102af28ebeba28b77a Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 18 Dec 2024 06:59:24 +0000
Subject: [PATCH 166/293] deferring v1 test_completion.py to later PR

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .../v1/entrypoints/openai/test_completion.py  | 526 ------------------
 1 file changed, 526 deletions(-)
 delete mode 100644 tests/v1/entrypoints/openai/test_completion.py

diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
deleted file mode 100644
index 1a3d458b118ab..0000000000000
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ /dev/null
@@ -1,526 +0,0 @@
-# imports for guided decoding tests
-import re
-from typing import Dict, List, Optional
-
-import openai  # use the official client for correctness check
-import pytest
-import pytest_asyncio
-from openai import BadRequestError
-
-from tests.utils import RemoteOpenAIServer
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-# any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-
-
-@pytest.fixture(scope="module")
-def default_server_args():
-    return [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--max-num-seqs",
-        "128",
-        "--enforce-eager",
-    ]
-
-
-@pytest.fixture(scope="module",
-                params=["", "--disable-frontend-multiprocessing"])
-def server(default_server_args, request):
-    if request.param:
-        default_server_args.append(request.param)
-    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
-        yield remote_server
-
-
-@pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
-        yield async_client
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_single_completion(client: openai.AsyncOpenAI,
-                                 model_name: str) -> None:
-    completion = await client.completions.create(model=model_name,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
-
-    assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 1
-
-    choice = completion.choices[0]
-    assert len(choice.text) >= 5
-    assert choice.finish_reason == "length"
-    assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
-
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 1
-    assert completion.choices[0].prompt_logprobs is None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=None,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=0,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.token_logprobs is not None
-    assert choice.logprobs.top_logprobs is not None
-    assert len(choice.logprobs.top_logprobs[0]) == 1
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=5,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.token_logprobs is not None
-    assert choice.logprobs.top_logprobs is not None
-    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
-                                            model_name: str) -> None:
-
-    with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        await client.completions.create(
-            model=model_name,
-            prompt=[0, 0, 0, 0, 0],
-            max_tokens=5,
-            temperature=0.0,
-            # vLLM has higher default max_logprobs (20 instead of 5) to support
-            # both Completion API and Chat Completion API
-            logprobs=21,
-        )
-        ...
-    with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        stream = await client.completions.create(
-            model=model_name,
-            prompt=[0, 0, 0, 0, 0],
-            max_tokens=5,
-            temperature=0.0,
-            # vLLM has higher default max_logprobs (20 instead of 5) to support
-            # both Completion API and Chat Completion API
-            logprobs=30,
-            stream=True,
-        )
-        async for chunk in stream:
-            ...
-
-    # the server should still work afterwards
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1),
-                                                         (MODEL_NAME, 0),
-                                                         (MODEL_NAME, 1),
-                                                         (MODEL_NAME, None)])
-async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
-                                          model_name: str,
-                                          prompt_logprobs: Optional[int]):
-    params: Dict = {
-        "prompt": ["A robot may not injure another robot", "My name is"],
-        "model": model_name,
-    }
-    if prompt_logprobs is not None:
-        params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
-
-    if prompt_logprobs is not None and prompt_logprobs < 0:
-        with pytest.raises(BadRequestError):
-            await client.completions.create(**params)
-    else:
-        completion = await client.completions.create(**params)
-        if prompt_logprobs is not None:
-            assert completion.choices[0].prompt_logprobs is not None
-            assert len(completion.choices[0].prompt_logprobs) > 0
-
-            assert completion.choices[1].prompt_logprobs is not None
-            assert len(completion.choices[1].prompt_logprobs) > 0
-
-        else:
-            assert completion.choices[0].prompt_logprobs is None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_completion_streaming(client: openai.AsyncOpenAI,
-                                    model_name: str) -> None:
-    prompt = "What is an LLM?"
-
-    single_completion = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-    )
-    single_output = single_completion.choices[0].text
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True)
-    chunks: List[str] = []
-    finish_reason_count = 0
-    async for chunk in stream:
-        chunks.append(chunk.choices[0].text)
-        if chunk.choices[0].finish_reason is not None:
-            finish_reason_count += 1
-    # finish reason should only return in last block
-    assert finish_reason_count == 1
-    assert chunk.choices[0].finish_reason == "length"
-    assert chunk.choices[0].text
-    assert "".join(chunks) == single_output
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
-    """Streaming for parallel sampling.
-    The tokens from multiple samples, are flattened into a single stream,
-    with an index to indicate which sample the token belongs to.
-    """
-
-    prompt = "What is an LLM?"
-    n = 3
-    max_tokens = 5
-
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=max_tokens,
-                                             n=n,
-                                             stream=True)
-    chunks: List[List[str]] = [[] for i in range(n)]
-    finish_reason_count = 0
-    async for chunk in stream:
-        index = chunk.choices[0].index
-        text = chunk.choices[0].text
-        chunks[index].append(text)
-        if chunk.choices[0].finish_reason is not None:
-            finish_reason_count += 1
-    assert finish_reason_count == n
-    for chunk in chunks:
-        assert len(chunk) == max_tokens
-        print("".join(chunk))
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_completion_stream_options(client: openai.AsyncOpenAI,
-                                         model_name: str):
-    prompt = "What is the capital of France?"
-
-    # Test stream=True, stream_options=
-    #     {"include_usage": False, "continuous_usage_stats": False}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": False,
-                                                 "continuous_usage_stats":
-                                                 False,
-                                             })
-
-    async for chunk in stream:
-        assert chunk.usage is None
-
-    # Test stream=True, stream_options=
-    #     {"include_usage": False, "continuous_usage_stats": True}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": False,
-                                                 "continuous_usage_stats":
-                                                 True,
-                                             })
-    async for chunk in stream:
-        assert chunk.usage is None
-
-    # Test stream=True, stream_options=
-    #     {"include_usage": True, "continuous_usage_stats": False}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": True,
-                                                 "continuous_usage_stats":
-                                                 False,
-                                             })
-    async for chunk in stream:
-        if chunk.choices[0].finish_reason is None:
-            assert chunk.usage is None
-        else:
-            assert chunk.usage is None
-            final_chunk = await stream.__anext__()
-            assert final_chunk.usage is not None
-            assert final_chunk.usage.prompt_tokens > 0
-            assert final_chunk.usage.completion_tokens > 0
-            assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
-            assert final_chunk.choices == []
-
-    # Test stream=True, stream_options=
-    #     {"include_usage": True, "continuous_usage_stats": True}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": True,
-                                                 "continuous_usage_stats":
-                                                 True,
-                                             })
-    async for chunk in stream:
-        assert chunk.usage is not None
-        assert chunk.usage.prompt_tokens > 0
-        assert chunk.usage.completion_tokens > 0
-        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
-                                            chunk.usage.completion_tokens)
-        if chunk.choices[0].finish_reason is not None:
-            final_chunk = await stream.__anext__()
-            assert final_chunk.usage is not None
-            assert final_chunk.usage.prompt_tokens > 0
-            assert final_chunk.usage.completion_tokens > 0
-            assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
-            assert final_chunk.choices == []
-
-    # Test stream=False, stream_options=
-    #     {"include_usage": None}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": None})
-
-    # Test stream=False, stream_options=
-    #    {"include_usage": True}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": True})
-
-    # Test stream=False, stream_options=
-    #     {"continuous_usage_stats": None}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(
-            model=model_name,
-            prompt=prompt,
-            max_tokens=5,
-            temperature=0.0,
-            stream=False,
-            stream_options={"continuous_usage_stats": None})
-
-    # Test stream=False, stream_options=
-    #    {"continuous_usage_stats": True}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(
-            model=model_name,
-            prompt=prompt,
-            max_tokens=5,
-            temperature=0.0,
-            stream=False,
-            stream_options={"continuous_usage_stats": True})
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
-    # test both text and token IDs
-    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
-        # test simple list
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            max_tokens=5,
-            temperature=0.0,
-        )
-        assert len(batch.choices) == 2
-        assert batch.choices[0].text == batch.choices[1].text
-
-        # test n = 2
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            n=2,
-            max_tokens=5,
-            temperature=0.0,
-            extra_body=dict(
-                # NOTE: this has to be true for n > 1 in vLLM, but
-                # not necessary for official client.
-                use_beam_search=True),
-        )
-        assert len(batch.choices) == 4
-        assert batch.choices[0].text != batch.choices[
-            1].text, "beam search should be different"
-        assert batch.choices[0].text == batch.choices[
-            2].text, "two copies of the same prompt should be the same"
-        assert batch.choices[1].text == batch.choices[
-            3].text, "two copies of the same prompt should be the same"
-
-        # test streaming
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            max_tokens=5,
-            temperature=0.0,
-            stream=True,
-        )
-        texts = [""] * 2
-        async for chunk in batch:
-            assert len(chunk.choices) == 1
-            choice = chunk.choices[0]
-            texts[choice.index] += choice.text
-        assert texts[0] == texts[1]
-
-
-@pytest.mark.asyncio
-async def test_allowed_token_ids(client: openai.AsyncOpenAI):
-    prompt = "Hello, my name is"
-    max_tokens = 1
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-
-    # Test exclusive selection
-    allowed_ids = [21555, 21557, 21558]
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        seed=42,
-        extra_body=dict(allowed_token_ids=allowed_ids),
-        logprobs=1,
-    )
-    response_tokens = completion.choices[0].logprobs.tokens
-    assert len(response_tokens) == 1
-    assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-@pytest.mark.parametrize("logprobs_arg", [1, 0])
-async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
-                                       model_name: str, logprobs_arg: int):
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-    # test using text and token IDs
-    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
-        completion = await client.completions.create(model=model_name,
-                                                     prompt=prompt,
-                                                     max_tokens=5,
-                                                     temperature=0.0,
-                                                     echo=True,
-                                                     logprobs=logprobs_arg)
-
-        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
-                                                             list) else prompt
-        assert re.search(r"^" + prompt_text, completion.choices[0].text)
-        logprobs = completion.choices[0].logprobs
-        assert logprobs is not None
-        assert len(logprobs.text_offset) > 5
-        assert (len(logprobs.token_logprobs) > 5
-                and logprobs.token_logprobs[0] is None)
-        assert (len(logprobs.top_logprobs) > 5
-                and logprobs.top_logprobs[0] is None)
-        for top_logprobs in logprobs.top_logprobs[1:]:
-            assert max(logprobs_arg,
-                       1) <= len(top_logprobs) <= logprobs_arg + 1
-        assert len(logprobs.tokens) > 5

From 15654c4ac00b60b6c4004ff34d65a362ae10c687 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sun, 29 Dec 2024 11:07:06 -0500
Subject: [PATCH 167/293] simplify changes to scheduler

---
 vllm/v1/core/scheduler.py | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 9471eee4c331c..5f85d66249542 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -107,11 +107,11 @@ def schedule(self) -> "SchedulerOutput":
         # but not all. The constraint is due to the persistent batch in the
         # V1 model runner.
         # TODO(woosuk): Remove this constraint after refactoring model runner.
-        has_partial_request = False
+        partial_req_index = None
         req_index = 0
         while req_index < len(self.running):
             # Only the last request in the RUNNING queue can be "partial".
-            assert not has_partial_request
+            assert partial_req_index is None
             assert token_budget > 0
             request = self.running[req_index]
             num_new_tokens = request.num_tokens - request.num_computed_tokens
@@ -158,9 +158,10 @@ def schedule(self) -> "SchedulerOutput":
             ]
             num_scheduled_tokens[request.request_id] = num_new_tokens
             token_budget -= num_new_tokens
+            if (request.num_computed_tokens + num_new_tokens <
+                    request.num_tokens):
+                partial_req_index = req_index
             req_index += 1
-            has_partial_request = (request.num_computed_tokens + num_new_tokens
-                                   < request.num_tokens)
 
             # Encoder-related.
             if encoder_inputs_to_schedule:
@@ -174,7 +175,7 @@ def schedule(self) -> "SchedulerOutput":
         # Next, schedule the WAITING requests.
         if not preempted_reqs:
             while self.waiting:
-                if has_partial_request:
+                if partial_req_index:
                     break
                 if len(self.running) == self.max_num_running_reqs:
                     break
@@ -240,8 +241,9 @@ def schedule(self) -> "SchedulerOutput":
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
                 request.num_computed_tokens = num_computed_tokens
-                has_partial_request = (num_computed_tokens + num_new_tokens <
-                                       request.num_tokens)
+                if (num_computed_tokens + num_new_tokens < request.num_tokens):
+                    assert partial_req_index is None
+                    partial_req_index = req_index
 
                 # Encoder-related.
                 if encoder_inputs_to_schedule:
@@ -279,16 +281,6 @@ def schedule(self) -> "SchedulerOutput":
         ]
         preempted_req_ids = {req.request_id for req in preempted_reqs}
 
-        partial_req_indices = [
-            idx for idx, request in enumerate(self.running)
-            if request.num_computed_tokens +
-            num_scheduled_tokens[request.request_id] < request.num_tokens
-        ]
-        num_partial_reqs = len(partial_req_indices)
-        assert num_partial_reqs < 2
-        partial_req_index = (partial_req_indices[0]
-                             if num_partial_reqs > 0 else -1)
-
         scheduler_output = SchedulerOutput(
             scheduled_new_reqs=new_reqs_data,
             scheduled_resumed_reqs=resumed_reqs_data,
@@ -542,7 +534,7 @@ class SchedulerOutput:
     scheduled_new_reqs: List[NewRequestData]
     scheduled_resumed_reqs: List[ResumedRequestData]
     scheduled_running_reqs: List[RunningRequestData]
-    partial_req_index: int  # >0 if running req is partial, -1 o/w
+    partial_req_index: Optional[int]
 
     num_scheduled_tokens: Dict[str, int]
     total_num_scheduled_tokens: int

From 5857f87363fa3c8fa5c8f7e93573c8086527833b Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sun, 29 Dec 2024 11:09:42 -0500
Subject: [PATCH 168/293] small assert

---
 vllm/v1/core/scheduler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 5f85d66249542..b3f4313e7a5c1 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -160,6 +160,7 @@ def schedule(self) -> "SchedulerOutput":
             token_budget -= num_new_tokens
             if (request.num_computed_tokens + num_new_tokens <
                     request.num_tokens):
+                assert partial_req_index is None
                 partial_req_index = req_index
             req_index += 1
 

From a49d4c16f681780c171a8558d5ca0e2738c040f3 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sun, 29 Dec 2024 11:10:12 -0500
Subject: [PATCH 169/293] nit

---
 vllm/v1/core/scheduler.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index b3f4313e7a5c1..6e9b1396c63af 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -281,7 +281,6 @@ def schedule(self) -> "SchedulerOutput":
                 req.num_computed_tokens) for req in scheduled_running_reqs
         ]
         preempted_req_ids = {req.request_id for req in preempted_reqs}
-
         scheduler_output = SchedulerOutput(
             scheduled_new_reqs=new_reqs_data,
             scheduled_resumed_reqs=resumed_reqs_data,

From dc7d27cd50a56d6143f0997819eab79a048304b3 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Sun, 29 Dec 2024 11:17:52 -0500
Subject: [PATCH 170/293] revert moving update from output file

---
 vllm/v1/core/scheduler.py | 191 +++++++++++++++++++++++++++++++++++++
 vllm/v1/engine/core.py    | 193 +-------------------------------------
 2 files changed, 193 insertions(+), 191 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 6e9b1396c63af..ea6ded4be9be0 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -10,6 +10,8 @@
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_manager import KVCacheManager
+from vllm.v1.engine import EngineCoreOutput	
+from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 
 if TYPE_CHECKING:
@@ -385,6 +387,195 @@ def _try_schedule_encoder_inputs(
             encoder_inputs_to_schedule.append(i)
         return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
 
+    def update_from_output(
+        self,
+        scheduler_output: "SchedulerOutput",
+        model_runner_output: "ModelRunnerOutput",
+    ) -> List[EngineCoreOutput]:
+        """Build engine core output from model runner output.
+        
+        Args:
+          scheduler_output: scheduler output prior to engine step.
+          model_runner_output: model runner output from engine step.
+
+        Returns:
+          Engine core output which tracks the progress of generation.
+        """
+        # NOTE(woosuk): This method doesn't consider speculative decoding.
+        sampled_token_ids = model_runner_output.sampled_token_ids
+        num_scheduled_tokens = scheduler_output.num_scheduled_tokens
+        do_batch_sample_logprobs = (model_runner_output.batch_logprobs_cpu
+                                    is not None)
+        do_batch_prompt_logprobs = (
+            model_runner_output.batch_prompt_logprobs_cpu is not None
+            and len(model_runner_output.batch_prompt_logprobs_cpu) > 0)
+
+        if do_batch_prompt_logprobs:
+            # Index into prompt tokens, for building
+            # prompt logprobs output data structure
+            mr_output_slice_lower_index = 0
+        new_running: List[Request] = []
+        engine_core_outputs: List[EngineCoreOutput] = []
+        for request in self.running:
+            req_id = request.request_id
+            prev_num_computed_tokens = request.num_computed_tokens
+            request.num_computed_tokens += num_scheduled_tokens[req_id]
+            req_index = model_runner_output.req_id_to_index[req_id]
+            num_new_tokens = 1
+            request_sample_logprobs = request.request_sample_logprobs
+            request_do_logprobs = (do_batch_sample_logprobs
+                                   and request_sample_logprobs is not None
+                                   and request_sample_logprobs > 0)
+
+            if do_batch_prompt_logprobs:
+                request_prompt_logprobs = request.request_prompt_logprobs
+                # Number of new prompt tokens is the number of scheduled
+                # tokens *if* the request is partial (because the sampled
+                # token is discarded and all sequence offsets are prompt
+                # offsets), otherwise it is the number of scheduled
+                # tokens minus one (for the sampled token)
+                req_is_not_partial = (scheduler_output.partial_req_index !=
+                                      req_index)
+                num_new_prompt_tokens = (
+                    num_scheduled_tokens[request.request_id] -
+                    int(req_is_not_partial))
+
+                request_do_prompt_logprobs = (request_prompt_logprobs
+                                              is not None
+                                              and request_prompt_logprobs > 0
+                                              and num_new_prompt_tokens > 0)
+
+                if request_do_prompt_logprobs:
+                    # Construct prompt logprobs, under the condition that
+                    # prompt logprobs were requested & a nonzero number of
+                    # prompt tokens were computed in this step for this request.
+                    #
+                    # Pythonization is deferred to outside the engine core.
+                    #
+                    # Note that this scenario returns an EngineCoreOutput which
+                    # is empty except for the prompt logprobs which were
+                    # computed for these prompt tokens.
+                    #
+                    # Note: new_prompt_logprobs will be used later to build the
+                    # engine core output
+                    assert (model_runner_output.batch_prompt_logprobs_cpu
+                            is not None)
+                    assert (
+                        model_runner_output.batch_prompt_logprob_token_ids_cpu
+                        is not None)
+                    assert request.prompt_logprobs is not None
+                    assert request.prompt_logprob_token_ids is not None
+                    logprob_cnt = request_prompt_logprobs
+                    mr_output_slice_upper_index = (
+                        mr_output_slice_lower_index + num_new_prompt_tokens)
+                    new_prompt_logprobs = (
+                        model_runner_output.batch_prompt_logprobs_cpu[
+                            mr_output_slice_lower_index:
+                            mr_output_slice_upper_index, 0:logprob_cnt])
+                    new_prompt_logprob_token_ids = (
+                        model_runner_output.batch_prompt_logprob_token_ids_cpu[
+                            mr_output_slice_lower_index:
+                            mr_output_slice_upper_index, 0:logprob_cnt])
+
+                    req_slice_upper_index = (prev_num_computed_tokens +
+                                             num_new_prompt_tokens)
+                    request.prompt_logprobs[
+                        prev_num_computed_tokens:
+                        req_slice_upper_index] = new_prompt_logprobs
+                    request.prompt_logprob_token_ids[
+                        prev_num_computed_tokens:
+                        req_slice_upper_index] = new_prompt_logprob_token_ids
+                    mr_output_slice_lower_index = mr_output_slice_upper_index
+                else:
+                    mr_output_slice_lower_index += num_new_prompt_tokens
+            else:
+                request_do_prompt_logprobs = False
+
+            # When the request's num_computed_tokens catches up its num_tokens,
+            # the request generates output tokens. Otherwise, we ignore the
+            # sampler output for the request.
+            assert request.num_computed_tokens <= request.num_tokens
+
+            cached_encoder_input_ids = (
+                self.encoder_cache_manager.get_cached_input_ids(request))
+            for input_id in list(cached_encoder_input_ids):
+                start_pos = request.mm_positions[input_id]["offset"]
+                num_tokens = request.mm_positions[input_id]["length"]
+                if start_pos + num_tokens <= request.num_computed_tokens:
+                    # The encoder output is already processed and stored
+                    # in the decoder's KV cache.
+                    self.encoder_cache_manager.free(request, input_id)
+
+            if request.num_computed_tokens == request.num_tokens:
+                # NOTE(woosuk): Currently, we assume that each request
+                # generates at most one token at each step.
+                token_id = sampled_token_ids[req_index]
+                if request_do_logprobs:
+                    assert model_runner_output.batch_logprobs_cpu is not None
+                    assert (model_runner_output.batch_logprob_token_ids_cpu
+                            is not None)
+                    assert request.logprobs is not None
+                    # Slice out this request's sample logprobs; defer
+                    # pythonization to be carried out in the frontend.
+                    request.logprobs.append(
+                        (model_runner_output.batch_logprobs_cpu[req_index],
+                         model_runner_output.
+                         batch_logprob_token_ids_cpu[req_index]))
+                request.append_output_token_ids(token_id)
+                # TODO: Update the KV cache manager for prefix caching.
+
+                # Check for stop and update request state.
+                # This must be called before me make the EngineCoreOutput.
+                stopped = self._check_stop(request)
+
+                # Compute engine core output logprobs list as such,
+                # so the type checker can see the assert
+                if request_do_logprobs:
+                    assert request.logprobs is not None
+                    logprobs = request.logprobs[-num_new_tokens:]
+                else:
+                    logprobs = None
+
+                # Add EngineCoreOutput for this Request.
+                # Return the logprob for the most recently computed tokens.
+                # Return no prompt logprobs in decode-phase.
+                output = EngineCoreOutput(
+                    request_id=req_id,
+                    new_token_ids=request.output_token_ids[-num_new_tokens:],
+                    finished=request.is_finished(),
+                    finish_reason=request.get_finished_reason(),
+                    stop_reason=request.stop_reason,
+                    logprobs=logprobs,
+                    prompt_logprobs=(new_prompt_logprobs
+                                     if request_do_prompt_logprobs else None),
+                    prompt_logprobs_token_ids=(new_prompt_logprob_token_ids
+                                               if request_do_prompt_logprobs
+                                               else None))
+                engine_core_outputs.append(output)
+
+                # Breakout of the loop.
+                if stopped:
+                    continue
+
+            elif request_do_prompt_logprobs:
+                # This request is still partial but prompt logprobs were
+                # requested
+                engine_core_outputs.append(
+                    EngineCoreOutput(
+                        request_id=req_id,
+                        new_token_ids=[],
+                        finished=request.is_finished(),
+                        finish_reason=request.get_finished_reason(),
+                        stop_reason=request.stop_reason,
+                        logprobs=[] if request_do_logprobs else None,
+                        prompt_logprobs=new_prompt_logprobs,
+                        prompt_logprobs_token_ids=new_prompt_logprob_token_ids)
+                )
+
+            new_running.append(request)
+        self.running = new_running
+        return engine_core_outputs
+
     def _check_stop(self, request: Request) -> bool:
         if (request.num_tokens >= self.max_model_len
                 or request.num_output_tokens >= request.max_tokens):
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 8f02d20fc4744..e658b744caf27 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -116,196 +116,6 @@ def abort_requests(self, request_ids: List[str]):
         self.scheduler.finish_requests(request_ids,
                                        RequestStatus.FINISHED_ABORTED)
 
-    def update_from_output(
-        self,
-        scheduler_output: "SchedulerOutput",
-        model_runner_output: "ModelRunnerOutput",
-    ) -> List[EngineCoreOutput]:
-        """Build engine core output from model runner output.
-        
-        Args:
-          scheduler_output: scheduler output prior to engine step.
-          model_runner_output: model runner output from engine step.
-
-        Returns:
-          Engine core output which tracks the progress of generation.
-        """
-        scheduler = self.scheduler
-        # NOTE(woosuk): This method doesn't consider speculative decoding.
-        sampled_token_ids = model_runner_output.sampled_token_ids
-        num_scheduled_tokens = scheduler_output.num_scheduled_tokens
-        do_batch_sample_logprobs = (model_runner_output.batch_logprobs_cpu
-                                    is not None)
-        do_batch_prompt_logprobs = (
-            model_runner_output.batch_prompt_logprobs_cpu is not None
-            and len(model_runner_output.batch_prompt_logprobs_cpu) > 0)
-
-        if do_batch_prompt_logprobs:
-            # Index into prompt tokens, for building
-            # prompt logprobs output data structure
-            mr_output_slice_lower_index = 0
-        new_running: List[Request] = []
-        engine_core_outputs: List[EngineCoreOutput] = []
-        for request in scheduler.running:
-            req_id = request.request_id
-            prev_num_computed_tokens = request.num_computed_tokens
-            request.num_computed_tokens += num_scheduled_tokens[req_id]
-            req_index = model_runner_output.req_id_to_index[req_id]
-            num_new_tokens = 1
-            request_sample_logprobs = request.request_sample_logprobs
-            request_do_logprobs = (do_batch_sample_logprobs
-                                   and request_sample_logprobs is not None
-                                   and request_sample_logprobs > 0)
-
-            if do_batch_prompt_logprobs:
-                request_prompt_logprobs = request.request_prompt_logprobs
-                # Number of new prompt tokens is the number of scheduled
-                # tokens *if* the request is partial (because the sampled
-                # token is discarded and all sequence offsets are prompt
-                # offsets), otherwise it is the number of scheduled
-                # tokens minus one (for the sampled token)
-                req_is_not_partial = (scheduler_output.partial_req_index !=
-                                      req_index)
-                num_new_prompt_tokens = (
-                    num_scheduled_tokens[request.request_id] -
-                    int(req_is_not_partial))
-
-                request_do_prompt_logprobs = (request_prompt_logprobs
-                                              is not None
-                                              and request_prompt_logprobs > 0
-                                              and num_new_prompt_tokens > 0)
-
-                if request_do_prompt_logprobs:
-                    # Construct prompt logprobs, under the condition that
-                    # prompt logprobs were requested & a nonzero number of
-                    # prompt tokens were computed in this step for this request.
-                    #
-                    # Pythonization is deferred to outside the engine core.
-                    #
-                    # Note that this scenario returns an EngineCoreOutput which
-                    # is empty except for the prompt logprobs which were
-                    # computed for these prompt tokens.
-                    #
-                    # Note: new_prompt_logprobs will be used later to build the
-                    # engine core output
-                    assert (model_runner_output.batch_prompt_logprobs_cpu
-                            is not None)
-                    assert (
-                        model_runner_output.batch_prompt_logprob_token_ids_cpu
-                        is not None)
-                    assert request.prompt_logprobs is not None
-                    assert request.prompt_logprob_token_ids is not None
-                    logprob_cnt = request_prompt_logprobs
-                    mr_output_slice_upper_index = (
-                        mr_output_slice_lower_index + num_new_prompt_tokens)
-                    new_prompt_logprobs = (
-                        model_runner_output.batch_prompt_logprobs_cpu[
-                            mr_output_slice_lower_index:
-                            mr_output_slice_upper_index, 0:logprob_cnt])
-                    new_prompt_logprob_token_ids = (
-                        model_runner_output.batch_prompt_logprob_token_ids_cpu[
-                            mr_output_slice_lower_index:
-                            mr_output_slice_upper_index, 0:logprob_cnt])
-
-                    req_slice_upper_index = (prev_num_computed_tokens +
-                                             num_new_prompt_tokens)
-                    request.prompt_logprobs[
-                        prev_num_computed_tokens:
-                        req_slice_upper_index] = new_prompt_logprobs
-                    request.prompt_logprob_token_ids[
-                        prev_num_computed_tokens:
-                        req_slice_upper_index] = new_prompt_logprob_token_ids
-                    mr_output_slice_lower_index = mr_output_slice_upper_index
-                else:
-                    mr_output_slice_lower_index += num_new_prompt_tokens
-            else:
-                request_do_prompt_logprobs = False
-
-            # When the request's num_computed_tokens catches up its num_tokens,
-            # the request generates output tokens. Otherwise, we ignore the
-            # sampler output for the request.
-            assert request.num_computed_tokens <= request.num_tokens
-
-            cached_encoder_input_ids = (
-                scheduler.encoder_cache_manager.get_cached_input_ids(request))
-            for input_id in list(cached_encoder_input_ids):
-                start_pos = request.mm_positions[input_id]["offset"]
-                num_tokens = request.mm_positions[input_id]["length"]
-                if start_pos + num_tokens <= request.num_computed_tokens:
-                    # The encoder output is already processed and stored
-                    # in the decoder's KV cache.
-                    scheduler.encoder_cache_manager.free(request, input_id)
-
-            if request.num_computed_tokens == request.num_tokens:
-                # NOTE(woosuk): Currently, we assume that each request
-                # generates at most one token at each step.
-                token_id = sampled_token_ids[req_index]
-                if request_do_logprobs:
-                    assert model_runner_output.batch_logprobs_cpu is not None
-                    assert (model_runner_output.batch_logprob_token_ids_cpu
-                            is not None)
-                    assert request.logprobs is not None
-                    # Slice out this request's sample logprobs; defer
-                    # pythonization to be carried out in the frontend.
-                    request.logprobs.append(
-                        (model_runner_output.batch_logprobs_cpu[req_index],
-                         model_runner_output.
-                         batch_logprob_token_ids_cpu[req_index]))
-                request.append_output_token_ids(token_id)
-                # TODO: Update the KV cache manager for prefix caching.
-
-                # Check for stop and update request state.
-                # This must be called before me make the EngineCoreOutput.
-                stopped = scheduler._check_stop(request)
-
-                # Compute engine core output logprobs list as such,
-                # so the type checker can see the assert
-                if request_do_logprobs:
-                    assert request.logprobs is not None
-                    logprobs = request.logprobs[-num_new_tokens:]
-                else:
-                    logprobs = None
-
-                # Add EngineCoreOutput for this Request.
-                # Return the logprob for the most recently computed tokens.
-                # Return no prompt logprobs in decode-phase.
-                output = EngineCoreOutput(
-                    request_id=req_id,
-                    new_token_ids=request.output_token_ids[-num_new_tokens:],
-                    finished=request.is_finished(),
-                    finish_reason=request.get_finished_reason(),
-                    stop_reason=request.stop_reason,
-                    logprobs=logprobs,
-                    prompt_logprobs=(new_prompt_logprobs
-                                     if request_do_prompt_logprobs else None),
-                    prompt_logprobs_token_ids=(new_prompt_logprob_token_ids
-                                               if request_do_prompt_logprobs
-                                               else None))
-                engine_core_outputs.append(output)
-
-                # Breakout of the loop.
-                if stopped:
-                    continue
-
-            elif request_do_prompt_logprobs:
-                # This request is still partial but prompt logprobs were
-                # requested
-                engine_core_outputs.append(
-                    EngineCoreOutput(
-                        request_id=req_id,
-                        new_token_ids=[],
-                        finished=request.is_finished(),
-                        finish_reason=request.get_finished_reason(),
-                        stop_reason=request.stop_reason,
-                        logprobs=[] if request_do_logprobs else None,
-                        prompt_logprobs=new_prompt_logprobs,
-                        prompt_logprobs_token_ids=new_prompt_logprob_token_ids)
-                )
-
-            new_running.append(request)
-        scheduler.running = new_running
-        return engine_core_outputs
-
     def step(self) -> List[EngineCoreOutput]:
         """Schedule, execute, and make output."""
 
@@ -314,7 +124,8 @@ def step(self) -> List[EngineCoreOutput]:
 
         scheduler_output = self.scheduler.schedule()
         output = self.model_executor.execute_model(scheduler_output)
-        engine_core_outputs = self.update_from_output(scheduler_output, output)
+        engine_core_outputs = self.scheduler.update_from_output(
+            scheduler_output, output)
         return engine_core_outputs
 
     def shutdown(self):

From 72eed99309a6a7d84e252610a172a4d87729b20a Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 1 Jan 2025 17:58:57 -0500
Subject: [PATCH 171/293] updated

---
 vllm/v1/core/scheduler.py          |  24 +--
 vllm/v1/outputs.py                 |  19 +-
 vllm/v1/sample/metadata.py         |  14 +-
 vllm/v1/sample/sampler.py          | 314 +++++------------------------
 vllm/v1/worker/gpu_input_batch.py  |  36 ++--
 vllm/v1/worker/gpu_model_runner.py |  55 +++--
 6 files changed, 124 insertions(+), 338 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index ea6ded4be9be0..f202f8a87ef86 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -10,7 +10,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_manager import KVCacheManager
-from vllm.v1.engine import EngineCoreOutput	
+from vllm.v1.engine import EngineCoreOutput
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 
@@ -109,11 +109,11 @@ def schedule(self) -> "SchedulerOutput":
         # but not all. The constraint is due to the persistent batch in the
         # V1 model runner.
         # TODO(woosuk): Remove this constraint after refactoring model runner.
-        partial_req_index = None
+        partial_req_id = None
         req_index = 0
         while req_index < len(self.running):
             # Only the last request in the RUNNING queue can be "partial".
-            assert partial_req_index is None
+            assert partial_req_id is None
             assert token_budget > 0
             request = self.running[req_index]
             num_new_tokens = request.num_tokens - request.num_computed_tokens
@@ -162,8 +162,8 @@ def schedule(self) -> "SchedulerOutput":
             token_budget -= num_new_tokens
             if (request.num_computed_tokens + num_new_tokens <
                     request.num_tokens):
-                assert partial_req_index is None
-                partial_req_index = req_index
+                assert partial_req_id is None
+                partial_req_id = request.request_id
             req_index += 1
 
             # Encoder-related.
@@ -178,7 +178,7 @@ def schedule(self) -> "SchedulerOutput":
         # Next, schedule the WAITING requests.
         if not preempted_reqs:
             while self.waiting:
-                if partial_req_index:
+                if partial_req_id:
                     break
                 if len(self.running) == self.max_num_running_reqs:
                     break
@@ -245,8 +245,8 @@ def schedule(self) -> "SchedulerOutput":
                 request.status = RequestStatus.RUNNING
                 request.num_computed_tokens = num_computed_tokens
                 if (num_computed_tokens + num_new_tokens < request.num_tokens):
-                    assert partial_req_index is None
-                    partial_req_index = req_index
+                    assert partial_req_id is None
+                    partial_req_id = request.request_id
 
                 # Encoder-related.
                 if encoder_inputs_to_schedule:
@@ -287,7 +287,7 @@ def schedule(self) -> "SchedulerOutput":
             scheduled_new_reqs=new_reqs_data,
             scheduled_resumed_reqs=resumed_reqs_data,
             scheduled_running_reqs=running_reqs_data,
-            partial_req_index=partial_req_index,
+            partial_req_id=partial_req_id,
             num_scheduled_tokens=num_scheduled_tokens,
             total_num_scheduled_tokens=total_num_scheduled_tokens,
             scheduled_encoder_inputs=scheduled_encoder_inputs,
@@ -434,8 +434,8 @@ def update_from_output(
                 # token is discarded and all sequence offsets are prompt
                 # offsets), otherwise it is the number of scheduled
                 # tokens minus one (for the sampled token)
-                req_is_not_partial = (scheduler_output.partial_req_index !=
-                                      req_index)
+                req_is_not_partial = (scheduler_output.partial_req_id !=
+                                      req_id)
                 num_new_prompt_tokens = (
                     num_scheduled_tokens[request.request_id] -
                     int(req_is_not_partial))
@@ -725,7 +725,7 @@ class SchedulerOutput:
     scheduled_new_reqs: List[NewRequestData]
     scheduled_resumed_reqs: List[ResumedRequestData]
     scheduled_running_reqs: List[RunningRequestData]
-    partial_req_index: Optional[int]
+    partial_req_id: Optional[str]
 
     num_scheduled_tokens: Dict[str, int]
     total_num_scheduled_tokens: int
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 74174b7678023..bf86f3fed4af7 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -5,6 +5,14 @@
 import torch
 
 
+@dataclass
+class PromptLogprobsOutput:
+
+    # [num_reqs, max_num_logprobs + 1]
+    logprob_token_ids: Optional[torch.Tensor] = None
+    logprobs: Optional[torch.Tensor] = None
+
+
 @dataclass
 class SamplerOutput:
 
@@ -12,14 +20,9 @@ class SamplerOutput:
     sampled_token_ids: List[int]
 
     # [num_reqs, max_num_logprobs + 1]
-    batch_sample_logprob_token_ids: Optional[torch.Tensor] = None
-    # [num_reqs, max_num_logprobs + 1]
-    batch_sample_logprobs: Optional[torch.Tensor] = None
-
-    # [num_prompt_tokens, max_num_prompt_logprobs + 1]
-    batch_prompt_logprobs: Optional[torch.Tensor] = None
-    # [num_prompt_tokens, max_num_prompt_logprobs + 1]
-    batch_prompt_logprob_token_ids: Optional[torch.Tensor] = None
+    logprob_token_ids: Optional[torch.Tensor] = None
+    logprobs: Optional[torch.Tensor] = None
+    
 
 
 # ModelRunnerOutput is serialized and sent to the scheduler process.
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 38297ccac355a..0585201a148cc 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict, Optional
+from typing import Dict, List
 
 import torch
 
@@ -17,14 +17,4 @@ class SamplingMetadata:
     no_top_k: bool
 
     generators: Dict[int, torch.Generator]
-
-    # Max number of sample or prompt logprobs
-    # (respectiely) at the batch level
-    max_num_batch_sample_logprobs: int
-    max_num_batch_prompt_logprobs: int
-
-    # Attributes which support logprob computation
-    query_start_loc: Optional[torch.Tensor]
-    num_query_tokens: Optional[torch.Tensor]
-    num_input_tokens: int
-    partial_req_index: int  # >0 if there is a partial request, -1 o/w
+    max_num_logprobs: int
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index dea64c4f9b134..a2a5107452b45 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -4,7 +4,7 @@
 import torch
 import torch.nn as nn
 
-from vllm.v1.outputs import SamplerOutput
+from vllm.v1.outputs import SamplerOutput, LogprobsOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 
 _SAMPLING_EPS = 1e-5
@@ -12,20 +12,6 @@
 
 class Sampler(nn.Module):
 
-    def _apply_temperature_top_k_top_p(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-        num_query_tokens: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-
-        temperature = (sampling_metadata.temperature if
-                       num_query_tokens is None else torch.repeat_interleave(
-                           sampling_metadata.temperature, num_query_tokens))
-
-        return self._apply_top_k_top_p(
-            self._apply_temperature(logits, temperature), sampling_metadata)
-
     def _probs_sample(
         self,
         maybe_sample_logits: torch.Tensor,
@@ -36,176 +22,27 @@ def _probs_sample(
         # Use int32 to reduce the tensor size.
         return sampled.to(torch.int32)
 
-    def _top_logprobs_token_indices(
-        self,
-        logprob_values: torch.Tensor,
-        max_num_logprobs: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Compute top logprobs and associated token indices
-        
-        Args:
-          logprobs: total_tokens x vocab tensor
-          max_num_logprobs: Max number of top {sample,prompt} logprobs
-                            requested in batch (depending on whether top sample
-                            logprobs or top prompt logprobs are being computed).
-                            This will be the k.
-
-        Returns:
-          Top logprobs, total_tokens x max_num_logprobs tensor
-          Top logprob token indices, total_tokens x max_num_logprobs tensor
-        """
-        topk_logprobs, topk_indices = torch.topk(logprob_values,
-                                                 max_num_logprobs,
-                                                 dim=-1)
-        # Use int32 to reduce the tensor size.
-        return topk_logprobs, topk_indices.to(torch.int32)
+   
 
-    def _compute_logprobs_from_processed_logits(
+    def compute_logprobs(
         self,
-        do_batch_sample_logprobs: bool,
-        do_batch_prompt_logprobs: bool,
-        maybe_sampled: torch.Tensor,
-        maybe_sample_logits_indices: Optional[torch.Tensor],
-        prompt_logits_mask: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-        maybe_sample_logits_w_tmp_tpk_tpp: torch.Tensor,
-        logits_w_tmp_tpk_tpp: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Compute sample and prompt logprobs as required by batch config
+        logits: torch.Tensor,
+        max_num_logprobs: int
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """Compute logprobs and move to CPU."""
         
-        Consumes logits which have already had temperature, top-k and top-p
-        applied. 
-         
-        `do_batch_sample_logprobs` and `do_batch_prompt_logprobs` control
-        whether sample and prompt logprobs are computed, respectively.
-
-        This function does not handle the case where no logprobs are required
-        at the batch level; it is assumed this function will not be called in
-        that scenario.
-
-        Args:
-          do_batch_sample_logprobs: at least one request in the batch requires
-                                    sample logprobs to be computed
-          do_batch_prompt_logprobs: at least one request in the batch requires
-                                    prompt logprobs to be computed
-          maybe_sampled: list of sampled tokens; if there is a partial request,
-                         includes the partial request's sampled token (which
-                         will later be discarded.)
-          maybe_sample_logits_indices: sequence-offset indices where a new
-                         token is decoded; if there is a partial request,
-                         includes the index of the partial request's sampled
-                         token (which will later be discarded.)
-          prompt_logits_mask: mask indicating the sequence offsets of prompt
-                         tokens. Note: if there is a partial request,
-                         this mask includes the index of the partial request's
-                         sample token (since this sampled token will be
-                         discarded, but the logprobs computed at this offset
-                         are part of the prompt logprobs.) Note that this means
-                         prompt_logits_mask and maybe_sample_logits_indices
-                         may have overlap.
-          sampling_metadata
-          maybe_sample_logits_w_tmp_tpk_tpp: assumed to be logits gathered
-                         from sequence offsets where a new token is being
-                         decoded (including for a partial request); assumed
-                         that temperature, top-k and top-p have been applied.
-          logits_w_tmp_tpk_tpp: optional; all logits with temperature, top-k,
-                         top-p applied.
-
-          Returns:
-            Sample logprobs (`None` if `do_batch_sample_logprobs == False`,
-                             o/w num_samples x max_num_logprobs tensor)
-            Sample logprobs token indices (`None` if
-                            `do_batch_sample_logprobs == False`,
-                             o/w num_samples x max_num_logprobs tensor)
-            Prompt logprobs (`None` if `do_batch_prompt_logprobs == False`,
-                             o/w num_prompt_tokens x max_num_prompt_logprobs
-                             tensor)
-            Prompt logprobs token indices (`None` if
-                 `do_batch_prompt_logprobs == False`, o/w
-                 num_prompt_tokens x max_num_prompt_logprobs tensor)
-        """
-
-        assert do_batch_sample_logprobs or do_batch_prompt_logprobs
-        if do_batch_sample_logprobs and do_batch_prompt_logprobs:
-            # Batch requires sample and prompt logprobs
-
-            # - Compute logprobs for all sequence offsets
-            logprobs = self.get_logprobs(logits_w_tmp_tpk_tpp)
-
-            # - Compute *top* logprobs for sequence offsets
-            #   where a new token is being decoded
-            (
-                maybe_sample_topk_logprobs,
-                maybe_sample_topk_indices,
-            ) = self._top_logprobs_token_indices(
-                logprobs[maybe_sample_logits_indices, :],
-                sampling_metadata.max_num_batch_sample_logprobs)
-
-            # - In case sampled tokens are not in the top logprobs at their
-            #   respective sequence offsets, gather logprobs associated with
-            #   sampled tokens
-            maybe_sampled_logprobs = logprobs[maybe_sample_logits_indices,
-                                              maybe_sampled]
-
-            return ((
-                # Sample logprobs (including sampled tokens)
-                torch.cat((maybe_sample_topk_logprobs,
-                           maybe_sampled_logprobs.unsqueeze(-1)),
-                          dim=-1),
-                # Sample logprobs token indices (including sampled tokens)
-                torch.cat(
-                    (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
-                    dim=-1)) +
-                    # Prompt logprobs and token indices
-                    self._top_logprobs_token_indices(
-                        logprobs[prompt_logits_mask, :],
-                        sampling_metadata.max_num_batch_prompt_logprobs))
-        elif do_batch_sample_logprobs:
-            # Batch requires only sample logprobs
-
-            # - Compute top logprobs only at sequence offsets where new tokens
-            #   are being decoded
-            logprobs = self.get_logprobs(maybe_sample_logits_w_tmp_tpk_tpp)
-            (
-                maybe_sample_topk_logprobs,
-                maybe_sample_topk_indices,
-            ) = self._top_logprobs_token_indices(
-                logprobs, sampling_metadata.max_num_batch_sample_logprobs)
-
-            # - In case sampled tokens are not in the top logprobs at their
-            #   respective sequence offsets, gather logprobs associated with
-            #   sampled tokens
-            maybe_sampled_logprobs = logprobs[
-                torch.arange(maybe_sampled.shape[0]), maybe_sampled]
-
-            # - Concat sampled token logprobs
-            maybe_sample_topk_logprobs = torch.cat(
-                (maybe_sample_topk_logprobs,
-                 maybe_sampled_logprobs.unsqueeze(-1)),
-                dim=-1)
-            # - Concat sampled token id
-            maybe_sample_topk_indices = torch.cat(
-                (maybe_sample_topk_indices, maybe_sampled.unsqueeze(-1)),
-                dim=-1)
-
-            # Return sample logprobs
-            return (maybe_sample_topk_logprobs, maybe_sample_topk_indices,
-                    None, None)
-
-        elif do_batch_prompt_logprobs:
-            # Batch requires only prompt logprobs
-
-            # - Compute top logprobs only at sequence offsets of prompt tokens
-            assert logits_w_tmp_tpk_tpp is not None
-            logprobs = self.get_logprobs(
-                logits_w_tmp_tpk_tpp[prompt_logits_mask, :])
-
-            # Return prompt logprobs
-            return ((None, None) + self._top_logprobs_token_indices(
-                logprobs, sampling_metadata.max_num_batch_prompt_logprobs))
-
-        raise ValueError("One or both of Logprobs and Prompt Logprobs must"
-                         " be enabled to use this method.")
+        if max_num_logprobs > 0:
+            logprobs = self.get_logprobs(logits)
+            # FIXME: Mask the sampled token_id, get topk logprobs,
+            # and concatenate the topk with the sampled token_id.
+            topk_logprobs, topk_indices = torch.topk(
+                logprobs, max_num_logprobs, dim=-1)
+            # Use int32 to reduce the tensor size.
+            topk_indices = topk_indices.to(torch.int32)
+
+            return topk_logprobs.cpu(), topk_indices.cpu()
+        else:
+            return None, None
 
     def forward(
         self,
@@ -228,93 +65,27 @@ def forward(
           (if requested)
         """
 
-        # Batch-level logprobs configs. `do_batch_sample_logprobs`
-        # indicates whether any request requires sample logprobs.
-        # `do_batch_prompt_logprobs` indicates whether any request
-        # requires prompt logprobs. `do_batch_any_logprobs` indicates
-        # whether, overall, any request in the batch requires logprobs
-        # computed
-        do_batch_sample_logprobs = (
-            sampling_metadata.max_num_batch_sample_logprobs > 0)
-        do_batch_prompt_logprobs = (
-            sampling_metadata.max_num_batch_prompt_logprobs > 0)
-        do_batch_any_logprobs = (do_batch_sample_logprobs
-                                 or do_batch_prompt_logprobs)
-
-        num_query_tokens = sampling_metadata.num_query_tokens
-        # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
-        # request in the batch. While we should not sample any token from this
-        # partial request, we do so for simplicity. We will ignore the sampled
-        # token from the partial request.
-        assert sampling_metadata.query_start_loc is not None
-        maybe_sample_logits_indices = sampling_metadata.query_start_loc[1:] - 1
-        prompt_logits_mask = torch.ones(sampling_metadata.num_input_tokens,
-                                        dtype=torch.bool)
-        # Sequence offsets where a token is being decoded are *not* prompt
-        # tokens...
-        pdx = sampling_metadata.partial_req_index
-        prompt_logits_mask[maybe_sample_logits_indices] = False
-        # ...unless the request in question is partial
-        prompt_logits_mask[maybe_sample_logits_indices[pdx]] = True
-
-        # Apply temperature, top-k and top-p to logits at sequence offsets
-        # where a new token is being decoded.
-        if do_batch_prompt_logprobs:
-            # If prompt logprobs are required, then temp/top-k/top-p
-            # must also be applied to prompt logits as a prerequisite.
-            # So pass *all* logits through temp/top-k/top-p, then gather
-            # the processed logits from the sequence offsets where a new token
-            # is being decoded.
-            logits_w_tmp_tpk_tpp = self._apply_temperature_top_k_top_p(
-                logits, sampling_metadata, num_query_tokens)
-
-            maybe_sample_logits_w_tmp_tpk_tpp = (
-                logits_w_tmp_tpk_tpp[maybe_sample_logits_indices])
-        else:
-            # If prompt logprobs are not required, then gather the logits
-            # only from the sequence offsets where a new token is being
-            # decoded, and *only* apply temp/top-k/top-p to those logits.
-            maybe_sample_logits_w_tmp_tpk_tpp = (
-                self._apply_temperature_top_k_top_p(
-                    logits[maybe_sample_logits_indices], sampling_metadata,
-                    None))
-
-        # Compute and sample token probability distribution, *only* at sequence
-        # offsets where a new token is being decoded
-        maybe_sampled = self._probs_sample(maybe_sample_logits_w_tmp_tpk_tpp,
-                                           sampling_metadata)
+        # Sample next token.
+        logits = self._process_logits(logits, sampling_metadata)
+        probs = self.get_probs(logits)
+        sampled = self.sample(probs, sampling_metadata)
+        # Use int32 to reduce the tensor size.
+        sampled = sampled.to(torch.int32)
 
-        # Compute sample & prompt logprobs, as-needed
-        if do_batch_any_logprobs:
-            (
-                maybe_sample_logprobs,
-                maybe_sample_logprobs_token_indices,
-                prompt_logprobs,
-                prompt_logprobs_token_indices,
-            ) = self._compute_logprobs_from_processed_logits(
-                do_batch_sample_logprobs=do_batch_sample_logprobs,
-                do_batch_prompt_logprobs=do_batch_prompt_logprobs,
-                maybe_sampled=maybe_sampled,
-                maybe_sample_logits_indices=maybe_sample_logits_indices,
-                prompt_logits_mask=prompt_logits_mask,
-                sampling_metadata=sampling_metadata,
-                maybe_sample_logits_w_tmp_tpk_tpp=
-                maybe_sample_logits_w_tmp_tpk_tpp,
-                logits_w_tmp_tpk_tpp=(logits_w_tmp_tpk_tpp
-                                      if do_batch_prompt_logprobs else None))
+        # Compute the logprobs.
+        # NOTE: CPU-GPU synchronization happens here. 
+        logprob_token_ids, logprobs = self.compute_logprobs(
+            logits,
+            sampling_metadata.max_num_logprobs
+        )
 
-            # Return decoded output tokens and sample/prompt logprobs,
-            # as required
-            return SamplerOutput(
-                sampled_token_ids=maybe_sampled.tolist(),
-                batch_sample_logprobs=maybe_sample_logprobs,
-                batch_sample_logprob_token_ids=
-                maybe_sample_logprobs_token_indices,
-                batch_prompt_logprobs=prompt_logprobs,
-                batch_prompt_logprob_token_ids=prompt_logprobs_token_indices)
-        else:
-            # No logprobs; return decoded output tokens
-            return SamplerOutput(sampled_token_ids=maybe_sampled.tolist())
+        # NOTE: CPU-GPU synchronization happens here.
+        sampler_output = SamplerOutput(
+            sampled_token_ids=sampled.tolist(),
+            logprob_token_ids=logprob_token_ids,
+            logprobs=logprobs,
+        )
+        return sampler_output
 
     def _apply_temperature(
         self,
@@ -342,6 +113,15 @@ def _apply_top_k_top_p(
             sampling_metadata.top_p,
         )
 
+    def _process_logits(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        logits = self.apply_temperature(logits, sampling_metadata.temperature)
+        logits = self.apply_top_k_top_p(logits, sampling_metadata)
+        return logits
+
     def get_probs(self, logits: torch.Tensor) -> torch.Tensor:
         return torch.softmax(logits, dim=-1, dtype=torch.float32)
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 224efe915fd9c..f121f47d97978 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -110,14 +110,20 @@ def __init__(
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
         self.top_k_reqs: Set[str] = set()
 
+        # Logprobs-related.
+        # NOTE(rob): The prompt logprobs trackers only include reqs that 
+        # are actively generating logprobs (i.e. in prefill phase).
+        self.num_logprobs: Dict[str, int] = {}
+        self.num_prompt_logprobs: Dict[str, int] = {}
+
+        # NOTE(rob): The req indexes that need sampling
+        self.needs_logits: set[int] = set()
+
         # req_index -> generator
         # NOTE(woosuk): The indices of the requests that do not have their own
         # generator should not be included in the dictionary.
         self.generators: Dict[int, torch.Generator] = {}
 
-        self.num_logprobs: Dict[str, int] = {}
-        self.num_prompt_logprobs: Dict[str, int] = {}
-        self.prompt_logprob_reqs: Set[str] = set()
 
     def add_request(
         self,
@@ -166,12 +172,11 @@ def add_request(
 
         num_logprobs = sampling_params.logprobs
         num_prompt_logprobs = sampling_params.prompt_logprobs
-        if num_logprobs is not None and num_logprobs > 0:
+        if num_logprobs and num_logprobs > 0:
             self.num_logprobs[req_id] = num_logprobs
-        if num_prompt_logprobs is not None and num_prompt_logprobs > 0:
+        if num_prompt_logprobs and num_prompt_logprobs > 0:
             self.num_prompt_logprobs[req_id] = num_prompt_logprobs
-        if sampling_params.prompt_logprobs:
-            self.prompt_logprob_reqs.add(req_id)
+
 
     def remove_request(self, req_id: str) -> Optional[int]:
         req_index = self.req_id_to_index.pop(req_id, None)
@@ -186,7 +191,6 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
         self.num_prompt_logprobs.pop(req_id, None)
-        self.prompt_logprob_reqs.discard(req_id)
         return req_index
 
     def clear(self) -> None:
@@ -199,7 +203,6 @@ def clear(self) -> None:
         self.generators.clear()
         self.num_logprobs.clear()
         self.num_prompt_logprobs.clear()
-        self.prompt_logprob_reqs.clear()
 
     def condense(self, empty_req_indices: List[int]) -> None:
         if self.num_reqs == 0:
@@ -247,9 +250,6 @@ def condense(self, empty_req_indices: List[int]) -> None:
 
     def make_sampling_metadata(
         self,
-        partial_req_index: int,
-        num_input_tokens: int,
-        query_start_loc: torch.Tensor,
         skip_copy: bool = False,
     ) -> SamplingMetadata:
         if not skip_copy:
@@ -273,15 +273,7 @@ def make_sampling_metadata(
             generators=self.generators,
             max_num_batch_sample_logprobs=self.max_num_logprobs,
             max_num_batch_prompt_logprobs=self.max_num_prompt_logprobs,
-            # Required for sampling indices computation
-            query_start_loc=query_start_loc,
-            num_input_tokens=num_input_tokens,
-            partial_req_index=partial_req_index,
-            # Required for prompt logprobs temperature computation.
-            # If prompt logprobs is not required for this batch, then
-            # avoid storing num_query_tokens
-            num_query_tokens=(torch.diff(query_start_loc)
-                              if self.max_num_prompt_logprobs > 0 else None))
+            prompt_logprobs_req_indices=[xxx])
 
     @property
     def num_reqs(self) -> int:
@@ -318,4 +310,4 @@ def no_logprob(self) -> bool:
 
     @property
     def no_prompt_logprob(self) -> bool:
-        return len(self.prompt_logprob_reqs) == 0
+        return len(self.num_prompt_logprobs) == 0
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 80bbdb69cf436..da155c6e5c257 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -208,6 +208,14 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             self.input_batch.block_table_cpu[
                 req_index, start_index:end_index] = req_data.new_block_ids
 
+            # TODO(rob): is there a cleaner way to do this?
+            # If the request is still in the prompt phase and requires
+            # prompt logprobs, include it in active_prompt_logprobs.
+            if (req_id == scheduler_output.partial_req_id
+                    and req_id in self.input_batch.num_prompt_logprobs):
+                self.input_batch.active_prompt_logprobs[req_id] = (
+                    self.input_batch.num_prompt_logprobs[req_id])
+
         req_ids_to_add: List[str] = []
         # Add new requests to the cached states.
         for new_req_data in scheduler_output.scheduled_new_reqs:
@@ -262,7 +270,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
     def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> FlashAttentionMetadata:
+    ) -> Tuple[FlashAttentionMetadata, np.ndarray[bool]]:
 
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
@@ -367,11 +375,22 @@ def _prepare_inputs(
             block_table=self.input_batch.block_table[:num_reqs],
             slot_mapping=slot_mapping,
         )
-        # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
-        # request in the batch. While we should not sample any token from this
-        # partial request, we do so for simplicity. We will ignore the sampled
-        # token from the partial request.
-        return attn_metadata
+
+        # Get mask of indices that need logits for sampling.
+        sample_logits_mask = query_start_loc[1:] - 1
+
+        # Get mask of indices that need logits for prompt logprobs.
+        # NOTE(rob): we should avoid loops over all reqs in this fn,
+        # but the size of num_prompt_logprobs is small since it is
+        # only requests that are currently in the prefill phase.
+        prompt_logprobs_indicies = [
+            self.input_batch.req_id_to_index[req_id]
+            for req_id in self.input_batch.num_prompt_logprobs
+        ]
+        prompt_logits_mask = torch.from_numpy(
+            np.isin(req_indices, prompt_logprobs_indicies))
+
+        return attn_metadata, sample_logits_mask, prompt_logits_mask
 
     def _prepare_sampling(
         self,
@@ -483,7 +502,8 @@ def execute_model(
             encoder_outputs = []
 
         # Prepare the decoder inputs.
-        attn_metadata = self._prepare_inputs(scheduler_output)
+        attn_metadata, sample_logits_mask, prompt_logits_mask = (
+            self._prepare_inputs(scheduler_output))
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
@@ -498,12 +518,6 @@ def execute_model(
 
         sampling_metadata = self._prepare_sampling(
             scheduler_output, num_input_tokens, attn_metadata.query_start_loc)
-        # Indicate whether one or more requests in the batch require sample
-        # logprobs or prompt logprobs to be computed, respectively
-        do_batch_sample_logprobs = (
-            sampling_metadata.max_num_batch_sample_logprobs > 0)
-        do_batch_prompt_logprobs = (
-            sampling_metadata.max_num_batch_prompt_logprobs > 0)
 
         if self.is_multimodal_model:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
@@ -537,12 +551,13 @@ def execute_model(
                 attn_metadata=None,
                 inputs_embeds=inputs_embeds,
             )
-
         hidden_states = hidden_states[:num_scheduled_tokens]
+        sample_hidden_states = hidden_states[sample_logits_mask]
+        sample_logits = self.model.compute_logits(sample_hidden_states, None)
 
-        # Sample the next token and get logprobs if needed.
-        sampler_output: SamplerOutput = self.model.sample(
-            logits=self.model.compute_logits(hidden_states, None),
+        # Sample the next token.
+        sampler_output = self.model.sample(
+            logits=sample_logits,
             sampling_metadata=sampling_metadata,
         )
 
@@ -569,6 +584,12 @@ def execute_model(
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
+        # Indicate whether one or more requests in the batch require sample
+        # logprobs or prompt logprobs to be computed, respectively
+        do_batch_sample_logprobs = (
+            sampling_metadata.max_num_batch_sample_logprobs > 0)
+        do_batch_prompt_logprobs = (
+            sampling_metadata.max_num_batch_prompt_logprobs > 0)
         # Prepare batch-level sample logprobs in a way that the type-checker
         # understands
         if do_batch_sample_logprobs:

From 7d6eb229963e36aa7ce5c7a59b558e437d5e413f Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 1 Jan 2025 18:02:41 -0500
Subject: [PATCH 172/293] stahs

---
 vllm/v1/outputs.py                |  1 -
 vllm/v1/sample/metadata.py        |  8 +++
 vllm/v1/sample/sampler.py         | 86 ++++++++++++++-----------------
 vllm/v1/worker/gpu_input_batch.py |  4 +-
 4 files changed, 48 insertions(+), 51 deletions(-)

diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index bf86f3fed4af7..08b022ae2b807 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -22,7 +22,6 @@ class SamplerOutput:
     # [num_reqs, max_num_logprobs + 1]
     logprob_token_ids: Optional[torch.Tensor] = None
     logprobs: Optional[torch.Tensor] = None
-    
 
 
 # ModelRunnerOutput is serialized and sent to the scheduler process.
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 0585201a148cc..9e98118a60fff 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -18,3 +18,11 @@ class SamplingMetadata:
 
     generators: Dict[int, torch.Generator]
     max_num_logprobs: int
+
+
+@dataclass
+class PromptLogprobsMetadata:
+
+    temperature: torch.Tensor
+    top_p: torch.Tensor
+    top_k: torch.Tensor
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index a2a5107452b45..cba6f1be46858 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -4,7 +4,7 @@
 import torch
 import torch.nn as nn
 
-from vllm.v1.outputs import SamplerOutput, LogprobsOutput
+from vllm.v1.outputs import SamplerOutput, PromptLogprobsOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 
 _SAMPLING_EPS = 1e-5
@@ -12,38 +12,6 @@
 
 class Sampler(nn.Module):
 
-    def _probs_sample(
-        self,
-        maybe_sample_logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> torch.Tensor:
-        probs = self.get_probs(maybe_sample_logits)
-        sampled = self.sample(probs, sampling_metadata)
-        # Use int32 to reduce the tensor size.
-        return sampled.to(torch.int32)
-
-   
-
-    def compute_logprobs(
-        self,
-        logits: torch.Tensor,
-        max_num_logprobs: int
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
-        """Compute logprobs and move to CPU."""
-        
-        if max_num_logprobs > 0:
-            logprobs = self.get_logprobs(logits)
-            # FIXME: Mask the sampled token_id, get topk logprobs,
-            # and concatenate the topk with the sampled token_id.
-            topk_logprobs, topk_indices = torch.topk(
-                logprobs, max_num_logprobs, dim=-1)
-            # Use int32 to reduce the tensor size.
-            topk_indices = topk_indices.to(torch.int32)
-
-            return topk_logprobs.cpu(), topk_indices.cpu()
-        else:
-            return None, None
-
     def forward(
         self,
         logits: torch.Tensor,
@@ -72,12 +40,10 @@ def forward(
         # Use int32 to reduce the tensor size.
         sampled = sampled.to(torch.int32)
 
-        # Compute the logprobs.
+        # Compute the logprobs if requested.
         # NOTE: CPU-GPU synchronization happens here. 
-        logprob_token_ids, logprobs = self.compute_logprobs(
-            logits,
-            sampling_metadata.max_num_logprobs
-        )
+        logprob_token_ids, logprobs = self._compute_logprobs(
+            logits, sampling_metadata.max_num_logprobs)
 
         # NOTE: CPU-GPU synchronization happens here.
         sampler_output = SamplerOutput(
@@ -87,6 +53,41 @@ def forward(
         )
         return sampler_output
 
+    def get_prompt_logprobs(
+        self,
+        logits: torch.Tensor,
+
+    ) -> PromptLogprobsOutput:
+        
+        
+        
+    def _compute_logprobs(
+        self,
+        logits: torch.Tensor,
+        max_num_logprobs: int
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:        
+        if max_num_logprobs > 0:
+            logprobs = self.get_logprobs(logits)
+            # FIXME: Mask the sampled token_id, get topk logprobs,
+            # and concatenate the topk with the sampled token_id.
+            topk_logprobs, topk_indices = torch.topk(
+                logprobs, max_num_logprobs, dim=-1)
+            # Use int32 to reduce the tensor size.
+            topk_indices = topk_indices.to(torch.int32)
+
+            return topk_logprobs.cpu(), topk_indices.cpu()
+        else:
+            return None, None
+
+    def _process_logits(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        logits = self._apply_temperature(logits, sampling_metadata.temperature)
+        logits = self._apply_top_k_top_p(logits, sampling_metadata)
+        return logits
+
     def _apply_temperature(
         self,
         logits: torch.Tensor,
@@ -113,15 +114,6 @@ def _apply_top_k_top_p(
             sampling_metadata.top_p,
         )
 
-    def _process_logits(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> torch.Tensor:
-        logits = self.apply_temperature(logits, sampling_metadata.temperature)
-        logits = self.apply_top_k_top_p(logits, sampling_metadata)
-        return logits
-
     def get_probs(self, logits: torch.Tensor) -> torch.Tensor:
         return torch.softmax(logits, dim=-1, dtype=torch.float32)
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index f121f47d97978..720a9c7870d86 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -111,7 +111,7 @@ def __init__(
         self.top_k_reqs: Set[str] = set()
 
         # Logprobs-related.
-        # NOTE(rob): The prompt logprobs trackers only include reqs that 
+        # NOTE(rob): The prompt logprobs trackers only include reqs that
         # are actively generating logprobs (i.e. in prefill phase).
         self.num_logprobs: Dict[str, int] = {}
         self.num_prompt_logprobs: Dict[str, int] = {}
@@ -124,7 +124,6 @@ def __init__(
         # generator should not be included in the dictionary.
         self.generators: Dict[int, torch.Generator] = {}
 
-
     def add_request(
         self,
         request: "CachedRequestState",
@@ -177,7 +176,6 @@ def add_request(
         if num_prompt_logprobs and num_prompt_logprobs > 0:
             self.num_prompt_logprobs[req_id] = num_prompt_logprobs
 
-
     def remove_request(self, req_id: str) -> Optional[int]:
         req_index = self.req_id_to_index.pop(req_id, None)
         if req_index is None:

From 7c4c231abd6c7ac3d10a6c62c53277bd0e53359f Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 1 Jan 2025 18:14:01 -0500
Subject: [PATCH 173/293] stash

---
 vllm/v1/sample/metadata.py         |  1 +
 vllm/v1/sample/sampler.py          | 15 +++++++++---
 vllm/v1/worker/gpu_input_batch.py  |  8 ++-----
 vllm/v1/worker/gpu_model_runner.py | 37 +++++-------------------------
 4 files changed, 21 insertions(+), 40 deletions(-)

diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 9e98118a60fff..00e86077a0d51 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -26,3 +26,4 @@ class PromptLogprobsMetadata:
     temperature: torch.Tensor
     top_p: torch.Tensor
     top_k: torch.Tensor
+    max_num_logprobs: int
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index cba6f1be46858..f6d898bbf9788 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -5,7 +5,8 @@
 import torch.nn as nn
 
 from vllm.v1.outputs import SamplerOutput, PromptLogprobsOutput
-from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.metadata import (SamplingMetadata,
+                                     PromptLogprobsMetadata)
 
 _SAMPLING_EPS = 1e-5
 
@@ -56,10 +57,18 @@ def forward(
     def get_prompt_logprobs(
         self,
         logits: torch.Tensor,
-
+        prompt_logprobs_metadata: PromptLogprobsMetadata,
     ) -> PromptLogprobsOutput:
+        logits = self._process_logits(logits, prompt_logprobs_metadata)
+
+        # Compute the prompt logprobs if requested.
+        # NOTE: CPU-GPU synchronization happens here.
+        logprob_token_ids, logprobs = self._compute_logprobs(
+            logits, prompt_logprobs_metadata.max_num_logprobs)
         
-        
+        return PromptLogprobsOutput(
+            logprob_token_ids=logprob_token_ids,
+            logprobs=logprobs)
         
     def _compute_logprobs(
         self,
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 720a9c7870d86..96dcba4881a30 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -116,9 +116,6 @@ def __init__(
         self.num_logprobs: Dict[str, int] = {}
         self.num_prompt_logprobs: Dict[str, int] = {}
 
-        # NOTE(rob): The req indexes that need sampling
-        self.needs_logits: set[int] = set()
-
         # req_index -> generator
         # NOTE(woosuk): The indices of the requests that do not have their own
         # generator should not be included in the dictionary.
@@ -269,9 +266,8 @@ def make_sampling_metadata(
             no_top_p=self.no_top_p,
             no_top_k=self.no_top_k,
             generators=self.generators,
-            max_num_batch_sample_logprobs=self.max_num_logprobs,
-            max_num_batch_prompt_logprobs=self.max_num_prompt_logprobs,
-            prompt_logprobs_req_indices=[xxx])
+            max_num_logprobs=self.max_num_logprobs,
+        )
 
     @property
     def num_reqs(self) -> int:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index da155c6e5c257..e6e423d0db5db 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -584,37 +584,12 @@ def execute_model(
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
-        # Indicate whether one or more requests in the batch require sample
-        # logprobs or prompt logprobs to be computed, respectively
-        do_batch_sample_logprobs = (
-            sampling_metadata.max_num_batch_sample_logprobs > 0)
-        do_batch_prompt_logprobs = (
-            sampling_metadata.max_num_batch_prompt_logprobs > 0)
-        # Prepare batch-level sample logprobs in a way that the type-checker
-        # understands
-        if do_batch_sample_logprobs:
-            assert (sampler_output.batch_sample_logprob_token_ids is not None)
-            assert (sampler_output.batch_sample_logprobs is not None)
-            batch_logprob_token_ids_cpu = (
-                sampler_output.batch_sample_logprob_token_ids.cpu().numpy())
-            batch_logprobs_cpu = (
-                sampler_output.batch_sample_logprobs.cpu().numpy())
-        else:
-            batch_logprob_token_ids_cpu = None
-            batch_logprobs_cpu = None
-
-        # Prepare batch-level prompt logprobs in a way that the type-checker
-        # understands
-        if do_batch_prompt_logprobs:
-            assert (sampler_output.batch_prompt_logprob_token_ids is not None)
-            assert (sampler_output.batch_prompt_logprobs is not None)
-            batch_prompt_logprob_token_ids_cpu = (
-                sampler_output.batch_prompt_logprob_token_ids.cpu().numpy())
-            batch_prompt_logprobs_cpu = (
-                sampler_output.batch_prompt_logprobs.cpu().numpy())
-        else:
-            batch_prompt_logprob_token_ids_cpu = None
-            batch_prompt_logprobs_cpu = None
+        # Compute prompt logprobs.
+        prompt_hidden_states = hidden_states[prompt_logits_mask]
+        prompt_logits = self.model.compute_logits(prompt_hidden_states, None)
+        # TODO: why is the sampler part of the model def?
+        prompt_logprobs_output = self.model.sampler.get_prompt_logprobs(
+            prompt_logits, prompt_logprobs_metadata)
 
         model_runner_output = ModelRunnerOutput(
             req_ids=cast(List[str], self.input_batch.req_ids[:num_reqs]),

From eab5cebfbd06139a63a03b17ebb094cadd0970bb Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 1 Jan 2025 18:56:50 -0500
Subject: [PATCH 174/293] updated

---
 vllm/v1/outputs.py                 | 41 +++++++++----------
 vllm/v1/sample/metadata.py         | 18 +++++----
 vllm/v1/sample/sampler.py          | 63 +++++++++++++++---------------
 vllm/v1/worker/gpu_model_runner.py | 28 +++++--------
 4 files changed, 70 insertions(+), 80 deletions(-)

diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 08b022ae2b807..0da9f8742c1c1 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -1,16 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict, List, Optional
-
-import numpy.typing as npt
-import torch
-
-
-@dataclass
-class PromptLogprobsOutput:
-
-    # [num_reqs, max_num_logprobs + 1]
-    logprob_token_ids: Optional[torch.Tensor] = None
-    logprobs: Optional[torch.Tensor] = None
+from typing import Dict, List
 
 
 @dataclass
@@ -19,9 +8,17 @@ class SamplerOutput:
     # [num_reqs]
     sampled_token_ids: List[int]
 
-    # [num_reqs, max_num_logprobs + 1]
-    logprob_token_ids: Optional[torch.Tensor] = None
-    logprobs: Optional[torch.Tensor] = None
+    # [num_reqs, max_num_logprobs]
+    logprob_token_ids: List[int]
+    logprobs: List[int]
+
+
+@dataclass
+class PromptLogprobsOutput:
+
+    # req_id -> [max_num_prompt_logprobs]
+    logprob_token_ids: Dict[str, List[int]]
+    logprobs: Dict[str, List[float]]
 
 
 # ModelRunnerOutput is serialized and sent to the scheduler process.
@@ -37,12 +34,10 @@ class ModelRunnerOutput:
     # [num_reqs]
     sampled_token_ids: List[int]
 
-    # [num_reqs, max_num_logprobs + 1]
-    batch_logprob_token_ids_cpu: Optional[npt.NDArray]
-    # [num_reqs, max_num_logprobs + 1]
-    batch_logprobs_cpu: Optional[npt.NDArray]
+    # [num_reqs, max_num_logprobs]
+    logprob_token_ids: List[List[int]]
+    logprobs: List[List[float]]
 
-    # [num_reqs, max_num_prompt_logprobs]
-    batch_prompt_logprob_token_ids_cpu: Optional[npt.NDArray]
-    # [num_reqs, max_num_prompt_logprobs]
-    batch_prompt_logprobs_cpu: Optional[npt.NDArray]
+    # req_id -> [max_num_prompt_logprobs]
+    prompt_logprob_token_ids: Dict[str, List[int]]
+    prompt_logprobs: Dict[str, List[float]]
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 00e86077a0d51..4d06cdbb4d083 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -5,17 +5,21 @@
 
 
 @dataclass
-class SamplingMetadata:
+class LogitsProcessMetadata:
 
     temperature: torch.Tensor
-    all_greedy: bool
-    all_random: bool
-
     top_p: torch.Tensor
     top_k: torch.Tensor
     no_top_p: bool
     no_top_k: bool
 
+
+@dataclass
+class SamplingMetadata:
+
+    all_greedy: bool
+    all_random: bool
+    logits_process_metadata: LogitsProcessMetadata
     generators: Dict[int, torch.Generator]
     max_num_logprobs: int
 
@@ -23,7 +27,7 @@ class SamplingMetadata:
 @dataclass
 class PromptLogprobsMetadata:
 
-    temperature: torch.Tensor
-    top_p: torch.Tensor
-    top_k: torch.Tensor
+    req_ids: List[str]
+    req_indicies: List[int]
+    logits_process_metadata: LogitsProcessMetadata
     max_num_logprobs: int
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index f6d898bbf9788..cbff438ca1c8a 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,11 +1,11 @@
 """A layer that samples the next tokens from the model's outputs."""
-from typing import Dict, Optional, Tuple
+from typing import Dict, List, Tuple
 
 import torch
 import torch.nn as nn
 
 from vllm.v1.outputs import SamplerOutput, PromptLogprobsOutput
-from vllm.v1.sample.metadata import (SamplingMetadata,
+from vllm.v1.sample.metadata import (LogitsProcessMetadata, SamplingMetadata,
                                      PromptLogprobsMetadata)
 
 _SAMPLING_EPS = 1e-5
@@ -35,14 +35,15 @@ def forward(
         """
 
         # Sample next token.
-        logits = self._process_logits(logits, sampling_metadata)
+        logits = self._process_logits(
+            logits, sampling_metadata.logits_process_metadata)
         probs = self.get_probs(logits)
         sampled = self.sample(probs, sampling_metadata)
         # Use int32 to reduce the tensor size.
         sampled = sampled.to(torch.int32)
 
         # Compute the logprobs if requested.
-        # NOTE: CPU-GPU synchronization happens here. 
+        # NOTE: logprob CPU-GPU synchronization happens here.
         logprob_token_ids, logprobs = self._compute_logprobs(
             logits, sampling_metadata.max_num_logprobs)
 
@@ -59,42 +60,44 @@ def get_prompt_logprobs(
         logits: torch.Tensor,
         prompt_logprobs_metadata: PromptLogprobsMetadata,
     ) -> PromptLogprobsOutput:
-        logits = self._process_logits(logits, prompt_logprobs_metadata)
+        # Apply logits processor.
+        logits = self._process_logits(
+            logits, prompt_logprobs_metadata.logits_process_metadata)
 
         # Compute the prompt logprobs if requested.
         # NOTE: CPU-GPU synchronization happens here.
         logprob_token_ids, logprobs = self._compute_logprobs(
             logits, prompt_logprobs_metadata.max_num_logprobs)
-        
-        return PromptLogprobsOutput(
-            logprob_token_ids=logprob_token_ids,
-            logprobs=logprobs)
-        
+
+        return PromptLogprobsOutput(logprob_token_ids=logprob_token_ids,
+                                    logprobs=logprobs)
+
     def _compute_logprobs(
-        self,
-        logits: torch.Tensor,
-        max_num_logprobs: int
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:        
+            self, logits: torch.Tensor,
+            max_num_logprobs: int) -> Tuple[List[int], List[float]]:
         if max_num_logprobs > 0:
             logprobs = self.get_logprobs(logits)
             # FIXME: Mask the sampled token_id, get topk logprobs,
             # and concatenate the topk with the sampled token_id.
-            topk_logprobs, topk_indices = torch.topk(
-                logprobs, max_num_logprobs, dim=-1)
+            topk_logprobs, topk_indices = torch.topk(logprobs,
+                                                     max_num_logprobs,
+                                                     dim=-1)
             # Use int32 to reduce the tensor size.
             topk_indices = topk_indices.to(torch.int32)
 
-            return topk_logprobs.cpu(), topk_indices.cpu()
+            # NOTE: CPU<>GPU synchronization happens here.
+            return topk_indices.tolist(), topk_logprobs.tolist()
         else:
-            return None, None
+            return [], []
 
     def _process_logits(
         self,
         logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
+        logits_process_metadata: LogitsProcessMetadata,
     ) -> torch.Tensor:
-        logits = self._apply_temperature(logits, sampling_metadata.temperature)
-        logits = self._apply_top_k_top_p(logits, sampling_metadata)
+        logits = self._apply_temperature(logits,
+                                         logits_process_metadata.temperature)
+        logits = self._apply_top_k_top_p(logits, logits_process_metadata)
         return logits
 
     def _apply_temperature(
@@ -113,14 +116,14 @@ def _apply_temperature(
     def _apply_top_k_top_p(
         self,
         logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
+        logits_process_metadata: LogitsProcessMetadata,
     ) -> torch.Tensor:
         return _apply_top_k_top_p(
             logits,
-            sampling_metadata.no_top_k,
-            sampling_metadata.top_k,
-            sampling_metadata.no_top_p,
-            sampling_metadata.top_p,
+            logits_process_metadata.no_top_k,
+            logits_process_metadata.top_k,
+            logits_process_metadata.no_top_p,
+            logits_process_metadata.top_p,
         )
 
     def get_probs(self, logits: torch.Tensor) -> torch.Tensor:
@@ -167,11 +170,9 @@ def sample(
         greedy_sampled = self.greedy_sample(probs)
         random_sampled = self.random_sample(probs,
                                             sampling_metadata.generators)
-        sampled = torch.where(
-            sampling_metadata.temperature < _SAMPLING_EPS,
-            greedy_sampled,
-            random_sampled,
-        )
+        temperature = sampling_metadata.logits_process_metadata.temperature
+        sampled = torch.where(temperature < _SAMPLING_EPS,
+                              greedy_sampled, random_sampled)
         return sampled
 
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e6e423d0db5db..c7009c1a20a30 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -395,8 +395,6 @@ def _prepare_inputs(
     def _prepare_sampling(
         self,
         scheduler_output: "SchedulerOutput",
-        num_input_tokens: int,
-        query_start_loc: torch.Tensor,
     ) -> SamplingMetadata:
         skip_copy = True
         if (scheduler_output.finished_req_ids
@@ -406,12 +404,7 @@ def _prepare_sampling(
                 or scheduler_output.scheduled_resumed_reqs):
             skip_copy = False
         # Create the sampling metadata.
-        sampling_metadata = self.input_batch.make_sampling_metadata(
-            scheduler_output.partial_req_index,
-            num_input_tokens,
-            query_start_loc,
-            skip_copy,
-        )
+        sampling_metadata = self.input_batch.make_sampling_metadata(skip_copy)
         return sampling_metadata
 
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
@@ -516,7 +509,7 @@ def execute_model(
             num_input_tokens = num_scheduled_tokens
         attn_metadata.num_input_tokens = num_input_tokens
 
-        sampling_metadata = self._prepare_sampling(
+        sampling_metadata, prompt_logprobs_metadata = self._prepare_sampling(
             scheduler_output, num_input_tokens, attn_metadata.query_start_loc)
 
         if self.is_multimodal_model:
@@ -587,22 +580,19 @@ def execute_model(
         # Compute prompt logprobs.
         prompt_hidden_states = hidden_states[prompt_logits_mask]
         prompt_logits = self.model.compute_logits(prompt_hidden_states, None)
-        # TODO: why is the sampler part of the model def?
+        # TODO(rob): Why is the sampler part of the model definition?
         prompt_logprobs_output = self.model.sampler.get_prompt_logprobs(
             prompt_logits, prompt_logprobs_metadata)
 
-        model_runner_output = ModelRunnerOutput(
+        return ModelRunnerOutput(
             req_ids=cast(List[str], self.input_batch.req_ids[:num_reqs]),
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=sampled_token_ids,
-            # NOTE: sample and prompt logprob CPU-GPU synchronization happens
-            # here
-            batch_logprob_token_ids_cpu=batch_logprob_token_ids_cpu,
-            batch_logprobs_cpu=batch_logprobs_cpu,
-            batch_prompt_logprob_token_ids_cpu=(
-                batch_prompt_logprob_token_ids_cpu),
-            batch_prompt_logprobs_cpu=(batch_prompt_logprobs_cpu))
-        return model_runner_output
+            logprob_token_ids=sampler_output.logprob_token_ids,
+            logprobs=sampler_output.logprobs,
+            prompt_logprob_token_ids=prompt_logprobs_output.logprob_token_ids,
+            prompt_logprobs=prompt_logprobs_output.logprobs,
+        )
 
     def load_model(self) -> None:
         logger.info("Starting to load model %s...", self.model_config.model)

From 970e0307563a1d799383a8df90596d5ce32fbeee Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 1 Jan 2025 21:01:22 -0500
Subject: [PATCH 175/293] format

---
 vllm/v1/outputs.py                 | 20 ++-----
 vllm/v1/sample/metadata.py         | 11 +++-
 vllm/v1/sample/sampler.py          |  9 ++-
 vllm/v1/worker/gpu_input_batch.py  | 65 +++++++++++++---------
 vllm/v1/worker/gpu_model_runner.py | 88 ++++++++++++++++++------------
 5 files changed, 111 insertions(+), 82 deletions(-)

diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 0da9f8742c1c1..3f7c555b04841 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict, List
+from typing import Dict, List, Tuple
 
 
 @dataclass
@@ -9,16 +9,8 @@ class SamplerOutput:
     sampled_token_ids: List[int]
 
     # [num_reqs, max_num_logprobs]
-    logprob_token_ids: List[int]
-    logprobs: List[int]
-
-
-@dataclass
-class PromptLogprobsOutput:
-
-    # req_id -> [max_num_prompt_logprobs]
-    logprob_token_ids: Dict[str, List[int]]
-    logprobs: Dict[str, List[float]]
+    logprob_token_ids: List[List[int]]
+    logprobs: List[List[int]]
 
 
 # ModelRunnerOutput is serialized and sent to the scheduler process.
@@ -38,6 +30,6 @@ class ModelRunnerOutput:
     logprob_token_ids: List[List[int]]
     logprobs: List[List[float]]
 
-    # req_id -> [max_num_prompt_logprobs]
-    prompt_logprob_token_ids: Dict[str, List[int]]
-    prompt_logprobs: Dict[str, List[float]]
+    # req_id -> (prompt_logprobs_token_ids, prompt_logprobs)
+    # [num_reqs, max_num_prompt_logprobs]
+    prompt_logprobs: Dict[str, Tuple[List[List[int], List[List[float]]]]]
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 4d06cdbb4d083..484b568cf3a3e 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass
 from typing import Dict, List
 
+import numpy
 import torch
 
 
@@ -17,6 +18,8 @@ class LogitsProcessMetadata:
 @dataclass
 class SamplingMetadata:
 
+    # Indicies in the batch needing 
+    sample_indicies: torch.Tensor
     all_greedy: bool
     all_random: bool
     logits_process_metadata: LogitsProcessMetadata
@@ -27,7 +30,13 @@ class SamplingMetadata:
 @dataclass
 class PromptLogprobsMetadata:
 
+    # Mask of the indices needed for prompt logprobs.
+    prompt_logprobs_mask: numpy.ndarray[bool]
+
+    # Note: req_ids must be in order of the requests
+    # in prompt_indicies.
     req_ids: List[str]
-    req_indicies: List[int]
+    prompt_lens: Dict[str, int]
+
     logits_process_metadata: LogitsProcessMetadata
     max_num_logprobs: int
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index cbff438ca1c8a..3086e0808a455 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -59,7 +59,7 @@ def get_prompt_logprobs(
         self,
         logits: torch.Tensor,
         prompt_logprobs_metadata: PromptLogprobsMetadata,
-    ) -> PromptLogprobsOutput:
+    ) -> Tuple[List[List[int]], List[List[int]]]:
         # Apply logits processor.
         logits = self._process_logits(
             logits, prompt_logprobs_metadata.logits_process_metadata)
@@ -69,8 +69,7 @@ def get_prompt_logprobs(
         logprob_token_ids, logprobs = self._compute_logprobs(
             logits, prompt_logprobs_metadata.max_num_logprobs)
 
-        return PromptLogprobsOutput(logprob_token_ids=logprob_token_ids,
-                                    logprobs=logprobs)
+        return logprob_token_ids, logprobs
 
     def _compute_logprobs(
             self, logits: torch.Tensor,
@@ -171,8 +170,8 @@ def sample(
         random_sampled = self.random_sample(probs,
                                             sampling_metadata.generators)
         temperature = sampling_metadata.logits_process_metadata.temperature
-        sampled = torch.where(temperature < _SAMPLING_EPS,
-                              greedy_sampled, random_sampled)
+        sampled = torch.where(temperature < _SAMPLING_EPS, greedy_sampled,
+                              random_sampled)
         return sampled
 
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 96dcba4881a30..5fa4a95e1fe01 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -8,7 +8,9 @@
 
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams, SamplingType
-from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.metadata import (LogitsProcessMetadata,
+                                     SamplingMetadata,
+                                     PromptLogprobsMetadata)
 
 if TYPE_CHECKING:
     from vllm.multimodal.inputs import PlaceholderRange
@@ -110,16 +112,17 @@ def __init__(
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
         self.top_k_reqs: Set[str] = set()
 
+        # req_index -> generator
+        # NOTE(woosuk): The indices of the requests that do not have their own
+        # generator should not be included in the dictionary.
+        self.generators: Dict[int, torch.Generator] = {}
+
         # Logprobs-related.
         # NOTE(rob): The prompt logprobs trackers only include reqs that
-        # are actively generating logprobs (i.e. in prefill phase).
+        # are actively generating logprobs (i.e. they in prefill phase).
         self.num_logprobs: Dict[str, int] = {}
         self.num_prompt_logprobs: Dict[str, int] = {}
 
-        # req_index -> generator
-        # NOTE(woosuk): The indices of the requests that do not have their own
-        # generator should not be included in the dictionary.
-        self.generators: Dict[int, torch.Generator] = {}
 
     def add_request(
         self,
@@ -166,12 +169,13 @@ def add_request(
         if request.generator is not None:
             self.generators[req_index] = request.generator
 
-        num_logprobs = sampling_params.logprobs
-        num_prompt_logprobs = sampling_params.prompt_logprobs
-        if num_logprobs and num_logprobs > 0:
-            self.num_logprobs[req_id] = num_logprobs
-        if num_prompt_logprobs and num_prompt_logprobs > 0:
-            self.num_prompt_logprobs[req_id] = num_prompt_logprobs
+        if sampling_params.logprobs:
+            self.num_logprobs[req_id] = sampling_params.logprobs
+        if sampling_params.prompt_logprobs:
+            # TODO(rob): handle prefix caching and recomputation.
+            # We need to re-run the prefill if requesting prompt
+            # logprobs w/ prefix caching.
+            self.num_prompt_logprobs[req_id] = sampling_params.prompt_logprobs
 
     def remove_request(self, req_id: str) -> Optional[int]:
         req_index = self.req_id_to_index.pop(req_id, None)
@@ -245,6 +249,7 @@ def condense(self, empty_req_indices: List[int]) -> None:
 
     def make_sampling_metadata(
         self,
+        query_start_loc: torch.Tensor,
         skip_copy: bool = False,
     ) -> SamplingMetadata:
         if not skip_copy:
@@ -255,20 +260,34 @@ def make_sampling_metadata(
             self.top_k[:self.num_reqs].copy_(
                 self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True)
 
-        num_reqs = self.num_reqs
-
         return SamplingMetadata(
-            temperature=self.temperature[:num_reqs],
+            sample_indicies=query_start_loc[1:] - 1,
             all_greedy=self.all_greedy,
             all_random=self.all_random,
-            top_p=self.top_p[:self.num_reqs],
-            top_k=self.top_k[:self.num_reqs],
-            no_top_p=self.no_top_p,
-            no_top_k=self.no_top_k,
+            logits_process_metadata=LogitsProcessMetadata(
+                temperature=self.temperature[:self.num_reqs],
+                top_p=self.top_p[:self.num_reqs],
+                top_k=self.top_k[:self.num_reqs],
+                no_top_p=self.no_top_p,
+                no_top_k=self.no_top_k),
             generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
         )
 
+    def make_prompt_logprobs_metadata(
+        self) -> Optional[PromptLogprobsMetadata]:
+
+        if self.max_num_prompt_logprobs:
+
+            return PromptLogprobsMetadata(
+                req_ids=list(self.num_prompt_logprobs.keys()),
+
+                max_num_logprobs=self.max_num_prompt_logprobs,
+            )
+        else:
+            return None
+
+
     @property
     def num_reqs(self) -> int:
         return len(self.req_id_to_index)
@@ -297,11 +316,3 @@ def max_num_logprobs(self) -> int:
     def max_num_prompt_logprobs(self) -> int:
         return (max(self.num_prompt_logprobs.values())
                 if self.num_prompt_logprobs else 0)
-
-    @property
-    def no_logprob(self) -> bool:
-        return len(self.num_logprobs) == 0
-
-    @property
-    def no_prompt_logprob(self) -> bool:
-        return len(self.num_prompt_logprobs) == 0
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c7009c1a20a30..f9d35baf790c2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,6 +1,6 @@
 import gc
 import time
-from typing import TYPE_CHECKING, Dict, List, Tuple, cast
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, cast
 
 import numpy as np
 import torch
@@ -21,7 +21,7 @@
                                                    FlashAttentionMetadata)
 from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
 from vllm.v1.outputs import ModelRunnerOutput, SamplerOutput
-from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.metadata import SamplingMetadata, PromptLogprobsMetadata
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
 if TYPE_CHECKING:
@@ -208,13 +208,10 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             self.input_batch.block_table_cpu[
                 req_index, start_index:end_index] = req_data.new_block_ids
 
-            # TODO(rob): is there a cleaner way to do this?
-            # If the request is still in the prompt phase and requires
-            # prompt logprobs, include it in active_prompt_logprobs.
-            if (req_id == scheduler_output.partial_req_id
-                    and req_id in self.input_batch.num_prompt_logprobs):
-                self.input_batch.active_prompt_logprobs[req_id] = (
-                    self.input_batch.num_prompt_logprobs[req_id])
+            # Remove from prompt logprobs once out of prefill phase.
+            if (req_id in self.input_batch.num_prompt_logprobs
+                    and req_id != scheduler_output.partial_req_id):
+                del self.input_batch.num_prompt_logprobs[req_id]
 
         req_ids_to_add: List[str] = []
         # Add new requests to the cached states.
@@ -270,7 +267,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
     def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> Tuple[FlashAttentionMetadata, np.ndarray[bool]]:
+    ) -> Tuple[FlashAttentionMetadata,
+               SamplingMetadata,
+               Optional[PromptLogprobsMetadata]]:
 
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
@@ -376,25 +375,31 @@ def _prepare_inputs(
             slot_mapping=slot_mapping,
         )
 
-        # Get mask of indices that need logits for sampling.
-        sample_logits_mask = query_start_loc[1:] - 1
+        sampling_metadata = self._prepare_sampling(scheduler_output, query_start_loc)
+        prompt_logprobs_metadata = self._prepare_prompt_logprobs(req_indices)
 
-        # Get mask of indices that need logits for prompt logprobs.
-        # NOTE(rob): we should avoid loops over all reqs in this fn,
-        # but the size of num_prompt_logprobs is small since it is
-        # only requests that are currently in the prefill phase.
-        prompt_logprobs_indicies = [
+        return attn_metadata, sampling_metadata, prompt_logprobs_metadata
+
+    def _prepare_prompt_logprobs(
+        self,
+        req_indices: np.ndarray,
+    ) -> PromptLogprobsMetadata:
+        
+        # Indicies of requests that need prompt logprobs.
+        # NOTE(rob): We should avoid loops over all reqs, this just
+        # loops over requests that currently need prompt logprobs,
+        prompt_logprobs_req_idxs = [
             self.input_batch.req_id_to_index[req_id]
-            for req_id in self.input_batch.num_prompt_logprobs
-        ]
-        prompt_logits_mask = torch.from_numpy(
-            np.isin(req_indices, prompt_logprobs_indicies))
+            for req_id in self.input_batch.num_prompt_logprobs]
 
-        return attn_metadata, sample_logits_mask, prompt_logits_mask
+        prompt_logprobs_mask = np.isin(
+            req_indices, prompt_logprobs_req_idxs)
+        
 
     def _prepare_sampling(
         self,
         scheduler_output: "SchedulerOutput",
+        query_start_loc: torch.Tensor,
     ) -> SamplingMetadata:
         skip_copy = True
         if (scheduler_output.finished_req_ids
@@ -404,7 +409,8 @@ def _prepare_sampling(
                 or scheduler_output.scheduled_resumed_reqs):
             skip_copy = False
         # Create the sampling metadata.
-        sampling_metadata = self.input_batch.make_sampling_metadata(skip_copy)
+        sampling_metadata = self.input_batch.make_sampling_metadata(
+            skip_copy, query_start_loc)
         return sampling_metadata
 
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
@@ -509,9 +515,6 @@ def execute_model(
             num_input_tokens = num_scheduled_tokens
         attn_metadata.num_input_tokens = num_input_tokens
 
-        sampling_metadata, prompt_logprobs_metadata = self._prepare_sampling(
-            scheduler_output, num_input_tokens, attn_metadata.query_start_loc)
-
         if self.is_multimodal_model:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
@@ -554,6 +557,29 @@ def execute_model(
             sampling_metadata=sampling_metadata,
         )
 
+        # Compute prompt logprobs if requested.
+        prompt_logprobs_output = {}
+        if prompt_logprobs_metadata.max_num_logprobs > 0:
+            # First, compute the prompt logprobs all together.
+            prompt_hidden_states = hidden_states[prompt_logits_mask]
+            prompt_logits = self.model.compute_logits(prompt_hidden_states,
+                                                      None)
+            prompt_logprobs_token_ids, prompt_logprobs = self.model.sampler.get_prompt_logprobs(
+                prompt_logits, prompt_logprobs_metadata)
+
+            # Second, split the prompt logprobs
+            # NOTE(rob): We should avoid looping over all reqs for performance,
+            # this only loops over current prefills which need prompt lps.
+            # NOTE(rob): Here, we assume that req_ids are in order.
+            start_pos = 0
+            for req_id in zip(self.prompt_logprobs_metadata.req_ids):
+                end_pos = start_pos + prompt_logprobs_metadata.prompt_lens[
+                    req_id]
+                prompt_logprobs_output[req_id] = (
+                    prompt_logprobs_token_ids[start_pos:end_pos],
+                    prompt_logprobs[start_pos:end_pos])
+                start_pos = end_pos
+
         sampled_token_ids = sampler_output.sampled_token_ids
         # TODO(woosuk): The following loop can be slow since it iterates over
         # the requests one by one. Optimize.
@@ -577,21 +603,13 @@ def execute_model(
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
-        # Compute prompt logprobs.
-        prompt_hidden_states = hidden_states[prompt_logits_mask]
-        prompt_logits = self.model.compute_logits(prompt_hidden_states, None)
-        # TODO(rob): Why is the sampler part of the model definition?
-        prompt_logprobs_output = self.model.sampler.get_prompt_logprobs(
-            prompt_logits, prompt_logprobs_metadata)
-
         return ModelRunnerOutput(
             req_ids=cast(List[str], self.input_batch.req_ids[:num_reqs]),
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=sampled_token_ids,
             logprob_token_ids=sampler_output.logprob_token_ids,
             logprobs=sampler_output.logprobs,
-            prompt_logprob_token_ids=prompt_logprobs_output.logprob_token_ids,
-            prompt_logprobs=prompt_logprobs_output.logprobs,
+            prompt_logprobs=prompt_logprobs_output,
         )
 
     def load_model(self) -> None:

From 54d6f175c8ec1af75186412308141d598e68b276 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Wed, 1 Jan 2025 22:04:30 -0500
Subject: [PATCH 176/293] single compute_logits, consider switching to N
 compute_logits

---
 vllm/v1/sample/metadata.py         | 12 ++---
 vllm/v1/sample/sampler.py          | 24 ++-------
 vllm/v1/worker/gpu_input_batch.py  | 47 +++++++++++++----
 vllm/v1/worker/gpu_model_runner.py | 82 ++++++++++++++++--------------
 4 files changed, 89 insertions(+), 76 deletions(-)

diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 484b568cf3a3e..238d471b30291 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -18,7 +18,6 @@ class LogitsProcessMetadata:
 @dataclass
 class SamplingMetadata:
 
-    # Indicies in the batch needing 
     sample_indicies: torch.Tensor
     all_greedy: bool
     all_random: bool
@@ -30,13 +29,8 @@ class SamplingMetadata:
 @dataclass
 class PromptLogprobsMetadata:
 
-    # Mask of the indices needed for prompt logprobs.
-    prompt_logprobs_mask: numpy.ndarray[bool]
-
-    # Note: req_ids must be in order of the requests
-    # in prompt_indicies.
-    req_ids: List[str]
-    prompt_lens: Dict[str, int]
+    # req_id -> mask of indices each prompt logprob
+    logits_masks: Dict[str, numpy.ndarray[bool]]
 
+    # Logits process metadata for all elts of the batch
     logits_process_metadata: LogitsProcessMetadata
-    max_num_logprobs: int
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 3086e0808a455..8b1e7adf3cf5f 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -35,7 +35,7 @@ def forward(
         """
 
         # Sample next token.
-        logits = self._process_logits(
+        logits = self.process_logits(
             logits, sampling_metadata.logits_process_metadata)
         probs = self.get_probs(logits)
         sampled = self.sample(probs, sampling_metadata)
@@ -44,7 +44,7 @@ def forward(
 
         # Compute the logprobs if requested.
         # NOTE: logprob CPU-GPU synchronization happens here.
-        logprob_token_ids, logprobs = self._compute_logprobs(
+        logprob_token_ids, logprobs = self.compute_logprobs(
             logits, sampling_metadata.max_num_logprobs)
 
         # NOTE: CPU-GPU synchronization happens here.
@@ -55,23 +55,7 @@ def forward(
         )
         return sampler_output
 
-    def get_prompt_logprobs(
-        self,
-        logits: torch.Tensor,
-        prompt_logprobs_metadata: PromptLogprobsMetadata,
-    ) -> Tuple[List[List[int]], List[List[int]]]:
-        # Apply logits processor.
-        logits = self._process_logits(
-            logits, prompt_logprobs_metadata.logits_process_metadata)
-
-        # Compute the prompt logprobs if requested.
-        # NOTE: CPU-GPU synchronization happens here.
-        logprob_token_ids, logprobs = self._compute_logprobs(
-            logits, prompt_logprobs_metadata.max_num_logprobs)
-
-        return logprob_token_ids, logprobs
-
-    def _compute_logprobs(
+    def compute_logprobs(
             self, logits: torch.Tensor,
             max_num_logprobs: int) -> Tuple[List[int], List[float]]:
         if max_num_logprobs > 0:
@@ -89,7 +73,7 @@ def _compute_logprobs(
         else:
             return [], []
 
-    def _process_logits(
+    def process_logits(
         self,
         logits: torch.Tensor,
         logits_process_metadata: LogitsProcessMetadata,
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 5fa4a95e1fe01..d458e8ca9d874 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -275,18 +275,47 @@ def make_sampling_metadata(
         )
 
     def make_prompt_logprobs_metadata(
-        self) -> Optional[PromptLogprobsMetadata]:
-
-        if self.max_num_prompt_logprobs:
-
-            return PromptLogprobsMetadata(
-                req_ids=list(self.num_prompt_logprobs.keys()),
+        self,
+        num_scheduled_tokens: np.ndarray,
+        req_indices: np.ndarray,
+    ) -> Optional[PromptLogprobsMetadata]:
 
-                max_num_logprobs=self.max_num_prompt_logprobs,
-            )
-        else:
+        if not self.max_num_prompt_logprobs:
             return None
 
+        # Create masks for each request needing prompt logprobs.
+        # TODO(rob): wrap this in torch tensor + move to GPU?
+        logits_masks = {
+            req_id: (req_indices == self.req_id_to_index[req_id])
+            for req_id in self.num_prompt_logprobs
+        }
+
+        # Expand temp, top_p, and top_k for the whole batch.
+        # NOTE(rob): to simplify implementation, process the logits
+        # for all batch elements when computing prompt logprobs.
+        # TODO(rob): I think these will come out flattened, need to
+        # reshape after calling repeat_interleave?
+        num_scheduled_tokens_torch = torch.from_numpy(
+            num_scheduled_tokens).to(self.temperature.device)
+        temperature = torch.repeat_interleave(self.temperature,
+                                              num_scheduled_tokens_torch)
+        # Skip expansion if we are going to skip top_p/k anyways.
+        top_p = (self.top_p if self.no_top_p else
+                 torch.repeat_interleave(self.top_p,
+                                         num_scheduled_tokens_torch))
+        top_k = (self.top_k if self.no_top_k else
+                 torch.repeat_interleave(self.top_k,
+                                         num_scheduled_tokens_torch))
+
+        return PromptLogprobsMetadata(
+            logits_masks=logits_masks,
+            logits_process_metadata=LogitsProcessMetadata(
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                no_top_p=self.no_top_p,
+                no_top_k=self.no_top_k,
+            ))
 
     @property
     def num_reqs(self) -> int:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f9d35baf790c2..7dffdeab0fb4c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -375,27 +375,13 @@ def _prepare_inputs(
             slot_mapping=slot_mapping,
         )
 
-        sampling_metadata = self._prepare_sampling(scheduler_output, query_start_loc)
-        prompt_logprobs_metadata = self._prepare_prompt_logprobs(req_indices)
+        sampling_metadata = self._prepare_sampling(
+            scheduler_output, query_start_loc)
+        prompt_logprobs_metadata = self._prepare_prompt_logprobs(
+            num_scheduled_tokens, req_indices)
 
         return attn_metadata, sampling_metadata, prompt_logprobs_metadata
 
-    def _prepare_prompt_logprobs(
-        self,
-        req_indices: np.ndarray,
-    ) -> PromptLogprobsMetadata:
-        
-        # Indicies of requests that need prompt logprobs.
-        # NOTE(rob): We should avoid loops over all reqs, this just
-        # loops over requests that currently need prompt logprobs,
-        prompt_logprobs_req_idxs = [
-            self.input_batch.req_id_to_index[req_id]
-            for req_id in self.input_batch.num_prompt_logprobs]
-
-        prompt_logprobs_mask = np.isin(
-            req_indices, prompt_logprobs_req_idxs)
-        
-
     def _prepare_sampling(
         self,
         scheduler_output: "SchedulerOutput",
@@ -413,6 +399,19 @@ def _prepare_sampling(
             skip_copy, query_start_loc)
         return sampling_metadata
 
+    def _prepare_prompt_logprobs(
+        self,
+        num_scheduled_tokens: np.array,
+        req_indices: np.ndarray,
+    ) -> Optional[PromptLogprobsMetadata]:
+        # NOTE(rob): Since this function uses the values of
+        # input_batch.temp/top_p/top_k, which are mutated in 
+        # self._prepare_sampling, it should be called AFTER.
+        
+        # Create the prompt logprobs metadata.
+        return self.input_batch.make_prompt_logprobs_metadata(
+            num_scheduled_tokens, req_indices)
+
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
@@ -501,7 +500,7 @@ def execute_model(
             encoder_outputs = []
 
         # Prepare the decoder inputs.
-        attn_metadata, sample_logits_mask, prompt_logits_mask = (
+        attn_metadata, sampling_metadata, prompt_logprobs_metadata = (
             self._prepare_inputs(scheduler_output))
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
@@ -548,7 +547,7 @@ def execute_model(
                 inputs_embeds=inputs_embeds,
             )
         hidden_states = hidden_states[:num_scheduled_tokens]
-        sample_hidden_states = hidden_states[sample_logits_mask]
+        sample_hidden_states = hidden_states[sampling_metadata.sample_indicies]
         sample_logits = self.model.compute_logits(sample_hidden_states, None)
 
         # Sample the next token.
@@ -559,26 +558,33 @@ def execute_model(
 
         # Compute prompt logprobs if requested.
         prompt_logprobs_output = {}
-        if prompt_logprobs_metadata.max_num_logprobs > 0:
-            # First, compute the prompt logprobs all together.
-            prompt_hidden_states = hidden_states[prompt_logits_mask]
-            prompt_logits = self.model.compute_logits(prompt_hidden_states,
-                                                      None)
-            prompt_logprobs_token_ids, prompt_logprobs = self.model.sampler.get_prompt_logprobs(
-                prompt_logits, prompt_logprobs_metadata)
+        if prompt_logprobs_metadata:
+            # NOTE(rob): the implementation computes logits that are not
+            # needed and uses a small loop to keep code simple for a low
+            # importance feature (primarily used for lm-eval evaluations).
+
+            # First, compute the logits for all elements of the batch
+            logits = self.model.sampler.compute_logits(hidden_states, None)
+            logits = self.model.sampler.process_logits(
+                logits, prompt_logprobs_metadata.logits_process_metadata)
+            
+            # Second, compute the logprobs for requests needing prompt lps.
+            # NOTE(rob): We should avoid looping over all reqs, this loop
+            # this only loops over active prefills which need prompt lps.
+            prompt_logprobs = {}
+            for req_id, logits_mask in logits_mask.items():
+                req_logits = logits[logits_mask]
+                lp_token_ids, lps = self.model.sampler.compute_logprobs(
+                    req_logits, self.input_batch.num_prompt_logprobs[req_id])
+    
+                
+                # TODO: remove the sample logprob by checking if this is a
+                # partial request or not.
+                prompt_logprobs[req_id] = (lp_token_ids, lps)
+                
 
             # Second, split the prompt logprobs
-            # NOTE(rob): We should avoid looping over all reqs for performance,
-            # this only loops over current prefills which need prompt lps.
-            # NOTE(rob): Here, we assume that req_ids are in order.
-            start_pos = 0
-            for req_id in zip(self.prompt_logprobs_metadata.req_ids):
-                end_pos = start_pos + prompt_logprobs_metadata.prompt_lens[
-                    req_id]
-                prompt_logprobs_output[req_id] = (
-                    prompt_logprobs_token_ids[start_pos:end_pos],
-                    prompt_logprobs[start_pos:end_pos])
-                start_pos = end_pos
+            
 
         sampled_token_ids = sampler_output.sampled_token_ids
         # TODO(woosuk): The following loop can be slow since it iterates over

From 8d4723e566dd3be6f54e4dc29e0d3b2118ebd943 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 07:41:59 -0500
Subject: [PATCH 177/293] format

---
 vllm/v1/sample/sampler.py          |  4 ++--
 vllm/v1/worker/gpu_input_batch.py  | 30 ++++++++++++++----------------
 vllm/v1/worker/gpu_model_runner.py | 18 +++++++-----------
 3 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 8b1e7adf3cf5f..76e662d8252e8 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -35,8 +35,8 @@ def forward(
         """
 
         # Sample next token.
-        logits = self.process_logits(
-            logits, sampling_metadata.logits_process_metadata)
+        logits = self.process_logits(logits,
+                                     sampling_metadata.logits_process_metadata)
         probs = self.get_probs(logits)
         sampled = self.sample(probs, sampling_metadata)
         # Use int32 to reduce the tensor size.
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index d458e8ca9d874..05464f960e122 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -8,8 +8,7 @@
 
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams, SamplingType
-from vllm.v1.sample.metadata import (LogitsProcessMetadata,
-                                     SamplingMetadata,
+from vllm.v1.sample.metadata import (LogitsProcessMetadata, SamplingMetadata,
                                      PromptLogprobsMetadata)
 
 if TYPE_CHECKING:
@@ -123,7 +122,6 @@ def __init__(
         self.num_logprobs: Dict[str, int] = {}
         self.num_prompt_logprobs: Dict[str, int] = {}
 
-
     def add_request(
         self,
         request: "CachedRequestState",
@@ -285,27 +283,27 @@ def make_prompt_logprobs_metadata(
 
         # Create masks for each request needing prompt logprobs.
         # TODO(rob): wrap this in torch tensor + move to GPU?
+        metas = {}
+        for req_id in self.num_prompt_logprobs:
+            req_index = self.req_id_to_index[req_id]
+            num_scheduled_tok = num_scheduled_tokens[req_index]
+            top_p = self.top_p
+
         logits_masks = {
             req_id: (req_indices == self.req_id_to_index[req_id])
             for req_id in self.num_prompt_logprobs
         }
 
-        # Expand temp, top_p, and top_k for the whole batch.
-        # NOTE(rob): to simplify implementation, process the logits
-        # for all batch elements when computing prompt logprobs.
-        # TODO(rob): I think these will come out flattened, need to
-        # reshape after calling repeat_interleave?
-        num_scheduled_tokens_torch = torch.from_numpy(
-            num_scheduled_tokens).to(self.temperature.device)
+        # Expand temp, top_p, and top_k for the whole batch
+        num_scheduled_tokens_torch = torch.from_numpy(num_scheduled_tokens).to(
+            self.temperature.device)
         temperature = torch.repeat_interleave(self.temperature,
                                               num_scheduled_tokens_torch)
         # Skip expansion if we are going to skip top_p/k anyways.
-        top_p = (self.top_p if self.no_top_p else
-                 torch.repeat_interleave(self.top_p,
-                                         num_scheduled_tokens_torch))
-        top_k = (self.top_k if self.no_top_k else
-                 torch.repeat_interleave(self.top_k,
-                                         num_scheduled_tokens_torch))
+        top_p = (self.top_p if self.no_top_p else torch.repeat_interleave(
+            self.top_p, num_scheduled_tokens_torch))
+        top_k = (self.top_k if self.no_top_k else torch.repeat_interleave(
+            self.top_k, num_scheduled_tokens_torch))
 
         return PromptLogprobsMetadata(
             logits_masks=logits_masks,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 7dffdeab0fb4c..787ccea82d2cc 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -267,8 +267,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
     def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> Tuple[FlashAttentionMetadata,
-               SamplingMetadata,
+    ) -> Tuple[FlashAttentionMetadata, SamplingMetadata,
                Optional[PromptLogprobsMetadata]]:
 
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
@@ -375,8 +374,8 @@ def _prepare_inputs(
             slot_mapping=slot_mapping,
         )
 
-        sampling_metadata = self._prepare_sampling(
-            scheduler_output, query_start_loc)
+        sampling_metadata = self._prepare_sampling(scheduler_output,
+                                                   query_start_loc)
         prompt_logprobs_metadata = self._prepare_prompt_logprobs(
             num_scheduled_tokens, req_indices)
 
@@ -405,9 +404,9 @@ def _prepare_prompt_logprobs(
         req_indices: np.ndarray,
     ) -> Optional[PromptLogprobsMetadata]:
         # NOTE(rob): Since this function uses the values of
-        # input_batch.temp/top_p/top_k, which are mutated in 
+        # input_batch.temp/top_p/top_k, which are mutated in
         # self._prepare_sampling, it should be called AFTER.
-        
+
         # Create the prompt logprobs metadata.
         return self.input_batch.make_prompt_logprobs_metadata(
             num_scheduled_tokens, req_indices)
@@ -567,7 +566,7 @@ def execute_model(
             logits = self.model.sampler.compute_logits(hidden_states, None)
             logits = self.model.sampler.process_logits(
                 logits, prompt_logprobs_metadata.logits_process_metadata)
-            
+
             # Second, compute the logprobs for requests needing prompt lps.
             # NOTE(rob): We should avoid looping over all reqs, this loop
             # this only loops over active prefills which need prompt lps.
@@ -576,15 +575,12 @@ def execute_model(
                 req_logits = logits[logits_mask]
                 lp_token_ids, lps = self.model.sampler.compute_logprobs(
                     req_logits, self.input_batch.num_prompt_logprobs[req_id])
-    
-                
+
                 # TODO: remove the sample logprob by checking if this is a
                 # partial request or not.
                 prompt_logprobs[req_id] = (lp_token_ids, lps)
-                
 
             # Second, split the prompt logprobs
-            
 
         sampled_token_ids = sampler_output.sampled_token_ids
         # TODO(woosuk): The following loop can be slow since it iterates over

From fc20e5edcbd9bcd3dd44d81a2040ccbd3133c7f0 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 12:50:07 +0000
Subject: [PATCH 178/293] revert update from output changes

---
 vllm/v1/core/scheduler.py | 137 +-------------------------------------
 1 file changed, 3 insertions(+), 134 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index f202f8a87ef86..32873cae6f067 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -392,105 +392,14 @@ def update_from_output(
         scheduler_output: "SchedulerOutput",
         model_runner_output: "ModelRunnerOutput",
     ) -> List[EngineCoreOutput]:
-        """Build engine core output from model runner output.
-        
-        Args:
-          scheduler_output: scheduler output prior to engine step.
-          model_runner_output: model runner output from engine step.
-
-        Returns:
-          Engine core output which tracks the progress of generation.
-        """
         # NOTE(woosuk): This method doesn't consider speculative decoding.
         sampled_token_ids = model_runner_output.sampled_token_ids
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
-        do_batch_sample_logprobs = (model_runner_output.batch_logprobs_cpu
-                                    is not None)
-        do_batch_prompt_logprobs = (
-            model_runner_output.batch_prompt_logprobs_cpu is not None
-            and len(model_runner_output.batch_prompt_logprobs_cpu) > 0)
-
-        if do_batch_prompt_logprobs:
-            # Index into prompt tokens, for building
-            # prompt logprobs output data structure
-            mr_output_slice_lower_index = 0
         new_running: List[Request] = []
         engine_core_outputs: List[EngineCoreOutput] = []
         for request in self.running:
             req_id = request.request_id
-            prev_num_computed_tokens = request.num_computed_tokens
             request.num_computed_tokens += num_scheduled_tokens[req_id]
-            req_index = model_runner_output.req_id_to_index[req_id]
-            num_new_tokens = 1
-            request_sample_logprobs = request.request_sample_logprobs
-            request_do_logprobs = (do_batch_sample_logprobs
-                                   and request_sample_logprobs is not None
-                                   and request_sample_logprobs > 0)
-
-            if do_batch_prompt_logprobs:
-                request_prompt_logprobs = request.request_prompt_logprobs
-                # Number of new prompt tokens is the number of scheduled
-                # tokens *if* the request is partial (because the sampled
-                # token is discarded and all sequence offsets are prompt
-                # offsets), otherwise it is the number of scheduled
-                # tokens minus one (for the sampled token)
-                req_is_not_partial = (scheduler_output.partial_req_id !=
-                                      req_id)
-                num_new_prompt_tokens = (
-                    num_scheduled_tokens[request.request_id] -
-                    int(req_is_not_partial))
-
-                request_do_prompt_logprobs = (request_prompt_logprobs
-                                              is not None
-                                              and request_prompt_logprobs > 0
-                                              and num_new_prompt_tokens > 0)
-
-                if request_do_prompt_logprobs:
-                    # Construct prompt logprobs, under the condition that
-                    # prompt logprobs were requested & a nonzero number of
-                    # prompt tokens were computed in this step for this request.
-                    #
-                    # Pythonization is deferred to outside the engine core.
-                    #
-                    # Note that this scenario returns an EngineCoreOutput which
-                    # is empty except for the prompt logprobs which were
-                    # computed for these prompt tokens.
-                    #
-                    # Note: new_prompt_logprobs will be used later to build the
-                    # engine core output
-                    assert (model_runner_output.batch_prompt_logprobs_cpu
-                            is not None)
-                    assert (
-                        model_runner_output.batch_prompt_logprob_token_ids_cpu
-                        is not None)
-                    assert request.prompt_logprobs is not None
-                    assert request.prompt_logprob_token_ids is not None
-                    logprob_cnt = request_prompt_logprobs
-                    mr_output_slice_upper_index = (
-                        mr_output_slice_lower_index + num_new_prompt_tokens)
-                    new_prompt_logprobs = (
-                        model_runner_output.batch_prompt_logprobs_cpu[
-                            mr_output_slice_lower_index:
-                            mr_output_slice_upper_index, 0:logprob_cnt])
-                    new_prompt_logprob_token_ids = (
-                        model_runner_output.batch_prompt_logprob_token_ids_cpu[
-                            mr_output_slice_lower_index:
-                            mr_output_slice_upper_index, 0:logprob_cnt])
-
-                    req_slice_upper_index = (prev_num_computed_tokens +
-                                             num_new_prompt_tokens)
-                    request.prompt_logprobs[
-                        prev_num_computed_tokens:
-                        req_slice_upper_index] = new_prompt_logprobs
-                    request.prompt_logprob_token_ids[
-                        prev_num_computed_tokens:
-                        req_slice_upper_index] = new_prompt_logprob_token_ids
-                    mr_output_slice_lower_index = mr_output_slice_upper_index
-                else:
-                    mr_output_slice_lower_index += num_new_prompt_tokens
-            else:
-                request_do_prompt_logprobs = False
-
             # When the request's num_computed_tokens catches up its num_tokens,
             # the request generates output tokens. Otherwise, we ignore the
             # sampler output for the request.
@@ -507,71 +416,31 @@ def update_from_output(
                     self.encoder_cache_manager.free(request, input_id)
 
             if request.num_computed_tokens == request.num_tokens:
+                req_index = model_runner_output.req_id_to_index[req_id]
                 # NOTE(woosuk): Currently, we assume that each request
                 # generates at most one token at each step.
                 token_id = sampled_token_ids[req_index]
-                if request_do_logprobs:
-                    assert model_runner_output.batch_logprobs_cpu is not None
-                    assert (model_runner_output.batch_logprob_token_ids_cpu
-                            is not None)
-                    assert request.logprobs is not None
-                    # Slice out this request's sample logprobs; defer
-                    # pythonization to be carried out in the frontend.
-                    request.logprobs.append(
-                        (model_runner_output.batch_logprobs_cpu[req_index],
-                         model_runner_output.
-                         batch_logprob_token_ids_cpu[req_index]))
                 request.append_output_token_ids(token_id)
+                num_new_tokens = 1
                 # TODO: Update the KV cache manager for prefix caching.
 
                 # Check for stop and update request state.
                 # This must be called before me make the EngineCoreOutput.
                 stopped = self._check_stop(request)
 
-                # Compute engine core output logprobs list as such,
-                # so the type checker can see the assert
-                if request_do_logprobs:
-                    assert request.logprobs is not None
-                    logprobs = request.logprobs[-num_new_tokens:]
-                else:
-                    logprobs = None
-
                 # Add EngineCoreOutput for this Request.
-                # Return the logprob for the most recently computed tokens.
-                # Return no prompt logprobs in decode-phase.
                 output = EngineCoreOutput(
                     request_id=req_id,
                     new_token_ids=request.output_token_ids[-num_new_tokens:],
                     finished=request.is_finished(),
                     finish_reason=request.get_finished_reason(),
-                    stop_reason=request.stop_reason,
-                    logprobs=logprobs,
-                    prompt_logprobs=(new_prompt_logprobs
-                                     if request_do_prompt_logprobs else None),
-                    prompt_logprobs_token_ids=(new_prompt_logprob_token_ids
-                                               if request_do_prompt_logprobs
-                                               else None))
+                    stop_reason=request.stop_reason)
                 engine_core_outputs.append(output)
 
                 # Breakout of the loop.
                 if stopped:
                     continue
 
-            elif request_do_prompt_logprobs:
-                # This request is still partial but prompt logprobs were
-                # requested
-                engine_core_outputs.append(
-                    EngineCoreOutput(
-                        request_id=req_id,
-                        new_token_ids=[],
-                        finished=request.is_finished(),
-                        finish_reason=request.get_finished_reason(),
-                        stop_reason=request.stop_reason,
-                        logprobs=[] if request_do_logprobs else None,
-                        prompt_logprobs=new_prompt_logprobs,
-                        prompt_logprobs_token_ids=new_prompt_logprob_token_ids)
-                )
-
             new_running.append(request)
         self.running = new_running
         return engine_core_outputs

From fca2daef2f748e095d1e3c4480866fc51cdd75f0 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 12:52:23 +0000
Subject: [PATCH 179/293] update partial reqs to be a list

---
 vllm/v1/core/scheduler.py          | 18 +++++++++---------
 vllm/v1/worker/gpu_model_runner.py |  4 ++--
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 32873cae6f067..766342e468d14 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -109,11 +109,11 @@ def schedule(self) -> "SchedulerOutput":
         # but not all. The constraint is due to the persistent batch in the
         # V1 model runner.
         # TODO(woosuk): Remove this constraint after refactoring model runner.
-        partial_req_id = None
+        partial_req_ids: List[str] = []
         req_index = 0
         while req_index < len(self.running):
             # Only the last request in the RUNNING queue can be "partial".
-            assert partial_req_id is None
+            assert len(partial_req_id) == 0
             assert token_budget > 0
             request = self.running[req_index]
             num_new_tokens = request.num_tokens - request.num_computed_tokens
@@ -162,8 +162,8 @@ def schedule(self) -> "SchedulerOutput":
             token_budget -= num_new_tokens
             if (request.num_computed_tokens + num_new_tokens <
                     request.num_tokens):
-                assert partial_req_id is None
-                partial_req_id = request.request_id
+                assert len(partial_req_ids) == 0
+                partial_req_ids.append(request.request_id)
             req_index += 1
 
             # Encoder-related.
@@ -178,7 +178,7 @@ def schedule(self) -> "SchedulerOutput":
         # Next, schedule the WAITING requests.
         if not preempted_reqs:
             while self.waiting:
-                if partial_req_id:
+                if len(partial_req_ids) > 0:
                     break
                 if len(self.running) == self.max_num_running_reqs:
                     break
@@ -245,8 +245,8 @@ def schedule(self) -> "SchedulerOutput":
                 request.status = RequestStatus.RUNNING
                 request.num_computed_tokens = num_computed_tokens
                 if (num_computed_tokens + num_new_tokens < request.num_tokens):
-                    assert partial_req_id is None
-                    partial_req_id = request.request_id
+                    assert len(partial_req_ids) == 0
+                    partial_req_ids.append(request.request_id)
 
                 # Encoder-related.
                 if encoder_inputs_to_schedule:
@@ -287,7 +287,7 @@ def schedule(self) -> "SchedulerOutput":
             scheduled_new_reqs=new_reqs_data,
             scheduled_resumed_reqs=resumed_reqs_data,
             scheduled_running_reqs=running_reqs_data,
-            partial_req_id=partial_req_id,
+            partial_req_ids=partial_req_ids,
             num_scheduled_tokens=num_scheduled_tokens,
             total_num_scheduled_tokens=total_num_scheduled_tokens,
             scheduled_encoder_inputs=scheduled_encoder_inputs,
@@ -594,7 +594,7 @@ class SchedulerOutput:
     scheduled_new_reqs: List[NewRequestData]
     scheduled_resumed_reqs: List[ResumedRequestData]
     scheduled_running_reqs: List[RunningRequestData]
-    partial_req_id: Optional[str]
+    partial_req_ids: List[str]
 
     num_scheduled_tokens: Dict[str, int]
     total_num_scheduled_tokens: int
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 787ccea82d2cc..1155414b9ed76 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -209,8 +209,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                 req_index, start_index:end_index] = req_data.new_block_ids
 
             # Remove from prompt logprobs once out of prefill phase.
-            if (req_id in self.input_batch.num_prompt_logprobs
-                    and req_id != scheduler_output.partial_req_id):
+            if (req_id in self.input_batch.num_prompt_logprobs and
+                req_id not in scheduler_output.partial_req_ids):
                 del self.input_batch.num_prompt_logprobs[req_id]
 
         req_ids_to_add: List[str] = []

From 317ee1ed6dda414d6d1c9aa6d0bc249527fe546f Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 13:00:38 +0000
Subject: [PATCH 180/293] update

---
 vllm/v1/engine/__init__.py         |  7 ++--
 vllm/v1/engine/processor.py        | 53 ++++++++++++------------------
 vllm/v1/sample/sampler.py          | 21 ++----------
 vllm/v1/worker/gpu_model_runner.py |  4 +--
 4 files changed, 27 insertions(+), 58 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index e4ece580e9866..6f05bb2634408 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -22,11 +22,8 @@ class DetokenizerRequest:
 
     stop: List[str]
     include_stop_str_in_output: bool
-
-    # Per-request logprobs & prompt logprobs
-    # counts; None is equivalent to 0
-    logprobs: Optional[int]
-    prompt_logprobs: Optional[int]
+    logprobs: int
+    prompt_logprobs: int
 
 
 @dataclass
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 8990213b5a246..6257611531da1 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -50,11 +50,10 @@ def __init__(
             cache_config.enable_prefix_caching
         self.mm_hasher = MMHasher()
 
-    def _assert_valid_sample_logprobs_prompt_logprobs(
+    def _validate_logprobs(
         self,
         params: Union[SamplingParams, PoolingParams],
-        max_logprobs: int,
-    ):
+    ) -> None:
         """Validate requested number of sample logprobs & prompt logprobs
         
         Fails with ValueError if to many logprobs are requested.
@@ -64,13 +63,22 @@ def _assert_valid_sample_logprobs_prompt_logprobs(
           max_logprobs: max number of logprobs or prompt logprobs
         """
 
-        if isinstance(params, SamplingParams) and (
-            (params.logprobs and params.logprobs > max_logprobs) or
-            (params.prompt_logprobs
-             and params.prompt_logprobs > max_logprobs)):
+        if not isinstance(params, SamplingParams):
+            return
+
+        max_logprobs = self.model_config.max_logprobs
+
+        # Validate sample logprobs.
+        if (params.logprobs and params.logprobs > max_logprobs):
+            raise ValueError(
+                f"Requested sample logprobs of {params.logprobs}, "
+                f"which is greated than max allowed: {max_logprobs}")
 
-            raise ValueError(f"Cannot request more than "
-                             f"{max_logprobs} logprobs or prompt logprobs.")
+        # Validate prompt logprobs.
+        if (params.prompt_logprobs and params.prompt_logprobs > max_logprobs):
+            raise ValueError(
+                f"Requested prompt logprobs of {params.prompt_logprobs}, "
+                f"which is greated than max allowed: {max_logprobs}")
 
     # TODO: run in an ThreadpoolExecutor or BackgroundProcess.
     # This ideally should releases the GIL, so we should not block the
@@ -86,31 +94,12 @@ def process_inputs(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
-        """Process the input prompt into engine (& possibly tokenizer) requests
-        
-        Args:
-          request_id: request ID
-          prompt: input prompt str
-          params: sampling or pooling commands
-          arrival_time: time when inputs arrived; will be computed if `None`
-          is passed in
-          max_logprobs_permitted_by_engine: the max number of sample or prompt
-          logprobs a request may ask for
-          lora_request: LoRA request structure
-          trace_headers: trace info
-          prompt_adapter_request: prompt adapter request structure
-          priority: currently unsupported; must be zero & is by default.
-
-        Returns:
-          Detokenizer request structure
-          Engine request structure
-        """
 
         # TODO(woosuk): Support pooling models.
         # TODO(woosuk): Support encoder-decoder models.
 
-        self._assert_valid_sample_logprobs_prompt_logprobs(
-            params, self.model_config.max_logprobs)
+        # TODO(rob): Add more param validation here.
+        self._validate_logprobs(params)
 
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
@@ -177,8 +166,8 @@ def process_inputs(
             sampling_params.output_kind,
             sampling_params.stop,
             sampling_params.include_stop_str_in_output,
-            sampling_params.logprobs,
-            sampling_params.prompt_logprobs,
+            sampling_params.logprobs or 0,
+            sampling_params.prompt_logprobs or 0,
         )
 
         # Make Request for EngineCore.
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 76e662d8252e8..51eb80167173d 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -18,25 +18,8 @@ def forward(
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
-        """Implement sampling.
-        
-        Apply temperature, top-k and top-p.
-        Sample from the probability distribution implied by `logits`.
-        Only sample at sequence offsets where new tokens are decoded.
-        In the process, compute sample and prompt logprobs (if required.)
-
-        Args:
-          logits: model output logits which imply probability distribution.
-          sampling_metadata: sampling config settings
-        
-        Returns:
-          Sampler output. Sampled tokens and sample/prompt logprobs
-          (if requested)
-        """
-
-        # Sample next token.
-        logits = self.process_logits(logits,
-                                     sampling_metadata.logits_process_metadata)
+        logits = self.process_logits(
+            logits, sampling_metadata.logits_process_metadata)
         probs = self.get_probs(logits)
         sampled = self.sample(probs, sampling_metadata)
         # Use int32 to reduce the tensor size.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 1155414b9ed76..006da8a568f3f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -209,8 +209,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                 req_index, start_index:end_index] = req_data.new_block_ids
 
             # Remove from prompt logprobs once out of prefill phase.
-            if (req_id in self.input_batch.num_prompt_logprobs and
-                req_id not in scheduler_output.partial_req_ids):
+            if (req_id in self.input_batch.num_prompt_logprobs
+                    and req_id not in scheduler_output.partial_req_ids):
                 del self.input_batch.num_prompt_logprobs[req_id]
 
         req_ids_to_add: List[str] = []

From 74fc26456b2b4c8a888f78302430dc2140a0b311 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 13:07:51 +0000
Subject: [PATCH 181/293] updated

---
 vllm/v1/worker/gpu_input_batch.py  |  4 ++--
 vllm/v1/worker/gpu_model_runner.py | 23 +++++++++++++++++------
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 05464f960e122..e5c0caad546d6 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -247,7 +247,7 @@ def condense(self, empty_req_indices: List[int]) -> None:
 
     def make_sampling_metadata(
         self,
-        query_start_loc: torch.Tensor,
+        sample_indices: torch.Tensor,
         skip_copy: bool = False,
     ) -> SamplingMetadata:
         if not skip_copy:
@@ -259,7 +259,7 @@ def make_sampling_metadata(
                 self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True)
 
         return SamplingMetadata(
-            sample_indicies=query_start_loc[1:] - 1,
+            sample_indicies=sample_indices,
             all_greedy=self.all_greedy,
             all_random=self.all_random,
             logits_process_metadata=LogitsProcessMetadata(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 006da8a568f3f..507caffeec71f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -374,8 +374,11 @@ def _prepare_inputs(
             slot_mapping=slot_mapping,
         )
 
-        sampling_metadata = self._prepare_sampling(scheduler_output,
-                                                   query_start_loc)
+        # Make Sampling Metadata
+        sampling_metadata = self._prepare_sampling(
+            scheduler_output=scheduler_output,
+            sample_indices=query_start_loc[1:] - 1
+        )
         prompt_logprobs_metadata = self._prepare_prompt_logprobs(
             num_scheduled_tokens, req_indices)
 
@@ -384,7 +387,7 @@ def _prepare_inputs(
     def _prepare_sampling(
         self,
         scheduler_output: "SchedulerOutput",
-        query_start_loc: torch.Tensor,
+        sample_indices: torch.Tensor,
     ) -> SamplingMetadata:
         skip_copy = True
         if (scheduler_output.finished_req_ids
@@ -395,7 +398,7 @@ def _prepare_sampling(
             skip_copy = False
         # Create the sampling metadata.
         sampling_metadata = self.input_batch.make_sampling_metadata(
-            skip_copy, query_start_loc)
+            skip_copy, sample_indices)
         return sampling_metadata
 
     def _prepare_prompt_logprobs(
@@ -582,6 +585,7 @@ def execute_model(
 
             # Second, split the prompt logprobs
 
+        # Update Request State.
         sampled_token_ids = sampler_output.sampled_token_ids
         # TODO(woosuk): The following loop can be slow since it iterates over
         # the requests one by one. Optimize.
@@ -605,14 +609,21 @@ def execute_model(
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
-        return ModelRunnerOutput(
-            req_ids=cast(List[str], self.input_batch.req_ids[:num_reqs]),
+        # num_reqs entries should be non-None	
+        assert all(	
+            req_id is not None for req_id in	
+            self.input_batch.req_ids[:num_reqs]), "req_ids contains None"	
+        req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs])
+
+        model_runner_output = ModelRunnerOutput(
+            req_ids=req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=sampled_token_ids,
             logprob_token_ids=sampler_output.logprob_token_ids,
             logprobs=sampler_output.logprobs,
             prompt_logprobs=prompt_logprobs_output,
         )
+        return model_runner_output
 
     def load_model(self) -> None:
         logger.info("Starting to load model %s...", self.model_config.model)

From db999da50ecde7732292c29911dfae7b1e3d722a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 13:08:46 +0000
Subject: [PATCH 182/293] remove unrelated changes

---
 vllm/v1/engine/detokenizer.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 1b3c8e671909c..cef08c7946380 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -19,11 +19,6 @@
 
 @dataclass
 class IncrementalDetokenizer:
-    """Track and implement detokenization for a single request.
-    
-    Also handles Pythonization (conversion to OpenAI-API-compatible Python
-    data structures) of logprobs Numpy arrays computed for the request.
-    """
 
     # Generation data
     output_text: str

From 9b430d885daf23dd55a5ac575d538d4f981baf5d Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 13:11:58 +0000
Subject: [PATCH 183/293] updated

---
 vllm/v1/engine/detokenizer.py | 15 +++------------
 vllm/v1/engine/llm_engine.py  | 12 ++++--------
 2 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index cef08c7946380..a539f21d889f5 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -24,9 +24,9 @@ class IncrementalDetokenizer:
     output_text: str
     tokens: List[str]
     token_ids: List[int]
-    request_logprobs: Optional[SampleLogprobs]
-    request_prompt_logprobs: Optional[PromptLogprobs]
-    request_cumulative_logprob: Optional[float]
+    logprobs: Optional[SampleLogprobs]
+    prompt_logprobs: Optional[PromptLogprobs]
+    cumulative_logprob: Optional[float]
 
     # Stop strings
     stop: List[str]
@@ -71,15 +71,6 @@ def from_new_request(
         tokenizer: AnyTokenizer,
         request: DetokenizerRequest,
     ) -> "IncrementalDetokenizer":
-        """Construct incremental detokenizer for a request.
-        
-        Args:
-          tokenizer: tokenizer provides detokenization methods
-          request: track detokenization progress of this request
-
-        Returns:
-          Incremental detokenizer for the request
-        """
 
         tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
             tokenizer=tokenizer,
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index d82cf388c5db1..bea8c5502f612 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -150,14 +150,8 @@ def add_request(
 
         # 1) Process raw inputs into the request.
         detokenizer_req, engine_core_req = self.processor.process_inputs(
-            request_id=request_id,
-            prompt=prompt,
-            params=params,
-            arrival_time=arrival_time,
-            lora_request=lora_request,
-            trace_headers=trace_headers,
-            prompt_adapter_request=prompt_adapter_request,
-            priority=priority)
+            request_id, prompt, params, arrival_time, lora_request,
+            trace_headers, prompt_adapter_request, priority)
 
         # 2) Add the request to Detokenizer.
         self.detokenizer.add_request(detokenizer_req)
@@ -180,6 +174,8 @@ def step(self) -> List[RequestOutput]:
 
         return request_outputs
 
+    # TODO(rob): Can we get rid of these?
+
     def get_model_config(self):
         return self.model_config
 

From d470e23169e44ef8d3918e223b86b38c01da7152 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 13:12:35 +0000
Subject: [PATCH 184/293] nit

---
 vllm/v1/core/scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 766342e468d14..80b53c3025b39 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -113,7 +113,7 @@ def schedule(self) -> "SchedulerOutput":
         req_index = 0
         while req_index < len(self.running):
             # Only the last request in the RUNNING queue can be "partial".
-            assert len(partial_req_id) == 0
+            assert len(partial_req_ids) == 0
             assert token_budget > 0
             request = self.running[req_index]
             num_new_tokens = request.num_tokens - request.num_computed_tokens

From ecaa68a38da23e97f8e61b1555b235df0a9823ca Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 13:15:38 +0000
Subject: [PATCH 185/293] update ModelRunnerOutput

---
 vllm/v1/outputs.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 3f7c555b04841..a359f4cbb54a2 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -1,5 +1,7 @@
 from dataclasses import dataclass
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
+
+import torch
 
 
 @dataclass
@@ -26,9 +28,9 @@ class ModelRunnerOutput:
     # [num_reqs]
     sampled_token_ids: List[int]
 
-    # [num_reqs, max_num_logprobs]
-    logprob_token_ids: List[List[int]]
-    logprobs: List[List[float]]
+    # [num_reqs, max_num_logprobs + 1]
+    logprob_token_ids_cpu: Optional[torch.Tensor]
+    logprobs_cpu: Optional[torch.Tensor]
 
     # req_id -> (prompt_logprobs_token_ids, prompt_logprobs)
     # [num_reqs, max_num_prompt_logprobs]

From c32b6ebb8391bdfbd2bb95b460b66dd76df7a900 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 13:18:04 +0000
Subject: [PATCH 186/293] updated

---
 vllm/v1/engine/processor.py | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 6257611531da1..8c9d7d1e523ff 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -54,20 +54,10 @@ def _validate_logprobs(
         self,
         params: Union[SamplingParams, PoolingParams],
     ) -> None:
-        """Validate requested number of sample logprobs & prompt logprobs
-        
-        Fails with ValueError if to many logprobs are requested.
-
-        Args:
-          params: Sampling parameters
-          max_logprobs: max number of logprobs or prompt logprobs
-        """
-
         if not isinstance(params, SamplingParams):
             return
 
         max_logprobs = self.model_config.max_logprobs
-
         # Validate sample logprobs.
         if (params.logprobs and params.logprobs > max_logprobs):
             raise ValueError(
@@ -79,6 +69,11 @@ def _validate_logprobs(
             raise ValueError(
                 f"Requested prompt logprobs of {params.prompt_logprobs}, "
                 f"which is greated than max allowed: {max_logprobs}")
+        
+    def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
+        if lora_request is not None and not self.lora_config:
+            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
+                             "not enabled!")
 
     # TODO: run in an ThreadpoolExecutor or BackgroundProcess.
     # This ideally should releases the GIL, so we should not block the
@@ -98,12 +93,10 @@ def process_inputs(
         # TODO(woosuk): Support pooling models.
         # TODO(woosuk): Support encoder-decoder models.
 
-        # TODO(rob): Add more param validation here.
+        # TODO(rob): Validate all SamplingParams.
         self._validate_logprobs(params)
+        self._validate_lora(lora_request)
 
-        if lora_request is not None and not self.lora_config:
-            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
-                             "not enabled!")
         if arrival_time is None:
             arrival_time = time.time()
         assert priority == 0, "vLLM V1 does not support priority at the moment."

From 09d7592b96d38656cface80e4ed76e146ccc69b5 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 13:24:38 +0000
Subject: [PATCH 187/293] updated

---
 vllm/v1/outputs.py        |  4 ++--
 vllm/v1/sample/sampler.py | 15 ++++++++-------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index a359f4cbb54a2..0fa3631650ad1 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -11,8 +11,8 @@ class SamplerOutput:
     sampled_token_ids: List[int]
 
     # [num_reqs, max_num_logprobs]
-    logprob_token_ids: List[List[int]]
-    logprobs: List[List[int]]
+    logprob_token_ids: Optional[torch.Tensor]
+    logprobs: Optional[torch.Tensor]
 
 
 # ModelRunnerOutput is serialized and sent to the scheduler process.
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 51eb80167173d..af1abcb3ca7ab 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,5 +1,5 @@
 """A layer that samples the next tokens from the model's outputs."""
-from typing import Dict, List, Tuple
+from typing import Dict, Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -26,7 +26,7 @@ def forward(
         sampled = sampled.to(torch.int32)
 
         # Compute the logprobs if requested.
-        # NOTE: logprob CPU-GPU synchronization happens here.
+        # NOTE: CPU-GPU synchronization happens here.
         logprob_token_ids, logprobs = self.compute_logprobs(
             logits, sampling_metadata.max_num_logprobs)
 
@@ -39,8 +39,10 @@ def forward(
         return sampler_output
 
     def compute_logprobs(
-            self, logits: torch.Tensor,
-            max_num_logprobs: int) -> Tuple[List[int], List[float]]:
+        self,
+        logits: torch.Tensor,
+        max_num_logprobs: int
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
         if max_num_logprobs > 0:
             logprobs = self.get_logprobs(logits)
             # FIXME: Mask the sampled token_id, get topk logprobs,
@@ -51,10 +53,9 @@ def compute_logprobs(
             # Use int32 to reduce the tensor size.
             topk_indices = topk_indices.to(torch.int32)
 
-            # NOTE: CPU<>GPU synchronization happens here.
-            return topk_indices.tolist(), topk_logprobs.tolist()
+            return topk_indices.cpu(), topk_logprobs.cpu()
         else:
-            return [], []
+            return None, None
 
     def process_logits(
         self,

From f092bef50f7ed4585a1a4ddd523e4d6596ff5853 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 13:31:54 +0000
Subject: [PATCH 188/293] cleanup

---
 vllm/v1/worker/gpu_model_runner.py | 34 ++++++++++++------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 507caffeec71f..efd76d8a1d862 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -374,13 +374,13 @@ def _prepare_inputs(
             slot_mapping=slot_mapping,
         )
 
-        # Make Sampling Metadata
-        sampling_metadata = self._prepare_sampling(
+        # Make Sampling and Prompt Logprobs Metadata.
+        sampling_metadata, prompt_logprobs_metadata = self._prepare_sampling(
             scheduler_output=scheduler_output,
-            sample_indices=query_start_loc[1:] - 1
+            sample_indices=query_start_loc[1:] - 1,
+            num_scheduled_tokens=num_scheduled_tokens,
+            req_indices=req_indices,
         )
-        prompt_logprobs_metadata = self._prepare_prompt_logprobs(
-            num_scheduled_tokens, req_indices)
 
         return attn_metadata, sampling_metadata, prompt_logprobs_metadata
 
@@ -388,7 +388,9 @@ def _prepare_sampling(
         self,
         scheduler_output: "SchedulerOutput",
         sample_indices: torch.Tensor,
-    ) -> SamplingMetadata:
+        num_scheduled_tokens: np.array,
+        req_indices: np.ndarray,
+    ) -> Tuple[SamplingMetadata, Optional[PromptLogprobsMetadata]]:
         skip_copy = True
         if (scheduler_output.finished_req_ids
                 or scheduler_output.preempted_req_ids):
@@ -399,21 +401,13 @@ def _prepare_sampling(
         # Create the sampling metadata.
         sampling_metadata = self.input_batch.make_sampling_metadata(
             skip_copy, sample_indices)
-        return sampling_metadata
-
-    def _prepare_prompt_logprobs(
-        self,
-        num_scheduled_tokens: np.array,
-        req_indices: np.ndarray,
-    ) -> Optional[PromptLogprobsMetadata]:
-        # NOTE(rob): Since this function uses the values of
-        # input_batch.temp/top_p/top_k, which are mutated in
-        # self._prepare_sampling, it should be called AFTER.
-
-        # Create the prompt logprobs metadata.
-        return self.input_batch.make_prompt_logprobs_metadata(
+        
+        # Create the prompt logprobs metdata.
+        prompt_lps_metdata = self.input_batch.make_prompt_logprobs_metadata(
             num_scheduled_tokens, req_indices)
 
+        return sampling_metadata, prompt_lps_metdata
+
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
@@ -552,7 +546,7 @@ def execute_model(
         sample_hidden_states = hidden_states[sampling_metadata.sample_indicies]
         sample_logits = self.model.compute_logits(sample_hidden_states, None)
 
-        # Sample the next token.
+        # Sample the next token and get logprobs if needed.
         sampler_output = self.model.sample(
             logits=sample_logits,
             sampling_metadata=sampling_metadata,

From 555861e42da0a773d73c0b679c7c6c483248f10b Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 13:33:24 +0000
Subject: [PATCH 189/293] remove spurious change

---
 vllm/v1/engine/async_llm.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 7dc465b50559a..41fb4b25d45bb 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -164,14 +164,8 @@ async def add_request(
 
         # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
         detokenizer_req, engine_core_req = self.processor.process_inputs(
-            request_id=request_id,
-            prompt=prompt,
-            params=params,
-            arrival_time=arrival_time,
-            lora_request=lora_request,
-            trace_headers=trace_headers,
-            prompt_adapter_request=prompt_adapter_request,
-            priority=priority)
+            request_id, prompt, params, arrival_time, lora_request,
+            trace_headers, prompt_adapter_request, priority)
 
         # 3) Add the request to Detokenizer (this process).
         self.detokenizer.add_request(detokenizer_req)

From 5b7d629a52acfec1267dc5f87ffd72d6a493b18a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 13:34:21 +0000
Subject: [PATCH 190/293] updated

---
 vllm/v1/outputs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 0fa3631650ad1..698d7a8dcaae4 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -30,6 +30,7 @@ class ModelRunnerOutput:
 
     # [num_reqs, max_num_logprobs + 1]
     logprob_token_ids_cpu: Optional[torch.Tensor]
+    # [num_reqs, max_num_logprobs + 1]
     logprobs_cpu: Optional[torch.Tensor]
 
     # req_id -> (prompt_logprobs_token_ids, prompt_logprobs)

From 2694b75a50acea53a154bffc649851f7e029a807 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 13:37:21 +0000
Subject: [PATCH 191/293] less spurious changes

---
 vllm/v1/outputs.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 698d7a8dcaae4..91666479abe80 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -10,8 +10,9 @@ class SamplerOutput:
     # [num_reqs]
     sampled_token_ids: List[int]
 
-    # [num_reqs, max_num_logprobs]
+    # [num_reqs, max_num_logprobs + 1]
     logprob_token_ids: Optional[torch.Tensor]
+    # [num_reqs, max_num_logprobs + 1]
     logprobs: Optional[torch.Tensor]
 
 

From 3d651fce0afdb96bdc41393558655f1bde27bd8f Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 13:51:21 +0000
Subject: [PATCH 192/293] updated

---
 vllm/v1/request.py                 |  5 +----
 vllm/v1/sample/sampler.py          | 16 ++++++++++++----
 vllm/v1/worker/gpu_model_runner.py |  4 ++--
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 97190ee83ed23..312015d04ed33 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -49,10 +49,7 @@ def __init__(
         self.num_prompt_tokens = len(self.prompt_token_ids)
         self._output_token_ids: List[int] = []
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
-        # Number of sample logprobs and prompt logprobs requested,
-        # respectively
-        self.request_sample_logprobs = sampling_params.logprobs
-        self.request_prompt_logprobs = sampling_params.prompt_logprobs
+
         # If sample logprobs are enabled, the number of sample logprobs cannot
         # be anticipated in advance (because the LLM is partially responsible
         # for deciding when the completion is finished.) So,
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index af1abcb3ca7ab..9e142937be894 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -41,18 +41,26 @@ def forward(
     def compute_logprobs(
         self,
         logits: torch.Tensor,
-        max_num_logprobs: int
+        max_num_logprobs: int,
+        sampled_token_ids: Optional[torch.Tensor] = None,
+        sampled_logprobs: Optional[torch.Tensor] = None,
     ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
         if max_num_logprobs > 0:
             logprobs = self.get_logprobs(logits)
             # FIXME: Mask the sampled token_id, get topk logprobs,
             # and concatenate the topk with the sampled token_id.
-            topk_logprobs, topk_indices = torch.topk(logprobs,
-                                                     max_num_logprobs,
-                                                     dim=-1)
+            topk_logprobs, topk_indices = torch.topk(
+                logprobs, max_num_logprobs, dim=-1)
             # Use int32 to reduce the tensor size.
             topk_indices = topk_indices.to(torch.int32)
 
+            # Concatenate with the sampled token_id if provided.
+            if sampled_logprobs:
+                topk_indices = torch.cat([topk_indices,
+                                          sampled_token_ids])
+                topk_logprobs = torch.cat([topk_logprobs,
+                                           sampled_logprobs])
+
             return topk_indices.cpu(), topk_logprobs.cpu()
         else:
             return None, None
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index efd76d8a1d862..f806a786e4fb8 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -613,8 +613,8 @@ def execute_model(
             req_ids=req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=sampled_token_ids,
-            logprob_token_ids=sampler_output.logprob_token_ids,
-            logprobs=sampler_output.logprobs,
+            logprob_token_ids_cpu=sampler_output.logprob_token_ids,
+            logprobs_cpu=sampler_output.logprobs,
             prompt_logprobs=prompt_logprobs_output,
         )
         return model_runner_output

From cbe8275b1cdf0788c590c9db6994b6784093c3aa Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 13:53:07 +0000
Subject: [PATCH 193/293] updated to include the sampled logprob

---
 vllm/v1/sample/sampler.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 9e142937be894..42f744de8ce27 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -43,23 +43,20 @@ def compute_logprobs(
         logits: torch.Tensor,
         max_num_logprobs: int,
         sampled_token_ids: Optional[torch.Tensor] = None,
-        sampled_logprobs: Optional[torch.Tensor] = None,
     ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
         if max_num_logprobs > 0:
             logprobs = self.get_logprobs(logits)
-            # FIXME: Mask the sampled token_id, get topk logprobs,
-            # and concatenate the topk with the sampled token_id.
             topk_logprobs, topk_indices = torch.topk(
                 logprobs, max_num_logprobs, dim=-1)
             # Use int32 to reduce the tensor size.
             topk_indices = topk_indices.to(torch.int32)
 
             # Concatenate with the sampled token_id if provided.
-            if sampled_logprobs:
-                topk_indices = torch.cat([topk_indices,
-                                          sampled_token_ids])
-                topk_logprobs = torch.cat([topk_logprobs,
-                                           sampled_logprobs])
+            if sampled_token_ids:
+                # TODO(rob): check if the concat is right.
+                sampled_logprobs = logprobs[sampled_token_ids]
+                topk_indices = torch.cat([topk_indices, sampled_token_ids])
+                topk_logprobs = torch.cat([topk_logprobs, sampled_logprobs])
 
             return topk_indices.cpu(), topk_logprobs.cpu()
         else:

From 531eeb7e74c68577761346d21ffbb453b9b6fa87 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 14:05:22 +0000
Subject: [PATCH 194/293] fix logprobs

---
 vllm/outputs.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/vllm/outputs.py b/vllm/outputs.py
index 066f566b96097..fc8a501be2c4b 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -145,18 +145,7 @@ def new(
         cumulative_logprob: Optional[float],
         finished: bool = False,
     ) -> "RequestOutput":
-        """Initialize a new RequestOutput object.
-        
-        Args:
-          request_id
-          prompt: optional single prompt string
-          prompt_token_ids: optional list of prompt tokens
-          text: completion text
-          token_ids: completion token ids
-          logprobs: completion sample logprobs
-          prompt_logprobs: prompt logprobs
-          finished: whether the request is finished
-        """
+        """Initialize a new RequestOutput object."""
 
         # TODO: Support `n` > 1.
         completion_output = CompletionOutput(

From c4ed7baa84dde1353c4269b683ae69247612dd79 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 14:22:56 +0000
Subject: [PATCH 195/293] add utility class

---
 vllm/v1/engine/core.py        |  5 ++--
 vllm/v1/engine/core_client.py |  6 ++---
 vllm/v1/serial_utils.py       | 45 ++++++++++++++++++++++++++---------
 3 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e658b744caf27..442876c805ae1 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -9,7 +9,6 @@
 
 import zmq
 import zmq.asyncio
-from msgspec import msgpack
 
 from vllm.config import CacheConfig, VllmConfig
 from vllm.executor.multiproc_worker_utils import get_mp_context
@@ -25,7 +24,7 @@
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
-from vllm.v1.serial_utils import PickleEncoder, custom_enc_hook
+from vllm.v1.serial_utils import PickleEncoder, MsgpackEncoder
 from vllm.v1.utils import make_zmq_socket
 from vllm.version import __version__ as VLLM_VERSION
 
@@ -364,7 +363,7 @@ def process_output_socket(self, output_path: str):
         """Output socket IO thread."""
 
         # Msgpack serialization encoding.
-        encoder = msgpack.Encoder(enc_hook=custom_enc_hook)
+        encoder = MsgpackEncoder()
         # Reuse send buffer.
         buffer = bytearray()
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index f5efdf7605a2d..72b798cdea3b3 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -2,7 +2,6 @@
 import weakref
 from typing import List, Optional
 
-import msgspec
 import zmq
 import zmq.asyncio
 
@@ -13,7 +12,7 @@
                             EngineCoreRequestType, EngineCoreRequestUnion)
 from vllm.v1.engine.core import (EngineCore, EngineCoreProc,
                                  EngineCoreProcHandle)
-from vllm.v1.serial_utils import PickleEncoder, custom_ext_hook
+from vllm.v1.serial_utils import PickleEncoder, MsgpackDecoder
 
 logger = init_logger(__name__)
 
@@ -134,8 +133,7 @@ def __init__(
     ):
         # Serialization setup.
         self.encoder = PickleEncoder()
-        self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs,
-                                               ext_hook=custom_ext_hook)
+        self.decoder = MsgpackDecoder(EngineCoreOutputs)
 
         # ZMQ setup.
         if asyncio_mode:
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 76f7076cfa9e0..9b002052aae78 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,11 +1,11 @@
+import torch
 import pickle
-from typing import Any
-
 import numpy as np
+from typing import Any
 from msgspec import msgpack
 
 CUSTOM_TYPE_CODE_PICKLE = 1
-pickle_types = (np.ndarray, )
+PICKLE_TYPES = torch.Tensor
 
 
 class PickleEncoder:
@@ -15,24 +15,47 @@ def encode(self, obj):
 
     def decode(self, data):
         return pickle.loads(data)
+    
+
+class MsgpackEncoder:
+    """Encoder with custom torch tensor serialization."""
+
+    def __init__(self):
+        self.encoder = msgpack.Encoder(enc_hook=custom_enc_hook)
+    
+    def encode(self, obj: Any) -> bytes:
+        return self.encoder.encode(obj)
+    
+    def encode_into(self, obj: Any, buf: bytearray) -> None:
+        self.encoder.encode_into(obj, buf)
+
+
+class MsgpackDecoder:
+    """Decoder with custom torch tensor serialization."""
+
+    def __init__(self, t: Any):
+        self.decoder = msgpack.Decoder(t, ext_hook=custom_ext_hook)
+    
+    def decode(self, obj: Any):
+        return self.decoder.decode(obj)
+
 
 
 def custom_enc_hook(obj: Any) -> Any:
-    if isinstance(obj, pickle_types):
-        # Return an `Ext` object so msgspec serializes it as an extension type.
-        return msgpack.Ext(CUSTOM_TYPE_CODE_PICKLE, pickle.dumps(obj))
+    if isinstance(obj, PICKLE_TYPES):
+        # NOTE(rob): it is fastest to use numpy + pickle
+        # when serializing torch tensors.
+        # https://gist.github.com/tlrmchlsmth/8067f1b24a82b6e2f90450e7764fa103 # noqa: E501
+        return msgpack.Ext(CUSTOM_TYPE_CODE_PICKLE,
+                           pickle.dumps(obj.numpy()))
     else:
-        # Raise a NotImplementedError for other types
         raise NotImplementedError(
             f"Objects of type {type(obj)} are not supported")
 
 
 def custom_ext_hook(code: int, data: memoryview) -> Any:
     if code == CUSTOM_TYPE_CODE_PICKLE:
-        # This extension type represents a complex number, decode the data
-        # buffer accordingly.
-        return pickle.loads(data)
+        return torch.from_numpy(pickle.loads(data))
     else:
-        # Raise a NotImplementedError for other extension type codes
         raise NotImplementedError(
             f"Extension type code {code} is not supported")

From a7cb6917a7e5ef7fe994d80a139cc35403f96adb Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 14:23:38 +0000
Subject: [PATCH 196/293] format

---
 vllm/v1/serial_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 9b002052aae78..62033295ce159 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,6 +1,5 @@
 import torch
 import pickle
-import numpy as np
 from typing import Any
 from msgspec import msgpack
 
@@ -40,7 +39,6 @@ def decode(self, obj: Any):
         return self.decoder.decode(obj)
 
 
-
 def custom_enc_hook(obj: Any) -> Any:
     if isinstance(obj, PICKLE_TYPES):
         # NOTE(rob): it is fastest to use numpy + pickle

From d001a05b08c0bba38d55bf41a84b298c94ee0af4 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 14:26:40 +0000
Subject: [PATCH 197/293] remove cruft

---
 vllm/v1/engine/core.py  | 3 +--
 vllm/v1/serial_utils.py | 4 +---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 442876c805ae1..a899cf936c9dd 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -16,13 +16,12 @@
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.usage.usage_lib import UsageContext
-from vllm.v1.core.scheduler import Scheduler, SchedulerOutput
+from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType, EngineCoreRequestUnion)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import PickleEncoder, MsgpackEncoder
 from vllm.v1.utils import make_zmq_socket
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 62033295ce159..ff5ead89d4b7b 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -4,8 +4,6 @@
 from msgspec import msgpack
 
 CUSTOM_TYPE_CODE_PICKLE = 1
-PICKLE_TYPES = torch.Tensor
-
 
 class PickleEncoder:
 
@@ -40,7 +38,7 @@ def decode(self, obj: Any):
 
 
 def custom_enc_hook(obj: Any) -> Any:
-    if isinstance(obj, PICKLE_TYPES):
+    if isinstance(obj, torch.Tensor):
         # NOTE(rob): it is fastest to use numpy + pickle
         # when serializing torch tensors.
         # https://gist.github.com/tlrmchlsmth/8067f1b24a82b6e2f90450e7764fa103 # noqa: E501

From 3a257b83cb8029d80aec6ea2baf44bbac9c17fe6 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 15:00:23 +0000
Subject: [PATCH 198/293] update comment

---
 vllm/v1/engine/processor.py        |  2 +-
 vllm/v1/outputs.py                 |  2 +-
 vllm/v1/sample/sampler.py          |  9 +++---
 vllm/v1/serial_utils.py            | 12 ++++----
 vllm/v1/worker/gpu_input_batch.py  | 13 ++++----
 vllm/v1/worker/gpu_model_runner.py | 49 ++++++++++++------------------
 6 files changed, 39 insertions(+), 48 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 8c9d7d1e523ff..c2aec2ce486b8 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -69,7 +69,7 @@ def _validate_logprobs(
             raise ValueError(
                 f"Requested prompt logprobs of {params.prompt_logprobs}, "
                 f"which is greated than max allowed: {max_logprobs}")
-        
+
     def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 91666479abe80..8633c9e63d30d 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -36,4 +36,4 @@ class ModelRunnerOutput:
 
     # req_id -> (prompt_logprobs_token_ids, prompt_logprobs)
     # [num_reqs, max_num_prompt_logprobs]
-    prompt_logprobs: Dict[str, Tuple[List[List[int], List[List[float]]]]]
+    prompt_logprobs: Dict[str, Tuple[torch.Tensor, torch.Tensor]]
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 42f744de8ce27..9935bf1a2562e 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -18,8 +18,8 @@ def forward(
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
-        logits = self.process_logits(
-            logits, sampling_metadata.logits_process_metadata)
+        logits = self.process_logits(logits,
+                                     sampling_metadata.logits_process_metadata)
         probs = self.get_probs(logits)
         sampled = self.sample(probs, sampling_metadata)
         # Use int32 to reduce the tensor size.
@@ -46,8 +46,9 @@ def compute_logprobs(
     ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
         if max_num_logprobs > 0:
             logprobs = self.get_logprobs(logits)
-            topk_logprobs, topk_indices = torch.topk(
-                logprobs, max_num_logprobs, dim=-1)
+            topk_logprobs, topk_indices = torch.topk(logprobs,
+                                                     max_num_logprobs,
+                                                     dim=-1)
             # Use int32 to reduce the tensor size.
             topk_indices = topk_indices.to(torch.int32)
 
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index ff5ead89d4b7b..22ac90b0cae45 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -5,6 +5,7 @@
 
 CUSTOM_TYPE_CODE_PICKLE = 1
 
+
 class PickleEncoder:
 
     def encode(self, obj):
@@ -12,17 +13,17 @@ def encode(self, obj):
 
     def decode(self, data):
         return pickle.loads(data)
-    
+
 
 class MsgpackEncoder:
     """Encoder with custom torch tensor serialization."""
 
     def __init__(self):
         self.encoder = msgpack.Encoder(enc_hook=custom_enc_hook)
-    
+
     def encode(self, obj: Any) -> bytes:
         return self.encoder.encode(obj)
-    
+
     def encode_into(self, obj: Any, buf: bytearray) -> None:
         self.encoder.encode_into(obj, buf)
 
@@ -32,7 +33,7 @@ class MsgpackDecoder:
 
     def __init__(self, t: Any):
         self.decoder = msgpack.Decoder(t, ext_hook=custom_ext_hook)
-    
+
     def decode(self, obj: Any):
         return self.decoder.decode(obj)
 
@@ -42,8 +43,7 @@ def custom_enc_hook(obj: Any) -> Any:
         # NOTE(rob): it is fastest to use numpy + pickle
         # when serializing torch tensors.
         # https://gist.github.com/tlrmchlsmth/8067f1b24a82b6e2f90450e7764fa103 # noqa: E501
-        return msgpack.Ext(CUSTOM_TYPE_CODE_PICKLE,
-                           pickle.dumps(obj.numpy()))
+        return msgpack.Ext(CUSTOM_TYPE_CODE_PICKLE, pickle.dumps(obj.numpy()))
     else:
         raise NotImplementedError(
             f"Objects of type {type(obj)} are not supported")
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index e5c0caad546d6..7f9678052e257 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -116,10 +116,10 @@ def __init__(
         # generator should not be included in the dictionary.
         self.generators: Dict[int, torch.Generator] = {}
 
-        # Logprobs-related.
-        # NOTE(rob): The prompt logprobs trackers only include reqs that
-        # are actively generating logprobs (i.e. they in prefill phase).
+        
         self.num_logprobs: Dict[str, int] = {}
+        # NOTE(rob): num_prompt_logprobs ONLY includes reqs
+        # that are currently in the prefill phase.
         self.num_prompt_logprobs: Dict[str, int] = {}
 
     def add_request(
@@ -274,7 +274,8 @@ def make_sampling_metadata(
 
     def make_prompt_logprobs_metadata(
         self,
-        num_scheduled_tokens: np.ndarray,
+        partial_req_ids: List[int],
+        num_scheduled_tokens: Dict[str, int],
         req_indices: np.ndarray,
     ) -> Optional[PromptLogprobsMetadata]:
 
@@ -286,8 +287,8 @@ def make_prompt_logprobs_metadata(
         metas = {}
         for req_id in self.num_prompt_logprobs:
             req_index = self.req_id_to_index[req_id]
-            num_scheduled_tok = num_scheduled_tokens[req_index]
-            top_p = self.top_p
+            req_num_scheduled_tokens = num_scheduled_tokens[req_index]
+            top_p = self.top_p[req_id]
 
         logits_masks = {
             req_id: (req_indices == self.req_id_to_index[req_id])
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f806a786e4fb8..f30ee6718df2f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -401,7 +401,7 @@ def _prepare_sampling(
         # Create the sampling metadata.
         sampling_metadata = self.input_batch.make_sampling_metadata(
             skip_copy, sample_indices)
-        
+
         # Create the prompt logprobs metdata.
         prompt_lps_metdata = self.input_batch.make_prompt_logprobs_metadata(
             num_scheduled_tokens, req_indices)
@@ -553,31 +553,20 @@ def execute_model(
         )
 
         # Compute prompt logprobs if requested.
-        prompt_logprobs_output = {}
-        if prompt_logprobs_metadata:
-            # NOTE(rob): the implementation computes logits that are not
-            # needed and uses a small loop to keep code simple for a low
-            # importance feature (primarily used for lm-eval evaluations).
-
-            # First, compute the logits for all elements of the batch
-            logits = self.model.sampler.compute_logits(hidden_states, None)
-            logits = self.model.sampler.process_logits(
-                logits, prompt_logprobs_metadata.logits_process_metadata)
-
-            # Second, compute the logprobs for requests needing prompt lps.
-            # NOTE(rob): We should avoid looping over all reqs, this loop
-            # this only loops over active prefills which need prompt lps.
-            prompt_logprobs = {}
-            for req_id, logits_mask in logits_mask.items():
-                req_logits = logits[logits_mask]
-                lp_token_ids, lps = self.model.sampler.compute_logprobs(
-                    req_logits, self.input_batch.num_prompt_logprobs[req_id])
-
-                # TODO: remove the sample logprob by checking if this is a
-                # partial request or not.
-                prompt_logprobs[req_id] = (lp_token_ids, lps)
-
-            # Second, split the prompt logprobs
+        # NOTE(rob): this implementation computes the prompt logprobs for
+        # each active prompt separately, which is suboptimal. However,
+        # there are typically < 5 active prefills in a batch and prompt
+        # logprobs are a rare feature (used by lm-eval-harness), so
+        # prioritize simplicity over performance.
+        prompt_logprobs_output: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
+        for (req_id, mask, num_logprobs,
+             logits_process_metadata) in prompt_logprobs_metadata.zipped():
+            # Compute logits.
+            logits = self.model.sampler.compute_logits(
+                hidden_states[mask], None)
+            # Compute logprobs.
+            prompt_logprobs_output[req_id] = self.model.sampler.get_prompt_logprobs(
+                logits, logits_process_metadata, num_logprobs)
 
         # Update Request State.
         sampled_token_ids = sampler_output.sampled_token_ids
@@ -603,10 +592,10 @@ def execute_model(
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
-        # num_reqs entries should be non-None	
-        assert all(	
-            req_id is not None for req_id in	
-            self.input_batch.req_ids[:num_reqs]), "req_ids contains None"	
+        # num_reqs entries should be non-None
+        assert all(
+            req_id is not None for req_id in
+            self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
         req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs])
 
         model_runner_output = ModelRunnerOutput(

From bd38a24b6238cbb32d3382ca14d2558423dd7a70 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 15:00:31 +0000
Subject: [PATCH 199/293] nit

---
 vllm/v1/worker/gpu_input_batch.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 7f9678052e257..8c5d86994a34f 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -116,7 +116,6 @@ def __init__(
         # generator should not be included in the dictionary.
         self.generators: Dict[int, torch.Generator] = {}
 
-        
         self.num_logprobs: Dict[str, int] = {}
         # NOTE(rob): num_prompt_logprobs ONLY includes reqs
         # that are currently in the prefill phase.

From 531c007e7c7ac62ab4d515f62ec673bd24bd45af Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 15:32:48 +0000
Subject: [PATCH 200/293] stash

---
 vllm/v1/sample/metadata.py        | 11 +++--
 vllm/v1/worker/gpu_input_batch.py | 74 +++++++++++++++++--------------
 2 files changed, 46 insertions(+), 39 deletions(-)

diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 238d471b30291..02c2adf5c6c81 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -28,9 +28,8 @@ class SamplingMetadata:
 
 @dataclass
 class PromptLogprobsMetadata:
-
-    # req_id -> mask of indices each prompt logprob
-    logits_masks: Dict[str, numpy.ndarray[bool]]
-
-    # Logits process metadata for all elts of the batch
-    logits_process_metadata: LogitsProcessMetadata
+    
+    req_ids: List[str]
+    masks: List[int]
+    logits_process_metadatas: List[LogitsProcessMetadata]
+    num_prompt_logprobs: List[int]
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 8c5d86994a34f..d3497908cdb9e 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -274,46 +274,54 @@ def make_sampling_metadata(
     def make_prompt_logprobs_metadata(
         self,
         partial_req_ids: List[int],
-        num_scheduled_tokens: Dict[str, int],
         req_indices: np.ndarray,
     ) -> Optional[PromptLogprobsMetadata]:
 
         if not self.max_num_prompt_logprobs:
             return None
 
-        # Create masks for each request needing prompt logprobs.
-        # TODO(rob): wrap this in torch tensor + move to GPU?
-        metas = {}
-        for req_id in self.num_prompt_logprobs:
-            req_index = self.req_id_to_index[req_id]
-            req_num_scheduled_tokens = num_scheduled_tokens[req_index]
-            top_p = self.top_p[req_id]
-
-        logits_masks = {
-            req_id: (req_indices == self.req_id_to_index[req_id])
-            for req_id in self.num_prompt_logprobs
-        }
-
-        # Expand temp, top_p, and top_k for the whole batch
-        num_scheduled_tokens_torch = torch.from_numpy(num_scheduled_tokens).to(
-            self.temperature.device)
-        temperature = torch.repeat_interleave(self.temperature,
-                                              num_scheduled_tokens_torch)
-        # Skip expansion if we are going to skip top_p/k anyways.
-        top_p = (self.top_p if self.no_top_p else torch.repeat_interleave(
-            self.top_p, num_scheduled_tokens_torch))
-        top_k = (self.top_k if self.no_top_k else torch.repeat_interleave(
-            self.top_k, num_scheduled_tokens_torch))
-
+        # NOTE(rob): we should avoid loops like this  in model runner,
+        # but this ONLY loops over requests that are currently in
+        # prefill phase AND need prompt lps.
+        req_ids = []
+        masks = []
+        logits_process_metadatas = []
+        num_prompt_logprobs = []
+
+        # TODO(rob): should we move this to _update_states?
+        for req_id, req_num_prompt_logprobs in self.num_prompt_logprobs.items():
+            req_idx = self.req_id_to_index[req_id]
+
+            # Make the logits mask for the request prefills.
+            mask = req_indices[req_indices == req_idx].tolist()
+            if req_id not in partial_req_ids:
+                # Remove the sample token if there is one.
+                mask = mask[:-1]
+
+            # NOTE(rob): the tensors are shape 1, so we can use them in
+            # process_logits since they will be broadcasted to shape N.
+            temperature = self.temperature[req_idx]
+            top_p = self.top_p[req_idx]
+            top_k = self.top_k[req_idx]
+            no_top_p = req_id not in self.top_p_reqs
+            no_top_k = req_id not in self.top_k_reqs
+
+            req_ids.append(req_id)
+            masks.append(mask)
+            num_prompt_logprobs.append(req_num_prompt_logprobs)
+            logits_process_metadatas.append(
+                LogitsProcessMetadata(
+                    temperature=temperature,
+                    top_p=top_p,
+                    top_k=top_k,
+                    no_top_p=no_top_p,
+                    no_top_k=no_top_k))
+        
         return PromptLogprobsMetadata(
-            logits_masks=logits_masks,
-            logits_process_metadata=LogitsProcessMetadata(
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                no_top_p=self.no_top_p,
-                no_top_k=self.no_top_k,
-            ))
+            req_ids=req_ids,
+            logits_process_metadatas=logits_process_metadatas,
+            masks=masks,
+            num_prompt_logprobs=num_prompt_logprobs)
 
     @property
     def num_reqs(self) -> int:

From 0497bf9826f8ba1df5ffa0d8e7f937d9f1623164 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 15:42:46 +0000
Subject: [PATCH 201/293] update

---
 vllm/v1/sample/metadata.py         |  6 ++++++
 vllm/v1/sample/sampler.py          | 27 ++++++++++++++++++++++-----
 vllm/v1/worker/gpu_model_runner.py |  7 +++----
 3 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 02c2adf5c6c81..106044c14a9dc 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -33,3 +33,9 @@ class PromptLogprobsMetadata:
     masks: List[int]
     logits_process_metadatas: List[LogitsProcessMetadata]
     num_prompt_logprobs: List[int]
+
+    def zipped(self):
+        return zip(self.req_ids,
+                   self.masks,
+                   self.logits_process_metadatas,
+                   self.num_prompt_logprobs)
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 9935bf1a2562e..46104a26be76c 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -18,7 +18,7 @@ def forward(
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
-        logits = self.process_logits(logits,
+        logits = self._process_logits(logits,
                                      sampling_metadata.logits_process_metadata)
         probs = self.get_probs(logits)
         sampled = self.sample(probs, sampling_metadata)
@@ -27,8 +27,10 @@ def forward(
 
         # Compute the logprobs if requested.
         # NOTE: CPU-GPU synchronization happens here.
-        logprob_token_ids, logprobs = self.compute_logprobs(
-            logits, sampling_metadata.max_num_logprobs)
+        logprob_token_ids, logprobs = self._compute_logprobs(
+            logit=logits,
+            max_num_logprobs=sampling_metadata.max_num_logprobs,
+            sampled_token_ids=sampled)
 
         # NOTE: CPU-GPU synchronization happens here.
         sampler_output = SamplerOutput(
@@ -37,8 +39,23 @@ def forward(
             logprobs=logprobs,
         )
         return sampler_output
+    
+    def compute_prompt_logprobs(
+        self,
+        logits: torch.Tensor,
+        logits_process_metadata: LogitsProcessMetadata,
+        num_logprobs: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        logits = self._process_logits(logits, logits_process_metadata)
+
+        # NOTE: CPU-GPU synchronization happens here.
+        logprob_token_ids, logprobs = self._compute_logprobs(
+            logits=logits,
+            max_num_logprobs=num_logprobs)
 
-    def compute_logprobs(
+        return logprob_token_ids, logprobs
+        
+    def _compute_logprobs(
         self,
         logits: torch.Tensor,
         max_num_logprobs: int,
@@ -63,7 +80,7 @@ def compute_logprobs(
         else:
             return None, None
 
-    def process_logits(
+    def _process_logits(
         self,
         logits: torch.Tensor,
         logits_process_metadata: LogitsProcessMetadata,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f30ee6718df2f..80cea6ba4bcb5 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -403,10 +403,10 @@ def _prepare_sampling(
             skip_copy, sample_indices)
 
         # Create the prompt logprobs metdata.
-        prompt_lps_metdata = self.input_batch.make_prompt_logprobs_metadata(
+        prompt_lps_metadata = self.input_batch.make_prompt_logprobs_metadata(
             num_scheduled_tokens, req_indices)
 
-        return sampling_metadata, prompt_lps_metdata
+        return sampling_metadata, prompt_lps_metadata
 
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
@@ -559,8 +559,7 @@ def execute_model(
         # logprobs are a rare feature (used by lm-eval-harness), so
         # prioritize simplicity over performance.
         prompt_logprobs_output: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
-        for (req_id, mask, num_logprobs,
-             logits_process_metadata) in prompt_logprobs_metadata.zipped():
+        for (req_id, mask, logits_process_metadata, num_logprobs) in prompt_logprobs_metadata.zipped():
             # Compute logits.
             logits = self.model.sampler.compute_logits(
                 hidden_states[mask], None)

From 25041f66568eb04b07f96aecd82faacbd95e484e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 16:48:38 +0000
Subject: [PATCH 202/293] stash

---
 vllm/v1/core/scheduler.py          | 23 +++++++++++++++++++-
 vllm/v1/engine/__init__.py         | 11 +++++-----
 vllm/v1/outputs.py                 |  2 +-
 vllm/v1/sample/metadata.py         |  6 ++----
 vllm/v1/sample/sampler.py          | 11 +++++-----
 vllm/v1/worker/gpu_input_batch.py  | 16 +++++++-------
 vllm/v1/worker/gpu_model_runner.py | 34 +++++++++++++++++-------------
 7 files changed, 63 insertions(+), 40 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 80b53c3025b39..3c1e383363151 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -394,6 +394,9 @@ def update_from_output(
     ) -> List[EngineCoreOutput]:
         # NOTE(woosuk): This method doesn't consider speculative decoding.
         sampled_token_ids = model_runner_output.sampled_token_ids
+        logprobs_token_ids_cpu = model_runner_output.logprob_token_ids_cpu
+        logprobs_cpu = model_runner_output.logprobs_cpu
+        prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
         new_running: List[Request] = []
         engine_core_outputs: List[EngineCoreOutput] = []
@@ -428,13 +431,31 @@ def update_from_output(
                 # This must be called before me make the EngineCoreOutput.
                 stopped = self._check_stop(request)
 
+                # Extract sample logprobs if needed.
+                # TODO(rob): does it make sense to pythonize here?
+                do_lps = logprobs_cpu is not None
+                logprobs_token_ids = (logprobs_token_ids_cpu[req_index]
+                                      if do_lps else None)
+                logprobs = logprobs_cpu[req_index] if do_lps else None
+
+                # Extract prompt logprobs for this req if needed.
+                # TODO(rob): does it make sense to pythonize here?
+                # FIXME(rob): handle partial request. Currently we throw away
+                # the prompt logprobs for the partial request.
+                prompt_logprobs_token_ids, prompt_logprobs = (
+                    prompt_logprobs_dict.get(req_id, default=(None,None)))
+
                 # Add EngineCoreOutput for this Request.
                 output = EngineCoreOutput(
                     request_id=req_id,
                     new_token_ids=request.output_token_ids[-num_new_tokens:],
                     finished=request.is_finished(),
                     finish_reason=request.get_finished_reason(),
-                    stop_reason=request.stop_reason)
+                    stop_reason=request.stop_reason,
+                    logprobs_token_ids=logprobs_token_ids,
+                    logprobs=logprobs,
+                    prompt_logprobs_token_ids=prompt_logprobs_token_ids,
+                    prompt_logprobs=prompt_logprobs)
                 engine_core_outputs.append(output)
 
                 # Breakout of the loop.
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 6f05bb2634408..1e786d21b5e3a 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,9 +1,9 @@
 import enum
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import msgspec
-import numpy.typing as npt
+import torch
 
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
@@ -56,9 +56,10 @@ class EngineCoreOutput(
     request_id: str
     new_token_ids: List[int]
     finished: bool
-    logprobs: Optional[List[Tuple[npt.NDArray, npt.NDArray]]]
-    prompt_logprobs: Optional[npt.NDArray]
-    prompt_logprobs_token_ids: Optional[npt.NDArray]
+    logprobs: Optional[torch.Tensor]
+    logprobs_token_ids: Optional[torch.Tensor]
+    prompt_logprobs: Optional[torch.Tensor]
+    prompt_logprobs_token_ids: Optional[torch.Tensor]
     finish_reason: Optional[str] = None
     stop_reason: Union[int, str, None] = None
 
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 8633c9e63d30d..5460afaa1bdf3 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -36,4 +36,4 @@ class ModelRunnerOutput:
 
     # req_id -> (prompt_logprobs_token_ids, prompt_logprobs)
     # [num_reqs, max_num_prompt_logprobs]
-    prompt_logprobs: Dict[str, Tuple[torch.Tensor, torch.Tensor]]
+    prompt_logprobs_dict: Dict[str, Tuple[torch.Tensor, torch.Tensor]]
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 106044c14a9dc..1ec76b2d6bad9 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -28,14 +28,12 @@ class SamplingMetadata:
 
 @dataclass
 class PromptLogprobsMetadata:
-    
+
     req_ids: List[str]
     masks: List[int]
     logits_process_metadatas: List[LogitsProcessMetadata]
     num_prompt_logprobs: List[int]
 
     def zipped(self):
-        return zip(self.req_ids,
-                   self.masks,
-                   self.logits_process_metadatas,
+        return zip(self.req_ids, self.masks, self.logits_process_metadatas,
                    self.num_prompt_logprobs)
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 46104a26be76c..04ea53b76385e 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -18,8 +18,8 @@ def forward(
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
-        logits = self._process_logits(logits,
-                                     sampling_metadata.logits_process_metadata)
+        logits = self._process_logits(
+            logits, sampling_metadata.logits_process_metadata)
         probs = self.get_probs(logits)
         sampled = self.sample(probs, sampling_metadata)
         # Use int32 to reduce the tensor size.
@@ -39,7 +39,7 @@ def forward(
             logprobs=logprobs,
         )
         return sampler_output
-    
+
     def compute_prompt_logprobs(
         self,
         logits: torch.Tensor,
@@ -50,11 +50,10 @@ def compute_prompt_logprobs(
 
         # NOTE: CPU-GPU synchronization happens here.
         logprob_token_ids, logprobs = self._compute_logprobs(
-            logits=logits,
-            max_num_logprobs=num_logprobs)
+            logits=logits, max_num_logprobs=num_logprobs)
 
         return logprob_token_ids, logprobs
-        
+
     def _compute_logprobs(
         self,
         logits: torch.Tensor,
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index d3497908cdb9e..bc8c9474737b4 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -289,7 +289,8 @@ def make_prompt_logprobs_metadata(
         num_prompt_logprobs = []
 
         # TODO(rob): should we move this to _update_states?
-        for req_id, req_num_prompt_logprobs in self.num_prompt_logprobs.items():
+        for req_id, req_num_prompt_logprobs in self.num_prompt_logprobs.items(
+        ):
             req_idx = self.req_id_to_index[req_id]
 
             # Make the logits mask for the request prefills.
@@ -310,13 +311,12 @@ def make_prompt_logprobs_metadata(
             masks.append(mask)
             num_prompt_logprobs.append(req_num_prompt_logprobs)
             logits_process_metadatas.append(
-                LogitsProcessMetadata(
-                    temperature=temperature,
-                    top_p=top_p,
-                    top_k=top_k,
-                    no_top_p=no_top_p,
-                    no_top_k=no_top_k))
-        
+                LogitsProcessMetadata(temperature=temperature,
+                                      top_p=top_p,
+                                      top_k=top_k,
+                                      no_top_p=no_top_p,
+                                      no_top_k=no_top_k))
+
         return PromptLogprobsMetadata(
             req_ids=req_ids,
             logits_process_metadatas=logits_process_metadatas,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 80cea6ba4bcb5..0fd75c58c0bd3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -552,20 +552,24 @@ def execute_model(
             sampling_metadata=sampling_metadata,
         )
 
-        # Compute prompt logprobs if requested.
-        # NOTE(rob): this implementation computes the prompt logprobs for
-        # each active prompt separately, which is suboptimal. However,
-        # there are typically < 5 active prefills in a batch and prompt
-        # logprobs are a rare feature (used by lm-eval-harness), so
-        # prioritize simplicity over performance.
-        prompt_logprobs_output: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
-        for (req_id, mask, logits_process_metadata, num_logprobs) in prompt_logprobs_metadata.zipped():
-            # Compute logits.
-            logits = self.model.sampler.compute_logits(
-                hidden_states[mask], None)
-            # Compute logprobs.
-            prompt_logprobs_output[req_id] = self.model.sampler.get_prompt_logprobs(
-                logits, logits_process_metadata, num_logprobs)
+        # Compute prompt logprobs if needed.
+        # NOTE(rob): compute prompt logprobs for each req separately,
+        # which is suboptimal. However, prompt logprobs are rare (used
+        # by lm-eval-harness) and we have few prefill per batch,
+        # so prioritize simplicity.
+        prompt_lps_dict: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
+        if prompt_logprobs_metadata:
+            for req_id, mask, metadata, num_logprobs in prompt_logprobs_metadata.zipped(
+            ):
+                # TODO: make prompt lp metadata here?
+
+                # Compute logits.
+                logits = self.model.sampler.compute_logits(
+                    hidden_states[mask], None)
+                # Compute prompt logprobs.
+                prompt_lps_dict[
+                    req_id] = self.model.sampler.get_prompt_logprobs(
+                        logits, metadata, num_logprobs)
 
         # Update Request State.
         sampled_token_ids = sampler_output.sampled_token_ids
@@ -603,7 +607,7 @@ def execute_model(
             sampled_token_ids=sampled_token_ids,
             logprob_token_ids_cpu=sampler_output.logprob_token_ids,
             logprobs_cpu=sampler_output.logprobs,
-            prompt_logprobs=prompt_logprobs_output,
+            prompt_logprobs_dict=prompt_lps_dict,
         )
         return model_runner_output
 

From 062d0a76cc85c1aa0c5b8c60dcff7fd7442dae24 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 17:11:24 +0000
Subject: [PATCH 203/293] stash

---
 vllm/v1/core/scheduler.py     |  2 +-
 vllm/v1/engine/detokenizer.py | 33 ++++++++++++---------------------
 2 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 3c1e383363151..e1d15d10b89da 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -443,7 +443,7 @@ def update_from_output(
                 # FIXME(rob): handle partial request. Currently we throw away
                 # the prompt logprobs for the partial request.
                 prompt_logprobs_token_ids, prompt_logprobs = (
-                    prompt_logprobs_dict.get(req_id, default=(None,None)))
+                    prompt_logprobs_dict.get(req_id, default=(None, None)))
 
                 # Add EngineCoreOutput for this Request.
                 output = EngineCoreOutput(
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index a539f21d889f5..73257c8fc7e22 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -24,9 +24,6 @@ class IncrementalDetokenizer:
     output_text: str
     tokens: List[str]
     token_ids: List[int]
-    logprobs: Optional[SampleLogprobs]
-    prompt_logprobs: Optional[PromptLogprobs]
-    cumulative_logprob: Optional[float]
 
     # Stop strings
     stop: List[str]
@@ -49,11 +46,12 @@ class IncrementalDetokenizer:
     # Tokenizer for this request
     tokenizer: AnyTokenizer
 
-    # Maximum number of sample logprobs for this request
-    max_request_sample_logprobs: Optional[int]
-
-    # Maximum number of prompt logprobs for this request
-    max_request_prompt_logprobs: Optional[int]
+    # Logprobs for this request
+    logprobs: List[SampleLogprobs]
+    prompt_logprobs: List[PromptLogprobs]
+    cumulative_logprob: float
+    num_logprobs: int
+    num_prompt_logprobs: int
 
     # Accounting for stop string buffering
     stop_buffer_length: int
@@ -61,7 +59,6 @@ class IncrementalDetokenizer:
 
     @property
     def output_token_ids(self) -> List[int]:
-        """Return generated tokens"""
         assert len(self.token_ids) >= len(self.prompt_token_ids)
         return self.token_ids[len(self.prompt_token_ids):]
 
@@ -86,13 +83,6 @@ def from_new_request(
         else:
             stop_buffer_length = 0
 
-        # Flags for whether to detokenize sample logprobs and prompt logprobs,
-        # respectively.
-        do_request_logprobs = (request.logprobs is not None
-                               and request.logprobs > 0)
-        do_request_prompt_logprobs = (request.prompt_logprobs is not None
-                                      and request.prompt_logprobs > 0)
-
         return cls(
             output_text="",
             tokens=tokens,
@@ -112,11 +102,12 @@ def from_new_request(
             prompt_token_ids=request.prompt_token_ids,
             tokenizer=tokenizer,
             stop_buffer_length=stop_buffer_length,
-            max_request_sample_logprobs=request.logprobs,
-            max_request_prompt_logprobs=request.prompt_logprobs,
-            request_logprobs=[] if do_request_logprobs else None,
-            request_prompt_logprobs=[] if do_request_prompt_logprobs else None,
-            request_cumulative_logprob=0 if do_request_logprobs else None)
+            logprobs=[],
+            prompt_logprobs=[],
+            cumulative_logprob=0.,
+            num_logprobs=request.logprobs,
+            num_prompt_logprobs=request.prompt_logprobs,
+        )
 
     def _detokenize_ids(
         self,

From 94d9b382fd2e7f06b923336a1680199a57279018 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 17:42:42 +0000
Subject: [PATCH 204/293] updated

---
 vllm/v1/engine/detokenizer.py | 84 +++++++++++++++--------------------
 1 file changed, 37 insertions(+), 47 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 73257c8fc7e22..275662a09f953 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,8 +1,7 @@
 from dataclasses import dataclass
 from typing import Dict, Iterable, List, Optional, Tuple, Union, cast
 
-import numpy as np
-import numpy.typing as npt
+import torch
 
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
@@ -276,32 +275,34 @@ def _pythonize_maybe_detokenize_prompt_logprobs_for_request(
 
     def add_tokens(
         self,
-        new_sampled_token_ids: List[int],
-        new_sample_logprobs: Optional[List[Tuple[npt.NDArray, npt.NDArray]]],
-        new_prompt_logprobs: Optional[npt.NDArray],
-        new_prompt_logprob_token_ids: Optional[npt.NDArray],
+        new_token_ids: List[int],
+        new_logprobs_token_ids: Optional[List[torch.Tensor]],
+        new_logprobs: Optional[List[torch.Tensor]],
+        prompt_logprobs_token_ids: Optional[torch.Tensor],
+        prompt_logprobss: Optional[torch.Tensor],
         finish_reason: Optional[str],
         stop_reason: Optional[Union[int, str, None]],
     ) -> Optional[RequestOutput]:
         """Update RequestState for the request_id.
 
-        1) If necessary, detokenize sample logprobs *non*-incrementally
-        2) If necessary, detokenize prompt logprobs *non*-incrementally
-        3) Detokenize the new token ids incrementally.
+        1) Detokenize sample logprobs non-incrementally if needed
+        2) Detokenize prompt logprobs non-incrementally if needed
+        3) Detokenize the new token ids incrementally
         4) Evaluate stop criteria
         5) Update the `RequestOutput` object with new text
 
+        NOTE(rob): in the current implementation of EngineCore,
+        the lists above will all be of length 1 since we can only
+        generate one token at a time.
+
         Args:
           new_token_ids: list of newly-sampled token ids
-          new_logprobs: list of (logprobs,token ids) top logprobs
-                        tuples for sampled tokens
-          new_prompt_logprobs: num_chunk_tokens x num_prompt_logprobs np array
-                               of prompt logprobs values
-          new_prompt_logprob_token_ids: num_chunk_tokens x num_prompt_logprobs
-                                        np array of top token ids
-          finish_reason: string representation of the reason request
-                         detokenization completed
-          stop_reason: reason that detokenization stopped
+          new_logprobs_token_ids: list of topk ids (1 for each new token)
+          new_logprobs: list of topk logprobs (1 for each new token)
+          prompt_logprob_token_ids: topk ids for each prompt token
+          prompt_logprobs: topk logprobs for each prompt token          
+          finish_reason: reason request finished in engine
+          stop_reason: reason request stopped in the stopped
 
         Returns:
           Returns request output instance, except i.e. when the request
@@ -309,28 +310,20 @@ def add_tokens(
           which has not occurred yet.
         """
 
-        # Only try to Pythonize sample logprobs if any were provided
-        do_request_sample_logprobs = new_sample_logprobs is not None and len(
-            new_sample_logprobs) > 0
-        if do_request_sample_logprobs:
-            assert new_sample_logprobs is not None
-            assert len(new_sample_logprobs) == len(new_sampled_token_ids)
-        # Only try to Pythonize prompt logprobs if any were provided
-        do_request_prompt_logprobs = new_prompt_logprobs is not None and len(
-            new_prompt_logprobs) > 0
-        if do_request_prompt_logprobs:
-            assert new_prompt_logprob_token_ids is not None
+        # 1) Pythonize & detokenize sample logprobs.
+        if new_logprobs:
+            assert new_logprobs_token_ids is not None
+            assert len(new_logprobs) == len(new_token_id)
 
-        if do_request_sample_logprobs:
-            # 1) Pythonize & detokenize sample logprobs
-            assert new_sample_logprobs is not None
             new_sample_logprobs = (
                 self._pythonize_maybe_detokenize_sample_logprobs_for_request(
                     new_sample_logprobs,
                     new_sampled_token_ids,
                     detokenize=True))
 
-        if do_request_prompt_logprobs:
+        if prompt_logprobs:
+            assert prompt_logprobs_token_ids is not None
+
             # 2) If necessary, detokenize prompt logprobs incrementally
             assert new_prompt_logprobs is not None
             assert new_prompt_logprob_token_ids is not None
@@ -345,7 +338,7 @@ def add_tokens(
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
         decoded_text = ""
-        for new_token_id in new_sampled_token_ids:
+        for new_token_id in new_token_ids:
             self.token_ids.append(new_token_id)
             (new_tokens, new_decoded_token_text, prefix_offset,
              read_offset) = detokenize_incrementally(
@@ -366,8 +359,8 @@ def add_tokens(
 
             decoded_text += new_decoded_token_text
 
+        # 4) Evaluate stop criteria.
         if self.stop:
-            # 4) Evaluate stop criteria.
             stop = StopChecker.check_stop_strings(
                 output_text=self.output_text,
                 new_char_count=len(decoded_text),
@@ -375,12 +368,11 @@ def add_tokens(
                 include_in_output=self.include_stop_str_in_output,
             )
             if stop is not None:
-                _, truncate_to = stop
+                stop_str, truncate_to = stop
                 if truncate_to != -1:
                     self.output_text = self.output_text[:truncate_to]
                 finish_reason = "stop"  # TODO: use constant
-
-        # TODO: handle stop_token_ids here too?
+                stop_reason = stop_str
 
         # 5) Update the RequestOutput object with the new text.
         finished = bool(finish_reason)
@@ -390,15 +382,13 @@ def add_tokens(
 
         delta = self.output_kind == RequestOutputKind.DELTA
         output_text = self._get_next_output_text(finished, delta)
-        # DELTA -> new sampled tokens and logprobs + current cumulative prompt
-        #          logprob
-        # FINAL -> all sampled tokens and logprobs + current cumulative prompt
-        #          logprob
-        token_ids = new_sampled_token_ids if delta else self.output_token_ids
-        logprobs = new_sample_logprobs if delta else self.request_logprobs
-        prompt_logprobs = (new_prompt_logprobs
-                           if delta else self.request_prompt_logprobs)
-        cumulative_logprob = self.request_cumulative_logprob
+
+        # DELTA -> return just newly created items.
+        # FINAL -> return the whole history so far.
+        token_ids = new_token_ids if delta else self.output_token_ids
+        logprobs = new_logprobs if delta else self.logprobs
+        prompt_logprobs = new_logprobs if delta else self.prompt_logprobs
+        cumulative_logprob = self.cumulative_logprob
 
         request_output = RequestOutput.new(
             self.request_id,

From f2cdb6148fe7d39d2064caa35546147987476c33 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 17:43:54 +0000
Subject: [PATCH 205/293] updated

---
 vllm/v1/engine/detokenizer.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 275662a09f953..ae5a2d26c6987 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -380,11 +380,9 @@ def add_tokens(
             and not finished:
             return None
 
+        # Return just newly created items if DELTA.
         delta = self.output_kind == RequestOutputKind.DELTA
         output_text = self._get_next_output_text(finished, delta)
-
-        # DELTA -> return just newly created items.
-        # FINAL -> return the whole history so far.
         token_ids = new_token_ids if delta else self.output_token_ids
         logprobs = new_logprobs if delta else self.logprobs
         prompt_logprobs = new_logprobs if delta else self.prompt_logprobs
@@ -427,7 +425,6 @@ def _get_next_output_text(self, finished: bool, delta: bool) -> str:
 
 
 class Detokenizer:
-    """Track and implement detokenization of multiple requests"""
 
     def __init__(self,
                  tokenizer_name: str,

From 3c4d9c1beff8dd87cafb81331b1c5974f98d90e0 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 19:34:35 +0000
Subject: [PATCH 206/293] updated

---
 vllm/transformers_utils/detokenizer_utils.py |  47 ------
 vllm/v1/engine/detokenizer.py                | 152 +++++++++----------
 vllm/v1/sample/sampler.py                    |   9 +-
 3 files changed, 80 insertions(+), 128 deletions(-)

diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 885e3b9d92f88..676e2583e5d09 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -167,50 +167,3 @@ def detokenize_incrementally(
 
     new_text = new_text[len(prefix_text):]
     return new_tokens, new_text, read_offset, len(output_tokens)
-
-
-def detokenize_logprob_incrementally_in_place(
-    tokenizer: AnyTokenizer,
-    logprob_dict: Dict[int, Logprob],
-    input_ids_prefix: List[int],
-    prev_tokens: Optional[List[str]],
-    prefix_offset: int,
-    read_offset: int,
-    skip_special_tokens: bool = False,
-    spaces_between_special_tokens: bool = True,
-) -> None:
-    """Detokenizes the logprobs at a single token offset incrementally.
-
-    For each top-token in `logprob_dict`, apply incremental detokenization
-    to the token list `input_ids_prefix + [top-token id]`
-
-    The logprob data structure is modified in-place with the string
-    representation of each decoded top-token.
-    
-    Args:
-        tokenizer: The tokenizer to use.
-        logprob_dict: logprob data structure for a single token position
-        input_ids_prefix: The input ids *preceding* the token offset under
-                          consideration
-        prev_tokens: The previous tokens. If None, this function will convert
-            the input ids to tokens and return the tokens and the new text.
-        prefix_offset: The prefix offset.
-        read_offset: The read offset.
-        skip_special_tokens: Whether to skip special tokens.
-        spaces_between_special_tokens: Whether to add spaces between special
-            tokens.
-    """
-
-    for token_id in logprob_dict:
-        # Detokenize logprob for a particular top
-        # token at a particular token offset
-
-        logprob_dict[token_id].decoded_token = detokenize_incrementally(
-            tokenizer=tokenizer,
-            all_input_ids=input_ids_prefix + [token_id],
-            prev_tokens=prev_tokens,
-            prefix_offset=prefix_offset,
-            read_offset=read_offset,
-            skip_special_tokens=skip_special_tokens,
-            spaces_between_special_tokens=spaces_between_special_tokens,
-        )[1]
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index ae5a2d26c6987..833e80e175e51 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -46,8 +46,8 @@ class IncrementalDetokenizer:
     tokenizer: AnyTokenizer
 
     # Logprobs for this request
-    logprobs: List[SampleLogprobs]
-    prompt_logprobs: List[PromptLogprobs]
+    logprobs: SampleLogprobs
+    prompt_logprobs: PromptLogprobs
     cumulative_logprob: float
     num_logprobs: int
     num_prompt_logprobs: int
@@ -108,21 +108,6 @@ def from_new_request(
             num_prompt_logprobs=request.prompt_logprobs,
         )
 
-    def _detokenize_ids(
-        self,
-        token_id_list: int,
-    ) -> List[str]:
-        """Helper method to detokenize one or more token ids.
-        
-        Args:
-          token_id_list: list of tokens to detokenize
-
-        Returns:
-          List of token string representations of tokens
-        """
-        return self.tokenizer.convert_ids_to_tokens(token_id_list,
-                                                    skip_special_tokens=False)
-
     def _pythonize_sequence_position(
         self,
         logprob_values: npt.NDArray,
@@ -156,11 +141,11 @@ def _pythonize_sequence_position(
                 zip(logprob_values, logprob_token_ids, logprob_token_strs))
         }
 
-    def _pythonize_maybe_detokenize_sample_logprobs_for_request(
+    def _make_sample_logprobs(
         self,
-        new_sample_logprobs: List[Tuple[npt.NDArray, npt.NDArray]],
-        new_sample_token_ids: List[int],
-        detokenize: bool,
+        sampled_token_ids: List[int],
+        logprobs_token_ids_lst: List[torch.Tensor],
+        logprobs_lst: List[torch.Tensor],
     ) -> SampleLogprobs:
         """Pythonize sample logprobs, maybe detokenize.
         
@@ -188,40 +173,51 @@ def _pythonize_maybe_detokenize_sample_logprobs_for_request(
           Sample logprobs compute in this step, Pythonized and possibly
           detokenized
         """
-        new_pythonized_logprobs = []
-        max_logprobs = self.max_request_sample_logprobs
-        assert max_logprobs is not None
-        assert self.request_logprobs is not None
-        for (logprob_values,
-             logprob_token_ids), token_id in zip(new_sample_logprobs,
-                                                 new_sample_token_ids):
-            # Only keep the number of logprobs specified by the request
-            # (plus possibly the sampled token id & its logprob)
-            logprob_cnt = max_logprobs
-            if token_id not in logprob_token_ids[0:logprob_cnt]:
-                # Sampled token is not in the in the top logprobs;
-                # inject it & re-sort, ensuring that excess logprobs
-                # not requested by the user have -inf probability
-                logprob_values[max_logprobs:-1] = float('-inf')
-                # Get indices that would sort logprob_values in descending order
-                indices = np.argsort(logprob_values)[::-1]
-                # Use these indices to reorder logprob_values and
-                # logprob_token_ids
-                logprob_values = logprob_values[indices]
-                logprob_token_ids = logprob_token_ids[indices]
-                # There will be one more logprob than the user requested
-                logprob_cnt = max_logprobs + 1
-
-            # Pythonize top logprobs
-            new_pythonized_logprobs_dict = self._pythonize_sequence_position(
-                logprob_values[0:logprob_cnt],
-                logprob_token_ids[0:logprob_cnt], detokenize)
-            self.request_logprobs.append(new_pythonized_logprobs_dict)
-            self.request_cumulative_logprob += new_pythonized_logprobs_dict[
-                token_id].logprob
-            new_pythonized_logprobs.append(new_pythonized_logprobs_dict)
-
-        return new_pythonized_logprobs
+
+        # NOTE(rob): the lists are of length > 1 if a single step
+        # of engine core generates > 1 token (e.g. spec decoding).
+        assert len(sampled_token_ids) == len(logprobs_token_ids_lst)
+        assert len(sampled_token_ids) == len(logprobs_lst)
+        output_list: SampleLogprobs = []
+        for sampled_token_id, logprobs, logprobs_token_ids in zip(
+            sampled_token_ids, logprobs_lst, logprobs_token_ids_lst):
+
+            # Sampler cats the lps of sampled tok before the topk lps.
+            assert sampled_token_id == logprobs_token_ids[0].item(), (
+                "Sampler cats the sampled tokens logprobs in front of "
+                f"the topk logprobs, but got {sampled_token_id=} and "
+                f"{logprobs_token_ids[0].item()=}")
+
+            # Pythonize the torch tensors..
+            sampled_token_logprob = logprobs[0].item()
+            topk_token_ids = logprobs_token_ids[1:].tolist()
+            topk_logprobs = logprobs[1:].tolist()
+
+            # Make the Logprob objects.
+            # Detokenize *non-incrementally* for simplicity.
+            decoded_tokens = self.tokenizer.batch_decode(
+                topk_token_ids.reshape(-1,1))
+            # torch.topk used to select lps returns them
+            # in sorted order, so we can use idx for rank.
+            topk_logprobs_dict = {
+                topk_token_ids[idx]: Logprob(
+                    logprob=topk_logprobs[idx], rank=idx,
+                    decoded_token=decoded_tokens[idx],
+                ) for idx in range(self.num_logprobs)
+            }
+
+            # If the sampled token was not in the topk, add it.
+            if sampled_token_id not in topk_logprobs_dict:
+                # TODO(rob): is rank for sample Logprob needed? 
+                # it is not used in Chat Completions.
+                token = self.tokenizer.decode(sampled_token_id)
+                topk_logprobs_dict[sampled_token_id] = Logprob(
+                    logprob=sampled_token_logprob,
+                    rank=None, decoded_token=token)
+
+            output_list.append(topk_logprobs_dict)
+
+        return output_list
 
     def _pythonize_maybe_detokenize_prompt_logprobs_for_request(
         self,
@@ -279,7 +275,7 @@ def add_tokens(
         new_logprobs_token_ids: Optional[List[torch.Tensor]],
         new_logprobs: Optional[List[torch.Tensor]],
         prompt_logprobs_token_ids: Optional[torch.Tensor],
-        prompt_logprobss: Optional[torch.Tensor],
+        prompt_logprobs: Optional[torch.Tensor],
         finish_reason: Optional[str],
         stop_reason: Optional[Union[int, str, None]],
     ) -> Optional[RequestOutput]:
@@ -310,31 +306,31 @@ def add_tokens(
           which has not occurred yet.
         """
 
-        # 1) Pythonize & detokenize sample logprobs.
+        # 1) Make Sample Logprobs.
         if new_logprobs:
-            assert new_logprobs_token_ids is not None
-            assert len(new_logprobs) == len(new_token_id)
-
-            new_sample_logprobs = (
-                self._pythonize_maybe_detokenize_sample_logprobs_for_request(
-                    new_sample_logprobs,
-                    new_sampled_token_ids,
-                    detokenize=True))
+            sample_logprobs = self._make_sample_logprobs(
+                sampled_token_ids=new_token_ids,
+                logprobs_token_ids_lst=new_logprobs_token_ids,
+                logprobs=new_logprobs,
+            )
+            self.logprobs.append(sample_logprobs)
+            # TODO: update cumulative logprob.
+            # self.cumulative_logprob +=
 
+        # 2) Pythonize & detokenizer prompt logprobs.
         if prompt_logprobs:
             assert prompt_logprobs_token_ids is not None
+            prompt_logprobs = self._make_prompt_logprobs(
+                prompt_logprobs_token_ids,
+                prompt_logprobs,
+            )
+
+            # NOTE(rob): EngineCore does not stream out partial
+            # prefills, so all prompt logprobs come in one step.
+            assert len(self.prompt_logprobs) == 0
+            self.prompt_logprobs = prompt_logprobs
 
-            # 2) If necessary, detokenize prompt logprobs incrementally
-            assert new_prompt_logprobs is not None
-            assert new_prompt_logprob_token_ids is not None
-            new_prompt_logprobs = (
-                self._pythonize_maybe_detokenize_prompt_logprobs_for_request(
-                    new_prompt_logprobs,
-                    new_prompt_logprob_token_ids,
-                    detokenize=True))
-
-        # 3) Detokenize the new token ids incrementally. If necessary,
-        #    detokenize logprobs.
+        # 3) Detokenize the new token ids incrementally.
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
         decoded_text = ""
@@ -384,8 +380,8 @@ def add_tokens(
         delta = self.output_kind == RequestOutputKind.DELTA
         output_text = self._get_next_output_text(finished, delta)
         token_ids = new_token_ids if delta else self.output_token_ids
-        logprobs = new_logprobs if delta else self.logprobs
-        prompt_logprobs = new_logprobs if delta else self.prompt_logprobs
+        logprobs = sample_logprobs if delta else self.logprobs
+        prompt_logprobs = sample_logprobs if delta else self.prompt_logprobs
         cumulative_logprob = self.cumulative_logprob
 
         request_output = RequestOutput.new(
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 04ea53b76385e..abb41e960a5f1 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -64,16 +64,19 @@ def _compute_logprobs(
             logprobs = self.get_logprobs(logits)
             topk_logprobs, topk_indices = torch.topk(logprobs,
                                                      max_num_logprobs,
-                                                     dim=-1)
+                                                     dim=-1,
+                                                     sorted=True)
             # Use int32 to reduce the tensor size.
             topk_indices = topk_indices.to(torch.int32)
 
             # Concatenate with the sampled token_id if provided.
             if sampled_token_ids:
                 # TODO(rob): check if the concat is right.
+                # TODO(rob): we need to return the rank of the sampled token
+                # to be compatible with the OAI spec.
                 sampled_logprobs = logprobs[sampled_token_ids]
-                topk_indices = torch.cat([topk_indices, sampled_token_ids])
-                topk_logprobs = torch.cat([topk_logprobs, sampled_logprobs])
+                topk_indices = torch.cat([sampled_token_ids, topk_indices])
+                topk_logprobs = torch.cat([sampled_logprobs, topk_logprobs])
 
             return topk_indices.cpu(), topk_logprobs.cpu()
         else:

From 1a36c3bd71acc98eb089b984b47c287874c2c7c3 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 19:34:59 +0000
Subject: [PATCH 207/293] updated

---
 vllm/transformers_utils/detokenizer_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 676e2583e5d09..37ff8a236e791 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -1,6 +1,4 @@
-from typing import Dict, List, Optional, Tuple
-
-from vllm.sequence import Logprob
+from typing import List, Optional, Tuple
 
 from .tokenizer import AnyTokenizer
 

From 9e9ec2b263a6d1ac564e3eeb28ec8a1b8fe5105f Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 19:43:14 +0000
Subject: [PATCH 208/293] cleanup diff

---
 vllm/v1/engine/detokenizer.py | 70 ++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 39 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 833e80e175e51..1ba7d74f34780 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -272,24 +272,20 @@ def _pythonize_maybe_detokenize_prompt_logprobs_for_request(
     def add_tokens(
         self,
         new_token_ids: List[int],
+        finish_reason: Optional[str],
+        stop_reason: Optional[Union[int, str, None]],
         new_logprobs_token_ids: Optional[List[torch.Tensor]],
         new_logprobs: Optional[List[torch.Tensor]],
         prompt_logprobs_token_ids: Optional[torch.Tensor],
         prompt_logprobs: Optional[torch.Tensor],
-        finish_reason: Optional[str],
-        stop_reason: Optional[Union[int, str, None]],
     ) -> Optional[RequestOutput]:
-        """Update RequestState for the request_id.
-
-        1) Detokenize sample logprobs non-incrementally if needed
-        2) Detokenize prompt logprobs non-incrementally if needed
-        3) Detokenize the new token ids incrementally
-        4) Evaluate stop criteria
-        5) Update the `RequestOutput` object with new text
-
-        NOTE(rob): in the current implementation of EngineCore,
-        the lists above will all be of length 1 since we can only
-        generate one token at a time.
+        """
+        Update RequestState for the request_id by:
+            1) Detokenize the new token ids incrementally
+            2) Evaluate stop criteria
+            3) Detokenize sample logprobs non-incrementally
+            4) Detokenize prompt logprobs non-incrementally
+            5) Update the `RequestOutput` object with new text
 
         Args:
           new_token_ids: list of newly-sampled token ids
@@ -306,31 +302,7 @@ def add_tokens(
           which has not occurred yet.
         """
 
-        # 1) Make Sample Logprobs.
-        if new_logprobs:
-            sample_logprobs = self._make_sample_logprobs(
-                sampled_token_ids=new_token_ids,
-                logprobs_token_ids_lst=new_logprobs_token_ids,
-                logprobs=new_logprobs,
-            )
-            self.logprobs.append(sample_logprobs)
-            # TODO: update cumulative logprob.
-            # self.cumulative_logprob +=
-
-        # 2) Pythonize & detokenizer prompt logprobs.
-        if prompt_logprobs:
-            assert prompt_logprobs_token_ids is not None
-            prompt_logprobs = self._make_prompt_logprobs(
-                prompt_logprobs_token_ids,
-                prompt_logprobs,
-            )
-
-            # NOTE(rob): EngineCore does not stream out partial
-            # prefills, so all prompt logprobs come in one step.
-            assert len(self.prompt_logprobs) == 0
-            self.prompt_logprobs = prompt_logprobs
-
-        # 3) Detokenize the new token ids incrementally.
+        # 1) Detokenize the new token ids incrementally.
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
         decoded_text = ""
@@ -355,7 +327,7 @@ def add_tokens(
 
             decoded_text += new_decoded_token_text
 
-        # 4) Evaluate stop criteria.
+        # 2) Evaluate stop criteria.
         if self.stop:
             stop = StopChecker.check_stop_strings(
                 output_text=self.output_text,
@@ -370,6 +342,26 @@ def add_tokens(
                 finish_reason = "stop"  # TODO: use constant
                 stop_reason = stop_str
 
+        # 3) Make Sample Logprobs.
+        if new_logprobs:
+            sample_logprobs = self._make_sample_logprobs(
+                sampled_token_ids=new_token_ids,
+                logprobs_token_ids_lst=new_logprobs_token_ids,
+                logprobs=new_logprobs)
+            self.logprobs.append(sample_logprobs)
+            # TODO: update cumulative logprob.
+            # self.cumulative_logprob
+
+        # 4) Pythonize & detokenizer prompt logprobs.
+        if prompt_logprobs:
+            # EngineCore does not stream out partial prefill,
+            # so all prompt logprobs come in one step.
+            assert len(self.prompt_logprobs) == 0
+            assert prompt_logprobs_token_ids is not None
+            self.prompt_logprobs = self._make_prompt_logprobs(
+                prompt_logprobs_token_ids,
+                prompt_logprobs)
+
         # 5) Update the RequestOutput object with the new text.
         finished = bool(finish_reason)
         if self.output_kind == RequestOutputKind.FINAL_ONLY \

From b99d9cdead990817dabf572edce069d51d031929 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 19:44:43 +0000
Subject: [PATCH 209/293] clean up diff

---
 vllm/v1/engine/detokenizer.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 1ba7d74f34780..3520ffd6286ac 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -286,20 +286,6 @@ def add_tokens(
             3) Detokenize sample logprobs non-incrementally
             4) Detokenize prompt logprobs non-incrementally
             5) Update the `RequestOutput` object with new text
-
-        Args:
-          new_token_ids: list of newly-sampled token ids
-          new_logprobs_token_ids: list of topk ids (1 for each new token)
-          new_logprobs: list of topk logprobs (1 for each new token)
-          prompt_logprob_token_ids: topk ids for each prompt token
-          prompt_logprobs: topk logprobs for each prompt token          
-          finish_reason: reason request finished in engine
-          stop_reason: reason request stopped in the stopped
-
-        Returns:
-          Returns request output instance, except i.e. when the request
-          is configured to only return a result on the final decode step
-          which has not occurred yet.
         """
 
         # 1) Detokenize the new token ids incrementally.

From 2f8511842bfef56a9086da0ba627f62aa3c90d93 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 19:45:49 +0000
Subject: [PATCH 210/293] clean up diff

---
 vllm/v1/engine/detokenizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 3520ffd6286ac..34c1c8b352ab2 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -462,12 +462,12 @@ def step(
             # Detokenize and update state.
             request_output = detokenizer.add_tokens(
                 new_sampled_token_ids=engine_core_output.new_token_ids,
+                finish_reason=engine_core_output.finish_reason,
+                stop_reason=engine_core_output.stop_reason,
                 new_sample_logprobs=engine_core_output.logprobs,
                 new_prompt_logprobs=engine_core_output.prompt_logprobs,
                 new_prompt_logprob_token_ids=engine_core_output.
                 prompt_logprobs_token_ids,
-                finish_reason=engine_core_output.finish_reason,
-                stop_reason=engine_core_output.stop_reason,
             )
 
             if request_output is not None:

From cb8c87cf9a233d9a684d0a73dea28f5f89afd2dd Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 19:46:23 +0000
Subject: [PATCH 211/293] more clean

---
 vllm/v1/engine/detokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 34c1c8b352ab2..198476b304442 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -461,7 +461,7 @@ def step(
 
             # Detokenize and update state.
             request_output = detokenizer.add_tokens(
-                new_sampled_token_ids=engine_core_output.new_token_ids,
+                new_token_ids=engine_core_output.new_token_ids,
                 finish_reason=engine_core_output.finish_reason,
                 stop_reason=engine_core_output.stop_reason,
                 new_sample_logprobs=engine_core_output.logprobs,

From 983f2a71f7eae6f2febc966d547ef07b7107b117 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 19:53:22 +0000
Subject: [PATCH 212/293] stash

---
 vllm/v1/engine/detokenizer.py | 69 ++++++++++-------------------------
 1 file changed, 19 insertions(+), 50 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 198476b304442..a251172f27cdf 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -108,39 +108,6 @@ def from_new_request(
             num_prompt_logprobs=request.prompt_logprobs,
         )
 
-    def _pythonize_sequence_position(
-        self,
-        logprob_values: npt.NDArray,
-        logprob_token_ids: npt.NDArray,
-        detokenize: bool,
-    ) -> Dict[int, Logprob]:
-        """Pythonize the numpy (np) logprobs & token ids for a sequence position
-        
-        Outputs the OpenAI-API-compatible representation of the top tokens and
-        their logprobs at a single position in a sequence.
-
-        Optionally detokenize (compute logprob `decoded_token`)
-
-        Args:
-          logprob_values: np logprob values
-          logprob_token_ids: np logprob token ids
-          detokenize: if True, detokenize logprob top token ids
-
-        Return:
-          mapping from top token id to Logprob data structure
-        """
-        logprob_values = logprob_values.tolist()
-        logprob_token_ids = logprob_token_ids.tolist()
-        logprob_token_strs = (cast(List[Optional[str]],
-                                   self._detokenize_ids(logprob_token_ids)) if
-                              detokenize else [None] * len(logprob_token_ids))
-
-        return {
-            lpt: Logprob(lpv, (idx + 1), lpstr)
-            for idx, (lpv, lpt, lpstr) in enumerate(
-                zip(logprob_values, logprob_token_ids, logprob_token_strs))
-        }
-
     def _make_sample_logprobs(
         self,
         sampled_token_ids: List[int],
@@ -175,30 +142,30 @@ def _make_sample_logprobs(
         """
 
         # NOTE(rob): the lists are of length > 1 if a single step
-        # of engine core generates > 1 token (e.g. spec decoding).
+        # of EngineCore generates > 1 token (e.g. spec decoding).
         assert len(sampled_token_ids) == len(logprobs_token_ids_lst)
         assert len(sampled_token_ids) == len(logprobs_lst)
         output_list: SampleLogprobs = []
         for sampled_token_id, logprobs, logprobs_token_ids in zip(
             sampled_token_ids, logprobs_lst, logprobs_token_ids_lst):
 
-            # Sampler cats the lps of sampled tok before the topk lps.
+            # Sampler concatenates the logprobs of the sampled token
+            # ahead of the topk tokens.
             assert sampled_token_id == logprobs_token_ids[0].item(), (
                 "Sampler cats the sampled tokens logprobs in front of "
                 f"the topk logprobs, but got {sampled_token_id=} and "
                 f"{logprobs_token_ids[0].item()=}")
 
-            # Pythonize the torch tensors..
+            # Pythonize.
             sampled_token_logprob = logprobs[0].item()
             topk_token_ids = logprobs_token_ids[1:].tolist()
             topk_logprobs = logprobs[1:].tolist()
 
             # Make the Logprob objects.
-            # Detokenize *non-incrementally* for simplicity.
-            decoded_tokens = self.tokenizer.batch_decode(
-                topk_token_ids.reshape(-1,1))
-            # torch.topk used to select lps returns them
-            # in sorted order, so we can use idx for rank.
+            # Detokenize *non-incrementally*
+            decoded_tokens = self.tokenizer.batch_decode(topk_token_ids.reshape(-1,1))
+            # Sampler uses torch.topk to select the logprobs, whihch
+            # returns them in sorted order, so we can use idx for rank.
             topk_logprobs_dict = {
                 topk_token_ids[idx]: Logprob(
                     logprob=topk_logprobs[idx], rank=idx,
@@ -206,10 +173,12 @@ def _make_sample_logprobs(
                 ) for idx in range(self.num_logprobs)
             }
 
-            # If the sampled token was not in the topk, add it.
+            # Add a Logprob object for the sampled token if it
+            # is not already in the top k.
             if sampled_token_id not in topk_logprobs_dict:
-                # TODO(rob): is rank for sample Logprob needed? 
-                # it is not used in Chat Completions.
+                # TODO(rob): do we need to plumb up the rank for 
+                # the sample Logprob? It is not used in the 
+                # Chat Completions API for instance.
                 token = self.tokenizer.decode(sampled_token_id)
                 topk_logprobs_dict[sampled_token_id] = Logprob(
                     logprob=sampled_token_logprob,
@@ -281,11 +250,11 @@ def add_tokens(
     ) -> Optional[RequestOutput]:
         """
         Update RequestState for the request_id by:
-            1) Detokenize the new token ids incrementally
-            2) Evaluate stop criteria
-            3) Detokenize sample logprobs non-incrementally
-            4) Detokenize prompt logprobs non-incrementally
-            5) Update the `RequestOutput` object with new text
+            1) Detokenize the new token ids incrementally.
+            2) Evaluate stop criteria.
+            3) Detokenize sample logprobs non-incrementally.
+            4) Detokenize prompt logprobs non-incrementally.
+            5) Make the `RequestOutput` object with new text.
         """
 
         # 1) Detokenize the new token ids incrementally.
@@ -348,7 +317,7 @@ def add_tokens(
                 prompt_logprobs_token_ids,
                 prompt_logprobs)
 
-        # 5) Update the RequestOutput object with the new text.
+        # 5) Makes the RequestOutput object with the new text.
         finished = bool(finish_reason)
         if self.output_kind == RequestOutputKind.FINAL_ONLY \
             and not finished:

From 16a8caa28841df8f54df26f753d37710b01fb9c4 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 20:52:44 +0000
Subject: [PATCH 213/293] passing mypy

---
 vllm/v1/core/scheduler.py          |   7 +-
 vllm/v1/engine/__init__.py         |   2 +-
 vllm/v1/engine/core.py             |   2 +-
 vllm/v1/engine/core_client.py      |   2 +-
 vllm/v1/engine/detokenizer.py      | 191 ++++++++++++++---------------
 vllm/v1/engine/processor.py        |   4 +-
 vllm/v1/request.py                 |  32 +----
 vllm/v1/sample/metadata.py         |   1 -
 vllm/v1/sample/sampler.py          |   7 +-
 vllm/v1/serial_utils.py            |   3 +-
 vllm/v1/worker/gpu_input_batch.py  |  16 ++-
 vllm/v1/worker/gpu_model_runner.py |  17 ++-
 12 files changed, 121 insertions(+), 163 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index e1d15d10b89da..a45756d310945 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -433,17 +433,16 @@ def update_from_output(
 
                 # Extract sample logprobs if needed.
                 # TODO(rob): does it make sense to pythonize here?
-                do_lps = logprobs_cpu is not None
                 logprobs_token_ids = (logprobs_token_ids_cpu[req_index]
-                                      if do_lps else None)
-                logprobs = logprobs_cpu[req_index] if do_lps else None
+                                      if logprobs_token_ids_cpu else None)
+                logprobs = logprobs_cpu[req_index] if logprobs_cpu else None
 
                 # Extract prompt logprobs for this req if needed.
                 # TODO(rob): does it make sense to pythonize here?
                 # FIXME(rob): handle partial request. Currently we throw away
                 # the prompt logprobs for the partial request.
                 prompt_logprobs_token_ids, prompt_logprobs = (
-                    prompt_logprobs_dict.get(req_id, default=(None, None)))
+                    prompt_logprobs_dict.get(req_id, (None, None)))
 
                 # Add EngineCoreOutput for this Request.
                 output = EngineCoreOutput(
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 1e786d21b5e3a..af3cad1965c25 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,6 +1,6 @@
 import enum
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Union
 
 import msgspec
 import torch
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index a899cf936c9dd..0004e65461dcd 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -23,7 +23,7 @@
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
-from vllm.v1.serial_utils import PickleEncoder, MsgpackEncoder
+from vllm.v1.serial_utils import MsgpackEncoder, PickleEncoder
 from vllm.v1.utils import make_zmq_socket
 from vllm.version import __version__ as VLLM_VERSION
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 72b798cdea3b3..259943dfb194d 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -12,7 +12,7 @@
                             EngineCoreRequestType, EngineCoreRequestUnion)
 from vllm.v1.engine.core import (EngineCore, EngineCoreProc,
                                  EngineCoreProcHandle)
-from vllm.v1.serial_utils import PickleEncoder, MsgpackDecoder
+from vllm.v1.serial_utils import MsgpackDecoder, PickleEncoder
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index a251172f27cdf..aa43c6e9f2bbb 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple, Union, cast
+from typing import Dict, Iterable, List, Optional, Tuple, Union
 
 import torch
 
@@ -46,8 +46,8 @@ class IncrementalDetokenizer:
     tokenizer: AnyTokenizer
 
     # Logprobs for this request
-    logprobs: SampleLogprobs
-    prompt_logprobs: PromptLogprobs
+    logprobs: Optional[SampleLogprobs]
+    prompt_logprobs: Optional[PromptLogprobs]
     cumulative_logprob: float
     num_logprobs: int
     num_prompt_logprobs: int
@@ -101,8 +101,8 @@ def from_new_request(
             prompt_token_ids=request.prompt_token_ids,
             tokenizer=tokenizer,
             stop_buffer_length=stop_buffer_length,
-            logprobs=[],
-            prompt_logprobs=[],
+            logprobs=([] if request.logprobs else None),
+            prompt_logprobs=([] if request.prompt_logprobs else None),
             cumulative_logprob=0.,
             num_logprobs=request.logprobs,
             num_prompt_logprobs=request.prompt_logprobs,
@@ -114,31 +114,21 @@ def _make_sample_logprobs(
         logprobs_token_ids_lst: List[torch.Tensor],
         logprobs_lst: List[torch.Tensor],
     ) -> SampleLogprobs:
-        """Pythonize sample logprobs, maybe detokenize.
-        
-        Only Pythonizes sample logprobs computed in the current
-        step. Has the side effect of updating the incremental detokenizer
-        state by (1) appending the new sample logprobs to the list of what
-        was computed for previously-sampled tokens, and (2) accumulating
-        into the request's cumulative logprob value.ß
-
-        Pythonization entails the conversion from a numpy (np)
-        values/token ids representation to the more idiomatically
-        Pythonic representation required by the OpenAI API,
-        List[Dict[int,Logprob]]
-
-        The Logprob.decoded_token field is only computed (detokenized
-        from the associated top token id) if detokenize=True
+        """
+        Create formatted SampleLogprobs objects from the raw
+        EngineCore outputs after pythonizing + detokenizing.
+
+        NOTE: we detokenize the logprobs *non-incrementally*
+        for simplicity and performance of the implementation.
 
         Args:
-          new_sample_logprobs: List of (logprobs,logprob token ids) numpy array
-                               tuples
-          new_sample_token_ids: List of sample token ids
-          detokenize: Logprob.decoded_token is computed if True, otherwise None
-        
+            sampled_token_ids: List of new sampled tokens
+            logprobs_token_ids_lst: List of tensors of token ids of 
+                shape [topk+1] for the sampled + topk token ids
+            logprobs_lst: List of tensors of logprobs of 
+                shape [topk+1] for to sampled + topk token ids    
         Returns:
-          Sample logprobs compute in this step, Pythonized and possibly
-          detokenized
+            SampleLogprobs: List[Dict[str, Logprob]]
         """
 
         # NOTE(rob): the lists are of length > 1 if a single step
@@ -147,7 +137,7 @@ def _make_sample_logprobs(
         assert len(sampled_token_ids) == len(logprobs_lst)
         output_list: SampleLogprobs = []
         for sampled_token_id, logprobs, logprobs_token_ids in zip(
-            sampled_token_ids, logprobs_lst, logprobs_token_ids_lst):
+                sampled_token_ids, logprobs_lst, logprobs_token_ids_lst):
 
             # Sampler concatenates the logprobs of the sampled token
             # ahead of the topk tokens.
@@ -155,88 +145,81 @@ def _make_sample_logprobs(
                 "Sampler cats the sampled tokens logprobs in front of "
                 f"the topk logprobs, but got {sampled_token_id=} and "
                 f"{logprobs_token_ids[0].item()=}")
-
-            # Pythonize.
             sampled_token_logprob = logprobs[0].item()
             topk_token_ids = logprobs_token_ids[1:].tolist()
             topk_logprobs = logprobs[1:].tolist()
 
             # Make the Logprob objects.
-            # Detokenize *non-incrementally*
-            decoded_tokens = self.tokenizer.batch_decode(topk_token_ids.reshape(-1,1))
-            # Sampler uses torch.topk to select the logprobs, whihch
-            # returns them in sorted order, so we can use idx for rank.
+            decoded_tokens = self.tokenizer.batch_decode(
+                topk_token_ids.reshape(-1, 1))
+            # Sampler uses torch.topk() which sorts, so idx=rank.
             topk_logprobs_dict = {
                 topk_token_ids[idx]: Logprob(
-                    logprob=topk_logprobs[idx], rank=idx,
+                    logprob=topk_logprobs[idx],
+                    rank=idx,
                     decoded_token=decoded_tokens[idx],
-                ) for idx in range(self.num_logprobs)
+                )
+                for idx in range(self.num_logprobs)
             }
 
-            # Add a Logprob object for the sampled token if it
-            # is not already in the top k.
+            # Make the sampled token Logprob object if not in topk.
             if sampled_token_id not in topk_logprobs_dict:
-                # TODO(rob): do we need to plumb up the rank for 
-                # the sample Logprob? It is not used in the 
+                # TODO(rob): do we need to plumb up the rank for
+                # the sample Logprob? It is not used in the
                 # Chat Completions API for instance.
                 token = self.tokenizer.decode(sampled_token_id)
                 topk_logprobs_dict[sampled_token_id] = Logprob(
                     logprob=sampled_token_logprob,
-                    rank=None, decoded_token=token)
+                    rank=None,
+                    decoded_token=token)
 
             output_list.append(topk_logprobs_dict)
 
         return output_list
 
-    def _pythonize_maybe_detokenize_prompt_logprobs_for_request(
+    def _make_prompt_logprobs(
         self,
-        prompt_logprob_values: npt.NDArray,
-        prompt_logprob_token_ids: npt.NDArray,
-        detokenize: bool,
+        token_ids: torch.Tensor,
+        logprobs: torch.Tensor,
     ) -> PromptLogprobs:
-        """Pythonize prompt logprobs, maybe detokenize.
-        
-        Only Pythonizes prompt logprobs computed in the current
-        step. Has the side effect of updating the incremental detokenizer
-        state by appending the new prompt logprobs to the list of what
-        was computed for previous prompt chunks. Forces the first prompt
-        logprob associated with the request to be `None`.
-
-        Pythonization entails the conversion from a numpy (np)
-        values/token ids representation to the more idiomatically
-        Pythonic representation required by the OpenAI API,
-        List[Dict[int,Logprob]]
-
-        The Logprob.decoded_token field is only computed (detokenized
-        from the associated top token id) if detokenize=True
+        """
+        Create formatted PromptLogprobs objects from the raw
+        EngineCore outputs after pythonizing + detokenizing.
+
+        NOTE: we detokenize the logprobs *non-incrementally*
+        for simplicity and performance of the implementation.
 
         Args:
-          prompt_logprob_values: num_chunk_tokens x num_prompt_logprobs np array
-                                 of top token log probabilities
-          prompt_logprob_token_ids: num_chunk_tokens x num_prompt_logprobs np
-                                    array of top token ids
-          detokenize: Logprob.decoded_token is computed if True, otherwise None
-        
+            token_ids: Tensor of tok ids of shape [prompt_len, topk]
+            logprobs: Tensor of logprobs of shape [prompt_len, topk]
         Returns:
-          Prompt logprobs compute in this step, Pythonized and possibly
-          detokenized
+            PromptLogprobs: List[Dict[int, Logprob]]
         """
-        logprob_cnt = self.max_request_prompt_logprobs
-        prompt_logprobs: List[Optional[Dict[int, Logprob]]] = [
-            self._pythonize_sequence_position(plp_tok_values,
-                                              plp_tok_token_ids, detokenize)
-            for plp_tok_values, plp_tok_token_ids in zip(
-                # Slice out top prompt logprobs
-                prompt_logprob_values[:, 0:logprob_cnt],
-                prompt_logprob_token_ids[:, 0:logprob_cnt])
-        ]
-        if not self.request_prompt_logprobs:
-            # Ensure that None is the first prompt logprob
-            prompt_logprobs = cast(List[Optional[Dict[int, Logprob]]],
-                                   [None]) + prompt_logprobs
-        assert self.request_prompt_logprobs is not None
-        self.request_prompt_logprobs.extend(prompt_logprobs)
-        return prompt_logprobs
+
+        pass
+        # prompt_logprob_token_ids_lst = token_ids.tolist()
+        # decoded_tokens
+        # decoded_tokens = self.token_ids.con
+        # pass
+
+        # for
+
+        # logprob_cnt = self.max_request_prompt_logprobs
+        # prompt_logprobs: List[Optional[Dict[int, Logprob]]] = [
+        #     self._pythonize_sequence_position(plp_tok_values,
+        #                                       plp_tok_token_ids, detokenize)
+        #     for plp_tok_values, plp_tok_token_ids in zip(
+        #         # Slice out top prompt logprobs
+        #         prompt_logprob_values[:, 0:logprob_cnt],
+        #         prompt_logprob_token_ids[:, 0:logprob_cnt])
+        # ]
+        # if not self.request_prompt_logprobs:
+        #     # Ensure that None is the first prompt logprob
+        #     prompt_logprobs = cast(List[Optional[Dict[int, Logprob]]],
+        #                            [None]) + prompt_logprobs
+        # assert self.request_prompt_logprobs is not None
+        # self.request_prompt_logprobs.extend(prompt_logprobs)
+        # return prompt_logprobs
 
     def add_tokens(
         self,
@@ -245,8 +228,8 @@ def add_tokens(
         stop_reason: Optional[Union[int, str, None]],
         new_logprobs_token_ids: Optional[List[torch.Tensor]],
         new_logprobs: Optional[List[torch.Tensor]],
-        prompt_logprobs_token_ids: Optional[torch.Tensor],
-        prompt_logprobs: Optional[torch.Tensor],
+        new_prompt_logprobs_token_ids: Optional[torch.Tensor],
+        new_prompt_logprobs: Optional[torch.Tensor],
     ) -> Optional[RequestOutput]:
         """
         Update RequestState for the request_id by:
@@ -298,24 +281,28 @@ def add_tokens(
                 stop_reason = stop_str
 
         # 3) Make Sample Logprobs.
+        logprobs = None
         if new_logprobs:
-            sample_logprobs = self._make_sample_logprobs(
+            assert new_logprobs_token_ids is not None
+            assert self.logprobs is not None
+            logprobs = self._make_sample_logprobs(
                 sampled_token_ids=new_token_ids,
                 logprobs_token_ids_lst=new_logprobs_token_ids,
-                logprobs=new_logprobs)
-            self.logprobs.append(sample_logprobs)
-            # TODO: update cumulative logprob.
+                logprobs_lst=new_logprobs)
+            self.logprobs.append(logprobs)
+            # TODO(rob): update cumulative logprob.
             # self.cumulative_logprob
 
-        # 4) Pythonize & detokenizer prompt logprobs.
-        if prompt_logprobs:
+        # 4) Make Prompt Logprobs.
+        prompt_logprobs = None
+        if new_prompt_logprobs:
             # EngineCore does not stream out partial prefill,
             # so all prompt logprobs come in one step.
-            assert len(self.prompt_logprobs) == 0
-            assert prompt_logprobs_token_ids is not None
+            assert (self.prompt_logprobs is not None
+                    and len(self.prompt_logprobs) == 0)
+            assert new_prompt_logprobs_token_ids is not None
             self.prompt_logprobs = self._make_prompt_logprobs(
-                prompt_logprobs_token_ids,
-                prompt_logprobs)
+                new_prompt_logprobs_token_ids, new_prompt_logprobs)
 
         # 5) Makes the RequestOutput object with the new text.
         finished = bool(finish_reason)
@@ -323,12 +310,11 @@ def add_tokens(
             and not finished:
             return None
 
-        # Return just newly created items if DELTA.
         delta = self.output_kind == RequestOutputKind.DELTA
         output_text = self._get_next_output_text(finished, delta)
         token_ids = new_token_ids if delta else self.output_token_ids
-        logprobs = sample_logprobs if delta else self.logprobs
-        prompt_logprobs = sample_logprobs if delta else self.prompt_logprobs
+        logprobs = logprobs if delta else self.logprobs
+        prompt_logprobs = prompt_logprobs if delta else self.prompt_logprobs
         cumulative_logprob = self.cumulative_logprob
 
         request_output = RequestOutput.new(
@@ -433,10 +419,11 @@ def step(
                 new_token_ids=engine_core_output.new_token_ids,
                 finish_reason=engine_core_output.finish_reason,
                 stop_reason=engine_core_output.stop_reason,
-                new_sample_logprobs=engine_core_output.logprobs,
+                new_logprobs=engine_core_output.logprobs,
+                new_logprobs_token_ids=engine_core_output.logprobs_token_ids,
                 new_prompt_logprobs=engine_core_output.prompt_logprobs,
-                new_prompt_logprob_token_ids=engine_core_output.
-                prompt_logprobs_token_ids,
+                new_prompt_logprobs_token_ids=(
+                    engine_core_output.prompt_logprobs_token_ids),
             )
 
             if request_output is not None:
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index c2aec2ce486b8..c71eca4bfe7a8 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -62,13 +62,13 @@ def _validate_logprobs(
         if (params.logprobs and params.logprobs > max_logprobs):
             raise ValueError(
                 f"Requested sample logprobs of {params.logprobs}, "
-                f"which is greated than max allowed: {max_logprobs}")
+                f"which is greater than max allowed: {max_logprobs}")
 
         # Validate prompt logprobs.
         if (params.prompt_logprobs and params.prompt_logprobs > max_logprobs):
             raise ValueError(
                 f"Requested prompt logprobs of {params.prompt_logprobs}, "
-                f"which is greated than max allowed: {max_logprobs}")
+                f"which is greater than max allowed: {max_logprobs}")
 
     def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
         if lora_request is not None and not self.lora_config:
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 312015d04ed33..f4783ae366ef0 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,8 +1,5 @@
 import enum
-from typing import TYPE_CHECKING, List, Optional, Tuple, Union
-
-import numpy as np
-import numpy.typing as npt
+from typing import TYPE_CHECKING, List, Optional, Union
 
 from vllm.inputs import DecoderOnlyInputs, SingletonInputsAdapter, token_inputs
 from vllm.lora.request import LoRARequest
@@ -49,33 +46,6 @@ def __init__(
         self.num_prompt_tokens = len(self.prompt_token_ids)
         self._output_token_ids: List[int] = []
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
-
-        # If sample logprobs are enabled, the number of sample logprobs cannot
-        # be anticipated in advance (because the LLM is partially responsible
-        # for deciding when the completion is finished.) So,
-        # build a list of (logprobs,logprob_token_ids) tuples for each generated
-        # sequence position; logprobs and logprob_token_ids are both
-        # 1 x num_logprobs_at_offset np arrays,
-        # where num_logprobs_at_offset is the number of logprobs at a
-        # particular offset in the generated sequence. This has overheads
-        # compared to a single big NDArray, but should be okay because
-        # subsequent logprobs pythonization steps only
-        # aggregate along rows, not along columns.
-        # TODO: an alternative could be to preallocate a
-        # self.max_tokens x self.max_logprobs NDArray, but
-        # this was not employed because the array could be very large for large
-        # context windows, even if the completion was very short.
-        self.logprobs: Optional[List[Tuple[npt.NDArray, npt.NDArray]]] = (
-            None if self.request_sample_logprobs is None else [])
-        # The number of prompt logprobs is known is advance, so preallocate an
-        # NDArray
-        self.prompt_logprobs: Optional[npt.NDArray] = (
-            None if self.request_prompt_logprobs is None else np.empty(
-                (self.num_prompt_tokens, self.request_prompt_logprobs)))
-        self.prompt_logprob_token_ids: Optional[npt.NDArray] = (
-            None if self.request_prompt_logprobs is None else np.empty(
-                (self.num_prompt_tokens, self.request_prompt_logprobs),
-                dtype=np.int32))
         self.num_computed_tokens = 0
 
         # Multi-modal input metadata.
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 1ec76b2d6bad9..3b286d74355e9 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,7 +1,6 @@
 from dataclasses import dataclass
 from typing import Dict, List
 
-import numpy
 import torch
 
 
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index abb41e960a5f1..39868a019cdab 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -4,9 +4,8 @@
 import torch
 import torch.nn as nn
 
-from vllm.v1.outputs import SamplerOutput, PromptLogprobsOutput
-from vllm.v1.sample.metadata import (LogitsProcessMetadata, SamplingMetadata,
-                                     PromptLogprobsMetadata)
+from vllm.v1.outputs import SamplerOutput
+from vllm.v1.sample.metadata import LogitsProcessMetadata, SamplingMetadata
 
 _SAMPLING_EPS = 1e-5
 
@@ -28,7 +27,7 @@ def forward(
         # Compute the logprobs if requested.
         # NOTE: CPU-GPU synchronization happens here.
         logprob_token_ids, logprobs = self._compute_logprobs(
-            logit=logits,
+            logits=logits,
             max_num_logprobs=sampling_metadata.max_num_logprobs,
             sampled_token_ids=sampled)
 
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 22ac90b0cae45..5f01eaaaf6443 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,6 +1,7 @@
-import torch
 import pickle
 from typing import Any
+
+import torch
 from msgspec import msgpack
 
 CUSTOM_TYPE_CODE_PICKLE = 1
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index bc8c9474737b4..7ae2e378de6e0 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -4,12 +4,13 @@
 from typing import TYPE_CHECKING, Dict, List, Optional, Set
 
 import numpy as np
+import numpy.typing as npt
 import torch
 
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams, SamplingType
-from vllm.v1.sample.metadata import (LogitsProcessMetadata, SamplingMetadata,
-                                     PromptLogprobsMetadata)
+from vllm.v1.sample.metadata import (LogitsProcessMetadata,
+                                     PromptLogprobsMetadata, SamplingMetadata)
 
 if TYPE_CHECKING:
     from vllm.multimodal.inputs import PlaceholderRange
@@ -273,13 +274,16 @@ def make_sampling_metadata(
 
     def make_prompt_logprobs_metadata(
         self,
-        partial_req_ids: List[int],
-        req_indices: np.ndarray,
+        partial_req_ids: List[str],
+        req_indices: npt.NDArray,
     ) -> Optional[PromptLogprobsMetadata]:
 
         if not self.max_num_prompt_logprobs:
             return None
 
+        # Precompute the indicies.
+        all_indicies = np.arange(req_indices.shape[0])
+
         # NOTE(rob): we should avoid loops like this  in model runner,
         # but this ONLY loops over requests that are currently in
         # prefill phase AND need prompt lps.
@@ -293,8 +297,8 @@ def make_prompt_logprobs_metadata(
         ):
             req_idx = self.req_id_to_index[req_id]
 
-            # Make the logits mask for the request prefills.
-            mask = req_indices[req_indices == req_idx].tolist()
+            # Make the logits mask for this request's prefill.
+            mask = all_indicies[req_indices == req_idx].tolist()
             if req_id not in partial_req_ids:
                 # Remove the sample token if there is one.
                 mask = mask[:-1]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0fd75c58c0bd3..6770ac1451623 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3,6 +3,7 @@
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, cast
 
 import numpy as np
+import numpy.typing as npt
 import torch
 import torch.distributed
 import torch.nn as nn
@@ -20,8 +21,8 @@
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
 from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
-from vllm.v1.outputs import ModelRunnerOutput, SamplerOutput
-from vllm.v1.sample.metadata import SamplingMetadata, PromptLogprobsMetadata
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.sample.metadata import PromptLogprobsMetadata, SamplingMetadata
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
 if TYPE_CHECKING:
@@ -378,7 +379,6 @@ def _prepare_inputs(
         sampling_metadata, prompt_logprobs_metadata = self._prepare_sampling(
             scheduler_output=scheduler_output,
             sample_indices=query_start_loc[1:] - 1,
-            num_scheduled_tokens=num_scheduled_tokens,
             req_indices=req_indices,
         )
 
@@ -388,8 +388,7 @@ def _prepare_sampling(
         self,
         scheduler_output: "SchedulerOutput",
         sample_indices: torch.Tensor,
-        num_scheduled_tokens: np.array,
-        req_indices: np.ndarray,
+        req_indices: npt.NDArray,
     ) -> Tuple[SamplingMetadata, Optional[PromptLogprobsMetadata]]:
         skip_copy = True
         if (scheduler_output.finished_req_ids
@@ -402,9 +401,9 @@ def _prepare_sampling(
         sampling_metadata = self.input_batch.make_sampling_metadata(
             skip_copy, sample_indices)
 
-        # Create the prompt logprobs metdata.
+        # Create the prompt logprobs metadata.
         prompt_lps_metadata = self.input_batch.make_prompt_logprobs_metadata(
-            num_scheduled_tokens, req_indices)
+            scheduler_output.partial_req_ids, req_indices)
 
         return sampling_metadata, prompt_lps_metadata
 
@@ -559,8 +558,8 @@ def execute_model(
         # so prioritize simplicity.
         prompt_lps_dict: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
         if prompt_logprobs_metadata:
-            for req_id, mask, metadata, num_logprobs in prompt_logprobs_metadata.zipped(
-            ):
+            for (req_id, mask, metadata,
+                 num_logprobs) in prompt_logprobs_metadata.zipped():
                 # TODO: make prompt lp metadata here?
 
                 # Compute logits.

From 868e653cc37f90473186ad768e0e9add99ace839 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 22:13:54 +0000
Subject: [PATCH 214/293] updated

---
 vllm/v1/core/scheduler.py          |  15 ++-
 vllm/v1/engine/__init__.py         |   6 +-
 vllm/v1/engine/detokenizer.py      | 145 ++++++++++++++++-------------
 vllm/v1/worker/gpu_model_runner.py |   1 +
 4 files changed, 93 insertions(+), 74 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index a45756d310945..aa72047a91cfd 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -3,6 +3,8 @@
 from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set,
                     Tuple, Union)
 
+import torch
+
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalKwargs
@@ -432,13 +434,16 @@ def update_from_output(
                 stopped = self._check_stop(request)
 
                 # Extract sample logprobs if needed.
-                # TODO(rob): does it make sense to pythonize here?
-                logprobs_token_ids = (logprobs_token_ids_cpu[req_index]
-                                      if logprobs_token_ids_cpu else None)
-                logprobs = logprobs_cpu[req_index] if logprobs_cpu else None
+                logprobs_token_ids: List[torch.Tensor] = []
+                logprobs: List[torch.Tensor] = []
+                if request.sampling_params.logprobs:
+                    assert logprobs_token_ids_cpu is not None
+                    assert logprobs_cpu is not None
+                    # Here we assume there is 1 generated token per step.
+                    logprobs_token_ids = [logprobs_token_ids_cpu[req_index]]
+                    logprobs = logprobs_cpu[req_index]
 
                 # Extract prompt logprobs for this req if needed.
-                # TODO(rob): does it make sense to pythonize here?
                 # FIXME(rob): handle partial request. Currently we throw away
                 # the prompt logprobs for the partial request.
                 prompt_logprobs_token_ids, prompt_logprobs = (
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index af3cad1965c25..1e4b9e4ae7114 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -55,11 +55,11 @@ class EngineCoreOutput(
 
     request_id: str
     new_token_ids: List[int]
-    finished: bool
-    logprobs: Optional[torch.Tensor]
-    logprobs_token_ids: Optional[torch.Tensor]
+    logprobs: List[torch.Tensor]
+    logprobs_token_ids: List[torch.Tensor]
     prompt_logprobs: Optional[torch.Tensor]
     prompt_logprobs_token_ids: Optional[torch.Tensor]
+    finished: bool
     finish_reason: Optional[str] = None
     stop_reason: Union[int, str, None] = None
 
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index aa43c6e9f2bbb..662b545ebb93d 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -48,7 +48,7 @@ class IncrementalDetokenizer:
     # Logprobs for this request
     logprobs: Optional[SampleLogprobs]
     prompt_logprobs: Optional[PromptLogprobs]
-    cumulative_logprob: float
+    cumulative_logprob: Optional[float]
     num_logprobs: int
     num_prompt_logprobs: int
 
@@ -101,19 +101,20 @@ def from_new_request(
             prompt_token_ids=request.prompt_token_ids,
             tokenizer=tokenizer,
             stop_buffer_length=stop_buffer_length,
+            cumulative_logprob=(0. if request.logprobs else None),
             logprobs=([] if request.logprobs else None),
-            prompt_logprobs=([] if request.prompt_logprobs else None),
-            cumulative_logprob=0.,
+            # NOTE(rob): prompt logprobs of first token is always None.
+            prompt_logprobs=([None] if request.prompt_logprobs else None),
             num_logprobs=request.logprobs,
             num_prompt_logprobs=request.prompt_logprobs,
         )
 
-    def _make_sample_logprobs(
+    def _update_sample_logprobs(
         self,
         sampled_token_ids: List[int],
         logprobs_token_ids_lst: List[torch.Tensor],
         logprobs_lst: List[torch.Tensor],
-    ) -> SampleLogprobs:
+    ) -> Optional[SampleLogprobs]:
         """
         Create formatted SampleLogprobs objects from the raw
         EngineCore outputs after pythonizing + detokenizing.
@@ -128,14 +129,20 @@ def _make_sample_logprobs(
             logprobs_lst: List of tensors of logprobs of 
                 shape [topk+1] for to sampled + topk token ids    
         Returns:
-            SampleLogprobs: List[Dict[str, Logprob]]
+            SampleLogprobs: List[Dict[str, Logprob]]: New only.
         """
 
-        # NOTE(rob): the lists are of length > 1 if a single step
-        # of EngineCore generates > 1 token (e.g. spec decoding).
-        assert len(sampled_token_ids) == len(logprobs_token_ids_lst)
-        assert len(sampled_token_ids) == len(logprobs_lst)
-        output_list: SampleLogprobs = []
+        if self.num_logprobs == 0:
+            assert (len(logprobs_token_ids_lst) == 0
+                    and len(logprobs_lst) == 0)
+            return None
+        assert self.logprobs is not None
+
+        # NOTE(rob): the lists are of length > 1 if EngineCore
+        # generates > 1 token per step (e.g. in spec decoding).
+        num_new_tokens = len(sampled_token_ids)
+        assert num_new_tokens == len(logprobs_token_ids_lst)
+        assert num_new_tokens == len(logprobs_lst)
         for sampled_token_id, logprobs, logprobs_token_ids in zip(
                 sampled_token_ids, logprobs_lst, logprobs_token_ids_lst):
 
@@ -162,7 +169,7 @@ def _make_sample_logprobs(
                 for idx in range(self.num_logprobs)
             }
 
-            # Make the sampled token Logprob object if not in topk.
+            # Make the sampled Logprob object if not in topk.
             if sampled_token_id not in topk_logprobs_dict:
                 # TODO(rob): do we need to plumb up the rank for
                 # the sample Logprob? It is not used in the
@@ -173,15 +180,18 @@ def _make_sample_logprobs(
                     rank=None,
                     decoded_token=token)
 
-            output_list.append(topk_logprobs_dict)
+            # Update logprobs for this sequence position.
+            self.logprobs.append(topk_logprobs_dict)
+            # FIXME(rob): update cumulative logprob.
 
-        return output_list
+        # Return just the newly generated sample logprobs.
+        return self.logprobs[-num_new_tokens:]
 
-    def _make_prompt_logprobs(
+    def _update_prompt_logprobs(
         self,
-        token_ids: torch.Tensor,
-        logprobs: torch.Tensor,
-    ) -> PromptLogprobs:
+        logprobs_token_ids: Optional[torch.Tensor],
+        logprobs: Optional[torch.Tensor],
+    ) -> Optional[PromptLogprobs]:
         """
         Create formatted PromptLogprobs objects from the raw
         EngineCore outputs after pythonizing + detokenizing.
@@ -196,38 +206,53 @@ def _make_prompt_logprobs(
             PromptLogprobs: List[Dict[int, Logprob]]
         """
 
-        pass
-        # prompt_logprob_token_ids_lst = token_ids.tolist()
-        # decoded_tokens
-        # decoded_tokens = self.token_ids.con
-        # pass
-
-        # for
-
-        # logprob_cnt = self.max_request_prompt_logprobs
-        # prompt_logprobs: List[Optional[Dict[int, Logprob]]] = [
-        #     self._pythonize_sequence_position(plp_tok_values,
-        #                                       plp_tok_token_ids, detokenize)
-        #     for plp_tok_values, plp_tok_token_ids in zip(
-        #         # Slice out top prompt logprobs
-        #         prompt_logprob_values[:, 0:logprob_cnt],
-        #         prompt_logprob_token_ids[:, 0:logprob_cnt])
-        # ]
-        # if not self.request_prompt_logprobs:
-        #     # Ensure that None is the first prompt logprob
-        #     prompt_logprobs = cast(List[Optional[Dict[int, Logprob]]],
-        #                            [None]) + prompt_logprobs
-        # assert self.request_prompt_logprobs is not None
-        # self.request_prompt_logprobs.extend(prompt_logprobs)
-        # return prompt_logprobs
+        # Skip if this request is not using logprobs.
+        if self.num_prompt_logprobs == 0:
+            return None
+
+        # Skip if last step did not generate prompt lps (decode).
+        if logprobs_token_ids is None:
+            return None
+        assert logprobs is not None
+
+        # Since EngineCore does not stream partial requests,
+        # Detokenizer gets all the prompt logprobs at once, thus
+        # self.prompt_logprobs=[None].
+        assert (self.prompt_logprobs is not None
+                and len(self.prompt_logprobs == 1))
+
+        # Decode the tokens (*non-incrementally).
+        # Flattened: [prompt_len, num_logprobs] ->
+        #            [prompt_len * num_logprobs]
+        decoded_tokens = self.tokenizer.batch_decode(
+            logprobs_token_ids.reshape(-1, 1))
+
+        # Make Logprob for each prompt token.
+        num_tokens, num_logprobs = logprobs.shape
+        assert num_logprobs == self.num_prompt_logprobs
+        for token_idx in range(num_tokens):
+            topk_logprobs = logprobs[token_idx].tolist()
+            topk_token_ids = logprobs_token_ids[token_idx].tolist()
+            # NOTE: Sampler uses torch.topk(sorted=True), so idx=rank.
+            self.prompt_logprobs.append({
+                topk_token_ids[idx]: Logprob(
+                    logprob=topk_logprobs[idx],
+                    rank=idx,
+                    decoded_token=decoded_tokens[num_tokens * num_logprobs +
+                                                 idx],
+                )
+                for idx in range(num_logprobs)
+            })
+
+        return self.prompt_logprobs
 
     def add_tokens(
         self,
         new_token_ids: List[int],
         finish_reason: Optional[str],
         stop_reason: Optional[Union[int, str, None]],
-        new_logprobs_token_ids: Optional[List[torch.Tensor]],
-        new_logprobs: Optional[List[torch.Tensor]],
+        new_logprobs_token_ids: List[torch.Tensor],
+        new_logprobs: List[torch.Tensor],
         new_prompt_logprobs_token_ids: Optional[torch.Tensor],
         new_prompt_logprobs: Optional[torch.Tensor],
     ) -> Optional[RequestOutput]:
@@ -281,28 +306,17 @@ def add_tokens(
                 stop_reason = stop_str
 
         # 3) Make Sample Logprobs.
-        logprobs = None
-        if new_logprobs:
-            assert new_logprobs_token_ids is not None
-            assert self.logprobs is not None
-            logprobs = self._make_sample_logprobs(
-                sampled_token_ids=new_token_ids,
-                logprobs_token_ids_lst=new_logprobs_token_ids,
-                logprobs_lst=new_logprobs)
-            self.logprobs.append(logprobs)
-            # TODO(rob): update cumulative logprob.
-            # self.cumulative_logprob
+        logprobs = self._update_sample_logprobs(
+            sampled_token_ids=new_token_ids,
+            logprobs_token_ids_lst=new_logprobs_token_ids,
+            logprobs_lst=new_logprobs,
+        )
 
         # 4) Make Prompt Logprobs.
-        prompt_logprobs = None
-        if new_prompt_logprobs:
-            # EngineCore does not stream out partial prefill,
-            # so all prompt logprobs come in one step.
-            assert (self.prompt_logprobs is not None
-                    and len(self.prompt_logprobs) == 0)
-            assert new_prompt_logprobs_token_ids is not None
-            self.prompt_logprobs = self._make_prompt_logprobs(
-                new_prompt_logprobs_token_ids, new_prompt_logprobs)
+        prompt_logprobs = self._update_prompt_logprobs(
+            logprobs_token_ids=new_prompt_logprobs_token_ids,
+            logprobs=new_prompt_logprobs,
+        )
 
         # 5) Makes the RequestOutput object with the new text.
         finished = bool(finish_reason)
@@ -315,7 +329,6 @@ def add_tokens(
         token_ids = new_token_ids if delta else self.output_token_ids
         logprobs = logprobs if delta else self.logprobs
         prompt_logprobs = prompt_logprobs if delta else self.prompt_logprobs
-        cumulative_logprob = self.cumulative_logprob
 
         request_output = RequestOutput.new(
             self.request_id,
@@ -325,7 +338,7 @@ def add_tokens(
             token_ids,
             logprobs,
             prompt_logprobs,
-            cumulative_logprob,
+            self.cumulative_logprob,
             finished,
         )
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6770ac1451623..67f0008636502 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -378,6 +378,7 @@ def _prepare_inputs(
         # Make Sampling and Prompt Logprobs Metadata.
         sampling_metadata, prompt_logprobs_metadata = self._prepare_sampling(
             scheduler_output=scheduler_output,
+            # Here we assume there is one generated token per step.
             sample_indices=query_start_loc[1:] - 1,
             req_indices=req_indices,
         )

From 62b8360261778a6f13b8fd19a6cf6800497e20a4 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 22:15:32 +0000
Subject: [PATCH 215/293] update

---
 vllm/v1/engine/detokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 662b545ebb93d..049b734930b84 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -233,7 +233,7 @@ def _update_prompt_logprobs(
         for token_idx in range(num_tokens):
             topk_logprobs = logprobs[token_idx].tolist()
             topk_token_ids = logprobs_token_ids[token_idx].tolist()
-            # NOTE: Sampler uses torch.topk(sorted=True), so idx=rank.
+            # Sampler uses torch.topk(sorted=True), so idx=rank.
             self.prompt_logprobs.append({
                 topk_token_ids[idx]: Logprob(
                     logprob=topk_logprobs[idx],

From 7fe4d859acf907c0320e4a5263ed3ef5cde0692a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 22:15:52 +0000
Subject: [PATCH 216/293] update

---
 vllm/v1/engine/detokenizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 049b734930b84..61057792bd930 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -249,12 +249,12 @@ def _update_prompt_logprobs(
     def add_tokens(
         self,
         new_token_ids: List[int],
-        finish_reason: Optional[str],
-        stop_reason: Optional[Union[int, str, None]],
         new_logprobs_token_ids: List[torch.Tensor],
         new_logprobs: List[torch.Tensor],
         new_prompt_logprobs_token_ids: Optional[torch.Tensor],
         new_prompt_logprobs: Optional[torch.Tensor],
+        finish_reason: Optional[str],
+        stop_reason: Optional[Union[int, str, None]],
     ) -> Optional[RequestOutput]:
         """
         Update RequestState for the request_id by:

From 92a27aabc8152a5d4f51ac9b6ee6de372199c583 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 22:16:20 +0000
Subject: [PATCH 217/293] updated

---
 vllm/v1/engine/detokenizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 61057792bd930..577bb4e5e9cef 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -430,13 +430,13 @@ def step(
             # Detokenize and update state.
             request_output = detokenizer.add_tokens(
                 new_token_ids=engine_core_output.new_token_ids,
-                finish_reason=engine_core_output.finish_reason,
-                stop_reason=engine_core_output.stop_reason,
                 new_logprobs=engine_core_output.logprobs,
                 new_logprobs_token_ids=engine_core_output.logprobs_token_ids,
                 new_prompt_logprobs=engine_core_output.prompt_logprobs,
                 new_prompt_logprobs_token_ids=(
                     engine_core_output.prompt_logprobs_token_ids),
+                finish_reason=engine_core_output.finish_reason,
+                stop_reason=engine_core_output.stop_reason,
             )
 
             if request_output is not None:

From e279409e0ec1f350578be808f1afeacbc16ab85e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 22:22:10 +0000
Subject: [PATCH 218/293] update indexing

---
 vllm/v1/sample/sampler.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 39868a019cdab..b10756dd23681 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -70,10 +70,9 @@ def _compute_logprobs(
 
             # Concatenate with the sampled token_id if provided.
             if sampled_token_ids:
-                # TODO(rob): check if the concat is right.
-                # TODO(rob): we need to return the rank of the sampled token
-                # to be compatible with the OAI spec.
-                sampled_logprobs = logprobs[sampled_token_ids]
+                # TODO(rob): check if the indexing / concatting is right
+                # TODO(rob): do we need to return the rank of the sampled?
+                sampled_logprobs = logprobs[:, sampled_token_ids]
                 topk_indices = torch.cat([sampled_token_ids, topk_indices])
                 topk_logprobs = torch.cat([sampled_logprobs, topk_logprobs])
 

From bc3942c8f67dc8421cfc44d71caf4e4d7d1888a4 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 22:24:54 +0000
Subject: [PATCH 219/293] reduce changeg

---
 vllm/v1/sample/sampler.py | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index b10756dd23681..1648bd9123e4c 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -17,7 +17,9 @@ def forward(
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
-        logits = self._process_logits(
+        logits = self.apply_temperature(
+            logits, sampling_metadata.logits_process_metadata.temperature)
+        logits = self.apply_top_k_top_p(
             logits, sampling_metadata.logits_process_metadata)
         probs = self.get_probs(logits)
         sampled = self.sample(probs, sampling_metadata)
@@ -45,7 +47,9 @@ def compute_prompt_logprobs(
         logits_process_metadata: LogitsProcessMetadata,
         num_logprobs: int,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        logits = self._process_logits(logits, logits_process_metadata)
+        logits = self.apply_temperature(logits,
+                                        logits_process_metadata.temperature)
+        logits = self.apply_top_k_top_p(logits, logits_process_metadata)
 
         # NOTE: CPU-GPU synchronization happens here.
         logprob_token_ids, logprobs = self._compute_logprobs(
@@ -80,17 +84,7 @@ def _compute_logprobs(
         else:
             return None, None
 
-    def _process_logits(
-        self,
-        logits: torch.Tensor,
-        logits_process_metadata: LogitsProcessMetadata,
-    ) -> torch.Tensor:
-        logits = self._apply_temperature(logits,
-                                         logits_process_metadata.temperature)
-        logits = self._apply_top_k_top_p(logits, logits_process_metadata)
-        return logits
-
-    def _apply_temperature(
+    def apply_temperature(
         self,
         logits: torch.Tensor,
         temp: torch.Tensor,
@@ -103,7 +97,7 @@ def _apply_temperature(
         logits.div_(temp.unsqueeze(dim=1))
         return logits
 
-    def _apply_top_k_top_p(
+    def apply_top_k_top_p(
         self,
         logits: torch.Tensor,
         logits_process_metadata: LogitsProcessMetadata,

From b5647c3d187d4c9450986f6b291563089359600b Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 22:26:47 +0000
Subject: [PATCH 220/293] reduce cruft

---
 vllm/v1/sample/sampler.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 1648bd9123e4c..0cfd1b36161ac 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -154,9 +154,12 @@ def sample(
         greedy_sampled = self.greedy_sample(probs)
         random_sampled = self.random_sample(probs,
                                             sampling_metadata.generators)
-        temperature = sampling_metadata.logits_process_metadata.temperature
-        sampled = torch.where(temperature < _SAMPLING_EPS, greedy_sampled,
-                              random_sampled)
+        sampled = torch.where(
+            sampling_metadata.logits_process_metadata.temperature <
+            _SAMPLING_EPS,
+            greedy_sampled,
+            random_sampled,
+        )
         return sampled
 
 

From 0db5db01375782f92bc073b49561c73a428d73a6 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 22:27:44 +0000
Subject: [PATCH 221/293] reduce cruft

---
 vllm/v1/serial_utils.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 5f01eaaaf6443..666fca92bae92 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,7 +1,6 @@
 import pickle
-from typing import Any
-
 import torch
+from typing import Any
 from msgspec import msgpack
 
 CUSTOM_TYPE_CODE_PICKLE = 1
@@ -9,10 +8,10 @@
 
 class PickleEncoder:
 
-    def encode(self, obj):
+    def encode(self, obj: Any):
         return pickle.dumps(obj)
 
-    def decode(self, data):
+    def decode(self, data: Any):
         return pickle.loads(data)
 
 

From ff7d7d26ca908e217da0e8ea6411c3e3b77dc371 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 23:01:12 +0000
Subject: [PATCH 222/293] updated

---
 vllm/v1/engine/detokenizer.py      | 68 ++++++++++++++++--------------
 vllm/v1/serial_utils.py            |  3 +-
 vllm/v1/worker/gpu_input_batch.py  | 18 ++++----
 vllm/v1/worker/gpu_model_runner.py |  8 ++--
 4 files changed, 50 insertions(+), 47 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 577bb4e5e9cef..5344a80e319ef 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -50,7 +50,6 @@ class IncrementalDetokenizer:
     prompt_logprobs: Optional[PromptLogprobs]
     cumulative_logprob: Optional[float]
     num_logprobs: int
-    num_prompt_logprobs: int
 
     # Accounting for stop string buffering
     stop_buffer_length: int
@@ -103,10 +102,8 @@ def from_new_request(
             stop_buffer_length=stop_buffer_length,
             cumulative_logprob=(0. if request.logprobs else None),
             logprobs=([] if request.logprobs else None),
-            # NOTE(rob): prompt logprobs of first token is always None.
-            prompt_logprobs=([None] if request.prompt_logprobs else None),
+            prompt_logprobs=None,
             num_logprobs=request.logprobs,
-            num_prompt_logprobs=request.prompt_logprobs,
         )
 
     def _update_sample_logprobs(
@@ -206,46 +203,53 @@ def _update_prompt_logprobs(
             PromptLogprobs: List[Dict[int, Logprob]]
         """
 
-        # Skip if this request is not using logprobs.
-        if self.num_prompt_logprobs == 0:
-            return None
-
-        # Skip if last step did not generate prompt lps (decode).
         if logprobs_token_ids is None:
             return None
         assert logprobs is not None
 
-        # Since EngineCore does not stream partial requests,
-        # Detokenizer gets all the prompt logprobs at once, thus
-        # self.prompt_logprobs=[None].
-        assert (self.prompt_logprobs is not None
-                and len(self.prompt_logprobs == 1))
+        # EngineCore does not stream until entire prompt complete,
+        # so Detokenizer should get all prompt lps at once.
+        assert self.prompt_logprobs is None
 
-        # Decode the tokens (*non-incrementally).
-        # Flattened: [prompt_len, num_logprobs] ->
-        #            [prompt_len * num_logprobs]
+        # Detokenize non-incrementally.
+        # [num_tok, num_lps] -> [num_tok * num_lps]
         decoded_tokens = self.tokenizer.batch_decode(
             logprobs_token_ids.reshape(-1, 1))
 
-        # Make Logprob for each prompt token.
+        # Make Logprob for prompt token.
+        # NOTE(rob): the first tok has None.
         num_tokens, num_logprobs = logprobs.shape
-        assert num_logprobs == self.num_prompt_logprobs
-        for token_idx in range(num_tokens):
-            topk_logprobs = logprobs[token_idx].tolist()
-            topk_token_ids = logprobs_token_ids[token_idx].tolist()
-            # Sampler uses torch.topk(sorted=True), so idx=rank.
-            self.prompt_logprobs.append({
-                topk_token_ids[idx]: Logprob(
-                    logprob=topk_logprobs[idx],
-                    rank=idx,
-                    decoded_token=decoded_tokens[num_tokens * num_logprobs +
-                                                 idx],
-                )
-                for idx in range(num_logprobs)
-            })
+        self.prompt_logprobs = [None] + [
+            self._make_pos_logprob_dict(
+                logprobs[tok_idx].tolist(),
+                logprobs_token_ids[tok_idx].tolist(),
+                decoded_tokens[tok_idx * num_logprobs:],
+                num_logprobs,
+            ) for tok_idx in range(num_tokens)
+        ]
 
         return self.prompt_logprobs
 
+    @staticmethod
+    def _make_pos_logprob_dict(
+        logprobs: List[float],
+        token_ids: List[int],
+        decoded_tokens: List[str],
+        num_logprobs: int,
+    ) -> Dict[int, Logprob]:
+        """Make a Logprob dictionary for a position in the sequence."""
+
+        # Sampler uses torch.topk() which sorts so the
+        # index in lists is equivalent to rank.
+        return {
+            token_ids[idx]: Logprob(
+                logprob=logprobs[idx],
+                rank=idx,
+                decoded_token=decoded_tokens[idx],
+            )
+            for idx in range(num_logprobs)
+        }
+
     def add_tokens(
         self,
         new_token_ids: List[int],
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 666fca92bae92..813153a56ef68 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,6 +1,7 @@
 import pickle
-import torch
 from typing import Any
+
+import torch
 from msgspec import msgpack
 
 CUSTOM_TYPE_CODE_PICKLE = 1
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 7ae2e378de6e0..1eaabe10a5243 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -278,23 +278,22 @@ def make_prompt_logprobs_metadata(
         req_indices: npt.NDArray,
     ) -> Optional[PromptLogprobsMetadata]:
 
-        if not self.max_num_prompt_logprobs:
+        if self.no_prompt_logprob:
             return None
 
         # Precompute the indicies.
         all_indicies = np.arange(req_indices.shape[0])
 
-        # NOTE(rob): we should avoid loops like this  in model runner,
+        # NOTE(rob): we should avoid loops like this in ModelRunner,
         # but this ONLY loops over requests that are currently in
         # prefill phase AND need prompt lps.
+        # Should we move this to _update_states or execute_model()
+        # to avoid another loop?
         req_ids = []
         masks = []
         logits_process_metadatas = []
         num_prompt_logprobs = []
-
-        # TODO(rob): should we move this to _update_states?
-        for req_id, req_num_prompt_logprobs in self.num_prompt_logprobs.items(
-        ):
+        for req_id in self.num_prompt_logprobs:
             req_idx = self.req_id_to_index[req_id]
 
             # Make the logits mask for this request's prefill.
@@ -313,7 +312,7 @@ def make_prompt_logprobs_metadata(
 
             req_ids.append(req_id)
             masks.append(mask)
-            num_prompt_logprobs.append(req_num_prompt_logprobs)
+            num_prompt_logprobs.append(self.num_prompt_logprobs[req_id])
             logits_process_metadatas.append(
                 LogitsProcessMetadata(temperature=temperature,
                                       top_p=top_p,
@@ -352,6 +351,5 @@ def max_num_logprobs(self) -> int:
         return max(self.num_logprobs.values()) if self.num_logprobs else 0
 
     @property
-    def max_num_prompt_logprobs(self) -> int:
-        return (max(self.num_prompt_logprobs.values())
-                if self.num_prompt_logprobs else 0)
+    def no_prompt_logprob(self) -> bool:
+        return len(self.num_prompt_logprobs) == 0
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 67f0008636502..8df1e88da9c9b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -553,10 +553,10 @@ def execute_model(
         )
 
         # Compute prompt logprobs if needed.
-        # NOTE(rob): compute prompt logprobs for each req separately,
-        # which is suboptimal. However, prompt logprobs are rare (used
-        # by lm-eval-harness) and we have few prefill per batch,
-        # so prioritize simplicity.
+        # NOTE(rob): for clean code, compute prompt logprobs for each req
+        # separately, which is suboptimal. However, prompt logprobs are rare
+        # (used mostly by lm-eval-harness) and we have few prefill per batch,
+        # so we prioritize simplicity.
         prompt_lps_dict: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
         if prompt_logprobs_metadata:
             for (req_id, mask, metadata,

From 8aa8baa636e7bb48ddc3afe8ce00cc734aab9c07 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 23:06:35 +0000
Subject: [PATCH 223/293] update comment

---
 vllm/v1/engine/detokenizer.py | 35 +++++++++++++----------------------
 1 file changed, 13 insertions(+), 22 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 5344a80e319ef..24b362f530512 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -138,8 +138,6 @@ def _update_sample_logprobs(
         # NOTE(rob): the lists are of length > 1 if EngineCore
         # generates > 1 token per step (e.g. in spec decoding).
         num_new_tokens = len(sampled_token_ids)
-        assert num_new_tokens == len(logprobs_token_ids_lst)
-        assert num_new_tokens == len(logprobs_lst)
         for sampled_token_id, logprobs, logprobs_token_ids in zip(
                 sampled_token_ids, logprobs_lst, logprobs_token_ids_lst):
 
@@ -153,33 +151,26 @@ def _update_sample_logprobs(
             topk_token_ids = logprobs_token_ids[1:].tolist()
             topk_logprobs = logprobs[1:].tolist()
 
-            # Make the Logprob objects.
+            # Detokenize (non-incrementally).
             decoded_tokens = self.tokenizer.batch_decode(
                 topk_token_ids.reshape(-1, 1))
-            # Sampler uses torch.topk() which sorts, so idx=rank.
-            topk_logprobs_dict = {
-                topk_token_ids[idx]: Logprob(
-                    logprob=topk_logprobs[idx],
-                    rank=idx,
-                    decoded_token=decoded_tokens[idx],
-                )
-                for idx in range(self.num_logprobs)
-            }
-
-            # Make the sampled Logprob object if not in topk.
-            if sampled_token_id not in topk_logprobs_dict:
-                # TODO(rob): do we need to plumb up the rank for
-                # the sample Logprob? It is not used in the
-                # Chat Completions API for instance.
+
+            # Make the Logprob objects for each position.
+            pos_logprobs_dict = self._make_pos_logprob_dict(
+                topk_logprobs, topk_token_ids, decoded_tokens,
+                self.num_logprobs)
+
+            # Add the sampled Logprob if it was not in topk
+            if sampled_token_id not in pos_logprobs_dict:
                 token = self.tokenizer.decode(sampled_token_id)
-                topk_logprobs_dict[sampled_token_id] = Logprob(
+                pos_logprobs_dict[sampled_token_id] = Logprob(
                     logprob=sampled_token_logprob,
-                    rank=None,
+                    rank=None, # TODO: is this needed?
                     decoded_token=token)
 
             # Update logprobs for this sequence position.
-            self.logprobs.append(topk_logprobs_dict)
-            # FIXME(rob): update cumulative logprob.
+            self.logprobs.append(pos_logprobs_dict)
+            self.cumulative_logprob += sampled_token_logprob
 
         # Return just the newly generated sample logprobs.
         return self.logprobs[-num_new_tokens:]

From 527228da14910826a0cde5c2d8f824a808dfa54c Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 23:06:47 +0000
Subject: [PATCH 224/293] format

---
 vllm/v1/engine/detokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 24b362f530512..bfb92a3b31a55 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -165,7 +165,7 @@ def _update_sample_logprobs(
                 token = self.tokenizer.decode(sampled_token_id)
                 pos_logprobs_dict[sampled_token_id] = Logprob(
                     logprob=sampled_token_logprob,
-                    rank=None, # TODO: is this needed?
+                    rank=None,  # TODO: is this needed?
                     decoded_token=token)
 
             # Update logprobs for this sequence position.

From f5d0b57dd15b898e876b40fea4bb9363ed17a5c6 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 23:08:14 +0000
Subject: [PATCH 225/293] reduce length of comments

---
 vllm/v1/engine/detokenizer.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index bfb92a3b31a55..12e5a2d8571b8 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -113,12 +113,6 @@ def _update_sample_logprobs(
         logprobs_lst: List[torch.Tensor],
     ) -> Optional[SampleLogprobs]:
         """
-        Create formatted SampleLogprobs objects from the raw
-        EngineCore outputs after pythonizing + detokenizing.
-
-        NOTE: we detokenize the logprobs *non-incrementally*
-        for simplicity and performance of the implementation.
-
         Args:
             sampled_token_ids: List of new sampled tokens
             logprobs_token_ids_lst: List of tensors of token ids of 
@@ -126,7 +120,7 @@ def _update_sample_logprobs(
             logprobs_lst: List of tensors of logprobs of 
                 shape [topk+1] for to sampled + topk token ids    
         Returns:
-            SampleLogprobs: List[Dict[str, Logprob]]: New only.
+            New SampleLogprobs or None
         """
 
         if self.num_logprobs == 0:
@@ -181,17 +175,11 @@ def _update_prompt_logprobs(
         logprobs: Optional[torch.Tensor],
     ) -> Optional[PromptLogprobs]:
         """
-        Create formatted PromptLogprobs objects from the raw
-        EngineCore outputs after pythonizing + detokenizing.
-
-        NOTE: we detokenize the logprobs *non-incrementally*
-        for simplicity and performance of the implementation.
-
         Args:
             token_ids: Tensor of tok ids of shape [prompt_len, topk]
             logprobs: Tensor of logprobs of shape [prompt_len, topk]
         Returns:
-            PromptLogprobs: List[Dict[int, Logprob]]
+            PromptLogprobs or None
         """
 
         if logprobs_token_ids is None:

From 711ff138710a9ae51614525c0a1e8c5039b54c7a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 23:09:16 +0000
Subject: [PATCH 226/293] updated

---
 vllm/v1/engine/detokenizer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 12e5a2d8571b8..45633c8dd5c04 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -196,7 +196,6 @@ def _update_prompt_logprobs(
             logprobs_token_ids.reshape(-1, 1))
 
         # Make Logprob for prompt token.
-        # NOTE(rob): the first tok has None.
         num_tokens, num_logprobs = logprobs.shape
         self.prompt_logprobs = [None] + [
             self._make_pos_logprob_dict(

From 3a996156eeacb863b69fb1edd5689763c31daa83 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 23:11:01 +0000
Subject: [PATCH 227/293] reduce assets

---
 vllm/v1/engine/detokenizer.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 45633c8dd5c04..687d0db13ef79 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -124,8 +124,6 @@ def _update_sample_logprobs(
         """
 
         if self.num_logprobs == 0:
-            assert (len(logprobs_token_ids_lst) == 0
-                    and len(logprobs_lst) == 0)
             return None
         assert self.logprobs is not None
 

From 6bb6d3436a534ab0291aa57eff1132f7d68eaaeb Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 23:12:16 +0000
Subject: [PATCH 228/293] updated

---
 vllm/v1/engine/detokenizer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 687d0db13ef79..87e4e24ad2e13 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -143,16 +143,16 @@ def _update_sample_logprobs(
             topk_token_ids = logprobs_token_ids[1:].tolist()
             topk_logprobs = logprobs[1:].tolist()
 
-            # Detokenize (non-incrementally).
+            # Detokenize non-incrementally.
             decoded_tokens = self.tokenizer.batch_decode(
                 topk_token_ids.reshape(-1, 1))
 
-            # Make the Logprob objects for each position.
+            # Make the Logprob objects the position.
             pos_logprobs_dict = self._make_pos_logprob_dict(
                 topk_logprobs, topk_token_ids, decoded_tokens,
                 self.num_logprobs)
 
-            # Add the sampled Logprob if it was not in topk
+            # Add the sampled Logprob if it was not in topk.
             if sampled_token_id not in pos_logprobs_dict:
                 token = self.tokenizer.decode(sampled_token_id)
                 pos_logprobs_dict[sampled_token_id] = Logprob(

From d73010db04b9204d5ff1c14bfdd7126bec8ea49f Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 23:18:27 +0000
Subject: [PATCH 229/293] updated

---
 vllm/v1/worker/gpu_model_runner.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8df1e88da9c9b..d369728bf3a2e 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -571,7 +571,6 @@ def execute_model(
                     req_id] = self.model.sampler.get_prompt_logprobs(
                         logits, metadata, num_logprobs)
 
-        # Update Request State.
         sampled_token_ids = sampler_output.sampled_token_ids
         # TODO(woosuk): The following loop can be slow since it iterates over
         # the requests one by one. Optimize.

From b8f40df329c4c294aefb613bca9af69607003e1d Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 23:33:16 +0000
Subject: [PATCH 230/293] updated

---
 vllm/transformers_utils/detokenizer_utils.py | 17 +++++++
 vllm/v1/engine/detokenizer.py                | 47 +++++++++-----------
 2 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 37ff8a236e791..7dd96bd2ef72e 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -1,5 +1,7 @@
 from typing import List, Optional, Tuple
 
+import torch
+
 from .tokenizer import AnyTokenizer
 
 
@@ -72,6 +74,21 @@ def convert_prompt_ids_to_tokens(
     return new_tokens, prefix_offset, read_offset
 
 
+def detokenize_non_incrementally(
+    tokenizer: AnyTokenizer,
+    token_ids: torch.Tensor,
+) -> List[str]:
+    """Detokenize the input ids individually."""
+
+    # Flatten input to shape [N, 1]. Tokenizers then
+    # treats it as decoding batch N seq_len 1, such
+    # that they all happen independently.
+    flat_token_ids = token_ids.reshape(-1, 1)
+    # TODO(rob): deal with MistralTokenizer not doing
+    # batch_decode?
+    return tokenizer.batch_decode(flat_token_ids)
+
+
 # Based on
 # https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
 # under Apache 2.0 license
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 87e4e24ad2e13..0caba1e6fa391 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -9,7 +9,8 @@
 from vllm.sampling_params import RequestOutputKind
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.transformers_utils.detokenizer_utils import (
-    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
+    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally,
+    detokenize_non_incrementally)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.v1.engine import DetokenizerRequest, EngineCoreOutput
 
@@ -113,30 +114,28 @@ def _update_sample_logprobs(
         logprobs_lst: List[torch.Tensor],
     ) -> Optional[SampleLogprobs]:
         """
-        Args:
-            sampled_token_ids: List of new sampled tokens
-            logprobs_token_ids_lst: List of tensors of token ids of 
-                shape [topk+1] for the sampled + topk token ids
-            logprobs_lst: List of tensors of logprobs of 
-                shape [topk+1] for to sampled + topk token ids    
-        Returns:
-            New SampleLogprobs or None
+        Update logprobs based the prior step.
+
+        
+
+        Tensors are:
+            logprobs_token_ids: [topk + 1]: topk token ids at pos
+            logprobs: [num_logprobs + 1]: topk logprobs at pos
         """
 
         if self.num_logprobs == 0:
             return None
         assert self.logprobs is not None
 
-        # NOTE(rob): the lists are of length > 1 if EngineCore
-        # generates > 1 token per step (e.g. in spec decoding).
-        num_new_tokens = len(sampled_token_ids)
+        # NOTE(rob): Lists are only of length >1 if EngineCore
+        # generated >1 token during the prior step (e.g. spec decoding).
         for sampled_token_id, logprobs, logprobs_token_ids in zip(
                 sampled_token_ids, logprobs_lst, logprobs_token_ids_lst):
 
             # Sampler concatenates the logprobs of the sampled token
             # ahead of the topk tokens.
             assert sampled_token_id == logprobs_token_ids[0].item(), (
-                "Sampler cats the sampled tokens logprobs in front of "
+                "Sampler concats the sampled token logprob in front of "
                 f"the topk logprobs, but got {sampled_token_id=} and "
                 f"{logprobs_token_ids[0].item()=}")
             sampled_token_logprob = logprobs[0].item()
@@ -144,8 +143,8 @@ def _update_sample_logprobs(
             topk_logprobs = logprobs[1:].tolist()
 
             # Detokenize non-incrementally.
-            decoded_tokens = self.tokenizer.batch_decode(
-                topk_token_ids.reshape(-1, 1))
+            decoded_tokens = detokenize_non_incrementally(
+                self.tokenizer, topk_token_ids)
 
             # Make the Logprob objects the position.
             pos_logprobs_dict = self._make_pos_logprob_dict(
@@ -160,11 +159,11 @@ def _update_sample_logprobs(
                     rank=None,  # TODO: is this needed?
                     decoded_token=token)
 
-            # Update logprobs for this sequence position.
             self.logprobs.append(pos_logprobs_dict)
             self.cumulative_logprob += sampled_token_logprob
 
         # Return just the newly generated sample logprobs.
+        num_new_tokens = len(sampled_token_ids)
         return self.logprobs[-num_new_tokens:]
 
     def _update_prompt_logprobs(
@@ -172,14 +171,8 @@ def _update_prompt_logprobs(
         logprobs_token_ids: Optional[torch.Tensor],
         logprobs: Optional[torch.Tensor],
     ) -> Optional[PromptLogprobs]:
-        """
-        Args:
-            token_ids: Tensor of tok ids of shape [prompt_len, topk]
-            logprobs: Tensor of logprobs of shape [prompt_len, topk]
-        Returns:
-            PromptLogprobs or None
-        """
 
+        # Skip if no prompt logprobs
         if logprobs_token_ids is None:
             return None
         assert logprobs is not None
@@ -189,9 +182,10 @@ def _update_prompt_logprobs(
         assert self.prompt_logprobs is None
 
         # Detokenize non-incrementally.
-        # [num_tok, num_lps] -> [num_tok * num_lps]
-        decoded_tokens = self.tokenizer.batch_decode(
-            logprobs_token_ids.reshape(-1, 1))
+        # NOTE(rob): the output is flattened:
+        #   [num_tok, num_lps] -> [num_tok * num_lps]
+        decoded_tokens = detokenize_non_incrementally(self.tokenizer,
+                                                      logprobs_token_ids)
 
         # Make Logprob for prompt token.
         num_tokens, num_logprobs = logprobs.shape
@@ -199,6 +193,7 @@ def _update_prompt_logprobs(
             self._make_pos_logprob_dict(
                 logprobs[tok_idx].tolist(),
                 logprobs_token_ids[tok_idx].tolist(),
+                # Deal with the flattening from above.
                 decoded_tokens[tok_idx * num_logprobs:],
                 num_logprobs,
             ) for tok_idx in range(num_tokens)

From e806678a64dd6f7e9081350156f8acfa1cf15e6e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 23:45:01 +0000
Subject: [PATCH 231/293] clean

---
 vllm/v1/engine/detokenizer.py | 52 ++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 28 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 0caba1e6fa391..167a1a9a80e47 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -110,37 +110,33 @@ def from_new_request(
     def _update_sample_logprobs(
         self,
         sampled_token_ids: List[int],
-        logprobs_token_ids_lst: List[torch.Tensor],
+        token_ids_lst: List[torch.Tensor],
         logprobs_lst: List[torch.Tensor],
     ) -> Optional[SampleLogprobs]:
         """
-        Update logprobs based the prior step.
-
-        
+        Lists are only of length >1 if EngineCore made
+        >1 tokens in prior step (e.g. in spec decoding).
 
         Tensors are:
-            logprobs_token_ids: [topk + 1]: topk token ids at pos
-            logprobs: [num_logprobs + 1]: topk logprobs at pos
+            token_ids: [topk + 1]: topk token ids at pos
+            logprobs:  [topk + 1]: topk logprobs at pos
         """
 
         if self.num_logprobs == 0:
             return None
         assert self.logprobs is not None
 
-        # NOTE(rob): Lists are only of length >1 if EngineCore
-        # generated >1 token during the prior step (e.g. spec decoding).
-        for sampled_token_id, logprobs, logprobs_token_ids in zip(
-                sampled_token_ids, logprobs_lst, logprobs_token_ids_lst):
+        for sampled_token_id, logprobs, token_ids in zip(
+                sampled_token_ids, logprobs_lst, token_ids_lst):
 
-            # Sampler concatenates the logprobs of the sampled token
-            # ahead of the topk tokens.
-            assert sampled_token_id == logprobs_token_ids[0].item(), (
+            # Split into sampled vs top_k.
+            assert sampled_token_id == token_ids[0].item(), (
                 "Sampler concats the sampled token logprob in front of "
                 f"the topk logprobs, but got {sampled_token_id=} and "
-                f"{logprobs_token_ids[0].item()=}")
+                f"{token_ids[0].item()=}")
             sampled_token_logprob = logprobs[0].item()
-            topk_token_ids = logprobs_token_ids[1:].tolist()
-            topk_logprobs = logprobs[1:].tolist()
+            topk_token_ids = token_ids[1:]
+            topk_logprobs = logprobs[1:]
 
             # Detokenize non-incrementally.
             decoded_tokens = detokenize_non_incrementally(
@@ -148,8 +144,8 @@ def _update_sample_logprobs(
 
             # Make the Logprob objects the position.
             pos_logprobs_dict = self._make_pos_logprob_dict(
-                topk_logprobs, topk_token_ids, decoded_tokens,
-                self.num_logprobs)
+                topk_token_ids.tolist(), topk_logprobs.tolist(),
+                decoded_tokens, self.num_logprobs)
 
             # Add the sampled Logprob if it was not in topk.
             if sampled_token_id not in pos_logprobs_dict:
@@ -168,12 +164,12 @@ def _update_sample_logprobs(
 
     def _update_prompt_logprobs(
         self,
-        logprobs_token_ids: Optional[torch.Tensor],
+        token_ids: Optional[torch.Tensor],
         logprobs: Optional[torch.Tensor],
     ) -> Optional[PromptLogprobs]:
 
-        # Skip if no prompt logprobs
-        if logprobs_token_ids is None:
+        # Skip if no prompt logprobs were generated.
+        if token_ids is None:
             return None
         assert logprobs is not None
 
@@ -185,14 +181,14 @@ def _update_prompt_logprobs(
         # NOTE(rob): the output is flattened:
         #   [num_tok, num_lps] -> [num_tok * num_lps]
         decoded_tokens = detokenize_non_incrementally(self.tokenizer,
-                                                      logprobs_token_ids)
+                                                      token_ids)
 
         # Make Logprob for prompt token.
         num_tokens, num_logprobs = logprobs.shape
         self.prompt_logprobs = [None] + [
             self._make_pos_logprob_dict(
                 logprobs[tok_idx].tolist(),
-                logprobs_token_ids[tok_idx].tolist(),
+                token_ids[tok_idx].tolist(),
                 # Deal with the flattening from above.
                 decoded_tokens[tok_idx * num_logprobs:],
                 num_logprobs,
@@ -282,15 +278,15 @@ def add_tokens(
 
         # 3) Make Sample Logprobs.
         logprobs = self._update_sample_logprobs(
-            sampled_token_ids=new_token_ids,
-            logprobs_token_ids_lst=new_logprobs_token_ids,
-            logprobs_lst=new_logprobs,
+            new_token_ids,
+            new_logprobs_token_ids,
+            new_logprobs,
         )
 
         # 4) Make Prompt Logprobs.
         prompt_logprobs = self._update_prompt_logprobs(
-            logprobs_token_ids=new_prompt_logprobs_token_ids,
-            logprobs=new_prompt_logprobs,
+            new_prompt_logprobs_token_ids,
+            new_prompt_logprobs,
         )
 
         # 5) Makes the RequestOutput object with the new text.

From afef9324ebf3f61f8bcce1d42346a9355db9aec7 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 23:47:40 +0000
Subject: [PATCH 232/293] reduce cruft

---
 vllm/v1/worker/gpu_input_batch.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 1eaabe10a5243..6f487976f5ae2 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -257,7 +257,6 @@ def make_sampling_metadata(
                 self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True)
             self.top_k[:self.num_reqs].copy_(
                 self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True)
-
         return SamplingMetadata(
             sample_indicies=sample_indices,
             all_greedy=self.all_greedy,

From 71580ae9e25041495a504bd1cac4535f354dfee3 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 2 Jan 2025 23:49:17 +0000
Subject: [PATCH 233/293] revert crruft

---
 vllm/v1/worker/gpu_model_runner.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d369728bf3a2e..1a8ea3828a06b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -265,12 +265,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         if removed_req_indices:
             self.input_batch.condense(removed_req_indices)
 
-    def _prepare_inputs(
-        self,
-        scheduler_output: "SchedulerOutput",
-    ) -> Tuple[FlashAttentionMetadata, SamplingMetadata,
-               Optional[PromptLogprobsMetadata]]:
-
+    def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
         num_reqs = self.input_batch.num_reqs

From 1d52a37bbfda16293dbe0dfd61868aff841cde55 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 00:17:54 +0000
Subject: [PATCH 234/293] updated

---
 vllm/v1/sample/metadata.py         | 11 +----
 vllm/v1/worker/gpu_input_batch.py  | 72 ++++++++++-----------------
 vllm/v1/worker/gpu_model_runner.py | 79 +++++++++++++++++-------------
 3 files changed, 71 insertions(+), 91 deletions(-)

diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 3b286d74355e9..41c887076f10b 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -17,7 +17,6 @@ class LogitsProcessMetadata:
 @dataclass
 class SamplingMetadata:
 
-    sample_indicies: torch.Tensor
     all_greedy: bool
     all_random: bool
     logits_process_metadata: LogitsProcessMetadata
@@ -28,11 +27,5 @@ class SamplingMetadata:
 @dataclass
 class PromptLogprobsMetadata:
 
-    req_ids: List[str]
-    masks: List[int]
-    logits_process_metadatas: List[LogitsProcessMetadata]
-    num_prompt_logprobs: List[int]
-
-    def zipped(self):
-        return zip(self.req_ids, self.masks, self.logits_process_metadatas,
-                   self.num_prompt_logprobs)
+    prompt_indices: torch.Tensor
+    logits_process_metadata: LogitsProcessMetadata
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 6f487976f5ae2..a2093de1efeb0 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -273,57 +273,35 @@ def make_sampling_metadata(
 
     def make_prompt_logprobs_metadata(
         self,
+        req_id: str,
         partial_req_ids: List[str],
         req_indices: npt.NDArray,
-    ) -> Optional[PromptLogprobsMetadata]:
-
-        if self.no_prompt_logprob:
-            return None
-
-        # Precompute the indicies.
-        all_indicies = np.arange(req_indices.shape[0])
-
-        # NOTE(rob): we should avoid loops like this in ModelRunner,
-        # but this ONLY loops over requests that are currently in
-        # prefill phase AND need prompt lps.
-        # Should we move this to _update_states or execute_model()
-        # to avoid another loop?
-        req_ids = []
-        masks = []
-        logits_process_metadatas = []
-        num_prompt_logprobs = []
-        for req_id in self.num_prompt_logprobs:
-            req_idx = self.req_id_to_index[req_id]
-
-            # Make the logits mask for this request's prefill.
-            mask = all_indicies[req_indices == req_idx].tolist()
-            if req_id not in partial_req_ids:
-                # Remove the sample token if there is one.
-                mask = mask[:-1]
-
-            # NOTE(rob): the tensors are shape 1, so we can use them in
-            # process_logits since they will be broadcasted to shape N.
-            temperature = self.temperature[req_idx]
-            top_p = self.top_p[req_idx]
-            top_k = self.top_k[req_idx]
-            no_top_p = req_id not in self.top_p_reqs
-            no_top_k = req_id not in self.top_k_reqs
-
-            req_ids.append(req_id)
-            masks.append(mask)
-            num_prompt_logprobs.append(self.num_prompt_logprobs[req_id])
-            logits_process_metadatas.append(
-                LogitsProcessMetadata(temperature=temperature,
-                                      top_p=top_p,
-                                      top_k=top_k,
-                                      no_top_p=no_top_p,
-                                      no_top_k=no_top_k))
+    ) -> PromptLogprobsMetadata:
+        req_idx = self.req_id_to_index[req_id]
+
+        # Get the indices for this prefill in current batch.
+        all_indicies = torch.arange(req_indices.shape[0])
+        indices = all_indicies[req_indices == req_idx]
+        if req_id not in partial_req_ids:
+            # Remove the sample token if there is one.
+            indices = indices[:-1]
+
+        # The tensors are shape 1, so we can use them in process_logits
+        # since they will be broadcasted to shape N.
+        temperature = self.temperature[req_idx]
+        top_p = self.top_p[req_idx]
+        top_k = self.top_k[req_idx]
+        no_top_p = req_id not in self.top_p_reqs
+        no_top_k = req_id not in self.top_k_reqs
 
         return PromptLogprobsMetadata(
-            req_ids=req_ids,
-            logits_process_metadatas=logits_process_metadatas,
-            masks=masks,
-            num_prompt_logprobs=num_prompt_logprobs)
+            prompt_indices=indices,
+            logits_process_metadata=LogitsProcessMetadata(
+                temperature=temperature,
+                top_p=top_p, top_k=top_k,
+                no_top_p=no_top_p, 
+                no_top_k=no_top_k),
+        )
 
     @property
     def num_reqs(self) -> int:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 1a8ea3828a06b..2100753c4b955 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,6 +1,6 @@
 import gc
 import time
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, cast
+from typing import TYPE_CHECKING, Dict, List, Tuple, cast
 
 import numpy as np
 import numpy.typing as npt
@@ -370,22 +370,18 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
             slot_mapping=slot_mapping,
         )
 
-        # Make Sampling and Prompt Logprobs Metadata.
-        sampling_metadata, prompt_logprobs_metadata = self._prepare_sampling(
-            scheduler_output=scheduler_output,
-            # Here we assume there is one generated token per step.
-            sample_indices=query_start_loc[1:] - 1,
-            req_indices=req_indices,
-        )
-
-        return attn_metadata, sampling_metadata, prompt_logprobs_metadata
+        # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
+        # request in the batch. While we should not sample any token from this
+        # partial request, we do so for simplicity. We will ignore the sampled
+        # token from the partial request.
+        # TODO: Support prompt logprobs.
+        logits_indices = query_start_loc[1:] - 1
+        return attn_metadata, logits_indices, req_indices
 
     def _prepare_sampling(
-        self,
+        self, 
         scheduler_output: "SchedulerOutput",
-        sample_indices: torch.Tensor,
-        req_indices: npt.NDArray,
-    ) -> Tuple[SamplingMetadata, Optional[PromptLogprobsMetadata]]:
+    ) -> SamplingMetadata:
         skip_copy = True
         if (scheduler_output.finished_req_ids
                 or scheduler_output.preempted_req_ids):
@@ -394,14 +390,21 @@ def _prepare_sampling(
                 or scheduler_output.scheduled_resumed_reqs):
             skip_copy = False
         # Create the sampling metadata.
-        sampling_metadata = self.input_batch.make_sampling_metadata(
-            skip_copy, sample_indices)
-
+        sampling_metadata = self.input_batch.make_sampling_metadata(skip_copy)
+        return sampling_metadata
+    
+    def _prepare_prompt_logprobs(
+        self,
+        req_id: str,
+        scheduler_output: "SchedulerOutput",
+        req_indices: npt.NDArray,
+    ) -> PromptLogprobsMetadata:
+        
         # Create the prompt logprobs metadata.
-        prompt_lps_metadata = self.input_batch.make_prompt_logprobs_metadata(
-            scheduler_output.partial_req_ids, req_indices)
-
-        return sampling_metadata, prompt_lps_metadata
+        metadata = self.input_batch.make_prompt_logprobs_metadata(
+            req_id, scheduler_output.partial_req_ids, req_indices)
+        
+        return metadata
 
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
@@ -491,7 +494,7 @@ def execute_model(
             encoder_outputs = []
 
         # Prepare the decoder inputs.
-        attn_metadata, sampling_metadata, prompt_logprobs_metadata = (
+        attn_metadata, logits_indices, req_indices = (
             self._prepare_inputs(scheduler_output))
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
@@ -538,7 +541,7 @@ def execute_model(
                 inputs_embeds=inputs_embeds,
             )
         hidden_states = hidden_states[:num_scheduled_tokens]
-        sample_hidden_states = hidden_states[sampling_metadata.sample_indicies]
+        sample_hidden_states = hidden_states[logits_indices]
         sample_logits = self.model.compute_logits(sample_hidden_states, None)
 
         # Sample the next token and get logprobs if needed.
@@ -548,23 +551,29 @@ def execute_model(
         )
 
         # Compute prompt logprobs if needed.
-        # NOTE(rob): for clean code, compute prompt logprobs for each req
-        # separately, which is suboptimal. However, prompt logprobs are rare
-        # (used mostly by lm-eval-harness) and we have few prefill per batch,
-        # so we prioritize simplicity.
+        # NOTE(rob): for simplicity, suboptimally compute prompt logprobs
+        # for each req separately. Prompt logprobs are rare (used for eval),
+        # and we have few prefills per batch, so prioritize simple impl.
         prompt_lps_dict: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
-        if prompt_logprobs_metadata:
-            for (req_id, mask, metadata,
-                 num_logprobs) in prompt_logprobs_metadata.zipped():
-                # TODO: make prompt lp metadata here?
+        if not self.input_batch.no_prompt_logprob:
+            for (req_id, num_prompt_logprobs) in self.input_batch.num_prompt_logprobs.items():
+                
+                # Prepare mask and logits processor.
+                metadata = self._prepare_prompt_logprobs(
+                    req_id, scheduler_output, req_indices)
 
                 # Compute logits.
                 logits = self.model.sampler.compute_logits(
-                    hidden_states[mask], None)
+                    hidden_states[metadata.prompt_indices], None)
+
                 # Compute prompt logprobs.
-                prompt_lps_dict[
-                    req_id] = self.model.sampler.get_prompt_logprobs(
-                        logits, metadata, num_logprobs)
+                prompt_lps_dict[req_id] = (
+                    self.model.sampler.get_prompt_logprobs(
+                        logits,
+                        metadata.logits_process_metadata, 
+                        num_prompt_logprobs,
+                    )
+                )
 
         sampled_token_ids = sampler_output.sampled_token_ids
         # TODO(woosuk): The following loop can be slow since it iterates over

From c8eef87a2bbeadc59ab859fe9ee327114ea41fb2 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 00:24:29 +0000
Subject: [PATCH 235/293] cleanup

---
 vllm/v1/worker/gpu_input_batch.py  |  7 +++---
 vllm/v1/worker/gpu_model_runner.py | 34 ++++++++++++++++--------------
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index a2093de1efeb0..8dba5cfb2c734 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -247,7 +247,6 @@ def condense(self, empty_req_indices: List[int]) -> None:
 
     def make_sampling_metadata(
         self,
-        sample_indices: torch.Tensor,
         skip_copy: bool = False,
     ) -> SamplingMetadata:
         if not skip_copy:
@@ -258,7 +257,6 @@ def make_sampling_metadata(
             self.top_k[:self.num_reqs].copy_(
                 self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True)
         return SamplingMetadata(
-            sample_indicies=sample_indices,
             all_greedy=self.all_greedy,
             all_random=self.all_random,
             logits_process_metadata=LogitsProcessMetadata(
@@ -298,8 +296,9 @@ def make_prompt_logprobs_metadata(
             prompt_indices=indices,
             logits_process_metadata=LogitsProcessMetadata(
                 temperature=temperature,
-                top_p=top_p, top_k=top_k,
-                no_top_p=no_top_p, 
+                top_p=top_p,
+                top_k=top_k,
+                no_top_p=no_top_p,
                 no_top_k=no_top_k),
         )
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2100753c4b955..6d9f539ba0374 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -379,7 +379,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         return attn_metadata, logits_indices, req_indices
 
     def _prepare_sampling(
-        self, 
+        self,
         scheduler_output: "SchedulerOutput",
     ) -> SamplingMetadata:
         skip_copy = True
@@ -392,18 +392,18 @@ def _prepare_sampling(
         # Create the sampling metadata.
         sampling_metadata = self.input_batch.make_sampling_metadata(skip_copy)
         return sampling_metadata
-    
+
     def _prepare_prompt_logprobs(
         self,
         req_id: str,
         scheduler_output: "SchedulerOutput",
         req_indices: npt.NDArray,
     ) -> PromptLogprobsMetadata:
-        
+
         # Create the prompt logprobs metadata.
         metadata = self.input_batch.make_prompt_logprobs_metadata(
             req_id, scheduler_output.partial_req_ids, req_indices)
-        
+
         return metadata
 
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
@@ -542,38 +542,40 @@ def execute_model(
             )
         hidden_states = hidden_states[:num_scheduled_tokens]
         sample_hidden_states = hidden_states[logits_indices]
-        sample_logits = self.model.compute_logits(sample_hidden_states, None)
+        logits = self.model.compute_logits(sample_hidden_states, None)
 
         # Sample the next token and get logprobs if needed.
+        sampling_metadata = self._prepare_sampling(scheduler_output)
         sampler_output = self.model.sample(
-            logits=sample_logits,
+            logits=logits,
             sampling_metadata=sampling_metadata,
         )
 
         # Compute prompt logprobs if needed.
-        # NOTE(rob): for simplicity, suboptimally compute prompt logprobs
-        # for each req separately. Prompt logprobs are rare (used for eval),
-        # and we have few prefills per batch, so prioritize simple impl.
+        # NOTE(rob): for simplicity, compute prompt logprobs for each
+        # request separately. Prompt logprobs are rare (used for eval),
+        # and few prefills per batch, so prioritize simple over optimal.
         prompt_lps_dict: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
         if not self.input_batch.no_prompt_logprob:
-            for (req_id, num_prompt_logprobs) in self.input_batch.num_prompt_logprobs.items():
-                
+            for (req_id, num_prompt_logprobs
+                 ) in self.input_batch.num_prompt_logprobs.items():
+
                 # Prepare mask and logits processor.
                 metadata = self._prepare_prompt_logprobs(
                     req_id, scheduler_output, req_indices)
 
                 # Compute logits.
+                prompt_hidden_states = hidden_states[metadata.prompt_indices]
                 logits = self.model.sampler.compute_logits(
-                    hidden_states[metadata.prompt_indices], None)
+                    prompt_hidden_states, None)
 
                 # Compute prompt logprobs.
-                prompt_lps_dict[req_id] = (
-                    self.model.sampler.get_prompt_logprobs(
+                prompt_lps_dict[
+                    req_id] = self.model.sampler.get_prompt_logprobs(
                         logits,
-                        metadata.logits_process_metadata, 
+                        metadata.logits_process_metadata,
                         num_prompt_logprobs,
                     )
-                )
 
         sampled_token_ids = sampler_output.sampled_token_ids
         # TODO(woosuk): The following loop can be slow since it iterates over

From b501aedd81c5256ce2aa593a0ee3a584056e33cc Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 00:27:19 +0000
Subject: [PATCH 236/293] updated

---
 vllm/v1/sample/metadata.py         | 2 +-
 vllm/v1/worker/gpu_model_runner.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 41c887076f10b..5add1550654aa 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict, List
+from typing import Dict
 
 import torch
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6d9f539ba0374..8f6046ec271fa 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -557,12 +557,12 @@ def execute_model(
         # and few prefills per batch, so prioritize simple over optimal.
         prompt_lps_dict: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
         if not self.input_batch.no_prompt_logprob:
-            for (req_id, num_prompt_logprobs
+            for (request_id, num_prompt_logprobs
                  ) in self.input_batch.num_prompt_logprobs.items():
 
                 # Prepare mask and logits processor.
                 metadata = self._prepare_prompt_logprobs(
-                    req_id, scheduler_output, req_indices)
+                    request_id, scheduler_output, req_indices)
 
                 # Compute logits.
                 prompt_hidden_states = hidden_states[metadata.prompt_indices]
@@ -571,7 +571,7 @@ def execute_model(
 
                 # Compute prompt logprobs.
                 prompt_lps_dict[
-                    req_id] = self.model.sampler.get_prompt_logprobs(
+                    request_id] = self.model.sampler.get_prompt_logprobs(
                         logits,
                         metadata.logits_process_metadata,
                         num_prompt_logprobs,

From ac070f820aefc0cd0f9445f3800bb3fdc82e51b1 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 01:28:02 +0000
Subject: [PATCH 237/293] updated

---
 vllm/v1/sample/sampler.py          |  2 +-
 vllm/v1/worker/gpu_model_runner.py | 13 ++++++-------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 0cfd1b36161ac..de9f126364c54 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -41,7 +41,7 @@ def forward(
         )
         return sampler_output
 
-    def compute_prompt_logprobs(
+    def get_prompt_logprobs(
         self,
         logits: torch.Tensor,
         logits_process_metadata: LogitsProcessMetadata,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8f6046ec271fa..ad3f936f4e010 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -555,7 +555,7 @@ def execute_model(
         # NOTE(rob): for simplicity, compute prompt logprobs for each
         # request separately. Prompt logprobs are rare (used for eval),
         # and few prefills per batch, so prioritize simple over optimal.
-        prompt_lps_dict: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
+        prompt_logprobs_dict: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
         if not self.input_batch.no_prompt_logprob:
             for (request_id, num_prompt_logprobs
                  ) in self.input_batch.num_prompt_logprobs.items():
@@ -570,12 +570,11 @@ def execute_model(
                     prompt_hidden_states, None)
 
                 # Compute prompt logprobs.
-                prompt_lps_dict[
+                # TODO(rob): Why is this in the model?
+                prompt_logprobs_dict[
                     request_id] = self.model.sampler.get_prompt_logprobs(
-                        logits,
-                        metadata.logits_process_metadata,
-                        num_prompt_logprobs,
-                    )
+                        logits, metadata.logits_process_metadata,
+                        num_prompt_logprobs)
 
         sampled_token_ids = sampler_output.sampled_token_ids
         # TODO(woosuk): The following loop can be slow since it iterates over
@@ -612,7 +611,7 @@ def execute_model(
             sampled_token_ids=sampled_token_ids,
             logprob_token_ids_cpu=sampler_output.logprob_token_ids,
             logprobs_cpu=sampler_output.logprobs,
-            prompt_logprobs_dict=prompt_lps_dict,
+            prompt_logprobs_dict=prompt_logprobs_dict,
         )
         return model_runner_output
 

From 9a28ddfdf5fa79296520e3b74844c14973e55c6e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 01:32:10 +0000
Subject: [PATCH 238/293] updated

---
 vllm/v1/core/scheduler.py          | 5 +++--
 vllm/v1/engine/processor.py        | 1 -
 vllm/v1/worker/gpu_input_batch.py  | 6 +++---
 vllm/v1/worker/gpu_model_runner.py | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index aa72047a91cfd..77c01a7bb9e75 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -444,8 +444,9 @@ def update_from_output(
                     logprobs = logprobs_cpu[req_index]
 
                 # Extract prompt logprobs for this req if needed.
-                # FIXME(rob): handle partial request. Currently we throw away
-                # the prompt logprobs for the partial request.
+                # FIXME(rob): Currently we throw away the prompt logprobs
+                # of an in progress partial request. We can handle this
+                # by updating the Request object to hold prompt logprobs.
                 prompt_logprobs_token_ids, prompt_logprobs = (
                     prompt_logprobs_dict.get(req_id, (None, None)))
 
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index c71eca4bfe7a8..d8d3c36571b96 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -93,7 +93,6 @@ def process_inputs(
         # TODO(woosuk): Support pooling models.
         # TODO(woosuk): Support encoder-decoder models.
 
-        # TODO(rob): Validate all SamplingParams.
         self._validate_logprobs(params)
         self._validate_lora(lora_request)
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 8dba5cfb2c734..ed36e283b6bc5 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -170,9 +170,9 @@ def add_request(
         if sampling_params.logprobs:
             self.num_logprobs[req_id] = sampling_params.logprobs
         if sampling_params.prompt_logprobs:
-            # TODO(rob): handle prefix caching and recomputation.
-            # We need to re-run the prefill if requesting prompt
-            # logprobs w/ prefix caching.
+            # FIXME(rob): handle prefix caching and preemption.
+            # We currently get incorrect results if prompt logprobs
+            # are requested and we get a cache hit.
             self.num_prompt_logprobs[req_id] = sampling_params.prompt_logprobs
 
     def remove_request(self, req_id: str) -> Optional[int]:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index ad3f936f4e010..720dfe80366e7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -570,7 +570,7 @@ def execute_model(
                     prompt_hidden_states, None)
 
                 # Compute prompt logprobs.
-                # TODO(rob): Why is this in the model?
+                # TODO(rob): Should we move the sampler out of the model?
                 prompt_logprobs_dict[
                     request_id] = self.model.sampler.get_prompt_logprobs(
                         logits, metadata.logits_process_metadata,

From d1a956df6ac3743f41e8e7ddcd3a9a578d8b532c Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 01:36:48 +0000
Subject: [PATCH 239/293] update comment

---
 vllm/v1/outputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 5460afaa1bdf3..1cc6f4824c068 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -35,5 +35,5 @@ class ModelRunnerOutput:
     logprobs_cpu: Optional[torch.Tensor]
 
     # req_id -> (prompt_logprobs_token_ids, prompt_logprobs)
-    # [num_reqs, max_num_prompt_logprobs]
+    # [prompt_len, num_prompt_logprobs]
     prompt_logprobs_dict: Dict[str, Tuple[torch.Tensor, torch.Tensor]]

From 5fd00608629737115d605ed1fe189de20f82835e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 01:55:29 +0000
Subject: [PATCH 240/293] updated

---
 vllm/v1/worker/gpu_model_runner.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 720dfe80366e7..19a6421e5412c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -369,7 +369,6 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
             block_table=self.input_batch.block_table[:num_reqs],
             slot_mapping=slot_mapping,
         )
-
         # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
         # request in the batch. While we should not sample any token from this
         # partial request, we do so for simplicity. We will ignore the sampled

From 0d2f7c8a5d2501b534260e2e623f1d9f8595a48d Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 03:38:57 +0000
Subject: [PATCH 241/293] stash

---
 vllm/v1/engine/__init__.py         |  2 -
 vllm/v1/sample/metadata.py         | 29 ++++------
 vllm/v1/sample/sampler.py          | 90 +++++++++++++-----------------
 vllm/v1/worker/gpu_input_batch.py  | 66 ----------------------
 vllm/v1/worker/gpu_model_runner.py | 19 +++++--
 5 files changed, 64 insertions(+), 142 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 93defb723d57f..092c8faf56f63 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -8,8 +8,6 @@
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
 from vllm.sampling_params import SamplingParams
-    logprobs: int
-    prompt_logprobs: int
 
 
 @dataclass
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index d2394a5e2980c..d60f7eb5d76f9 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -5,36 +5,27 @@
 
 
 @dataclass
-class LogitsProcessMetadata:
+class SamplingMetadata:
 
     temperature: torch.Tensor
+    all_greedy: bool
+    all_random: bool
+
     top_p: torch.Tensor
     top_k: torch.Tensor
-    frequency_penalties: torch.Tensor
-    presence_penalties: torch.Tensor
-    repetition_penalties: torch.Tensor
     no_top_p: bool
     no_top_k: bool
-    no_penalties: bool
 
+    generators: Dict[int, torch.Generator]
 
-@dataclass
-class SamplingMetadata:
+    max_num_logprobs: int
 
     no_penalties: bool
-    all_greedy: bool
-    all_random: bool
-    logits_process_metadata: LogitsProcessMetadata
-    generators: Dict[int, torch.Generator]
-    max_num_logprobs: int
     prompt_token_ids: Optional[torch.Tensor]
+    frequency_penalties: torch.Tensor
+    presence_penalties: torch.Tensor
+    repetition_penalties: torch.Tensor
+
     output_token_ids: List[List[int]]
     min_tokens: List[int]
     stop_token_ids: List[Set[int]]
-
-
-@dataclass
-class PromptLogprobsMetadata:
-
-    prompt_indices: torch.Tensor
-    logits_process_metadata: LogitsProcessMetadata
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 3e95dd4e4064f..00c85e0f8c169 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -24,20 +24,19 @@ def forward(
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
+
+        # NOTE(woosuk): Use the original logits (before any penalties or
+        # temperature scaling) for the top-k logprobs.
+        # This is different from the V0 sampler, which uses the logits that
+        # is used for sampling (after penalties and temperature scaling).
         needs_logprobs = sampling_metadata.max_num_logprobs > 0
+        raw_logits = torch.Tensor([])
         if needs_logprobs:
-            # NOTE(woosuk): Use the original logits (before any penalties or
-            # temperature scaling) for the top-k logprobs.
-            # This is different from the V0 sampler, which uses the logits that
-            # is used for sampling (after penalties and temperature scaling).
-            # NOTE: We compute logprobs first because the below ops may
-            # modify the logits tensor in-place (and we don't want to clone
-            # the logits tensor for memory efficiency).
-            topk_logprobs, topk_indices = self.get_topk_logprobs(
-                logits, sampling_metadata)
-        else:
-            topk_logprobs = None
-            topk_indices = None
+            # NOTE(rob): We have to clone the raw logits (at fp16) to
+            # compute logprobs AFTER sampling, since we need return
+            # the logprob of the sampled token.
+            raw_logits = torch.empty_like(logits)
+            raw_logits.copy_(logits, non_blocking=True)
 
         # Use float32 for the logits.
         logits = logits.to(torch.float32)
@@ -50,11 +49,18 @@ def forward(
         # Use int32 to reduce the tensor size.
         sampled = sampled.to(torch.int32)
 
+        # Compute topk and sample logprobs.
+        logprob_token_ids, logprobs = self.get_logprobs(
+            raw_logits,
+            sampling_metadata.max_num_logprobs,
+            sampled=sampled,
+        )
+
         # NOTE: CPU-GPU synchronization happens here.
         sampler_output = SamplerOutput(
             sampled_token_ids=sampled.tolist(),
-            logprob_token_ids=logprob_token_ids,
-            logprobs=logprobs,
+            logprob_token_ids=logprob_token_ids or logprob_token_ids.cpu(),
+            logprobs=logprobs or logprobs.cpu(),
         )
         return sampler_output
 
@@ -74,33 +80,6 @@ def get_prompt_logprobs(
 
         return logprob_token_ids, logprobs
 
-    def _compute_logprobs(
-        self,
-        logits: torch.Tensor,
-        max_num_logprobs: int,
-        sampled_token_ids: Optional[torch.Tensor] = None,
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
-        if max_num_logprobs > 0:
-            logprobs = self.get_logprobs(logits)
-            topk_logprobs, topk_indices = torch.topk(logprobs,
-                                                     max_num_logprobs,
-                                                     dim=-1,
-                                                     sorted=True)
-            # Use int32 to reduce the tensor size.
-            topk_indices = topk_indices.to(torch.int32)
-
-            # Concatenate with the sampled token_id if provided.
-            if sampled_token_ids:
-                # TODO(rob): check if the indexing / concatting is right
-                # TODO(rob): do we need to return the rank of the sampled?
-                sampled_logprobs = logprobs[:, sampled_token_ids]
-                topk_indices = torch.cat([sampled_token_ids, topk_indices])
-                topk_logprobs = torch.cat([sampled_logprobs, topk_logprobs])
-
-            return topk_indices.cpu(), topk_logprobs.cpu()
-        else:
-            return None, None
-
     def apply_temperature(
         self,
         logits: torch.Tensor,
@@ -145,18 +124,29 @@ def sample(
         )
         return sampled
 
-    def get_topk_logprobs(
+    def get_logprobs(
         self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        logprobs = logits.log_softmax(dim=-1, dtype=torch.float32)
-        # FIXME: Mask the sampled token_id, get topk logprobs,
-        # and concatenate the topk with the sampled token_id.
-        topk_logprobs, topk_indices = torch.topk(
-            logprobs, sampling_metadata.max_num_logprobs, dim=-1)
+        logprobs: torch.Tensor,
+        num_logprobs: int,
+        sampled: Optional[torch.Tensor] = None,
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+        if num_logprobs == 0:
+            return None, None
+
+        topk_logprobs, topk_indices = torch.topk(logprobs,
+                                                 num_logprobs,
+                                                 dim=-1)
         # Use int32 to reduce the tensor size.
         topk_indices = topk_indices.to(torch.int32)
+
+        # Concatenate with the sampled token_id if provided.
+        if sampled:
+            # TODO(andy): do we need to return the rank of the sampled?
+            # TODO(andy): is this indexing right?
+            sampled_logprobs = logprobs[:, sampled]
+            topk_indices = torch.cat([sampled, topk_indices])
+            topk_logprobs = torch.cat([sampled_logprobs, topk_logprobs])
+
         return topk_logprobs, topk_indices
 
     def apply_penalties(
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 1cbc8002532d1..d8bc4765ca04e 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -409,72 +409,6 @@ def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
         return prompt_token_ids_cpu_tensor.to(device=self.device,
                                               non_blocking=True)
 
-    def make_prompt_logprobs_metadata(
-        self,
-        req_id: str,
-        partial_req_ids: List[str],
-        req_indices: npt.NDArray,
-    ) -> PromptLogprobsMetadata:
-        req_idx = self.req_id_to_index[req_id]
-
-        # Get the indices for this prefill in current batch.
-        all_indicies = torch.arange(req_indices.shape[0])
-        indices = all_indicies[req_indices == req_idx]
-        if req_id not in partial_req_ids:
-            # Remove the sample token if there is one.
-            indices = indices[:-1]
-
-        # The tensors are shape 1, so we can use them in process_logits
-        # since they will be broadcasted to shape N.
-        temperature = self.temperature[req_idx]
-        top_p = self.top_p[req_idx]
-        top_k = self.top_k[req_idx]
-        no_top_p = req_id not in self.top_p_reqs
-        no_top_k = req_id not in self.top_k_reqs
-
-        return PromptLogprobsMetadata(
-            prompt_indices=indices,
-            logits_process_metadata=LogitsProcessMetadata(
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                no_top_p=no_top_p,
-                no_top_k=no_top_k),
-        )
-
-    def make_prompt_logprobs_metadata(
-        self,
-        req_id: str,
-        partial_req_ids: List[str],
-        req_indices: npt.NDArray,
-    ) -> PromptLogprobsMetadata:
-        req_idx = self.req_id_to_index[req_id]
-
-        # Get the indices for this prefill in current batch.
-        all_indicies = torch.arange(req_indices.shape[0])
-        indices = all_indicies[req_indices == req_idx]
-        if req_id not in partial_req_ids:
-            # Remove the sample token if there is one.
-            indices = indices[:-1]
-
-        # The tensors are shape 1, so we can use them in process_logits
-        # since they will be broadcasted to shape N.
-        temperature = self.temperature[req_idx]
-        top_p = self.top_p[req_idx]
-        top_k = self.top_k[req_idx]
-        no_top_p = req_id not in self.top_p_reqs
-        no_top_k = req_id not in self.top_k_reqs
-
-        return PromptLogprobsMetadata(
-            prompt_indices=indices,
-            logits_process_metadata=LogitsProcessMetadata(
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                no_top_p=no_top_p,
-                no_top_k=no_top_k),
-        )
-
     @property
     def num_reqs(self) -> int:
         return len(self.req_id_to_index)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9d57f39b092c5..065163c9a3539 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -497,13 +497,22 @@ def _prepare_prompt_logprobs(
         req_id: str,
         scheduler_output: "SchedulerOutput",
         req_indices: npt.NDArray,
-    ) -> PromptLogprobsMetadata:
+    ) -> torch.Tensor:
+
+        # req_indices is the req_idx of each batched token.
+        # So if we have 3 sequences of lens [2, 5, 3],
+        # req_indices = [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
+
+        req_idx = self.input_batch.req_id_to_index[req_id]
 
-        # Create the prompt logprobs metadata.
-        metadata = self.input_batch.make_prompt_logprobs_metadata(
-            req_id, scheduler_output.partial_req_ids, req_indices)
+        # Get the indices for this (prefill) req in current batch.
+        num_tokens = req_indices.shape[0]
+        prompt_indices = self.arange_np[:num_tokens][req_indices == req_idx]
+        if req_id not in scheduler_output.partial_req_ids:
+            # Remove the sample token if there is one.
+            indices = indices[:-1]
 
-        return metadata
+        return prompt_indices
 
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs

From 06b9abaa33b925f0b710d9fa0e22791bc4b0a7af Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 03:40:19 +0000
Subject: [PATCH 242/293] cleanup

---
 vllm/v1/sample/sampler.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 00c85e0f8c169..c52c28df47257 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -24,14 +24,12 @@ def forward(
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
-
-        # NOTE(woosuk): Use the original logits (before any penalties or
-        # temperature scaling) for the top-k logprobs.
-        # This is different from the V0 sampler, which uses the logits that
-        # is used for sampling (after penalties and temperature scaling).
         needs_logprobs = sampling_metadata.max_num_logprobs > 0
-        raw_logits = torch.Tensor([])
         if needs_logprobs:
+            # NOTE(woosuk): Use the original logits (before any penalties or
+            # temperature scaling) for the top-k logprobs.
+            # This is different from the V0 sampler, which uses the logits that
+            # is used for sampling (after penalties and temperature scaling).
             # NOTE(rob): We have to clone the raw logits (at fp16) to
             # compute logprobs AFTER sampling, since we need return
             # the logprob of the sampled token.

From 035e2c22c045516f3d5f3e45f3660f040a26e1a6 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 03:47:31 +0000
Subject: [PATCH 243/293] updated

---
 tests/v1/engine/test_detokenizer.py |  6 ++--
 vllm/v1/sample/sampler.py           | 43 ++++++++++++++---------------
 2 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 9f7c5a38e860f..705c3372132d4 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -168,8 +168,6 @@ def test_incremental_detokenization(
                               output_kind=request_output_kind,
                               stop=[],
                               include_stop_str_in_output=False))
-            logprobs=logprobs,
-            prompt_logprobs=prompt_logprobs,
         for idx, (
             prompt,
             prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
@@ -257,8 +255,8 @@ def test_stop_string(
                 output_kind=RequestOutputKind.DELTA,
                 stop=STOP_STRINGS,
                 include_stop_str_in_output=include_stop_str_in_output,
-            logprobs=logprobs,
-            prompt_logprobs=prompt_logprobs,
+                logprobs=logprobs,
+                prompt_logprobs=prompt_logprobs,
             )) for idx, (
                 prompt,
                 prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index c52c28df47257..233f3fc3ab706 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -33,8 +33,7 @@ def forward(
             # NOTE(rob): We have to clone the raw logits (at fp16) to
             # compute logprobs AFTER sampling, since we need return
             # the logprob of the sampled token.
-            raw_logits = torch.empty_like(logits)
-            raw_logits.copy_(logits, non_blocking=True)
+            raw_logits = logits.clone()
 
         # Use float32 for the logits.
         logits = logits.to(torch.float32)
@@ -47,12 +46,14 @@ def forward(
         # Use int32 to reduce the tensor size.
         sampled = sampled.to(torch.int32)
 
-        # Compute topk and sample logprobs.
-        logprob_token_ids, logprobs = self.get_logprobs(
-            raw_logits,
-            sampling_metadata.max_num_logprobs,
-            sampled=sampled,
-        )
+        if needs_logprobs:
+            # Compute topk and sample logprobs.
+            logprob_token_ids, logprobs = self.get_logprobs(
+                raw_logits,
+                sampling_metadata.max_num_logprobs,
+                sampled=sampled)
+        else:
+            logprob_token_ids, logprobs = None, None
 
         # NOTE: CPU-GPU synchronization happens here.
         sampler_output = SamplerOutput(
@@ -124,25 +125,23 @@ def sample(
 
     def get_logprobs(
         self,
-        logprobs: torch.Tensor,
+        logits: torch.Tensor,
         num_logprobs: int,
-        sampled: Optional[torch.Tensor] = None,
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
-        if num_logprobs == 0:
-            return None, None
-
-        topk_logprobs, topk_indices = torch.topk(logprobs,
-                                                 num_logprobs,
-                                                 dim=-1)
+        sampled_token_ids: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Compute logprobs.
+        logprobs = logits.log_softmax(dim=-1, dtype=torch.float32)
+        topk_logprobs, topk_indices = torch.topk(
+            logprobs, num_logprobs, dim=-1)
         # Use int32 to reduce the tensor size.
         topk_indices = topk_indices.to(torch.int32)
 
-        # Concatenate with the sampled token_id if provided.
-        if sampled:
-            # TODO(andy): do we need to return the rank of the sampled?
+        # Concatenate with the sampled token_ids if provided.
+        if sampled_token_ids:
+            # TODO(rob): do we need to return the rank of the sampled?
             # TODO(andy): is this indexing right?
-            sampled_logprobs = logprobs[:, sampled]
-            topk_indices = torch.cat([sampled, topk_indices])
+            sampled_logprobs = logprobs[:, sampled_token_ids]
+            topk_indices = torch.cat([sampled_token_ids, topk_indices])
             topk_logprobs = torch.cat([sampled_logprobs, topk_logprobs])
 
         return topk_logprobs, topk_indices

From 17e41c8e434785c45103f11cc95ea97d00804f88 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 03:50:16 +0000
Subject: [PATCH 244/293] remove

---
 vllm/v1/sample/sampler.py | 30 ++++++++----------------------
 1 file changed, 8 insertions(+), 22 deletions(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 233f3fc3ab706..506b440b9ae7e 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -47,7 +47,8 @@ def forward(
         sampled = sampled.to(torch.int32)
 
         if needs_logprobs:
-            # Compute topk and sample logprobs.
+            # Get sampled and topk token logprobs.
+            # NOTE: CPU<>GPU sync happens here.
             logprob_token_ids, logprobs = self.get_logprobs(
                 raw_logits,
                 sampling_metadata.max_num_logprobs,
@@ -58,27 +59,11 @@ def forward(
         # NOTE: CPU-GPU synchronization happens here.
         sampler_output = SamplerOutput(
             sampled_token_ids=sampled.tolist(),
-            logprob_token_ids=logprob_token_ids or logprob_token_ids.cpu(),
-            logprobs=logprobs or logprobs.cpu(),
+            logprob_token_ids=logprob_token_ids,
+            logprobs=logprobs,
         )
         return sampler_output
 
-    def get_prompt_logprobs(
-        self,
-        logits: torch.Tensor,
-        logits_process_metadata: LogitsProcessMetadata,
-        num_logprobs: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        logits = self.apply_temperature(logits,
-                                        logits_process_metadata.temperature)
-        logits = self.apply_top_k_top_p(logits, logits_process_metadata)
-
-        # NOTE: CPU-GPU synchronization happens here.
-        logprob_token_ids, logprobs = self._compute_logprobs(
-            logits=logits, max_num_logprobs=num_logprobs)
-
-        return logprob_token_ids, logprobs
-
     def apply_temperature(
         self,
         logits: torch.Tensor,
@@ -131,8 +116,9 @@ def get_logprobs(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Compute logprobs.
         logprobs = logits.log_softmax(dim=-1, dtype=torch.float32)
-        topk_logprobs, topk_indices = torch.topk(
-            logprobs, num_logprobs, dim=-1)
+        topk_logprobs, topk_indices = torch.topk(logprobs,
+                                                 num_logprobs,
+                                                 dim=-1)
         # Use int32 to reduce the tensor size.
         topk_indices = topk_indices.to(torch.int32)
 
@@ -144,7 +130,7 @@ def get_logprobs(
             topk_indices = torch.cat([sampled_token_ids, topk_indices])
             topk_logprobs = torch.cat([sampled_logprobs, topk_logprobs])
 
-        return topk_logprobs, topk_indices
+        return topk_logprobs.cpu(), topk_indices.cpu()
 
     def apply_penalties(
         self,

From 2cb483237934fa5a5a9a168490ea6f7e8076b185 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 03:51:43 +0000
Subject: [PATCH 245/293] finish cleaning sampler.py

---
 vllm/v1/sample/sampler.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 506b440b9ae7e..3a7d271fa2102 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,11 +1,11 @@
 """A layer that samples the next tokens from the model's outputs."""
-from typing import Tuple, Optional, Tuple
+from typing import Optional, Tuple
 
 import torch
 import torch.nn as nn
 
 from vllm.v1.outputs import SamplerOutput
-from vllm.v1.sample.metadata import LogitsProcessMetadata, SamplingMetadata
+from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.ops.penalties import (apply_all_penalties,
                                           apply_min_token_penalties)
 from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
@@ -101,8 +101,7 @@ def sample(
 
         greedy_sampled = self.greedy_sample(logits)
         sampled = torch.where(
-            sampling_metadata.logits_process_metadata.temperature <
-            _SAMPLING_EPS,
+            sampling_metadata.temperature < _SAMPLING_EPS,
             greedy_sampled,
             random_sampled,
         )

From 92595a469e74e968742007d1bf557972f0a776bf Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 03:53:52 +0000
Subject: [PATCH 246/293] updated

---
 vllm/v1/worker/gpu_input_batch.py  | 4 +---
 vllm/v1/worker/gpu_model_runner.py | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index d8bc4765ca04e..b44752f4f381c 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -4,13 +4,11 @@
 from typing import TYPE_CHECKING, Dict, List, Optional, Set
 
 import numpy as np
-import numpy.typing as npt
 import torch
 
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams, SamplingType
-from vllm.v1.sample.metadata import (LogitsProcessMetadata,
-                                     PromptLogprobsMetadata, SamplingMetadata)
+from vllm.v1.sample.metadata import SamplingMetadata
 
 if TYPE_CHECKING:
     from vllm.multimodal.inputs import PlaceholderRange
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 065163c9a3539..aa4842f7467ce 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -22,7 +22,7 @@
                                                    FlashAttentionMetadata)
 from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
 from vllm.v1.outputs import ModelRunnerOutput
-from vllm.v1.sample.metadata import PromptLogprobsMetadata, SamplingMetadata
+from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
 if TYPE_CHECKING:

From c82fc85b19cce72372bfc6aae8d1c87b520a65bb Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 03:59:40 +0000
Subject: [PATCH 247/293] updated comment

---
 vllm/v1/worker/gpu_input_batch.py  |  2 +-
 vllm/v1/worker/gpu_model_runner.py | 21 ++++++++++-----------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index b44752f4f381c..a673a6d802e93 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -234,7 +234,7 @@ def add_request(
         if sampling_params.logprobs:
             self.num_logprobs[req_id] = sampling_params.logprobs
         if sampling_params.prompt_logprobs:
-            # FIXME(rob): handle prefix caching and preemption.
+            # FIXME(andy): handle prefix caching and preemption.
             # We currently get incorrect results if prompt logprobs
             # are requested and we get a cache hit.
             self.num_prompt_logprobs[req_id] = sampling_params.prompt_logprobs
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index aa4842f7467ce..b9b8bf8db78b0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -499,18 +499,17 @@ def _prepare_prompt_logprobs(
         req_indices: npt.NDArray,
     ) -> torch.Tensor:
 
-        # req_indices is the req_idx of each batched token.
-        # So if we have 3 sequences of lens [2, 5, 3],
-        # req_indices = [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
-
+        # NOTE(rob): req_indices is the req_idx of each token.
+        # If we have 3 sequences in the batch of lens [2, 5, 3],
+        # req_indices = [0, 0, 1, 1, 1, 1, 1, 2, 2, 2].
+        # Thus, prompt_indices is where req_indices == req_idx.
         req_idx = self.input_batch.req_id_to_index[req_id]
-
-        # Get the indices for this (prefill) req in current batch.
-        num_tokens = req_indices.shape[0]
-        prompt_indices = self.arange_np[:num_tokens][req_indices == req_idx]
-        if req_id not in scheduler_output.partial_req_ids:
-            # Remove the sample token if there is one.
-            indices = indices[:-1]
+        indices = self.arange_np[:req_indices.shape[0]]
+        prompt_indices = indices[req_indices == req_idx]
+        
+        # Remove the sample token if there is one.
+        if req_id not in scheduler_output.partial_req_ids:    
+            prompt_indices = prompt_indices[:-1]
 
         return prompt_indices
 

From c3c4f9c813c8a4118bfffdfe038f805764dbbed0 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 04:09:19 +0000
Subject: [PATCH 248/293] passing mypy!

---
 vllm/v1/engine/detokenizer.py      |  7 +++---
 vllm/v1/sample/sampler.py          |  2 +-
 vllm/v1/worker/gpu_model_runner.py | 39 +++++++++++++-----------------
 3 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 08d6fd8bdd410..044378fc38ba4 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -82,6 +82,7 @@ def from_new_request(
         else:
             stop_buffer_length = 0
 
+        logprobs = request.sampling_params.logprobs
         return cls(
             output_text="",
             tokens=tokens,
@@ -102,10 +103,10 @@ def from_new_request(
             prompt_token_ids=request.prompt_token_ids,
             tokenizer=tokenizer,
             stop_buffer_length=stop_buffer_length,
-            cumulative_logprob=(0. if request.logprobs else None),
-            logprobs=([] if request.logprobs else None),
+            cumulative_logprob=(0. if logprobs else None),
+            logprobs=([] if logprobs else None),
             prompt_logprobs=None,
-            num_logprobs=request.logprobs,
+            num_logprobs=(logprobs or 0),
         )
 
     def _update_sample_logprobs(
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 3a7d271fa2102..e41d2c2e5df49 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -52,7 +52,7 @@ def forward(
             logprob_token_ids, logprobs = self.get_logprobs(
                 raw_logits,
                 sampling_metadata.max_num_logprobs,
-                sampled=sampled)
+                sampled_token_ids=sampled)
         else:
             logprob_token_ids, logprobs = None, None
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index b9b8bf8db78b0..7a4665fe7cdfb 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -497,7 +497,7 @@ def _prepare_prompt_logprobs(
         req_id: str,
         scheduler_output: "SchedulerOutput",
         req_indices: npt.NDArray,
-    ) -> torch.Tensor:
+    ) -> npt.NDArray:
 
         # NOTE(rob): req_indices is the req_idx of each token.
         # If we have 3 sequences in the batch of lens [2, 5, 3],
@@ -506,9 +506,9 @@ def _prepare_prompt_logprobs(
         req_idx = self.input_batch.req_id_to_index[req_id]
         indices = self.arange_np[:req_indices.shape[0]]
         prompt_indices = indices[req_indices == req_idx]
-        
+
         # Remove the sample token if there is one.
-        if req_id not in scheduler_output.partial_req_ids:    
+        if req_id not in scheduler_output.partial_req_ids:
             prompt_indices = prompt_indices[:-1]
 
         return prompt_indices
@@ -663,25 +663,20 @@ def execute_model(
         # request separately. Prompt logprobs are rare (used for eval),
         # and few prefills per batch, so prioritize simple over optimal.
         prompt_logprobs_dict: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
-        if not self.input_batch.no_prompt_logprob:
-            for (request_id, num_prompt_logprobs
-                 ) in self.input_batch.num_prompt_logprobs.items():
-
-                # Prepare mask and logits processor.
-                metadata = self._prepare_prompt_logprobs(
-                    request_id, scheduler_output, req_indices)
-
-                # Compute logits.
-                prompt_hidden_states = hidden_states[metadata.prompt_indices]
-                logits = self.model.sampler.compute_logits(
-                    prompt_hidden_states, None)
-
-                # Compute prompt logprobs.
-                # TODO(rob): Should we move the sampler out of the model?
-                prompt_logprobs_dict[
-                    request_id] = self.model.sampler.get_prompt_logprobs(
-                        logits, metadata.logits_process_metadata,
-                        num_prompt_logprobs)
+        for (request_id, num_prompt_logprobs
+             ) in self.input_batch.num_prompt_logprobs.items():
+
+            # Prepare mask and logits processor.
+            prompt_indices = self._prepare_prompt_logprobs(
+                request_id, scheduler_output, req_indices)
+
+            # Compute logits.
+            prompt_hidden_states = hidden_states[prompt_indices]
+            logits = self.model.compute_logits(prompt_hidden_states, None)
+
+            # Compute prompt logprobs.
+            prompt_logprobs_dict[request_id] = self.model.sampler.get_logprobs(
+                logits, num_prompt_logprobs)
 
         sampled_token_ids = sampler_output.sampled_token_ids
         # TODO(woosuk): The following loop can be slow since it iterates over

From fec3d152d306230f120d19aeb82b6d85812714a0 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 04:10:56 +0000
Subject: [PATCH 249/293] comment

---
 vllm/transformers_utils/detokenizer_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 7dd96bd2ef72e..0908b8f085410 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -84,8 +84,8 @@ def detokenize_non_incrementally(
     # treats it as decoding batch N seq_len 1, such
     # that they all happen independently.
     flat_token_ids = token_ids.reshape(-1, 1)
-    # TODO(rob): deal with MistralTokenizer not doing
-    # batch_decode?
+    # TODO(andy): deal with MistralTokenizer not having
+    # batch_decode. Follow up if hard?
     return tokenizer.batch_decode(flat_token_ids)
 
 

From d002d676f0b05d29accaac7d615a34adfdca252b Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 04:11:10 +0000
Subject: [PATCH 250/293] todo -> fixme

---
 vllm/transformers_utils/detokenizer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 0908b8f085410..7e51adda69392 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -84,7 +84,7 @@ def detokenize_non_incrementally(
     # treats it as decoding batch N seq_len 1, such
     # that they all happen independently.
     flat_token_ids = token_ids.reshape(-1, 1)
-    # TODO(andy): deal with MistralTokenizer not having
+    # FIXME(andy): deal with MistralTokenizer not having
     # batch_decode. Follow up if hard?
     return tokenizer.batch_decode(flat_token_ids)
 

From 3157e8ba78e893283f8dd5f1197e3d769e4ff46c Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 04:18:16 +0000
Subject: [PATCH 251/293] updated

---
 vllm/v1/engine/detokenizer.py      |  4 ++--
 vllm/v1/worker/gpu_model_runner.py | 11 +++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 044378fc38ba4..b6eb9c5c7af4f 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -181,11 +181,11 @@ def _update_prompt_logprobs(
 
         # Detokenize non-incrementally.
         # NOTE(rob): the output is flattened:
-        #   [num_tok, num_lps] -> [num_tok * num_lps]
+        # [num_tok, num_lps] -> [num_tok * num_lps]
         decoded_tokens = detokenize_non_incrementally(self.tokenizer,
                                                       token_ids)
 
-        # Make Logprob for prompt token.
+        # Make Logprob for each tokens. The first Logprob is None.
         num_tokens, num_logprobs = logprobs.shape
         self.prompt_logprobs = [None] + [
             self._make_pos_logprob_dict(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 7a4665fe7cdfb..fe074482ff656 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -492,12 +492,13 @@ def _prepare_sampling(
             req_id_output_token_ids, skip_copy)
         return sampling_metadata
 
-    def _prepare_prompt_logprobs(
+    def _prepare_prompt_indices(
         self,
         req_id: str,
         scheduler_output: "SchedulerOutput",
         req_indices: npt.NDArray,
     ) -> npt.NDArray:
+        """Get the indices of a prompt in the batch."""
 
         # NOTE(rob): req_indices is the req_idx of each token.
         # If we have 3 sequences in the batch of lens [2, 5, 3],
@@ -660,17 +661,15 @@ def execute_model(
 
         # Compute prompt logprobs if needed.
         # NOTE(rob): for simplicity, compute prompt logprobs for each
-        # request separately. Prompt logprobs are rare (used for eval),
+        # prompt separately. Prompt logprobs are rare (used for eval),
         # and few prefills per batch, so prioritize simple over optimal.
         prompt_logprobs_dict: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
         for (request_id, num_prompt_logprobs
              ) in self.input_batch.num_prompt_logprobs.items():
 
-            # Prepare mask and logits processor.
-            prompt_indices = self._prepare_prompt_logprobs(
+            # Compute of the prompt.
+            prompt_indices = self._prepare_prompt_indices(
                 request_id, scheduler_output, req_indices)
-
-            # Compute logits.
             prompt_hidden_states = hidden_states[prompt_indices]
             logits = self.model.compute_logits(prompt_hidden_states, None)
 

From 60125e335e95e7ad9f1d64fd67f2c8669c83b2c8 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 4 Jan 2025 23:10:50 +0000
Subject: [PATCH 252/293] fixed sampler bug

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/sample/sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index e41d2c2e5df49..1a5d350e2dff9 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -122,7 +122,7 @@ def get_logprobs(
         topk_indices = topk_indices.to(torch.int32)
 
         # Concatenate with the sampled token_ids if provided.
-        if sampled_token_ids:
+        if sampled_token_ids is not None:
             # TODO(rob): do we need to return the rank of the sampled?
             # TODO(andy): is this indexing right?
             sampled_logprobs = logprobs[:, sampled_token_ids]

From 5908cb1426dc93969320237e4a337abf29ec90ad Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sun, 5 Jan 2025 03:29:49 +0000
Subject: [PATCH 253/293] fixed some sampler bugs

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/sample/sampler.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 1a5d350e2dff9..dd34d1a71cbcc 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -125,9 +125,11 @@ def get_logprobs(
         if sampled_token_ids is not None:
             # TODO(rob): do we need to return the rank of the sampled?
             # TODO(andy): is this indexing right?
-            sampled_logprobs = logprobs[:, sampled_token_ids]
-            topk_indices = torch.cat([sampled_token_ids, topk_indices])
-            topk_logprobs = torch.cat([sampled_logprobs, topk_logprobs])
+            sampled_logprobs = logprobs[torch.arange(logprobs.size(0)),
+                                        sampled_token_ids].unsqueeze(-1)
+            sampled_token_ids = sampled_token_ids.unsqueeze(-1)
+            topk_indices = torch.cat([sampled_token_ids, topk_indices], dim=1)
+            topk_logprobs = torch.cat([sampled_logprobs, topk_logprobs], dim=1)
 
         return topk_logprobs.cpu(), topk_indices.cpu()
 

From fc520312d67513cffc67b033642a7c9048f157f7 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sun, 5 Jan 2025 03:55:32 +0000
Subject: [PATCH 254/293] wip fixing detokenizer test

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py |  4 +++-
 tests/v1/engine/utils.py            | 10 ++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 705c3372132d4..20ae0f7e9bd1c 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -167,7 +167,9 @@ def test_incremental_detokenization(
                               spaces_between_special_tokens=False,
                               output_kind=request_output_kind,
                               stop=[],
-                              include_stop_str_in_output=False))
+                              include_stop_str_in_output=False,
+                              logprobs=logprobs,
+                              prompt_logprobs=prompt_logprobs))
         for idx, (
             prompt,
             prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index f3617067455da..da01900cfd4ef 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -7,7 +7,7 @@
 from transformers.tokenization_utils import PreTrainedTokenizer
 
 from vllm.outputs import RequestOutput
-from vllm.v1.engine.detokenizer import DetokenizerRequest
+from vllm.v1.engine import EngineCoreRequest
 
 random.seed(42)
 
@@ -206,7 +206,7 @@ def _decode_token(
 
 
 def validate_requests_logprobs(
-    requests: List[DetokenizerRequest],
+    requests: List[EngineCoreRequest],
     request_outputs: List[RequestOutput],
     tokenizer: PreTrainedTokenizer,
 ) -> None:
@@ -229,7 +229,9 @@ def validate_requests_logprobs(
       request_outputs: list of detokenizer outputs
     """
     for req, req_out in zip(requests, request_outputs):
-        if req.logprobs is not None and req.logprobs > 0:
+        logprobs = req.sampling_params.logprobs
+        prompt_logprobs = req.sampling_params.prompt_logprobs
+        if logprobs is not None and logprobs > 0:
             # Validate sample logprobs
             for comp in req_out.outputs:
                 # For each completion
@@ -243,7 +245,7 @@ def validate_requests_logprobs(
                             tok_id,
                             tokenizer), "sample logprob decoded token mismatch"
 
-        if req.prompt_logprobs is not None and req.prompt_logprobs > 0 and len(
+        if prompt_logprobs is not None and prompt_logprobs > 0 and len(
                 req_out.prompt_logprobs) > 0:
             # Validate prompt logprobs
             assert req_out.prompt_logprobs[

From 6e57de4f4020fd7cb0466890d926943ea5ad3fe0 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 6 Jan 2025 03:53:45 +0000
Subject: [PATCH 255/293] wip

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/serial_utils.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 813153a56ef68..c626a66e88a5e 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -4,7 +4,9 @@
 import torch
 from msgspec import msgpack
 
-CUSTOM_TYPE_CODE_PICKLE = 1
+from vllm.v1.engine import EngineCoreOutputs
+
+CUSTOM_TYPE_CODE_PICKLE = 100
 
 
 class PickleEncoder:
@@ -23,9 +25,16 @@ def __init__(self):
         self.encoder = msgpack.Encoder(enc_hook=custom_enc_hook)
 
     def encode(self, obj: Any) -> bytes:
+        # print("\n\nencode() obj:",obj,"\n\n")
+        # dec=msgpack.Decoder(EngineCoreOutputs, ext_hook=custom_ext_hook)
+        # dec.decode(self.encoder.encode(obj))
         return self.encoder.encode(obj)
 
     def encode_into(self, obj: Any, buf: bytearray) -> None:
+        print("\n\nbanana:",self.encoder.encode(obj),"\n\n")
+        # print(f"\n\nencode_into obj:{obj}","\n\n")
+        # dec=msgpack.Decoder(EngineCoreOutputs, ext_hook=custom_ext_hook)
+        # dec.decode(self.encoder.encode(obj))
         self.encoder.encode_into(obj, buf)
 
 
@@ -44,7 +53,8 @@ def custom_enc_hook(obj: Any) -> Any:
         # NOTE(rob): it is fastest to use numpy + pickle
         # when serializing torch tensors.
         # https://gist.github.com/tlrmchlsmth/8067f1b24a82b6e2f90450e7764fa103 # noqa: E501
-        return msgpack.Ext(CUSTOM_TYPE_CODE_PICKLE, pickle.dumps(obj.numpy()))
+        #return msgpack.Ext(CUSTOM_TYPE_CODE_PICKLE, pickle.dumps(obj.numpy()))
+        return msgpack.Ext(CUSTOM_TYPE_CODE_PICKLE, msgpack.Encoder().encode(obj.numpy()))
     else:
         raise NotImplementedError(
             f"Objects of type {type(obj)} are not supported")

From 599aae883d6f38123a70b33ccff0b5bba6cd4be6 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 6 Jan 2025 05:12:36 +0000
Subject: [PATCH 256/293] temporary hack to use pickling

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/serial_utils.py | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index c626a66e88a5e..07f0a61f0adba 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -4,9 +4,7 @@
 import torch
 from msgspec import msgpack
 
-from vllm.v1.engine import EngineCoreOutputs
-
-CUSTOM_TYPE_CODE_PICKLE = 100
+CUSTOM_TYPE_CODE_PICKLE = 1
 
 
 class PickleEncoder:
@@ -25,17 +23,12 @@ def __init__(self):
         self.encoder = msgpack.Encoder(enc_hook=custom_enc_hook)
 
     def encode(self, obj: Any) -> bytes:
-        # print("\n\nencode() obj:",obj,"\n\n")
-        # dec=msgpack.Decoder(EngineCoreOutputs, ext_hook=custom_ext_hook)
-        # dec.decode(self.encoder.encode(obj))
-        return self.encoder.encode(obj)
+        #return self.encoder.encode(obj)
+        return pickle.dumps(obj)
 
     def encode_into(self, obj: Any, buf: bytearray) -> None:
-        print("\n\nbanana:",self.encoder.encode(obj),"\n\n")
-        # print(f"\n\nencode_into obj:{obj}","\n\n")
-        # dec=msgpack.Decoder(EngineCoreOutputs, ext_hook=custom_ext_hook)
-        # dec.decode(self.encoder.encode(obj))
-        self.encoder.encode_into(obj, buf)
+        #self.encoder.encode_into(obj, buf)
+        buf[:] = pickle.dumps(obj)
 
 
 class MsgpackDecoder:
@@ -45,7 +38,8 @@ def __init__(self, t: Any):
         self.decoder = msgpack.Decoder(t, ext_hook=custom_ext_hook)
 
     def decode(self, obj: Any):
-        return self.decoder.decode(obj)
+        #return self.decoder.decode(obj)
+        return pickle.loads(obj)
 
 
 def custom_enc_hook(obj: Any) -> Any:
@@ -54,7 +48,8 @@ def custom_enc_hook(obj: Any) -> Any:
         # when serializing torch tensors.
         # https://gist.github.com/tlrmchlsmth/8067f1b24a82b6e2f90450e7764fa103 # noqa: E501
         #return msgpack.Ext(CUSTOM_TYPE_CODE_PICKLE, pickle.dumps(obj.numpy()))
-        return msgpack.Ext(CUSTOM_TYPE_CODE_PICKLE, msgpack.Encoder().encode(obj.numpy()))
+        return msgpack.Ext(CUSTOM_TYPE_CODE_PICKLE,
+                           msgpack.Encoder().encode(obj.numpy()))
     else:
         raise NotImplementedError(
             f"Objects of type {type(obj)} are not supported")

From ae1e1b7b7c79c62777eb446894791e21ee75c5f9 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 6 Jan 2025 06:21:24 +0000
Subject: [PATCH 257/293] wip detokenizer test

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py | 57 ++++++++++++------------
 tests/v1/engine/utils.py            | 69 ++++++++++++++---------------
 2 files changed, 62 insertions(+), 64 deletions(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 20ae0f7e9bd1c..02e0da265d9e4 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -1,7 +1,7 @@
 from typing import List, Optional, Tuple
 
-import numpy.typing as npt
 import pytest
+import torch
 from transformers import AutoTokenizer
 
 from tests.v1.engine.utils import (generate_dummy_prompt_logprobs,
@@ -29,32 +29,30 @@
 
 FULL_TOKENS = [tokenizer(text).input_ids for text in FULL_STRINGS]
 PROMPT_LEN = 5
+
+# Tokenize prompts under test & create dummy generated tokens
 PROMPT_TOKENS = [
     tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
 ]
-PROMPT_LOGPROBS_RAW: List[Tuple[npt.NDArray, npt.NDArray]] = [
+GENERATION_TOKENS = [
+    tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
+]
+
+# Generate dummy prompt logprobs & sample logprobs for initializing
+# the mock engine
+PROMPT_LOGPROBS: List[Tuple[torch.Tensor, torch.Tensor]] = [
     generate_dummy_prompt_logprobs(prompt_tokens_list=tokens_list,
                                    num_logprobs=NUM_PROMPT_LOGPROBS,
                                    tokenizer=tokenizer)
     for tokens_list in PROMPT_TOKENS
 ]
-# PROMPT_LOGPROBS = [
-#     _new_logprobs_detokenized(logprobs=logprobs, tokenizer=tokenizer)
-#     for logprobs in PROMPT_LOGPROBS_RAW
-# ]
-GENERATION_TOKENS = [
-    tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
-]
-GENERATION_LOGPROBS_RAW = [
+GENERATION_LOGPROBS = [
     generate_dummy_sample_logprobs(sampled_tokens_list=tokens_list,
                                    num_logprobs=NUM_SAMPLE_LOGPROBS,
                                    tokenizer=tokenizer)
     for tokens_list in GENERATION_TOKENS
 ]
-# GENERATION_LOGPROBS = [
-#     _new_logprobs_detokenized(logprobs=logprobs, tokenizer=tokenizer)
-#     for logprobs in GENERATION_LOGPROBS_RAW
-# ]
+
 PROMPT_STRINGS = [
     tokenizer.decode(prompt_tokens,
                      skip_special_tokens=True,
@@ -74,9 +72,9 @@ def __init__(
         self,
         generated_tokens_list: List[List[int]],
         prompt_tokens_list: List[List[int]],
-        generated_logprobs_raw: Optional[List[List[Tuple[npt.NDArray,
-                                                         npt.NDArray]]]],
-        prompt_logprobs_raw: Optional[List[Tuple[npt.NDArray, npt.NDArray]]],
+        generated_logprobs_raw: Optional[List[List[Tuple[torch.Tensor,
+                                                         torch.Tensor]]]],
+        prompt_logprobs_raw: Optional[List[Tuple[torch.Tensor, torch.Tensor]]],
     ) -> None:
         self.generated_tokens_list = generated_tokens_list
         self.prompt_tokens_list = prompt_tokens_list
@@ -117,6 +115,7 @@ def get_outputs(self) -> List[EngineCoreOutput]:
                     new_token_ids=[generated_token_ids[token_idx]],
                     finished=False,
                     logprobs=logprobs,
+                    logprobs_token_ids=logprobs,
                     prompt_logprobs=prompt_logprobs,
                     prompt_logprobs_token_ids=prompt_logprobs_token_ids,
                 )
@@ -144,12 +143,12 @@ def test_incremental_detokenization(
     do_generated_logprobs = logprobs is not None
     do_prompt_logprobs = prompt_logprobs is not None
     detokenizer = Detokenizer(TOKENIZER_NAME)
-    engine_core = MockEngineCore(generated_tokens_list=GENERATION_TOKENS,
-                                 prompt_tokens_list=PROMPT_TOKENS,
-                                 generated_logprobs_raw=GENERATION_LOGPROBS_RAW
-                                 if do_generated_logprobs else None,
-                                 prompt_logprobs_raw=PROMPT_LOGPROBS_RAW
-                                 if do_prompt_logprobs else None)
+    engine_core = MockEngineCore(
+        generated_tokens_list=GENERATION_TOKENS,
+        prompt_tokens_list=PROMPT_TOKENS,
+        generated_logprobs_raw=GENERATION_LOGPROBS
+        if do_generated_logprobs else None,
+        prompt_logprobs_raw=PROMPT_LOGPROBS if do_prompt_logprobs else None)
 
     # Make N requests.
     requests = [
@@ -232,12 +231,12 @@ def test_stop_string(
     do_generated_logprobs = logprobs is not None
     do_prompt_logprobs = prompt_logprobs is not None
     detokenizer = Detokenizer(TOKENIZER_NAME)
-    engine_core = MockEngineCore(generated_tokens_list=GENERATION_TOKENS,
-                                 prompt_tokens_list=PROMPT_TOKENS,
-                                 generated_logprobs_raw=GENERATION_LOGPROBS_RAW
-                                 if do_generated_logprobs else None,
-                                 prompt_logprobs_raw=PROMPT_LOGPROBS_RAW
-                                 if do_prompt_logprobs else None)
+    engine_core = MockEngineCore(
+        generated_tokens_list=GENERATION_TOKENS,
+        prompt_tokens_list=PROMPT_TOKENS,
+        generated_logprobs_raw=GENERATION_LOGPROBS
+        if do_generated_logprobs else None,
+        prompt_logprobs_raw=PROMPT_LOGPROBS if do_prompt_logprobs else None)
 
     # Make N requests.
     requests = [
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index da01900cfd4ef..fc66cebdc6732 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -2,8 +2,7 @@
 import random
 from typing import List, Tuple
 
-import numpy as np
-import numpy.typing as npt
+import torch
 from transformers.tokenization_utils import PreTrainedTokenizer
 
 from vllm.outputs import RequestOutput
@@ -16,7 +15,7 @@ def _create_random_top_logprob_test_vector(
     num_logprobs: int,
     lower: float,
     upper: float,
-) -> npt.NDArray:
+) -> torch.Tensor:
     """Create a random vector of top logprob float values.
     
     Use to create fake sample logprobs for testing.
@@ -31,16 +30,16 @@ def _create_random_top_logprob_test_vector(
       upper: upper range of logprob float values
 
     Returns:
-      1D length-`num_logprobs` np array of float logprob values
+      1D length-`num_logprobs` torch Tensor of float logprob values
     """
-    return np.random.rand(num_logprobs) * (upper - lower) + lower
+    return torch.rand(num_logprobs) * (upper - lower) + lower
 
 
 def _create_random_top_logprob_test_matrix(
     shape: Tuple,
     lower: float,
     upper: float,
-) -> npt.NDArray:
+) -> torch.Tensor:
     """Create a random matrix of top logprob float values.
     
     Use to create fake prompt logprobs for testing.
@@ -56,24 +55,20 @@ def _create_random_top_logprob_test_matrix(
       upper: upper range of logprob float values
 
     Returns:
-      2D num_tokens x num_logprobs np array of float logprob values
+      2D num_tokens x num_logprobs torch Tensor of float logprob values
     """
-    return np.random.rand(*shape) * (upper - lower) + lower
+    return torch.rand(*shape) * (upper - lower) + lower
 
 
 def _create_random_top_token_test_vector(
-    num_logprobs: int,
-    lower: int,
-    upper: int,
-    sampled_token_id: int,
-    adjust_num_logprobs: bool,
-) -> npt.NDArray:
+        num_logprobs: int, lower: int, upper: int, sampled_token_id: int,
+        adjust_num_logprobs: bool) -> torch.Tensor:
     """Create a random vector of top logprob token indices
 
     Use to create fake sample logprobs for testing. The sampled token
     ID must always be one of the top logprobs, which this dummy test
     vector generator enforces. OpenAI API
-    compatible engines must be able to return an addition sample
+    compatible engines must be able to return an additional sample
     logprob for the sampled token if the sampled token was not
     among the top sample logprobs; `adjust_num_logprobs` emulates
     this behavior by increasing the vector length by 1 if
@@ -89,23 +84,27 @@ def _create_random_top_token_test_vector(
                            logprobs
 
     Returns:
-      1D length-x np array of token ids where x is
+      1D length-x torch Tensor of token ids where x is
       `num_logprobs+1` if `adjust_num_logprobs` and
       `num_logprobs` otherwise
     """
-    choice_list = list(range(lower, upper))
-    res = np.random.choice(choice_list, (num_logprobs +
-                                         (1 if adjust_num_logprobs else 0), ),
-                           replace=False)
-    res[-1] = sampled_token_id
-    return res
+    # Calculate the final number of logprobs required
+    total_logprobs = num_logprobs + (1 if adjust_num_logprobs else 0)
+
+    # Generate random indices using torch
+    choice_tensor = torch.randperm(upper - lower)[:total_logprobs] + lower
+
+    # Ensure the sampled token ID is included in the tensor
+    choice_tensor[-1] = sampled_token_id
+
+    return choice_tensor
 
 
 def _create_random_top_token_test_matrix(
-    shape: Tuple,
+    shape: Tuple[int, int],
     lower: int,
     upper: int,
-) -> npt.NDArray:
+) -> torch.Tensor:
     """Create a random matrix of top logprob token indices
 
     Use to create fake prompt logprobs for testing.
@@ -114,24 +113,24 @@ def _create_random_top_token_test_matrix(
     replacement.
 
     Args:
-      shape: (num_tokens,num_logprobs) tuple representing
+      shape: (num_tokens, num_logprobs) tuple representing
              matrix shape
       lower: lower range of token ids
       upper: upper range of token ids
 
     Returns:
-      2D num_tokens x num_logprobs np array of token ids
+      2D num_tokens x num_logprobs torch Tensor of token ids
     """
-    choice_list = list(range(lower, upper))
-    res = np.random.choice(choice_list, (shape[0], shape[1]), replace=False)
-    return res
+    num_elements = shape[0] * shape[1]
+    choice_tensor = torch.randperm(upper - lower)[:num_elements] + lower
+    return choice_tensor.view(shape)
 
 
 def generate_dummy_sample_logprobs(
     sampled_tokens_list: List,
     num_logprobs: int,
     tokenizer: PreTrainedTokenizer,
-) -> List[Tuple[npt.NDArray, npt.NDArray]]:
+) -> List[Tuple[torch.Tensor, torch.Tensor]]:
     """Generate dummy sample logprobs
 
     Generate a test data structure which imitates the list of sample logprobs
@@ -143,8 +142,8 @@ def generate_dummy_sample_logprobs(
       tokenizer: model tokenizer to use for detokenization
 
     Returns
-      List of (logprobs vector, top token ids vector) np array tuples; each pair
-      of vectors have the same length which is either `num_logprobs` or
+      List of (logprobs vector, top token ids vector) torch Tensor tuples; each
+      pair of vectors have the same length which is either `num_logprobs` or
       `num_logprobs+1`
     """
     res = []
@@ -163,10 +162,10 @@ def generate_dummy_prompt_logprobs(
     prompt_tokens_list: List,
     num_logprobs: int,
     tokenizer: PreTrainedTokenizer,
-) -> Tuple[npt.NDArray, npt.NDArray]:
+) -> Tuple[torch.Tensor, torch.Tensor]:
     """Generate dummy prompt logprobs
 
-    Generate a test data structure which imitates the np arrays of prompt
+    Generate a test data structure which imitates the torch Tensors of prompt
     logprobs which would be assembled in the engine core during chunked
     prefill.
 
@@ -176,7 +175,7 @@ def generate_dummy_prompt_logprobs(
       tokenizer: model tokenizer to use for detokenization
 
     Returns
-      Single Tuple of (logprobs matrix, top token ids matrix) np arrays,
+      Single Tuple of (logprobs matrix, top token ids matrix) torch Tensor,
       where both matrices have dimensions
       num_prompt_tokens x num_logprobs
     """

From a1c5b2e3cc074430cc9d32e9e927502b77349f98 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 6 Jan 2025 07:12:50 +0000
Subject: [PATCH 258/293] fix: logprobs not being wrapped in an array

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/core/scheduler.py | 2 +-
 vllm/v1/sample/sampler.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index a470760796e41..6d11c258e6054 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -450,7 +450,7 @@ def update_from_output(
                     assert logprobs_cpu is not None
                     # Here we assume there is 1 generated token per step.
                     logprobs_token_ids = [logprobs_token_ids_cpu[req_index]]
-                    logprobs = logprobs_cpu[req_index]
+                    logprobs = [logprobs_cpu[req_index]]
 
                 # Extract prompt logprobs for this req if needed.
                 # FIXME(rob): Currently we throw away the prompt logprobs
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index dd34d1a71cbcc..1b51a7eea2819 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -49,12 +49,12 @@ def forward(
         if needs_logprobs:
             # Get sampled and topk token logprobs.
             # NOTE: CPU<>GPU sync happens here.
-            logprob_token_ids, logprobs = self.get_logprobs(
+            logprobs, logprob_token_ids = self.get_logprobs(
                 raw_logits,
                 sampling_metadata.max_num_logprobs,
                 sampled_token_ids=sampled)
         else:
-            logprob_token_ids, logprobs = None, None
+            logprobs, logprob_token_ids = None, None
 
         # NOTE: CPU-GPU synchronization happens here.
         sampler_output = SamplerOutput(

From 728837034682d168cd06c3d4c891b2c9dcd4affe Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 6 Jan 2025 10:55:47 +0000
Subject: [PATCH 259/293] sample logprobs work

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/core/scheduler.py     |  2 +-
 vllm/v1/engine/detokenizer.py | 74 +++++++++++++++++++++++++++--------
 2 files changed, 59 insertions(+), 17 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 6d11c258e6054..c20aef64100b4 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -456,7 +456,7 @@ def update_from_output(
                 # FIXME(rob): Currently we throw away the prompt logprobs
                 # of an in progress partial request. We can handle this
                 # by updating the Request object to hold prompt logprobs.
-                prompt_logprobs_token_ids, prompt_logprobs = (
+                prompt_logprobs, prompt_logprobs_token_ids = (
                     prompt_logprobs_dict.get(req_id, (None, None)))
 
                 # Add EngineCoreOutput for this Request.
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index b6eb9c5c7af4f..9f9d687a0d718 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -144,18 +144,23 @@ def _update_sample_logprobs(
             decoded_tokens = detokenize_non_incrementally(
                 self.tokenizer, topk_token_ids)
 
-            # Make the Logprob objects the position.
-            pos_logprobs_dict = self._make_pos_logprob_dict(
-                topk_token_ids.tolist(), topk_logprobs.tolist(),
-                decoded_tokens, self.num_logprobs)
-
-            # Add the sampled Logprob if it was not in topk.
-            if sampled_token_id not in pos_logprobs_dict:
-                token = self.tokenizer.decode(sampled_token_id)
-                pos_logprobs_dict[sampled_token_id] = Logprob(
+            # Make the dict of top-token Logprob objects associated with the
+            # current sequence offset
+            if sampled_token_id in topk_token_ids:
+                pos_logprobs_dict = self._make_pos_logprob_dict(
+                    topk_logprobs.tolist(), topk_token_ids.tolist(),
+                    decoded_tokens, self.num_logprobs)
+            else:
+                # If the sampled token is not one of the top tokens
+                # at this sequence offset, inject the sampled token
+                # & its Logprob instance into the dict
+                sample_logprob_obj = Logprob(
                     logprob=sampled_token_logprob,
-                    rank=None,  # TODO: is this needed?
-                    decoded_token=token)
+                    decoded_token=self.tokenizer.decode(sampled_token_id))
+                pos_logprobs_dict = self._make_pos_logprob_dict(
+                    topk_logprobs.tolist(), topk_token_ids.tolist(),
+                    decoded_tokens, self.num_logprobs,
+                    (sampled_token_id, sample_logprob_obj))
 
             self.logprobs.append(pos_logprobs_dict)
             self.cumulative_logprob += sampled_token_logprob
@@ -205,20 +210,57 @@ def _make_pos_logprob_dict(
         token_ids: List[int],
         decoded_tokens: List[str],
         num_logprobs: int,
+        sampled_token_id_logprob: Optional[Tuple[int, Logprob]] = None,
     ) -> Dict[int, Logprob]:
-        """Make a Logprob dictionary for a position in the sequence."""
-
+        """Make a Logprob dictionary for a position in the sequence.
+        
+        Returns a dictionary mapping top token ids to Logprob data
+        structures. Each Logprob data structure includes log probability,
+        decoded token, and rank (1-indexed). The size of the dict returned
+        will be be num_logprobs.
+
+        If the sampled token is not among the top logprobs, then 
+        sampled_token_id_logprob = (sampled_token_id,sample_logprob) must be
+        provided; an additional dictionary entry mapping sampled_token_id -> 
+        sample_logprob will be injected with rank equal to num_logprobs + 1 
+        (sampled_token_id must be lowest-rank if we are having to inject it.)
+        Note that the size of the dict returned will then be num_logprobs + 1.
+
+        Args:
+          logprobs: list of log probabilities
+          token_ids: list of top token ids
+          decoded_tokens: list of decoded top tokens
+          num_logprobs: number of top tokens
+          sampled_token_id_logprob: (optional) tuple of
+                                    (sampled_token_id,sample_logprob)
+
+        Returns:
+          Dict[top token id, Logprob]; num_logprobs or num_logprobs+1
+          keys in total
+        
+        """
         # Sampler uses torch.topk() which sorts so the
-        # index in lists is equivalent to rank.
-        return {
+        # index in lists is equivalent to rank-1.
+        logprobs_dict = {
             token_ids[idx]: Logprob(
                 logprob=logprobs[idx],
-                rank=idx,
+                rank=idx + 1,
                 decoded_token=decoded_tokens[idx],
             )
             for idx in range(num_logprobs)
         }
 
+        # Inject sampled token Logprob if necessary
+        if sampled_token_id_logprob:
+            sampled_token_id = sampled_token_id_logprob[0]
+            sample_logprob_obj = sampled_token_id_logprob[1]
+            assert sampled_token_id is not None
+            assert sample_logprob_obj is not None
+            sample_logprob_obj.rank = num_logprobs + 1
+            logprobs_dict[sampled_token_id] = sample_logprob_obj
+
+        return logprobs_dict
+
     def add_tokens(
         self,
         new_token_ids: List[int],

From 0e90ccb95b1a0e97f7000f6a7d09e0364d2e18db Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 6 Jan 2025 12:46:37 +0000
Subject: [PATCH 260/293] detokenizer test passing for sample logprobs

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py | 10 ++++++----
 tests/v1/engine/utils.py            |  9 ++++-----
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 02e0da265d9e4..d68d68e1bf0c1 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -95,11 +95,13 @@ def get_outputs(self) -> List[EngineCoreOutput]:
             if len(generated_token_ids) > token_idx:
                 if do_logprobs:
                     assert self.generated_logprobs_raw is not None
-                    logprobs = [
-                        self.generated_logprobs_raw[req_idx][token_idx]
-                    ]
+                    (logprobs, logprobs_token_ids) = (
+                        self.generated_logprobs_raw[req_idx][token_idx])
+                    logprobs = [logprobs]
+                    logprobs_token_ids = [logprobs_token_ids]
                 else:
                     logprobs = None
+                    logprobs_token_ids = None
                 if do_prompt_logprobs:
                     if self.current_idx == 0:
                         assert self.prompt_logprobs_raw is not None
@@ -115,7 +117,7 @@ def get_outputs(self) -> List[EngineCoreOutput]:
                     new_token_ids=[generated_token_ids[token_idx]],
                     finished=False,
                     logprobs=logprobs,
-                    logprobs_token_ids=logprobs,
+                    logprobs_token_ids=logprobs_token_ids,
                     prompt_logprobs=prompt_logprobs,
                     prompt_logprobs_token_ids=prompt_logprobs_token_ids,
                 )
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index fc66cebdc6732..1a2d98d7fe64c 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -32,7 +32,7 @@ def _create_random_top_logprob_test_vector(
     Returns:
       1D length-`num_logprobs` torch Tensor of float logprob values
     """
-    return torch.rand(num_logprobs) * (upper - lower) + lower
+    return torch.rand(num_logprobs + 1) * (upper - lower) + lower
 
 
 def _create_random_top_logprob_test_matrix(
@@ -89,13 +89,13 @@ def _create_random_top_token_test_vector(
       `num_logprobs` otherwise
     """
     # Calculate the final number of logprobs required
-    total_logprobs = num_logprobs + (1 if adjust_num_logprobs else 0)
+    total_logprobs = num_logprobs + 1
 
     # Generate random indices using torch
     choice_tensor = torch.randperm(upper - lower)[:total_logprobs] + lower
 
     # Ensure the sampled token ID is included in the tensor
-    choice_tensor[-1] = sampled_token_id
+    choice_tensor[0] = sampled_token_id
 
     return choice_tensor
 
@@ -200,8 +200,7 @@ def _decode_token(
     Returns:
       string representation of token
     """
-    return tokenizer.convert_ids_to_tokens([tok_id],
-                                           skip_special_tokens=False)[0]
+    return tokenizer.batch_decode([tok_id])[0]
 
 
 def validate_requests_logprobs(

From c2f48fb6be7556046924a1eb68f25101497fdbae Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 6 Jan 2025 15:36:50 +0000
Subject: [PATCH 261/293] detokenizer tests passing

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py |  4 +++-
 vllm/v1/engine/detokenizer.py       | 25 +++++++++++++++----------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index d68d68e1bf0c1..1065fed39c7f1 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -109,7 +109,9 @@ def get_outputs(self) -> List[EngineCoreOutput]:
                         prompt_logprobs_token_ids = self.prompt_logprobs_raw[
                             req_idx][1]
                     else:
-                        (prompt_logprobs, prompt_logprobs_token_ids) = ([], [])
+                        (prompt_logprobs,
+                         prompt_logprobs_token_ids) = (torch.empty(0, 0),
+                                                       torch.empty(0, 0))
                 else:
                     (prompt_logprobs, prompt_logprobs_token_ids) = (None, None)
                 output = EngineCoreOutput(
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 9f9d687a0d718..7d0afd4f83e8e 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -51,6 +51,7 @@ class IncrementalDetokenizer:
     prompt_logprobs: Optional[PromptLogprobs]
     cumulative_logprob: Optional[float]
     num_logprobs: int
+    num_prompt_logprobs: int
 
     # Accounting for stop string buffering
     stop_buffer_length: int
@@ -83,6 +84,7 @@ def from_new_request(
             stop_buffer_length = 0
 
         logprobs = request.sampling_params.logprobs
+        prompt_logprobs = request.sampling_params.prompt_logprobs
         return cls(
             output_text="",
             tokens=tokens,
@@ -105,7 +107,8 @@ def from_new_request(
             stop_buffer_length=stop_buffer_length,
             cumulative_logprob=(0. if logprobs else None),
             logprobs=([] if logprobs else None),
-            prompt_logprobs=None,
+            prompt_logprobs=([] if prompt_logprobs else None),
+            num_prompt_logprobs=(prompt_logprobs or 0),
             num_logprobs=(logprobs or 0),
         )
 
@@ -123,8 +126,8 @@ def _update_sample_logprobs(
             token_ids: [topk + 1]: topk token ids at pos
             logprobs:  [topk + 1]: topk logprobs at pos
         """
-
         if self.num_logprobs == 0:
+            # Sample logprobs disabled for this request
             return None
         assert self.logprobs is not None
 
@@ -174,15 +177,17 @@ def _update_prompt_logprobs(
         token_ids: Optional[torch.Tensor],
         logprobs: Optional[torch.Tensor],
     ) -> Optional[PromptLogprobs]:
-
-        # Skip if no prompt logprobs were generated.
-        if token_ids is None:
+        if self.num_prompt_logprobs == 0:
+            # Prompt logprobs disabled for this request
             return None
-        assert logprobs is not None
 
-        # EngineCore does not stream until entire prompt complete,
-        # so Detokenizer should get all prompt lps at once.
-        assert self.prompt_logprobs is None
+        # Prompt logprobs enabled but none generated in this step
+        if token_ids is None or (logprobs is not None
+                                 and logprobs.numel() == 0):
+            # EngineCore does not stream until entire prompt complete,
+            # so Detokenizer should get all prompt lps at once.
+            return []
+        assert logprobs is not None
 
         # Detokenize non-incrementally.
         # NOTE(rob): the output is flattened:
@@ -199,7 +204,7 @@ def _update_prompt_logprobs(
                 # Deal with the flattening from above.
                 decoded_tokens[tok_idx * num_logprobs:],
                 num_logprobs,
-            ) for tok_idx in range(num_tokens)
+            ) for tok_idx in range(num_tokens - 1)
         ]
 
         return self.prompt_logprobs

From 13177d4ac44d7379fb07a69db9e9182f4ef62ede Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 6 Jan 2025 21:47:03 +0000
Subject: [PATCH 262/293] prompt logprobs with chunked prefill!

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/core/scheduler.py          | 27 ++++++++++++-----
 vllm/v1/engine/detokenizer.py      | 47 ++++++++++++++++++++----------
 vllm/v1/sample/sampler.py          |  2 --
 vllm/v1/worker/gpu_model_runner.py |  7 ++++-
 4 files changed, 57 insertions(+), 26 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index c20aef64100b4..6054de6ce2ea1 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -429,6 +429,10 @@ def update_from_output(
                     # in the decoder's KV cache.
                     self.encoder_cache_manager.free(request, input_id)
 
+            # Extract prompt logprobs for this req if needed.
+            prompt_logprobs, prompt_logprobs_token_ids = (
+                prompt_logprobs_dict.get(req_id, (None, None)))
+
             if request.num_computed_tokens == request.num_tokens:
                 req_index = model_runner_output.req_id_to_index[req_id]
                 # NOTE(woosuk): Currently, we assume that each request
@@ -452,13 +456,6 @@ def update_from_output(
                     logprobs_token_ids = [logprobs_token_ids_cpu[req_index]]
                     logprobs = [logprobs_cpu[req_index]]
 
-                # Extract prompt logprobs for this req if needed.
-                # FIXME(rob): Currently we throw away the prompt logprobs
-                # of an in progress partial request. We can handle this
-                # by updating the Request object to hold prompt logprobs.
-                prompt_logprobs, prompt_logprobs_token_ids = (
-                    prompt_logprobs_dict.get(req_id, (None, None)))
-
                 # Add EngineCoreOutput for this Request.
                 output = EngineCoreOutput(
                     request_id=req_id,
@@ -476,6 +473,22 @@ def update_from_output(
                 if stopped:
                     continue
 
+            elif prompt_logprobs is not None:
+                # Chunked prefill & prompt logprobs is enabled; transmit partial
+                # logprobs via EngineCoreOutput
+                # Add EngineCoreOutput for this Request.
+                output = EngineCoreOutput(
+                    request_id=req_id,
+                    new_token_ids=[],
+                    finished=request.is_finished(),
+                    finish_reason=request.get_finished_reason(),
+                    stop_reason=request.stop_reason,
+                    logprobs_token_ids=[],
+                    logprobs=[],
+                    prompt_logprobs_token_ids=prompt_logprobs_token_ids,
+                    prompt_logprobs=prompt_logprobs)
+                engine_core_outputs.append(output)
+
             new_running.append(request)
         self.running = new_running
         return engine_core_outputs
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 7d0afd4f83e8e..fd185338d0938 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -180,14 +180,16 @@ def _update_prompt_logprobs(
         if self.num_prompt_logprobs == 0:
             # Prompt logprobs disabled for this request
             return None
-
-        # Prompt logprobs enabled but none generated in this step
-        if token_ids is None or (logprobs is not None
-                                 and logprobs.numel() == 0):
-            # EngineCore does not stream until entire prompt complete,
-            # so Detokenizer should get all prompt lps at once.
-            return []
         assert logprobs is not None
+        assert token_ids is not None
+        if logprobs.numel() == 0:
+            # Prompt logprobs are enabled for this request but prefill
+            # is finished and no more logprobs are being streamed from
+            # engine core
+            return []
+        # Prompt logprobs are enabled & engine core is streaming prompt
+        # logprobs, in one or more chunks.
+        assert self.prompt_logprobs is not None
 
         # Detokenize non-incrementally.
         # NOTE(rob): the output is flattened:
@@ -197,15 +199,28 @@ def _update_prompt_logprobs(
 
         # Make Logprob for each tokens. The first Logprob is None.
         num_tokens, num_logprobs = logprobs.shape
-        self.prompt_logprobs = [None] + [
-            self._make_pos_logprob_dict(
-                logprobs[tok_idx].tolist(),
-                token_ids[tok_idx].tolist(),
-                # Deal with the flattening from above.
-                decoded_tokens[tok_idx * num_logprobs:],
-                num_logprobs,
-            ) for tok_idx in range(num_tokens - 1)
-        ]
+        if len(self.prompt_logprobs) == 0:
+            # Buffer initial chunk of logprobs during prefill
+            self.prompt_logprobs = [None] + [
+                self._make_pos_logprob_dict(
+                    logprobs[tok_idx].tolist(),
+                    token_ids[tok_idx].tolist(),
+                    # Deal with the flattening from above.
+                    decoded_tokens[tok_idx * num_logprobs:],
+                    num_logprobs,
+                ) for tok_idx in range(num_tokens)
+            ]
+        else:
+            # Buffer subsequent chunk of logprobs during prefill
+            self.prompt_logprobs += [
+                self._make_pos_logprob_dict(
+                    logprobs[tok_idx].tolist(),
+                    token_ids[tok_idx].tolist(),
+                    # Deal with the flattening from above.
+                    decoded_tokens[tok_idx * num_logprobs:],
+                    num_logprobs,
+                ) for tok_idx in range(num_tokens)
+            ]
 
         return self.prompt_logprobs
 
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 1b51a7eea2819..c8407f4cabeff 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -123,8 +123,6 @@ def get_logprobs(
 
         # Concatenate with the sampled token_ids if provided.
         if sampled_token_ids is not None:
-            # TODO(rob): do we need to return the rank of the sampled?
-            # TODO(andy): is this indexing right?
             sampled_logprobs = logprobs[torch.arange(logprobs.size(0)),
                                         sampled_token_ids].unsqueeze(-1)
             sampled_token_ids = sampled_token_ids.unsqueeze(-1)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index b43e4ef4cf3c6..703edc48fba90 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -502,10 +502,15 @@ def _prepare_prompt_indices(
         # If we have 3 sequences in the batch of lens [2, 5, 3],
         # req_indices = [0, 0, 1, 1, 1, 1, 1, 2, 2, 2].
         # Thus, prompt_indices is where req_indices == req_idx.
+        print("\n\nreq_id_to_index:", self.input_batch.req_id_to_index, "\n\n")
+        print("\nreq_id:", req_id, "\n")
+        print("\nreq_indices:", req_indices, "\n")
+        print("\nscheduler_output.partial_req_ids:",
+              scheduler_output.partial_req_ids, "\n")
         req_idx = self.input_batch.req_id_to_index[req_id]
         indices = self.arange_np[:req_indices.shape[0]]
         prompt_indices = indices[req_indices == req_idx]
-
+        print("\n\n")
         # Remove the sample token if there is one.
         if req_id not in scheduler_output.partial_req_ids:
             prompt_indices = prompt_indices[:-1]

From 05536f574fa1281da019b27398bef5914f6848be Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 6 Jan 2025 21:48:37 +0000
Subject: [PATCH 263/293] cleanup

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/worker/gpu_model_runner.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 703edc48fba90..71ef192456eea 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -502,15 +502,9 @@ def _prepare_prompt_indices(
         # If we have 3 sequences in the batch of lens [2, 5, 3],
         # req_indices = [0, 0, 1, 1, 1, 1, 1, 2, 2, 2].
         # Thus, prompt_indices is where req_indices == req_idx.
-        print("\n\nreq_id_to_index:", self.input_batch.req_id_to_index, "\n\n")
-        print("\nreq_id:", req_id, "\n")
-        print("\nreq_indices:", req_indices, "\n")
-        print("\nscheduler_output.partial_req_ids:",
-              scheduler_output.partial_req_ids, "\n")
         req_idx = self.input_batch.req_id_to_index[req_id]
         indices = self.arange_np[:req_indices.shape[0]]
         prompt_indices = indices[req_indices == req_idx]
-        print("\n\n")
         # Remove the sample token if there is one.
         if req_id not in scheduler_output.partial_req_ids:
             prompt_indices = prompt_indices[:-1]

From 0d17df824f94633240bf01d37b980fda10c9ac5f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 6 Jan 2025 22:05:12 +0000
Subject: [PATCH 264/293] light refactor

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/transformers_utils/detokenizer_utils.py |  2 +-
 vllm/v1/engine/detokenizer.py                | 54 ++++++++------------
 2 files changed, 23 insertions(+), 33 deletions(-)

diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 7e51adda69392..b93f86a805c42 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -86,7 +86,7 @@ def detokenize_non_incrementally(
     flat_token_ids = token_ids.reshape(-1, 1)
     # FIXME(andy): deal with MistralTokenizer not having
     # batch_decode. Follow up if hard?
-    return tokenizer.batch_decode(flat_token_ids)
+    return tokenizer.batch_decode(flat_token_ids)  # type: ignore
 
 
 # Based on
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index fd185338d0938..0725b1ac3742e 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -116,7 +116,7 @@ def _update_sample_logprobs(
         self,
         sampled_token_ids: List[int],
         token_ids_lst: List[torch.Tensor],
-        logprobs_lst: List[torch.Tensor],
+        sample_logprobs_lst: List[torch.Tensor],
     ) -> Optional[SampleLogprobs]:
         """
         Lists are only of length >1 if EngineCore made
@@ -124,7 +124,7 @@ def _update_sample_logprobs(
 
         Tensors are:
             token_ids: [topk + 1]: topk token ids at pos
-            logprobs:  [topk + 1]: topk logprobs at pos
+            sample_logprobs:  [topk + 1]: topk logprobs at pos
         """
         if self.num_logprobs == 0:
             # Sample logprobs disabled for this request
@@ -132,7 +132,7 @@ def _update_sample_logprobs(
         assert self.logprobs is not None
 
         for sampled_token_id, logprobs, token_ids in zip(
-                sampled_token_ids, logprobs_lst, token_ids_lst):
+                sampled_token_ids, sample_logprobs_lst, token_ids_lst):
 
             # Split into sampled vs top_k.
             assert sampled_token_id == token_ids[0].item(), (
@@ -175,14 +175,14 @@ def _update_sample_logprobs(
     def _update_prompt_logprobs(
         self,
         token_ids: Optional[torch.Tensor],
-        logprobs: Optional[torch.Tensor],
+        prompt_logprobs: Optional[torch.Tensor],
     ) -> Optional[PromptLogprobs]:
         if self.num_prompt_logprobs == 0:
             # Prompt logprobs disabled for this request
             return None
-        assert logprobs is not None
+        assert prompt_logprobs is not None
         assert token_ids is not None
-        if logprobs.numel() == 0:
+        if prompt_logprobs.numel() == 0:
             # Prompt logprobs are enabled for this request but prefill
             # is finished and no more logprobs are being streamed from
             # engine core
@@ -196,32 +196,22 @@ def _update_prompt_logprobs(
         # [num_tok, num_lps] -> [num_tok * num_lps]
         decoded_tokens = detokenize_non_incrementally(self.tokenizer,
                                                       token_ids)
-
-        # Make Logprob for each tokens. The first Logprob is None.
-        num_tokens, num_logprobs = logprobs.shape
-        if len(self.prompt_logprobs) == 0:
-            # Buffer initial chunk of logprobs during prefill
-            self.prompt_logprobs = [None] + [
-                self._make_pos_logprob_dict(
-                    logprobs[tok_idx].tolist(),
-                    token_ids[tok_idx].tolist(),
-                    # Deal with the flattening from above.
-                    decoded_tokens[tok_idx * num_logprobs:],
-                    num_logprobs,
-                ) for tok_idx in range(num_tokens)
-            ]
-        else:
-            # Buffer subsequent chunk of logprobs during prefill
-            self.prompt_logprobs += [
-                self._make_pos_logprob_dict(
-                    logprobs[tok_idx].tolist(),
-                    token_ids[tok_idx].tolist(),
-                    # Deal with the flattening from above.
-                    decoded_tokens[tok_idx * num_logprobs:],
-                    num_logprobs,
-                ) for tok_idx in range(num_tokens)
-            ]
-
+        # Make Logprob for each token.
+        num_tokens, num_logprobs = prompt_logprobs.shape
+        chunk_prompt_logprobs = [
+            self._make_pos_logprob_dict(
+                prompt_logprobs[tok_idx].tolist(),
+                token_ids[tok_idx].tolist(),
+                # Deal with the flattening from above.
+                decoded_tokens[tok_idx * num_logprobs:],
+                num_logprobs,
+            ) for tok_idx in range(num_tokens)
+        ]
+        # Buffer prefill chunk
+        self.prompt_logprobs = (
+            ([None]  # First logprob is None
+             if len(self.prompt_logprobs) == 0 else self.prompt_logprobs) +
+            chunk_prompt_logprobs)
         return self.prompt_logprobs
 
     @staticmethod

From f7071919ecb9831f409c49e0f15df995bf0f0815 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 6 Jan 2025 22:59:45 +0000
Subject: [PATCH 265/293] torch serialization with msgpack via
 enc_/ext_hooksgit status!

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/__init__.py |  2 +-
 vllm/v1/serial_utils.py    | 13 ++++---------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 2b7c70e977489..be6c7a441eaab 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -35,7 +35,7 @@ class EngineCoreRequest:
 
 class EngineCoreOutput(
         msgspec.Struct,
-        array_like=True,  # type: ignore[call-arg]
+        array_like=False,  # type: ignore[call-arg]
         omit_defaults=True,  # type: ignore[call-arg]
         gc=False):  # type: ignore[call-arg]
 
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 07f0a61f0adba..813153a56ef68 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -23,12 +23,10 @@ def __init__(self):
         self.encoder = msgpack.Encoder(enc_hook=custom_enc_hook)
 
     def encode(self, obj: Any) -> bytes:
-        #return self.encoder.encode(obj)
-        return pickle.dumps(obj)
+        return self.encoder.encode(obj)
 
     def encode_into(self, obj: Any, buf: bytearray) -> None:
-        #self.encoder.encode_into(obj, buf)
-        buf[:] = pickle.dumps(obj)
+        self.encoder.encode_into(obj, buf)
 
 
 class MsgpackDecoder:
@@ -38,8 +36,7 @@ def __init__(self, t: Any):
         self.decoder = msgpack.Decoder(t, ext_hook=custom_ext_hook)
 
     def decode(self, obj: Any):
-        #return self.decoder.decode(obj)
-        return pickle.loads(obj)
+        return self.decoder.decode(obj)
 
 
 def custom_enc_hook(obj: Any) -> Any:
@@ -47,9 +44,7 @@ def custom_enc_hook(obj: Any) -> Any:
         # NOTE(rob): it is fastest to use numpy + pickle
         # when serializing torch tensors.
         # https://gist.github.com/tlrmchlsmth/8067f1b24a82b6e2f90450e7764fa103 # noqa: E501
-        #return msgpack.Ext(CUSTOM_TYPE_CODE_PICKLE, pickle.dumps(obj.numpy()))
-        return msgpack.Ext(CUSTOM_TYPE_CODE_PICKLE,
-                           msgpack.Encoder().encode(obj.numpy()))
+        return msgpack.Ext(CUSTOM_TYPE_CODE_PICKLE, pickle.dumps(obj.numpy()))
     else:
         raise NotImplementedError(
             f"Objects of type {type(obj)} are not supported")

From 3546639bb75f79b823b7a8c49351c5a9441cdffc Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 9 Jan 2025 02:27:03 +0000
Subject: [PATCH 266/293] wip

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/sample/sampler.py          | 18 ++++++++++++------
 vllm/v1/worker/gpu_model_runner.py |  8 +++++++-
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index c8407f4cabeff..3b22aac0011c9 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -52,7 +52,7 @@ def forward(
             logprobs, logprob_token_ids = self.get_logprobs(
                 raw_logits,
                 sampling_metadata.max_num_logprobs,
-                sampled_token_ids=sampled)
+                actual_token_ids=sampled)
         else:
             logprobs, logprob_token_ids = None, None
 
@@ -111,8 +111,14 @@ def get_logprobs(
         self,
         logits: torch.Tensor,
         num_logprobs: int,
-        sampled_token_ids: Optional[torch.Tensor] = None,
+        actual_token_ids: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute logprobs from logits.
+        
+        
+        
+        """
+
         # Compute logprobs.
         logprobs = logits.log_softmax(dim=-1, dtype=torch.float32)
         topk_logprobs, topk_indices = torch.topk(logprobs,
@@ -122,11 +128,11 @@ def get_logprobs(
         topk_indices = topk_indices.to(torch.int32)
 
         # Concatenate with the sampled token_ids if provided.
-        if sampled_token_ids is not None:
+        if actual_token_ids is not None:
             sampled_logprobs = logprobs[torch.arange(logprobs.size(0)),
-                                        sampled_token_ids].unsqueeze(-1)
-            sampled_token_ids = sampled_token_ids.unsqueeze(-1)
-            topk_indices = torch.cat([sampled_token_ids, topk_indices], dim=1)
+                                        actual_token_ids].unsqueeze(-1)
+            actual_token_ids = actual_token_ids.unsqueeze(-1)
+            topk_indices = torch.cat([actual_token_ids, topk_indices], dim=1)
             topk_logprobs = torch.cat([sampled_logprobs, topk_logprobs], dim=1)
 
         return topk_logprobs.cpu(), topk_indices.cpu()
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 24216775e28aa..053dc745f8b7f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -659,6 +659,9 @@ def execute_model(
         # prompt separately. Prompt logprobs are rare (used for eval),
         # and few prefills per batch, so prioritize simple over optimal.
         prompt_logprobs_dict: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
+        if len(self.input_batch.num_prompt_logprobs) > 0:
+            # Prompt token ids are required for computing prompt logprobs
+            assert self.input_batch.prompt_token_ids is not None
         for (request_id, num_prompt_logprobs
              ) in self.input_batch.num_prompt_logprobs.items():
 
@@ -670,7 +673,10 @@ def execute_model(
 
             # Compute prompt logprobs.
             prompt_logprobs_dict[request_id] = self.model.sampler.get_logprobs(
-                logits, num_prompt_logprobs)
+                logits,
+                num_prompt_logprobs,
+                actual_token_ids=self.input_batch.
+                prompt_token_ids[prompt_indices])
 
         sampled_token_ids = sampler_output.sampled_token_ids
         # TODO(woosuk): The following loop can be slow since it iterates over

From 69218abc1e66771e6097c9c226554e6c8ed82ad3 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 9 Jan 2025 03:17:14 +0000
Subject: [PATCH 267/293] GPU returns num_prompt_logprobs + 1 prompt logprobs

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/sample/sampler.py          | 39 +++++++++++++++++++-----------
 vllm/v1/worker/gpu_model_runner.py |  5 ++--
 2 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 3b22aac0011c9..3c2e3db0eed43 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,5 +1,5 @@
 """A layer that samples the next tokens from the model's outputs."""
-from typing import Optional, Tuple
+from typing import Tuple
 
 import torch
 import torch.nn as nn
@@ -52,7 +52,7 @@ def forward(
             logprobs, logprob_token_ids = self.get_logprobs(
                 raw_logits,
                 sampling_metadata.max_num_logprobs,
-                actual_token_ids=sampled)
+                token_ids=sampled)
         else:
             logprobs, logprob_token_ids = None, None
 
@@ -111,14 +111,26 @@ def get_logprobs(
         self,
         logits: torch.Tensor,
         num_logprobs: int,
-        actual_token_ids: Optional[torch.Tensor] = None,
+        token_ids: torch.Tensor,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Compute logprobs from logits.
-        
-        
-        
-        """
 
+        Also compute logprobs associated with `token_ids` and
+        concatenate to the output.
+
+        Args:
+          logits: (num tokens) x (vocab) tensor
+          num_logprobs: minimum number of logprobs to
+                        retain per token
+          token_ids: prompt tokens (if prompt logprobs)
+                     or sampled tokens (if sampled
+                     logprobs); 1D token ID tensor
+                     with (num tokens) elements
+
+        Returns:
+          Top-k float logprobs tensor, (num tokens) x (num_logprobs + 1)
+          Top-k int indices tensor, (num tokens) x (num_logprobs + 1)
+        """
         # Compute logprobs.
         logprobs = logits.log_softmax(dim=-1, dtype=torch.float32)
         topk_logprobs, topk_indices = torch.topk(logprobs,
@@ -127,13 +139,12 @@ def get_logprobs(
         # Use int32 to reduce the tensor size.
         topk_indices = topk_indices.to(torch.int32)
 
-        # Concatenate with the sampled token_ids if provided.
-        if actual_token_ids is not None:
-            sampled_logprobs = logprobs[torch.arange(logprobs.size(0)),
-                                        actual_token_ids].unsqueeze(-1)
-            actual_token_ids = actual_token_ids.unsqueeze(-1)
-            topk_indices = torch.cat([actual_token_ids, topk_indices], dim=1)
-            topk_logprobs = torch.cat([sampled_logprobs, topk_logprobs], dim=1)
+        # Concatenate with the token_ids
+        sampled_logprobs = logprobs[torch.arange(logprobs.size(0)),
+                                    token_ids].unsqueeze(-1)
+        token_ids = token_ids.unsqueeze(-1)
+        topk_indices = torch.cat([token_ids, topk_indices], dim=1)
+        topk_logprobs = torch.cat([sampled_logprobs, topk_logprobs], dim=1)
 
         return topk_logprobs.cpu(), topk_indices.cpu()
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 053dc745f8b7f..f1b4d11db0465 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -675,8 +675,9 @@ def execute_model(
             prompt_logprobs_dict[request_id] = self.model.sampler.get_logprobs(
                 logits,
                 num_prompt_logprobs,
-                actual_token_ids=self.input_batch.
-                prompt_token_ids[prompt_indices])
+                token_ids=self.input_batch.prompt_token_ids[
+                    prompt_indices]  #type: ignore
+            )
 
         sampled_token_ids = sampler_output.sampled_token_ids
         # TODO(woosuk): The following loop can be slow since it iterates over

From 2505244ef69de21f4b0388c752b9c09c966b8f4f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 9 Jan 2025 04:21:03 +0000
Subject: [PATCH 268/293] now prompt logprobs include prompt token

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/detokenizer.py      | 85 +++++++++++++++++++++++-------
 vllm/v1/worker/gpu_model_runner.py |  6 +--
 2 files changed, 70 insertions(+), 21 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 0725b1ac3742e..4f0408b0811e4 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -118,13 +118,20 @@ def _update_sample_logprobs(
         token_ids_lst: List[torch.Tensor],
         sample_logprobs_lst: List[torch.Tensor],
     ) -> Optional[SampleLogprobs]:
-        """
+        """Incorporate sample logprobs from this step, if they exist.
+
         Lists are only of length >1 if EngineCore made
         >1 tokens in prior step (e.g. in spec decoding).
 
-        Tensors are:
-            token_ids: [topk + 1]: topk token ids at pos
-            sample_logprobs:  [topk + 1]: topk logprobs at pos
+        Args:
+          sampled_token_ids: list of int token ids
+          token_ids_list: list of (topk + 1) token ids tensors at each pos;
+                          `None` if sample logprobs are disabled in this req
+          sample_logprobs: list of (topk + 1) logprobs tensors at each pos;
+                          `None` if sample logprobs are disabled in this req
+
+        Return:
+          Sample logprobs, if required for this request
         """
         if self.num_logprobs == 0:
             # Sample logprobs disabled for this request
@@ -177,6 +184,25 @@ def _update_prompt_logprobs(
         token_ids: Optional[torch.Tensor],
         prompt_logprobs: Optional[torch.Tensor],
     ) -> Optional[PromptLogprobs]:
+        """Incorporate prompt logprobs from this step, if they exist.
+
+        If prompt logprobs are enabled for this request and EngineCore
+        prefilled the prompt or a chunk of the prompt in this step,
+        both arguments should be non-empty lists. 
+
+        If prompt logprobs are enabled but prefill is completed, both
+        arguments should be empty lists.
+
+        If prompt logprobs are disabled, both arguments should be `None`.
+
+        Args:
+          token_ids: (num tokens) x (topk + 1) token ids tensor
+                     `None` if prompt logprobs are disabled in this req
+          prompt_logprobs: (num tokens) x (topk + 1) logprobs tensor
+
+        Return:
+          Prompt logprobs, if required for this request
+        """
         if self.num_prompt_logprobs == 0:
             # Prompt logprobs disabled for this request
             return None
@@ -191,27 +217,50 @@ def _update_prompt_logprobs(
         # logprobs, in one or more chunks.
         assert self.prompt_logprobs is not None
 
+        if len(self.prompt_logprobs) == 0:
+            self.prompt_logprobs = [None]
+
         # Detokenize non-incrementally.
         # NOTE(rob): the output is flattened:
         # [num_tok, num_lps] -> [num_tok * num_lps]
         decoded_tokens = detokenize_non_incrementally(self.tokenizer,
                                                       token_ids)
+
         # Make Logprob for each token.
         num_tokens, num_logprobs = prompt_logprobs.shape
-        chunk_prompt_logprobs = [
-            self._make_pos_logprob_dict(
-                prompt_logprobs[tok_idx].tolist(),
-                token_ids[tok_idx].tolist(),
-                # Deal with the flattening from above.
-                decoded_tokens[tok_idx * num_logprobs:],
-                num_logprobs,
-            ) for tok_idx in range(num_tokens)
-        ]
-        # Buffer prefill chunk
-        self.prompt_logprobs = (
-            ([None]  # First logprob is None
-             if len(self.prompt_logprobs) == 0 else self.prompt_logprobs) +
-            chunk_prompt_logprobs)
+        for tok_idx in range(num_tokens):
+
+            # Split into prompt token vs top_k.
+            prompt_token_id = token_ids[tok_idx, 0].item()
+            prompt_token_logprob = prompt_logprobs[tok_idx, 0].item()
+            topk_token_ids = token_ids[tok_idx, 1:]
+            topk_logprobs = prompt_logprobs[tok_idx, 1:]
+
+            # Make the dict of top-token Logprob objects associated with the
+            # current prompt offset
+            if prompt_token_id in topk_token_ids:
+                self.prompt_logprobs.append(
+                    self._make_pos_logprob_dict(
+                        topk_logprobs.tolist(),
+                        topk_token_ids.tolist(),
+                        # Deal with the flattening from above.
+                        decoded_tokens[tok_idx * num_logprobs:],
+                        self.num_prompt_logprobs,
+                    ))
+            else:
+                # If the prompt token is not one of the top tokens
+                # at this prompt offset, inject the prompt token
+                # & its Logprob instance into the dict
+                prompt_logprob_obj = Logprob(
+                    logprob=prompt_token_logprob,
+                    decoded_token=self.tokenizer.decode(prompt_token_id))
+                self.prompt_logprobs.append(
+                    self._make_pos_logprob_dict(
+                        topk_logprobs.tolist(),
+                        topk_token_ids.tolist(),
+                        decoded_tokens[tok_idx * num_logprobs:],
+                        self.num_prompt_logprobs,
+                        (prompt_token_id, prompt_logprob_obj)))
         return self.prompt_logprobs
 
     @staticmethod
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f1b4d11db0465..3158ceca32f74 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -661,7 +661,7 @@ def execute_model(
         prompt_logprobs_dict: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
         if len(self.input_batch.num_prompt_logprobs) > 0:
             # Prompt token ids are required for computing prompt logprobs
-            assert self.input_batch.prompt_token_ids is not None
+            assert input_ids is not None
         for (request_id, num_prompt_logprobs
              ) in self.input_batch.num_prompt_logprobs.items():
 
@@ -670,13 +670,13 @@ def execute_model(
                 request_id, scheduler_output, req_indices)
             prompt_hidden_states = hidden_states[prompt_indices]
             logits = self.model.compute_logits(prompt_hidden_states, None)
+            chunk_prompt_token_ids = input_ids[prompt_indices]
 
             # Compute prompt logprobs.
             prompt_logprobs_dict[request_id] = self.model.sampler.get_logprobs(
                 logits,
                 num_prompt_logprobs,
-                token_ids=self.input_batch.prompt_token_ids[
-                    prompt_indices]  #type: ignore
+                token_ids=chunk_prompt_token_ids  #type: ignore
             )
 
         sampled_token_ids = sampler_output.sampled_token_ids

From e1058acd63cb623ea1bb54cba168f8107e5511be Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 9 Jan 2025 05:24:30 +0000
Subject: [PATCH 269/293] wip making prompt logprobs line up with tok ids

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/sample/test_logprobs.py   | 14 +++++++++++---
 vllm/v1/engine/detokenizer.py      |  3 +--
 vllm/v1/worker/gpu_model_runner.py |  4 +++-
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 275f6b8335f4a..35087b585b40f 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -100,12 +100,16 @@ def _test_case_get_logprobs_and_prompt_logprobs(
             # correct
             assert vllm_result.outputs[0].logprobs is not None
             assert len(vllm_result.outputs[0].logprobs) == max_tokens
-            for logprobs in vllm_result.outputs[0].logprobs:
+            for logprobs, token_id in zip(vllm_result.outputs[0].logprobs,
+                                          vllm_result.outputs[0].token_ids):
                 assert logprobs is not None
                 # If the output token is not included in the top X
                 # logprob, it can return 1 more data
                 assert (len(logprobs) == num_top_logprobs
                         or len(logprobs) == num_top_logprobs + 1)
+                # But confirm that the output token ultimately does appear
+                # among the logprobs
+                assert token_id in logprobs
             output_text = vllm_result.outputs[0].text
             output_string_from_most_likely_tokens_lst: List[str] = []
             for top_logprobs in vllm_result.outputs[0].logprobs:
@@ -165,13 +169,17 @@ def _test_case_get_logprobs_and_prompt_logprobs(
             #   the prompt
             assert len(vllm_result.prompt_logprobs) == len(
                 vllm_result.prompt_token_ids)
-            for prompt_logprobs in vllm_result.prompt_logprobs[1:]:
+            for prompt_logprobs, prompt_token_id in zip(
+                    vllm_result.prompt_logprobs[1:],
+                    vllm_result.prompt_token_ids[1:]):
                 assert prompt_logprobs is not None
                 # - If the prompt token is not included in the top X
                 #   logprob, it can return 1 more data
                 assert (len(prompt_logprobs) == num_top_prompt_logprobs
                         or len(prompt_logprobs) == num_top_prompt_logprobs + 1)
-
+                # But confirm that the prompt token ultimately does appear
+                # among the prompt logprobs
+                assert prompt_token_id in prompt_logprobs
             # Compare prompt logprobs to HF
             # The first prompt logprob is always None, so we compare it from
             # 1:.
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 4f0408b0811e4..434a16906c04a 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -256,8 +256,7 @@ def _update_prompt_logprobs(
                     decoded_token=self.tokenizer.decode(prompt_token_id))
                 self.prompt_logprobs.append(
                     self._make_pos_logprob_dict(
-                        topk_logprobs.tolist(),
-                        topk_token_ids.tolist(),
+                        topk_logprobs.tolist(), topk_token_ids.tolist(),
                         decoded_tokens[tok_idx * num_logprobs:],
                         self.num_prompt_logprobs,
                         (prompt_token_id, prompt_logprob_obj)))
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 3158ceca32f74..429241ac3b328 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -670,7 +670,9 @@ def execute_model(
                 request_id, scheduler_output, req_indices)
             prompt_hidden_states = hidden_states[prompt_indices]
             logits = self.model.compute_logits(prompt_hidden_states, None)
-            chunk_prompt_token_ids = input_ids[prompt_indices]
+            # - Offset `prompt_indices` by 1 because (in general) the logprob
+            #   distribution at sequence position i is predicting position i+1
+            chunk_prompt_token_ids = input_ids[prompt_indices + 1]
 
             # Compute prompt logprobs.
             prompt_logprobs_dict[request_id] = self.model.sampler.get_logprobs(

From 5f33902c831ffba4584dab605b0743bbd9a4d8da Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 9 Jan 2025 07:16:39 +0000
Subject: [PATCH 270/293] partial req peek token

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/worker/gpu_input_batch.py  |  4 +++
 vllm/v1/worker/gpu_model_runner.py | 43 ++++++++++++++++++++++++++----
 2 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index c941fa78db985..69195d7680109 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -165,6 +165,10 @@ def __init__(
         # NOTE(rob): num_prompt_logprobs ONLY includes reqs
         # that are currently in the prefill phase.
         self.num_prompt_logprobs: Dict[str, int] = {}
+        # Dict mapping from partial request ID, to the ID of the token which
+        # immediately follows the last token processed in the current step.
+        # Only necessary for partial requests with prompt logprobs enabled.
+        self.partial_req_peek_token_ids: Dict[str, int] = {}
 
     def add_request(
         self,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 429241ac3b328..c88865413d43d 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -270,6 +270,16 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         if removed_req_indices:
             self.input_batch.condense(removed_req_indices)
 
+    def _compute_partial_req_next_token(
+        self,
+        req_id: str,
+        num_scheduled_tokens: npt.NDArray,
+    ) -> int:
+        req_idx = self.input_batch.req_id_to_index[req_id]
+        tok_idx = self.input_batch.num_computed_tokens_cpu[req_idx] + int(
+            num_scheduled_tokens[req_idx])
+        return int(self.input_batch.token_ids_cpu[req_idx, tok_idx])
+
     def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
@@ -309,6 +319,15 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
                arange,
                out=positions_np)
 
+        # ONLY for partial requests which need logprobs - prefetch the ID of the
+        # token immediately following the last token processed in this step.
+        self.input_batch.partial_req_peek_token_ids = {
+            req_id:
+            self._compute_partial_req_next_token(req_id, num_scheduled_tokens)
+            for req_id in set(scheduler_output.partial_req_ids)
+            & set(self.input_batch.num_prompt_logprobs.keys())
+        }
+
         # Get token indices.
         # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
         # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
@@ -491,7 +510,7 @@ def _prepare_sampling(
     def _prepare_prompt_indices(
         self,
         req_id: str,
-        scheduler_output: "SchedulerOutput",
+        is_partial_req: bool,
         req_indices: npt.NDArray,
     ) -> npt.NDArray:
         """Get the indices of a prompt in the batch."""
@@ -504,7 +523,7 @@ def _prepare_prompt_indices(
         indices = self.arange_np[:req_indices.shape[0]]
         prompt_indices = indices[req_indices == req_idx]
         # Remove the sample token if there is one.
-        if req_id not in scheduler_output.partial_req_ids:
+        if not is_partial_req:
             prompt_indices = prompt_indices[:-1]
 
         return prompt_indices
@@ -665,14 +684,28 @@ def execute_model(
         for (request_id, num_prompt_logprobs
              ) in self.input_batch.num_prompt_logprobs.items():
 
-            # Compute of the prompt.
+            # Compute the positions of the prompt tokens
+            is_partial_req = request_id in scheduler_output.partial_req_ids
             prompt_indices = self._prepare_prompt_indices(
-                request_id, scheduler_output, req_indices)
+                request_id, is_partial_req, req_indices)
             prompt_hidden_states = hidden_states[prompt_indices]
             logits = self.model.compute_logits(prompt_hidden_states, None)
             # - Offset `prompt_indices` by 1 because (in general) the logprob
             #   distribution at sequence position i is predicting position i+1
-            chunk_prompt_token_ids = input_ids[prompt_indices + 1]
+            if is_partial_req:
+                # - The prompt logprobs at the final position in this chunk,
+                #   are predicting the probability distribution of the first
+                #   token id in the next chunk - thus we must peek ahead at
+                #   the next chunk in order to know which token's prompt
+                #   logprobs to hold on to.
+                peek_token_id = torch.tensor(
+                    [self.input_batch.partial_req_peek_token_ids[request_id]],
+                    dtype=torch.int,
+                    device=input_ids.device)
+                chunk_prompt_token_ids = torch.cat(
+                    (input_ids[prompt_indices[:-1] + 1], peek_token_id))
+            else:
+                chunk_prompt_token_ids = input_ids[prompt_indices + 1]
 
             # Compute prompt logprobs.
             prompt_logprobs_dict[request_id] = self.model.sampler.get_logprobs(

From 199a834b6a0912f6e1ea6cbc193b926f1d472dd7 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 9 Jan 2025 15:11:51 +0000
Subject: [PATCH 271/293] refactoring

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/worker/gpu_input_batch.py  |  2 +-
 vllm/v1/worker/gpu_model_runner.py | 46 +++++++++++++++++++++---------
 2 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 69195d7680109..b5c3690b492e8 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -168,7 +168,7 @@ def __init__(
         # Dict mapping from partial request ID, to the ID of the token which
         # immediately follows the last token processed in the current step.
         # Only necessary for partial requests with prompt logprobs enabled.
-        self.partial_req_peek_token_ids: Dict[str, int] = {}
+        self.cached_partial_req_peek_token_ids: Dict[str, torch.Tensor] = {}
 
     def add_request(
         self,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c88865413d43d..770f3cae84063 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -270,15 +270,34 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         if removed_req_indices:
             self.input_batch.condense(removed_req_indices)
 
-    def _compute_partial_req_next_token(
+    def _peek_next_chunk_first_token(
         self,
         req_id: str,
         num_scheduled_tokens: npt.NDArray,
-    ) -> int:
+    ) -> torch.Tensor:
+        """During chunked prefill, peek at ID of next chunk's first token.
+
+        Example:
+
+        * Suppose prompt logprobs are enabled for request with id '6'
+        * Suppose prompt_token_ids = [0,5,2,3,8,5,6,7] for request id '6'
+        * Suppose in this step, the chunk [0,5,2,3] is being prefilled
+        * This method will return the token ID 8
+
+        Args:
+          req_id: request ID
+          num_scheduled_tokens: np array of per-req scheduled token counts
+
+        Returns:
+          Single-element 1D GPU tensor containing ID of first token in next
+          chunk.
+        """
         req_idx = self.input_batch.req_id_to_index[req_id]
         tok_idx = self.input_batch.num_computed_tokens_cpu[req_idx] + int(
             num_scheduled_tokens[req_idx])
-        return int(self.input_batch.token_ids_cpu[req_idx, tok_idx])
+        return torch.tensor([self.input_batch.token_ids_cpu[req_idx, tok_idx]],
+                            dtype=torch.int,
+                            device=self.device)
 
     def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
@@ -319,11 +338,12 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
                arange,
                out=positions_np)
 
-        # ONLY for partial requests which need logprobs - prefetch the ID of the
-        # token immediately following the last token processed in this step.
-        self.input_batch.partial_req_peek_token_ids = {
-            req_id:
-            self._compute_partial_req_next_token(req_id, num_scheduled_tokens)
+        # ONLY for partial requests with prompt logprobs enabled - peek at the
+        # ID of the prompt token immediately following the chunk processed in
+        # this step. Cache the token ID.
+        self.input_batch.cached_partial_req_peek_token_ids = {
+            req_id: self._peek_next_chunk_first_token(req_id,
+                                                      num_scheduled_tokens)
             for req_id in set(scheduler_output.partial_req_ids)
             & set(self.input_batch.num_prompt_logprobs.keys())
         }
@@ -696,12 +716,10 @@ def execute_model(
                 # - The prompt logprobs at the final position in this chunk,
                 #   are predicting the probability distribution of the first
                 #   token id in the next chunk - thus we must peek ahead at
-                #   the next chunk in order to know which token's prompt
-                #   logprobs to hold on to.
-                peek_token_id = torch.tensor(
-                    [self.input_batch.partial_req_peek_token_ids[request_id]],
-                    dtype=torch.int,
-                    device=input_ids.device)
+                #   the first token in the next chunk in order to know which
+                #   token's prompt logprobs to hold on to.
+                peek_token_id = (self.input_batch.
+                                 cached_partial_req_peek_token_ids[request_id])
                 chunk_prompt_token_ids = torch.cat(
                     (input_ids[prompt_indices[:-1] + 1], peek_token_id))
             else:

From 879fc44795f9af065581edbb2a9ed68c5018a3b1 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 9 Jan 2025 15:18:47 +0000
Subject: [PATCH 272/293] refactoring; non-blocking cpu->gpu transfer

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/worker/gpu_model_runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 770f3cae84063..12265e40f46b8 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -297,7 +297,8 @@ def _peek_next_chunk_first_token(
             num_scheduled_tokens[req_idx])
         return torch.tensor([self.input_batch.token_ids_cpu[req_idx, tok_idx]],
                             dtype=torch.int,
-                            device=self.device)
+                            device='cpu').to(device=self.device,
+                                             non_blocking=True)
 
     def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens

From 0f425fe2b99a2ce7fd22dc0a2fde4e1bfe61f23a Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 9 Jan 2025 17:19:15 +0000
Subject: [PATCH 273/293] wip detokenizer tests

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_detokenizer.py | 156 ++++++++++++++++++----------
 tests/v1/engine/utils.py            |  30 +++---
 2 files changed, 116 insertions(+), 70 deletions(-)

diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
index 1065fed39c7f1..71251442382db 100644
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
@@ -1,8 +1,10 @@
-from typing import List, Optional, Tuple
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
 
 import pytest
 import torch
-from transformers import AutoTokenizer
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
 
 from tests.v1.engine.utils import (generate_dummy_prompt_logprobs,
                                    generate_dummy_sample_logprobs,
@@ -17,52 +19,83 @@
 NUM_PROMPT_LOGPROBS = 7
 # Use Mistral instruct tokenizer
 TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
-tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
 
 FULL_STRINGS = [
     "My name is Robert from Neural Magic and I love working on vLLM so much!",
     "Red Hat is the best open source company by far across Linux, K8s, and AI.",
     "Nick is the name of my brother in addition to my colleague from Red Hat.",
 ]
-
 STOP_STRINGS = ["I love working on", "company by far", "brother in"]
-
-FULL_TOKENS = [tokenizer(text).input_ids for text in FULL_STRINGS]
 PROMPT_LEN = 5
 
-# Tokenize prompts under test & create dummy generated tokens
-PROMPT_TOKENS = [
-    tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
-]
-GENERATION_TOKENS = [
-    tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
-]
-
-# Generate dummy prompt logprobs & sample logprobs for initializing
-# the mock engine
-PROMPT_LOGPROBS: List[Tuple[torch.Tensor, torch.Tensor]] = [
-    generate_dummy_prompt_logprobs(prompt_tokens_list=tokens_list,
-                                   num_logprobs=NUM_PROMPT_LOGPROBS,
-                                   tokenizer=tokenizer)
-    for tokens_list in PROMPT_TOKENS
-]
-GENERATION_LOGPROBS = [
-    generate_dummy_sample_logprobs(sampled_tokens_list=tokens_list,
-                                   num_logprobs=NUM_SAMPLE_LOGPROBS,
-                                   tokenizer=tokenizer)
-    for tokens_list in GENERATION_TOKENS
-]
 
-PROMPT_STRINGS = [
-    tokenizer.decode(prompt_tokens,
-                     skip_special_tokens=True,
-                     tokenizer=tokenizer) for prompt_tokens in PROMPT_TOKENS
-]
-PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS]
-GENERATION_STRINGS = [
-    text[prompt_len:]
-    for text, prompt_len in zip(FULL_STRINGS, PROMPT_STRINGS_LEN)
-]
+@dataclass
+class DummyTestVectors:
+    """Dummy test vectors for detokenizer tests"""
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+    full_tokens: List[List[int]]  # Prompt + generated tokens
+    prompt_tokens: List[List[int]]
+    generation_tokens: List[List[int]]
+    # Each request is associated with a tuple of (top logprobs,top tokens)
+    # prompt logprobs tensors
+    prompt_logprobs: List[Tuple[torch.Tensor, torch.Tensor]]
+    # Each request is associated with a sample logprobs; a request's
+    # sample logprobs are a list of (top logprobs,top tokens)
+    # sample logprobs tensors at each sequence position
+    generation_logprobs: List[List[Tuple[torch.Tensor, torch.Tensor]]]
+    prompt_strings: List[str]
+    prompt_strings_len: List[int]
+    generation_strings: List[str]
+
+
+@pytest.fixture(scope="module")
+def dummy_test_vectors() -> DummyTestVectors:
+    """Generate dummy test vectors for detokenizer tests.
+    
+    Returns:
+      DummyTestVectors instance
+    """
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
+    # Tokenize prompts under test & create dummy generated tokens
+    prompt_tokens = [
+        tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
+    ]
+    generation_tokens = [
+        tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
+    ]
+    # Generate prompt strings
+    prompt_strings = [
+        tokenizer.decode(prompt_tokens,
+                         skip_special_tokens=True,
+                         tokenizer=tokenizer)
+        for prompt_tokens in prompt_tokens
+    ]
+    prompt_strings_len = [
+        len(prompt_string) for prompt_string in prompt_strings
+    ]
+    return DummyTestVectors(
+        tokenizer=tokenizer,
+        full_tokens=[tokenizer(text).input_ids for text in FULL_STRINGS],
+        prompt_tokens=prompt_tokens,
+        generation_tokens=generation_tokens,
+        prompt_strings=prompt_strings,
+        prompt_strings_len=prompt_strings_len,
+        generation_strings=[
+            text[prompt_len:]
+            for text, prompt_len in zip(FULL_STRINGS, prompt_strings_len)
+        ],
+        prompt_logprobs=[
+            generate_dummy_prompt_logprobs(prompt_tokens_list=tokens_list,
+                                           num_logprobs=NUM_PROMPT_LOGPROBS,
+                                           tokenizer=tokenizer)
+            for tokens_list in prompt_tokens
+        ],
+        generation_logprobs=[
+            generate_dummy_sample_logprobs(sampled_tokens_list=tokens_list,
+                                           num_logprobs=NUM_SAMPLE_LOGPROBS,
+                                           tokenizer=tokenizer)
+            for tokens_list in generation_tokens
+        ])
 
 
 class MockEngineCore:
@@ -143,16 +176,22 @@ def test_incremental_detokenization(
     request_output_kind: RequestOutputKind,
     logprobs: Optional[int],
     prompt_logprobs: Optional[int],
+    dummy_test_vectors: DummyTestVectors,
 ) -> None:
+    generation_tokens = dummy_test_vectors.generation_tokens
+    prompt_tokens = dummy_test_vectors.prompt_tokens
+    # Determine whether sample/prompt logprobs are enabled
     do_generated_logprobs = logprobs is not None
     do_prompt_logprobs = prompt_logprobs is not None
     detokenizer = Detokenizer(TOKENIZER_NAME)
+    # Build mock engine core, which emulates sampling & logprobs
     engine_core = MockEngineCore(
-        generated_tokens_list=GENERATION_TOKENS,
-        prompt_tokens_list=PROMPT_TOKENS,
-        generated_logprobs_raw=GENERATION_LOGPROBS
+        generated_tokens_list=generation_tokens,
+        prompt_tokens_list=prompt_tokens,
+        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
         if do_generated_logprobs else None,
-        prompt_logprobs_raw=PROMPT_LOGPROBS if do_prompt_logprobs else None)
+        prompt_logprobs_raw=dummy_test_vectors.prompt_logprobs
+        if do_prompt_logprobs else None)
 
     # Make N requests.
     requests = [
@@ -173,9 +212,8 @@ def test_incremental_detokenization(
                               include_stop_str_in_output=False,
                               logprobs=logprobs,
                               prompt_logprobs=prompt_logprobs))
-        for idx, (
-            prompt,
-            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+        for idx, (prompt, prompt_tokens) in enumerate(
+            zip(dummy_test_vectors.prompt_strings, prompt_tokens))
     ]
 
     # Add requests to the detokenizer.
@@ -195,7 +233,8 @@ def test_incremental_detokenization(
         assert len(requests_to_abort) == 0
 
         # Validate logprob detokenization
-        validate_requests_logprobs(requests, request_outputs, tokenizer)
+        validate_requests_logprobs(requests, request_outputs,
+                                   dummy_test_vectors.tokenizer)
 
         # Update tracking.
         for request_output in request_outputs:
@@ -211,7 +250,7 @@ def test_incremental_detokenization(
 
     # Confirmed tracked values matches what we expected.
     for idx, (ref_gen_str, ref_gen_toks) in enumerate(
-            zip(GENERATION_STRINGS, GENERATION_TOKENS)):
+            zip(dummy_test_vectors.generation_strings, generation_tokens)):
         gen_str = gen_strings[f"request-{idx}"]
         gen_toks = gen_tokens[f"request-{idx}"]
 
@@ -231,16 +270,19 @@ def test_stop_string(
     include_stop_str_in_output: bool,
     logprobs: Optional[int],
     prompt_logprobs: Optional[int],
+    dummy_test_vectors: DummyTestVectors,
 ) -> None:
+    prompt_tokens = dummy_test_vectors.prompt_tokens
     do_generated_logprobs = logprobs is not None
     do_prompt_logprobs = prompt_logprobs is not None
     detokenizer = Detokenizer(TOKENIZER_NAME)
     engine_core = MockEngineCore(
-        generated_tokens_list=GENERATION_TOKENS,
-        prompt_tokens_list=PROMPT_TOKENS,
-        generated_logprobs_raw=GENERATION_LOGPROBS
+        generated_tokens_list=dummy_test_vectors.generation_tokens,
+        prompt_tokens_list=prompt_tokens,
+        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
         if do_generated_logprobs else None,
-        prompt_logprobs_raw=PROMPT_LOGPROBS if do_prompt_logprobs else None)
+        prompt_logprobs_raw=dummy_test_vectors.prompt_logprobs
+        if do_prompt_logprobs else None)
 
     # Make N requests.
     requests = [
@@ -262,9 +304,8 @@ def test_stop_string(
                 include_stop_str_in_output=include_stop_str_in_output,
                 logprobs=logprobs,
                 prompt_logprobs=prompt_logprobs,
-            )) for idx, (
-                prompt,
-                prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+            )) for idx, (prompt, prompt_tokens) in enumerate(
+                zip(dummy_test_vectors.prompt_strings, prompt_tokens))
     ]
 
     # Add requests to the detokenizer.
@@ -288,7 +329,8 @@ def test_stop_string(
         aborted.extend(requests_to_abort)
 
         # Validate logprob detokenization
-        validate_requests_logprobs(requests, request_outputs, tokenizer)
+        validate_requests_logprobs(requests, request_outputs,
+                                   dummy_test_vectors.tokenizer)
 
         # Update tracking.
         for request_output in request_outputs:
@@ -304,8 +346,8 @@ def test_stop_string(
         i += 1
 
     # Confirmed tracked values matches what we expected.
-    for idx, (ref_gen_str,
-              stop_str) in enumerate(zip(GENERATION_STRINGS, STOP_STRINGS)):
+    for idx, (ref_gen_str, stop_str) in enumerate(
+            zip(dummy_test_vectors.generation_strings, STOP_STRINGS)):
 
         # Request should be aborted.
         request_id = f"request-{idx}"
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index 1a2d98d7fe64c..5cf0eb9daa9f0 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -32,7 +32,7 @@ def _create_random_top_logprob_test_vector(
     Returns:
       1D length-`num_logprobs` torch Tensor of float logprob values
     """
-    return torch.rand(num_logprobs + 1) * (upper - lower) + lower
+    return torch.rand(num_logprobs) * (upper - lower) + lower
 
 
 def _create_random_top_logprob_test_matrix(
@@ -61,8 +61,11 @@ def _create_random_top_logprob_test_matrix(
 
 
 def _create_random_top_token_test_vector(
-        num_logprobs: int, lower: int, upper: int, sampled_token_id: int,
-        adjust_num_logprobs: bool) -> torch.Tensor:
+    num_logprobs: int,
+    lower: int,
+    upper: int,
+    sampled_token_id: int,
+) -> torch.Tensor:
     """Create a random vector of top logprob token indices
 
     Use to create fake sample logprobs for testing. The sampled token
@@ -104,6 +107,7 @@ def _create_random_top_token_test_matrix(
     shape: Tuple[int, int],
     lower: int,
     upper: int,
+    tokens_list: List[int],
 ) -> torch.Tensor:
     """Create a random matrix of top logprob token indices
 
@@ -123,7 +127,9 @@ def _create_random_top_token_test_matrix(
     """
     num_elements = shape[0] * shape[1]
     choice_tensor = torch.randperm(upper - lower)[:num_elements] + lower
-    return choice_tensor.view(shape)
+    return torch.cat((torch.tensor(tokens_list, dtype=torch.int).unsqueeze(-1),
+                      choice_tensor.view(shape)),
+                     dim=1)
 
 
 def generate_dummy_sample_logprobs(
@@ -148,13 +154,11 @@ def generate_dummy_sample_logprobs(
     """
     res = []
     for sampled_token_id in sampled_tokens_list:
-        num_logprobs_adjustment = random.choice([0, 1])
-        res.append((_create_random_top_logprob_test_vector(
-            num_logprobs + num_logprobs_adjustment, -100, 0),
-                    _create_random_top_token_test_vector(
-                        num_logprobs, 0,
-                        len(tokenizer.vocab) - 1, sampled_token_id,
-                        num_logprobs_adjustment > 0)))
+        res.append(
+            (_create_random_top_logprob_test_vector(num_logprobs + 1, -100, 0),
+             _create_random_top_token_test_vector(num_logprobs, 0,
+                                                  len(tokenizer.vocab) - 1,
+                                                  sampled_token_id)))
     return res
 
 
@@ -181,10 +185,10 @@ def generate_dummy_prompt_logprobs(
     """
     num_prompt_tokens = len(prompt_tokens_list)
     return (_create_random_top_logprob_test_matrix(
-        (num_prompt_tokens, num_logprobs), -100, 0),
+        (num_prompt_tokens, num_logprobs + 1), -100, 0),
             _create_random_top_token_test_matrix(
                 (num_prompt_tokens, num_logprobs), 0,
-                len(tokenizer.vocab) - 1))
+                len(tokenizer.vocab) - 1, prompt_tokens_list))
 
 
 def _decode_token(

From 108912719b4e3dee3580e23062816686a0838802 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 9 Jan 2025 18:33:58 +0000
Subject: [PATCH 274/293] detok test fix

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/utils.py      | 16 ++++++--
 vllm/v1/engine/detokenizer.py | 72 ++++++++++++++++++++---------------
 2 files changed, 53 insertions(+), 35 deletions(-)

diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index 5cf0eb9daa9f0..ff2ebe77f0911 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -183,12 +183,20 @@ def generate_dummy_prompt_logprobs(
       where both matrices have dimensions
       num_prompt_tokens x num_logprobs
     """
-    num_prompt_tokens = len(prompt_tokens_list)
+    # For now, assume the whole prompt is processed in one chunk; thus,
+    # the number of non-`None` prompt logprobs is `len(prompt_tokens_list)-1`.
+    # Prior to injecting `None` at the beginning of prompt logprobs (which
+    # happens later in the detokenizer, not here), the prompt logprobs in
+    # the ith position are predicting the probability distribution of the
+    # prompt token in (i+1)st position. Thus, we concat
+    # `prompt_tokens_list[1:]` to the dummy token ids, just as the engine
+    # would.
+    num_prompt_logprobs = len(prompt_tokens_list) - 1
     return (_create_random_top_logprob_test_matrix(
-        (num_prompt_tokens, num_logprobs + 1), -100, 0),
+        (num_prompt_logprobs, num_logprobs + 1), -100, 0),
             _create_random_top_token_test_matrix(
-                (num_prompt_tokens, num_logprobs), 0,
-                len(tokenizer.vocab) - 1, prompt_tokens_list))
+                (num_prompt_logprobs, num_logprobs), 0,
+                len(tokenizer.vocab) - 1, prompt_tokens_list[1:]))
 
 
 def _decode_token(
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 434a16906c04a..8a1f45508a837 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -183,6 +183,7 @@ def _update_prompt_logprobs(
         self,
         token_ids: Optional[torch.Tensor],
         prompt_logprobs: Optional[torch.Tensor],
+        prompt_token_ids_lst: List[int],
     ) -> Optional[PromptLogprobs]:
         """Incorporate prompt logprobs from this step, if they exist.
 
@@ -196,9 +197,11 @@ def _update_prompt_logprobs(
         If prompt logprobs are disabled, both arguments should be `None`.
 
         Args:
-          token_ids: (num tokens) x (topk + 1) token ids tensor
+          token_ids: (num prompt tokens-1) x (topk + 1) token ids tensor
                      `None` if prompt logprobs are disabled in this req
-          prompt_logprobs: (num tokens) x (topk + 1) logprobs tensor
+          prompt_logprobs: (num prompt tokens-1) x (topk + 1) logprobs tensor
+          prompt_token_ids_lst: (num prompt tokens)-length list of prompt
+                                token ids
 
         Return:
           Prompt logprobs, if required for this request
@@ -227,11 +230,17 @@ def _update_prompt_logprobs(
                                                       token_ids)
 
         # Make Logprob for each token.
-        num_tokens, num_logprobs = prompt_logprobs.shape
-        for tok_idx in range(num_tokens):
-
+        num_chunk_tokens, decoded_tokens_stride = prompt_logprobs.shape
+        prompt_idx = len(self.prompt_logprobs)
+        for tok_idx, prompt_token_id in zip(range(num_chunk_tokens),
+                                            prompt_token_ids_lst[prompt_idx:]):
+            # Iterate over prefill chunk
+            assert prompt_token_id
+            assert prompt_token_id == token_ids[tok_idx, 0].item(), (
+                "Sampler concats the prompt token logprob in front of "
+                f"the topk logprobs, but got {prompt_token_id=} and "
+                f"{token_ids[tok_idx, 0].item()=}")
             # Split into prompt token vs top_k.
-            prompt_token_id = token_ids[tok_idx, 0].item()
             prompt_token_logprob = prompt_logprobs[tok_idx, 0].item()
             topk_token_ids = token_ids[tok_idx, 1:]
             topk_logprobs = prompt_logprobs[tok_idx, 1:]
@@ -244,7 +253,7 @@ def _update_prompt_logprobs(
                         topk_logprobs.tolist(),
                         topk_token_ids.tolist(),
                         # Deal with the flattening from above.
-                        decoded_tokens[tok_idx * num_logprobs:],
+                        decoded_tokens[tok_idx * decoded_tokens_stride:],
                         self.num_prompt_logprobs,
                     ))
             else:
@@ -257,7 +266,7 @@ def _update_prompt_logprobs(
                 self.prompt_logprobs.append(
                     self._make_pos_logprob_dict(
                         topk_logprobs.tolist(), topk_token_ids.tolist(),
-                        decoded_tokens[tok_idx * num_logprobs:],
+                        decoded_tokens[tok_idx * decoded_tokens_stride:],
                         self.num_prompt_logprobs,
                         (prompt_token_id, prompt_logprob_obj)))
         return self.prompt_logprobs
@@ -265,32 +274,34 @@ def _update_prompt_logprobs(
     @staticmethod
     def _make_pos_logprob_dict(
         logprobs: List[float],
-        token_ids: List[int],
+        logprob_token_ids: List[int],
         decoded_tokens: List[str],
         num_logprobs: int,
-        sampled_token_id_logprob: Optional[Tuple[int, Logprob]] = None,
+        special_token_id_logprob: Optional[Tuple[int, Logprob]] = None,
     ) -> Dict[int, Logprob]:
         """Make a Logprob dictionary for a position in the sequence.
         
         Returns a dictionary mapping top token ids to Logprob data
         structures. Each Logprob data structure includes log probability,
-        decoded token, and rank (1-indexed). The size of the dict returned
+        decoded token, and rank (index+1). The size of the dict returned
         will be be num_logprobs.
 
-        If the sampled token is not among the top logprobs, then 
-        sampled_token_id_logprob = (sampled_token_id,sample_logprob) must be
-        provided; an additional dictionary entry mapping sampled_token_id -> 
-        sample_logprob will be injected with rank equal to num_logprobs + 1 
-        (sampled_token_id must be lowest-rank if we are having to inject it.)
+        If the special token (sampled token or prompt token associated
+        with the current sequence position) is not among the top logprobs,
+        then special_token_id_logprob = (special_token_id,logprob) must be
+        provided; an additional dictionary entry mapping special_token_id -> 
+        logprob will be injected with rank equal to num_logprobs + 1 
+        (special_token_id must be lowest-rank if we are having to inject it.)
         Note that the size of the dict returned will then be num_logprobs + 1.
 
         Args:
           logprobs: list of log probabilities
-          token_ids: list of top token ids
+          logprob_token_ids: list of top token ids
           decoded_tokens: list of decoded top tokens
           num_logprobs: number of top tokens
-          sampled_token_id_logprob: (optional) tuple of
-                                    (sampled_token_id,sample_logprob)
+          special_token_id_logprob: (optional) tuple of
+                                    (special_token_id,logprob) associated with
+                                    sampled token or prompt token
 
         Returns:
           Dict[top token id, Logprob]; num_logprobs or num_logprobs+1
@@ -300,7 +311,7 @@ def _make_pos_logprob_dict(
         # Sampler uses torch.topk() which sorts so the
         # index in lists is equivalent to rank-1.
         logprobs_dict = {
-            token_ids[idx]: Logprob(
+            logprob_token_ids[idx]: Logprob(
                 logprob=logprobs[idx],
                 rank=idx + 1,
                 decoded_token=decoded_tokens[idx],
@@ -308,14 +319,14 @@ def _make_pos_logprob_dict(
             for idx in range(num_logprobs)
         }
 
-        # Inject sampled token Logprob if necessary
-        if sampled_token_id_logprob:
-            sampled_token_id = sampled_token_id_logprob[0]
-            sample_logprob_obj = sampled_token_id_logprob[1]
-            assert sampled_token_id is not None
-            assert sample_logprob_obj is not None
-            sample_logprob_obj.rank = num_logprobs + 1
-            logprobs_dict[sampled_token_id] = sample_logprob_obj
+        # Inject special token Logprob if necessary
+        if special_token_id_logprob:
+            special_token_id = special_token_id_logprob[0]
+            special_logprob_obj = special_token_id_logprob[1]
+            assert special_token_id is not None
+            assert special_logprob_obj is not None
+            special_logprob_obj.rank = num_logprobs + 1
+            logprobs_dict[special_token_id] = special_logprob_obj
 
         return logprobs_dict
 
@@ -387,9 +398,8 @@ def add_tokens(
 
         # 4) Make Prompt Logprobs.
         prompt_logprobs = self._update_prompt_logprobs(
-            new_prompt_logprobs_token_ids,
-            new_prompt_logprobs,
-        )
+            new_prompt_logprobs_token_ids, new_prompt_logprobs,
+            self.prompt_token_ids)
 
         # 5) Makes the RequestOutput object with the new text.
         finished = bool(finish_reason)

From d2742d8721459629a99bb7b0af7505a7cbe00f1a Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 9 Jan 2025 18:42:58 +0000
Subject: [PATCH 275/293] passing detok tests

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/detokenizer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 8a1f45508a837..383b5868505ad 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -244,6 +244,7 @@ def _update_prompt_logprobs(
             prompt_token_logprob = prompt_logprobs[tok_idx, 0].item()
             topk_token_ids = token_ids[tok_idx, 1:]
             topk_logprobs = prompt_logprobs[tok_idx, 1:]
+            decoded_tokens_offset = tok_idx * decoded_tokens_stride + 1
 
             # Make the dict of top-token Logprob objects associated with the
             # current prompt offset
@@ -253,7 +254,7 @@ def _update_prompt_logprobs(
                         topk_logprobs.tolist(),
                         topk_token_ids.tolist(),
                         # Deal with the flattening from above.
-                        decoded_tokens[tok_idx * decoded_tokens_stride:],
+                        decoded_tokens[decoded_tokens_offset:],
                         self.num_prompt_logprobs,
                     ))
             else:
@@ -266,7 +267,7 @@ def _update_prompt_logprobs(
                 self.prompt_logprobs.append(
                     self._make_pos_logprob_dict(
                         topk_logprobs.tolist(), topk_token_ids.tolist(),
-                        decoded_tokens[tok_idx * decoded_tokens_stride:],
+                        decoded_tokens[decoded_tokens_offset:],
                         self.num_prompt_logprobs,
                         (prompt_token_id, prompt_logprob_obj)))
         return self.prompt_logprobs

From a55e679b505f09c1c97c961cd7dbc6ee49e7c2a7 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Fri, 10 Jan 2025 01:48:48 +0000
Subject: [PATCH 276/293] LLMEngine test working, wip AsyncLLM test

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .../offline_inference/offline_inference.py    |  8 +++---
 tests/v1/engine/test_async_llm.py             | 13 ++++++++++
 tests/v1/engine/test_llm_engine.py            | 14 ++++++++++
 vllm/v1/engine/async_llm.py                   | 24 ++++++++++++++++-
 vllm/v1/engine/llm_engine.py                  | 26 +++++++++++++++++--
 vllm/v1/engine/utils.py                       | 14 ++++++++++
 6 files changed, 93 insertions(+), 6 deletions(-)
 create mode 100644 tests/v1/engine/test_llm_engine.py
 create mode 100644 vllm/v1/engine/utils.py

diff --git a/examples/offline_inference/offline_inference.py b/examples/offline_inference/offline_inference.py
index 23cc6e8539431..7435d5b00a377 100644
--- a/examples/offline_inference/offline_inference.py
+++ b/examples/offline_inference/offline_inference.py
@@ -8,10 +8,12 @@
     "The future of AI is",
 ]
 # Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+sampling_params = SamplingParams(temperature=0.8,
+                                 top_p=0.95,
+                                 prompt_logprobs=5)
 
 # Create an LLM.
-llm = LLM(model="facebook/opt-125m")
+llm = LLM(model="facebook/opt-125m", enable_prefix_caching=True)
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
@@ -19,4 +21,4 @@
 for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
\ No newline at end of file
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index fffb5b8100ec7..221c7b23f53c5 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -7,6 +7,7 @@
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.platforms import current_platform
 from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.utils import STR_ASYNC_LLM_PROMPT_LP_APC_UNSUPPORTED
 
 if not current_platform.is_cuda():
     pytest.skip(reason="V1 currently only supported on CUDA.",
@@ -30,6 +31,18 @@ async def generate(engine: AsyncLLM, request_id: str,
     return count, request_id
 
 
+def test_async_llm_refuses_prompt_logprobs_with_apc():
+    """Test passes if AsyncLLM raises an exception when it is configured
+    for automatic prefix caching and it receives a request with
+    prompt_logprobs enabled, which is incompatible."""
+    with pytest.raises(ValueError) as excinfo:
+        (LLM(model="facebook/opt-125m", enable_prefix_caching=True).generate(
+            "Hello, my name is",
+            SamplingParams(temperature=0.8, top_p=0.95, prompt_logprobs=5)))
+    # Validate exception string is correct
+    assert str(excinfo.value) == STR_ASYNC_LLM_PROMPT_LP_APC_UNSUPPORTED
+
+
 @pytest.mark.asyncio
 async def test_load(monkeypatch):
     # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
new file mode 100644
index 0000000000000..3efe864bdf30c
--- /dev/null
+++ b/tests/v1/engine/test_llm_engine.py
@@ -0,0 +1,14 @@
+"""LLMEngine tests"""
+import pytest
+from vllm import LLM, SamplingParams
+from vllm.v1.engine.utils import STR_LLM_ENGINE_PROMPT_LP_APC_UNSUPPORTED
+
+def test_llm_engine_refuses_prompt_logprobs_with_apc():
+    """Test passes if LLMEngine raises an exception when it is configured
+    for automatic prefix caching and it receives a request with
+    prompt_logprobs enabled, which is incompatible."""
+    with pytest.raises(ValueError) as excinfo:
+        (LLM(model="facebook/opt-125m",enable_prefix_caching=True)
+            .generate("Hello, my name is", SamplingParams(temperature=0.8, top_p=0.95, prompt_logprobs=5)))
+    # Validate exception string is correct
+    assert str(excinfo.value) == STR_LLM_ENGINE_PROMPT_LP_APC_UNSUPPORTED
\ No newline at end of file
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b963ba74f13f0..090e90fbfbc78 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -2,7 +2,7 @@
 import os
 from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
 
-from vllm.config import ModelConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.engine.protocol import EngineClient
@@ -22,6 +22,7 @@
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.engine.utils import STR_ASYNC_LLM_PROMPT_LP_APC_UNSUPPORTED
 
 logger = init_logger(__name__)
 
@@ -47,6 +48,7 @@ def __init__(
         self.log_stats = log_stats
         self.stat_loggers = stat_loggers
         self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
@@ -162,6 +164,22 @@ async def add_request(
 
         return self.rid_to_queue[request_id]
 
+    async def _assert_valid_request(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        """Validate AsyncLLM request attributes. Fail if invalid.
+        
+        Args:
+          params: request parameters
+        """
+        # Prompt logprobs and APC are incompatible
+        if isinstance(params, SamplingParams):
+            plp = params.prompt_logprobs
+            if (await self.get_cache_config()
+                ).enable_prefix_caching and plp is not None and plp > 0:
+                raise ValueError(STR_ASYNC_LLM_PROMPT_LP_APC_UNSUPPORTED)
+
     # TODO: we should support multiple prompts in one call, as you
     # can do with LLM.generate. So that for multi-prompt completion
     # requests we don't need to send multiple messages to core proc,
@@ -191,6 +209,7 @@ async def generate(
         The caller of generate() iterates the returned AsyncGenerator,
         returning the RequestOutput back to the caller.
         """
+        await self._assert_valid_request(sampling_params)
 
         try:
             # We start the output_handler on the first call to generate() so
@@ -292,6 +311,9 @@ def encode(
     async def get_model_config(self) -> ModelConfig:
         return self.model_config
 
+    async def get_cache_config(self) -> CacheConfig:
+        return self.cache_config
+
     async def get_decoding_config(self):
         raise ValueError("Not Supported on V1 yet.")
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 8ced3a34d2da3..6d6a42e3a7227 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -2,7 +2,7 @@
 
 from typing_extensions import TypeVar
 
-from vllm.config import VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.envs import VLLM_ENABLE_V1_MULTIPROCESSING
@@ -20,6 +20,7 @@
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
+from vllm.v1.engine.utils import STR_LLM_ENGINE_PROMPT_LP_APC_UNSUPPORTED
 from vllm.v1.executor.abstract import Executor
 
 logger = init_logger(__name__)
@@ -43,6 +44,7 @@ def __init__(
         multiprocess_mode: bool = False,
     ) -> None:
         self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
@@ -119,6 +121,22 @@ def abort_request(self, request_ids: List[str]) -> None:
         self.engine_core.abort_requests(request_ids)
         self.detokenizer.abort_requests(request_ids)
 
+    def _assert_valid_request(
+        self,
+        params: Union[SamplingParams, PoolingParams],
+    ) -> None:
+        """Validate LLMEngine request attributes. Fail if invalid.
+        
+        Args:
+          params: request parameters
+        """
+        # Prompt logprobs and APC are incompatible
+        if isinstance(params, SamplingParams):
+            plp = params.prompt_logprobs
+            if self.get_cache_config(
+            ).enable_prefix_caching and plp is not None and plp > 0:
+                raise ValueError(STR_LLM_ENGINE_PROMPT_LP_APC_UNSUPPORTED)
+
     def add_request(
         self,
         request_id: str,
@@ -130,6 +148,7 @@ def add_request(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> None:
+        self._assert_valid_request(params)
 
         # 1) Process raw inputs into the request.
         request = self.processor.process_inputs(request_id, prompt, params,
@@ -159,9 +178,12 @@ def step(self) -> List[RequestOutput]:
 
         return request_outputs
 
-    def get_model_config(self):
+    def get_model_config(self) -> ModelConfig:
         return self.model_config
 
+    def get_cache_config(self) -> CacheConfig:
+        return self.cache_config
+
     def start_profile(self):
         self.engine_core.profile(True)
 
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
new file mode 100644
index 0000000000000..4f32dda9f9964
--- /dev/null
+++ b/vllm/v1/engine/utils.py
@@ -0,0 +1,14 @@
+"""Engine utils"""
+
+# Exception strings
+STR_LLM_ENGINE_PROMPT_LP_APC_UNSUPPORTED=("Request specifies prompt_logprobs, but prompt"
+                    "_logprobs are incompatible with automatic prefix caching"
+                    " which is currently enabled on the vLLM server. Try"
+                    " re-initializing LLM with enable_prefix_caching=False,"
+                    " or setting prompt_logprobs=None (which is the default.)")
+
+STR_ASYNC_LLM_PROMPT_LP_APC_UNSUPPORTED=("Request specifies prompt_logprobs, but prompt"
+                    "_logprobs are incompatible with automatic prefix caching"
+                    " which is currently enabled on the vLLM server. Try"
+                    " restarting VLLM with --no-enable-prefix-caching,"
+                    " or setting prompt_logprobs=None (which is the default.)")
\ No newline at end of file

From b2c0c957baee2008e20e94a3f36823555e83d942 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Fri, 10 Jan 2025 01:52:16 +0000
Subject: [PATCH 277/293] reverted unwanted changes

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 examples/offline_inference/offline_inference.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/examples/offline_inference/offline_inference.py b/examples/offline_inference/offline_inference.py
index 7435d5b00a377..23cc6e8539431 100644
--- a/examples/offline_inference/offline_inference.py
+++ b/examples/offline_inference/offline_inference.py
@@ -8,12 +8,10 @@
     "The future of AI is",
 ]
 # Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8,
-                                 top_p=0.95,
-                                 prompt_logprobs=5)
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
 # Create an LLM.
-llm = LLM(model="facebook/opt-125m", enable_prefix_caching=True)
+llm = LLM(model="facebook/opt-125m")
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
@@ -21,4 +19,4 @@
 for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
\ No newline at end of file

From 9a40c5f3ecac835137211c783b966760aff074a6 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Fri, 10 Jan 2025 15:53:59 +0000
Subject: [PATCH 278/293] success

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_async_llm.py  | 68 ++++++++++++++++++++++++------
 tests/v1/engine/test_llm_engine.py | 12 ++++--
 vllm/v1/engine/async_llm.py        |  2 +-
 vllm/v1/engine/utils.py            | 22 +++++-----
 4 files changed, 77 insertions(+), 27 deletions(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 221c7b23f53c5..08102cbb07304 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -1,5 +1,5 @@
 import asyncio
-from typing import Tuple
+from typing import Optional, Tuple
 
 import pytest
 
@@ -17,13 +17,38 @@
                               disable_log_requests=True)
 
 
-async def generate(engine: AsyncLLM, request_id: str,
-                   max_tokens: int) -> Tuple[int, str]:
+async def generate(
+    engine: AsyncLLM,
+    request_id: str,
+    max_tokens: Optional[int] = None,
+    sampling_params: Optional[SamplingParams] = None,
+) -> Tuple[int, str]:
+    """Wrapper for `AsyncLLM` generation.
+
+    At least one of `max_tokens` and `sampling_params` must
+    not be `None`. If `sampling_params` is `None`, `max_tokens`
+    is used to create a `SamplingParams` instance. If
+    `sampling_params` is provided, `max_tokens` is not used.
+    
+    Args:
+      engine: AsyncLLM instance
+      request_id: AsyncLLM request ID
+      max_tokens: (optional) max number of tokens to generate
+      sampling_params: (optional) request sampling params
+
+    Returns:
+      count: number of returns from engine.generate()
+      request_id
+    """
+    assert not (max_tokens is None and sampling_params is None), (
+        "At least one of max_tokens and sampling_params"
+        " must not be None.")
+    if sampling_params is None:
+        sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0)
     count = 0
     async for _ in engine.generate(request_id=request_id,
                                    prompt="Hello my name is Robert and",
-                                   sampling_params=SamplingParams(
-                                       max_tokens=max_tokens, temperature=0)):
+                                   sampling_params=sampling_params):
 
         count += 1
         await asyncio.sleep(0.)
@@ -31,16 +56,35 @@ async def generate(engine: AsyncLLM, request_id: str,
     return count, request_id
 
 
-def test_async_llm_refuses_prompt_logprobs_with_apc():
+@pytest.mark.asyncio
+async def test_async_llm_refuses_prompt_logprobs_with_apc(monkeypatch):
     """Test passes if AsyncLLM raises an exception when it is configured
     for automatic prefix caching and it receives a request with
     prompt_logprobs enabled, which is incompatible."""
-    with pytest.raises(ValueError) as excinfo:
-        (LLM(model="facebook/opt-125m", enable_prefix_caching=True).generate(
-            "Hello, my name is",
-            SamplingParams(temperature=0.8, top_p=0.95, prompt_logprobs=5)))
-    # Validate exception string is correct
-    assert str(excinfo.value) == STR_ASYNC_LLM_PROMPT_LP_APC_UNSUPPORTED
+    # TODO(rickyx): Remove monkeypatch VLLM_USE_V1 setting once we have a
+    # better way to test V1 so that in the future when we switch, we don't
+    # have to change all the tests.
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    # Create AsyncLLM engine with APC
+    apc_engine_args = AsyncEngineArgs(model="facebook/opt-125m",
+                                      enable_prefix_caching=True,
+                                      gpu_memory_utilization=0.8,
+                                      disable_log_requests=True)
+    engine = AsyncLLM.from_engine_args(apc_engine_args)
+    try:
+        with pytest.raises(ValueError) as excinfo:
+            # Issue a request with prompt logprobs enabled, which should fail
+            await asyncio.create_task(
+                generate(engine,
+                         "request-0",
+                         sampling_params=SamplingParams(max_tokens=10,
+                                                        temperature=0,
+                                                        prompt_logprobs=5)))
+        # Validate exception string is correct
+        assert str(excinfo.value) == STR_ASYNC_LLM_PROMPT_LP_APC_UNSUPPORTED
+    finally:
+        # Shut down engine
+        engine.shutdown()
 
 
 @pytest.mark.asyncio
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index 3efe864bdf30c..37148909b5897 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -1,14 +1,18 @@
 """LLMEngine tests"""
 import pytest
+
 from vllm import LLM, SamplingParams
 from vllm.v1.engine.utils import STR_LLM_ENGINE_PROMPT_LP_APC_UNSUPPORTED
 
-def test_llm_engine_refuses_prompt_logprobs_with_apc():
+
+def test_llm_engine_refuses_prompt_logprobs_with_apc(monkeypatch):
     """Test passes if LLMEngine raises an exception when it is configured
     for automatic prefix caching and it receives a request with
     prompt_logprobs enabled, which is incompatible."""
+    monkeypatch.setenv("VLLM_USE_V1", "1")
     with pytest.raises(ValueError) as excinfo:
-        (LLM(model="facebook/opt-125m",enable_prefix_caching=True)
-            .generate("Hello, my name is", SamplingParams(temperature=0.8, top_p=0.95, prompt_logprobs=5)))
+        (LLM(model="facebook/opt-125m", enable_prefix_caching=True).generate(
+            "Hello, my name is",
+            SamplingParams(temperature=0.8, top_p=0.95, prompt_logprobs=5)))
     # Validate exception string is correct
-    assert str(excinfo.value) == STR_LLM_ENGINE_PROMPT_LP_APC_UNSUPPORTED
\ No newline at end of file
+    assert str(excinfo.value) == STR_LLM_ENGINE_PROMPT_LP_APC_UNSUPPORTED
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 090e90fbfbc78..6daf1579f3169 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -21,8 +21,8 @@
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
-from vllm.v1.executor.abstract import Executor
 from vllm.v1.engine.utils import STR_ASYNC_LLM_PROMPT_LP_APC_UNSUPPORTED
+from vllm.v1.executor.abstract import Executor
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index 4f32dda9f9964..d9d82de139935 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -1,14 +1,16 @@
 """Engine utils"""
 
 # Exception strings
-STR_LLM_ENGINE_PROMPT_LP_APC_UNSUPPORTED=("Request specifies prompt_logprobs, but prompt"
-                    "_logprobs are incompatible with automatic prefix caching"
-                    " which is currently enabled on the vLLM server. Try"
-                    " re-initializing LLM with enable_prefix_caching=False,"
-                    " or setting prompt_logprobs=None (which is the default.)")
+STR_LLM_ENGINE_PROMPT_LP_APC_UNSUPPORTED = (
+    "Request specifies prompt_logprobs, but prompt"
+    "_logprobs are incompatible with automatic prefix caching"
+    " which is currently enabled on the vLLM server. Try"
+    " re-initializing LLM with enable_prefix_caching=False,"
+    " or setting prompt_logprobs=None (which is the default.)")
 
-STR_ASYNC_LLM_PROMPT_LP_APC_UNSUPPORTED=("Request specifies prompt_logprobs, but prompt"
-                    "_logprobs are incompatible with automatic prefix caching"
-                    " which is currently enabled on the vLLM server. Try"
-                    " restarting VLLM with --no-enable-prefix-caching,"
-                    " or setting prompt_logprobs=None (which is the default.)")
\ No newline at end of file
+STR_ASYNC_LLM_PROMPT_LP_APC_UNSUPPORTED = (
+    "Request specifies prompt_logprobs, but prompt"
+    "_logprobs are incompatible with automatic prefix caching"
+    " which is currently enabled on the vLLM server. Try"
+    " restarting VLLM with --no-enable-prefix-caching,"
+    " or setting prompt_logprobs=None (which is the default.)")

From 435bb15ce0e6887998049b2077a3c17b30370590 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 13 Jan 2025 15:16:21 +0000
Subject: [PATCH 279/293] updated

---
 vllm/v1/engine/detokenizer.py      | 28 ++++++++++++++++++----------
 vllm/v1/engine/logprobs.py         |  0
 vllm/v1/engine/output_processor.py |  2 +-
 3 files changed, 19 insertions(+), 11 deletions(-)
 create mode 100644 vllm/v1/engine/logprobs.py

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 76d35495f1392..93fbcec14ac3b 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -11,6 +11,7 @@
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally,
     detokenize_non_incrementally)
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.engine.output_processor import RequestState
 
 logger = init_logger(__name__)
 
@@ -333,7 +334,8 @@ def _make_pos_logprob_dict(
 
     def update_from_output(
         self,
-        output: EngineCoreOutput
+        output: EngineCoreOutput,
+        request_state: RequestState,
     ) -> Optional[DetokenizerOutput]:
         """
         Update RequestState for the request_id by:
@@ -347,10 +349,10 @@ def update_from_output(
         new_token_ids = output.new_token_ids
         finish_reason = output.finish_reason
         stop_reason = output.stop_reason
-        new_logprobs_token_ids = output.new_logprobs_token_ids
-        new_logprobs = output.new_logprobs
-        new_prompt_logprobs_token_ids = output.new_prompt_logprobs_token_ids
-        new_prompt_logprobs = output.new_prompt_logprobs
+        new_logprobs_token_ids = output.logprobs_token_ids
+        new_logprobs = output.logprobs
+        new_prompt_logprobs_token_ids = output.prompt_logprobs_token_ids
+        new_prompt_logprobs = output.prompt_logprobs
 
         # 1) Detokenize the new token ids incrementally.
         # TODO(woosuk): This method becomes very inefficient when the number of
@@ -402,7 +404,7 @@ def update_from_output(
         # 4) Make Prompt Logprobs.
         prompt_logprobs = self._update_prompt_logprobs(
             new_prompt_logprobs_token_ids, new_prompt_logprobs,
-            self.prompt_token_ids)
+            request_state.prompt_token_ids)
 
         # 5) Makes the RequestOutput object with the new text.
         finished = bool(finish_reason)
@@ -417,9 +419,15 @@ def update_from_output(
         prompt_logprobs = prompt_logprobs if delta else self.prompt_logprobs
 
         return DetokenizerOutput(
-            output_text,
-            token_ids, finished,
-                                 finish_reason, stop_reason, logprobs, prompt_logprobs, self.cumulative_logprob )
+            output_text=output_text,
+            token_ids=token_ids,
+            finished=finished,
+            finish_reason=finish_reason,
+            stop_reason=stop_reason,
+            logprobs=logprobs,
+            prompt_logprobs=prompt_logprobs,
+            cumulative_logprob=self.cumulative_logprob,
+        )
 
     def _get_next_output_text(self, finished: bool, delta: bool) -> str:
         """If delta is True, only new text since the last call to
@@ -435,4 +443,4 @@ def _get_next_output_text(self, finished: bool, delta: bool) -> str:
         if last_offset < length:
             self._last_output_text_offset = length
             return self.output_text[last_offset:length]
-        return ""
\ No newline at end of file
+        return ""
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 749f4f5043c97..33b4789012053 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -149,7 +149,7 @@ def process_outputs(
 
             # 2) Detokenize the token ids into text.
             detokenizer_output = req_state.detokenizer.update_from_output(
-                engine_core_output)
+                engine_core_output, req_state)
 
             # 3) Create and handle RequestOutput objects.
             if request_output := self._make_request_output(

From c99690175b19bb0eb3709339473010d045db5079 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 14 Jan 2025 15:36:01 +0000
Subject: [PATCH 280/293] sort of fixed RequestState cyclical import; added
 logprobs, prompt_logprobs, and cumulative_logprob to RequestOutput

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/detokenizer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 93fbcec14ac3b..e89a6b99813ac 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -11,7 +11,6 @@
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally,
     detokenize_non_incrementally)
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
-from vllm.v1.engine.output_processor import RequestState
 
 logger = init_logger(__name__)
 
@@ -20,6 +19,9 @@
 class DetokenizerOutput:
     output_text: str
     token_ids: List[int]
+    logprobs: Optional[SampleLogprobs]
+    prompt_logprobs: Optional[PromptLogprobs]
+    cumulative_logprob: Optional[float]
     finished: bool
     finish_reason: Optional[str] = None
     stop_reason: Union[int, str, None] = None
@@ -335,7 +337,7 @@ def _make_pos_logprob_dict(
     def update_from_output(
         self,
         output: EngineCoreOutput,
-        request_state: RequestState,
+        request_state: "RequestState",
     ) -> Optional[DetokenizerOutput]:
         """
         Update RequestState for the request_id by:

From ba9561a97ec53d7450426fae5560ddd6c2fcc9ff Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 14 Jan 2025 15:42:44 +0000
Subject: [PATCH 281/293] actually fixed RequestState circular import

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/detokenizer.py            |  3 +-
 vllm/v1/engine/output_processor.py       | 42 ++--------------------
 vllm/v1/engine/output_processor_utils.py | 44 ++++++++++++++++++++++++
 3 files changed, 48 insertions(+), 41 deletions(-)
 create mode 100644 vllm/v1/engine/output_processor_utils.py

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index e89a6b99813ac..8f9805c8dfc14 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -11,6 +11,7 @@
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally,
     detokenize_non_incrementally)
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.engine.output_processor_utils import RequestState
 
 logger = init_logger(__name__)
 
@@ -337,7 +338,7 @@ def _make_pos_logprob_dict(
     def update_from_output(
         self,
         output: EngineCoreOutput,
-        request_state: "RequestState",
+        request_state: RequestState,
     ) -> Optional[DetokenizerOutput]:
         """
         Update RequestState for the request_id by:
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 33b4789012053..f05786670332c 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -3,11 +3,10 @@
 from typing import Dict, List, Optional
 
 from vllm.outputs import RequestOutput
-from vllm.transformers_utils.detokenizer_utils import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
-from vllm.v1.engine.detokenizer import (DetokenizerOutput,
-                                        IncrementalDetokenizer)
+from vllm.v1.engine.detokenizer import DetokenizerOutput
+from vllm.v1.engine.output_processor_utils import RequestState
 from vllm.v1.metrics.stats import IterationStats
 
 
@@ -19,43 +18,6 @@ class OutputProcessorOutput:
     iteration_stats: IterationStats
 
 
-class RequestState:
-
-    def __init__(
-        self,
-        request_id: str,
-        prompt: Optional[str],
-        prompt_token_ids: List[int],
-        detokenizer: IncrementalDetokenizer,
-        queue: Optional[asyncio.Queue[RequestOutput]],
-    ):
-        self.request_id = request_id
-        self.prompt = prompt
-        self.prompt_token_ids = prompt_token_ids
-        self.prompt_len = len(prompt_token_ids)
-        self.detokenizer = detokenizer
-        self.is_prefilling = True
-        self.queue = queue
-
-    @classmethod
-    def from_new_request(
-        cls,
-        tokenizer: AnyTokenizer,
-        request: EngineCoreRequest,
-        queue: Optional[asyncio.Queue[RequestOutput]] = None,
-    ) -> "RequestState":
-        return cls(
-            request_id=request.request_id,
-            prompt=request.prompt,
-            prompt_token_ids=request.prompt_token_ids,
-            detokenizer=IncrementalDetokenizer.from_new_request(
-                tokenizer=tokenizer,
-                request=request,
-            ),
-            queue=queue,
-        )
-
-
 class OutputProcessor:
     """Process EngineCoreOutputs into RequestOutputs."""
 
diff --git a/vllm/v1/engine/output_processor_utils.py b/vllm/v1/engine/output_processor_utils.py
new file mode 100644
index 0000000000000..69b49b3ff046b
--- /dev/null
+++ b/vllm/v1/engine/output_processor_utils.py
@@ -0,0 +1,44 @@
+"""Utils supporting :class:`OutputProcessor`"""
+import asyncio
+from typing import List, Optional
+
+from vllm.outputs import RequestOutput
+from vllm.transformers_utils.detokenizer_utils import AnyTokenizer
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.detokenizer import IncrementalDetokenizer
+
+class RequestState:
+
+    def __init__(
+        self,
+        request_id: str,
+        prompt: Optional[str],
+        prompt_token_ids: List[int],
+        detokenizer: IncrementalDetokenizer,
+        queue: Optional[asyncio.Queue[RequestOutput]],
+    ):
+        self.request_id = request_id
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
+        self.prompt_len = len(prompt_token_ids)
+        self.detokenizer = detokenizer
+        self.is_prefilling = True
+        self.queue = queue
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: AnyTokenizer,
+        request: EngineCoreRequest,
+        queue: Optional[asyncio.Queue[RequestOutput]] = None,
+    ) -> "RequestState":
+        return cls(
+            request_id=request.request_id,
+            prompt=request.prompt,
+            prompt_token_ids=request.prompt_token_ids,
+            detokenizer=IncrementalDetokenizer.from_new_request(
+                tokenizer=tokenizer,
+                request=request,
+            ),
+            queue=queue,
+        )
\ No newline at end of file

From 34735be71a3fa326b52d711e2a8ff32bd30fadd1 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 14 Jan 2025 15:46:25 +0000
Subject: [PATCH 282/293] woops

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/output_processor_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/output_processor_utils.py b/vllm/v1/engine/output_processor_utils.py
index 69b49b3ff046b..221228594cdf6 100644
--- a/vllm/v1/engine/output_processor_utils.py
+++ b/vllm/v1/engine/output_processor_utils.py
@@ -7,6 +7,7 @@
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.detokenizer import IncrementalDetokenizer
 
+
 class RequestState:
 
     def __init__(
@@ -41,4 +42,4 @@ def from_new_request(
                 request=request,
             ),
             queue=queue,
-        )
\ No newline at end of file
+        )

From 49c2c8cee8f1e3657cc240cf53da5ce8d13a93bf Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 14 Jan 2025 22:18:17 +0000
Subject: [PATCH 283/293] wip

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/detokenizer.py |   7 -
 vllm/v1/engine/logprobs.py    | 353 ++++++++++++++++++++++++++++++++++
 2 files changed, 353 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 8f9805c8dfc14..406266297be0d 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -53,13 +53,6 @@ class IncrementalDetokenizer:
     # Tokenizer for this request
     tokenizer: AnyTokenizer
 
-    # Logprobs for this request
-    logprobs: Optional[SampleLogprobs]
-    prompt_logprobs: Optional[PromptLogprobs]
-    cumulative_logprob: Optional[float]
-    num_logprobs: int
-    num_prompt_logprobs: int
-
     # Accounting for stop string buffering
     stop_buffer_length: int
     _last_output_text_offset: int = 0
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index e69de29bb2d1d..d74cdb8de9cb7 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -0,0 +1,353 @@
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.logger import init_logger
+from vllm.sampling_params import RequestOutputKind
+from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
+from vllm.transformers_utils.detokenizer_utils import (
+    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally,
+    detokenize_non_incrementally)
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.engine.output_processor_utils import RequestState
+from vllm.v1.engine.detokenizer import IncrementalDetokenizer
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class LogprobsOutput:
+    logprobs: Optional[SampleLogprobs]
+    prompt_logprobs: Optional[PromptLogprobs]
+    cumulative_logprob: Optional[float]
+
+
+@dataclass
+class LogprobsProcessor:
+
+    # Tokenizer for this request
+    tokenizer: AnyTokenizer
+
+    # Logprobs for this request
+    logprobs: Optional[SampleLogprobs]
+    prompt_logprobs: Optional[PromptLogprobs]
+    cumulative_logprob: Optional[float]
+    num_logprobs: int
+    num_prompt_logprobs: int
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: AnyTokenizer,
+        request: EngineCoreRequest,
+    ) -> "IncrementalDetokenizer":
+
+        tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
+            tokenizer=tokenizer,
+            prompt_ids=request.prompt_token_ids,
+            skip_special_tokens=request.sampling_params.skip_special_tokens,
+        )
+
+        stops = request.sampling_params.stop
+        # Number of chars to hold back when stop strings are to be excluded
+        # from streamed output.
+        if stops and not request.sampling_params.include_stop_str_in_output:
+            stop_buffer_length = max(len(s) for s in stops) - 1
+        else:
+            stop_buffer_length = 0
+
+        logprobs = request.sampling_params.logprobs
+        prompt_logprobs = request.sampling_params.prompt_logprobs
+        return cls(
+            output_text="",
+            tokens=tokens,
+            # Detokenizer mutates this list, so need a unique copy.
+            # NOTE(Nick): could we take ownership of it though?
+            token_ids=request.prompt_token_ids.copy(),
+            stop=stops,
+            include_stop_str_in_output=request.sampling_params.
+            include_stop_str_in_output,
+            prefix_offset=prefix_offset,
+            read_offset=read_offset,
+            skip_special_tokens=request.sampling_params.skip_special_tokens,
+            spaces_between_special_tokens=request.sampling_params.
+            spaces_between_special_tokens,
+            output_kind=request.sampling_params.output_kind,
+            prompt_len=len(request.prompt_token_ids),
+            tokenizer=tokenizer,
+            stop_buffer_length=stop_buffer_length,
+            cumulative_logprob=(0. if logprobs else None),
+            logprobs=([] if logprobs else None),
+            prompt_logprobs=([] if prompt_logprobs else None),
+            num_prompt_logprobs=(prompt_logprobs or 0),
+            num_logprobs=(logprobs or 0),
+        )
+
+    def _update_sample_logprobs(
+        self,
+        sampled_token_ids: List[int],
+        token_ids_lst: List[torch.Tensor],
+        sample_logprobs_lst: List[torch.Tensor],
+    ) -> Optional[SampleLogprobs]:
+        """Incorporate sample logprobs from this step, if they exist.
+
+        Lists are only of length >1 if EngineCore made
+        >1 tokens in prior step (e.g. in spec decoding).
+
+        Args:
+          sampled_token_ids: list of int token ids
+          token_ids_list: list of (topk + 1) token ids tensors at each pos;
+                          `None` if sample logprobs are disabled in this req
+          sample_logprobs: list of (topk + 1) logprobs tensors at each pos;
+                          `None` if sample logprobs are disabled in this req
+
+        Return:
+          Sample logprobs, if required for this request
+        """
+        assert self.logprobs is not None
+
+        for sampled_token_id, logprobs, token_ids in zip(
+                sampled_token_ids, sample_logprobs_lst, token_ids_lst):
+
+            # Split into sampled vs top_k.
+            assert sampled_token_id == token_ids[0].item(), (
+                "Sampler concats the sampled token logprob in front of "
+                f"the topk logprobs, but got {sampled_token_id=} and "
+                f"{token_ids[0].item()=}")
+            sampled_token_logprob = logprobs[0].item()
+            topk_token_ids = token_ids[1:]
+            topk_logprobs = logprobs[1:]
+
+            # Detokenize non-incrementally.
+            decoded_tokens = detokenize_non_incrementally(
+                self.tokenizer, topk_token_ids)
+
+            # Make the dict of top-token Logprob objects associated with the
+            # current sequence offset
+            if sampled_token_id in topk_token_ids:
+                pos_logprobs_dict = self._make_pos_logprob_dict(
+                    topk_logprobs.tolist(), topk_token_ids.tolist(),
+                    decoded_tokens, self.num_logprobs)
+            else:
+                # If the sampled token is not one of the top tokens
+                # at this sequence offset, inject the sampled token
+                # & its Logprob instance into the dict
+                sample_logprob_obj = Logprob(
+                    logprob=sampled_token_logprob,
+                    decoded_token=self.tokenizer.decode(sampled_token_id))
+                pos_logprobs_dict = self._make_pos_logprob_dict(
+                    topk_logprobs.tolist(), topk_token_ids.tolist(),
+                    decoded_tokens, self.num_logprobs,
+                    (sampled_token_id, sample_logprob_obj))
+
+            self.logprobs.append(pos_logprobs_dict)
+            self.cumulative_logprob += sampled_token_logprob
+
+        # Return just the newly generated sample logprobs.
+        num_new_tokens = len(sampled_token_ids)
+        return self.logprobs[-num_new_tokens:]
+
+    def _update_prompt_logprobs(
+        self,
+        token_ids: Optional[torch.Tensor],
+        prompt_logprobs: Optional[torch.Tensor],
+        prompt_token_ids_lst: List[int],
+    ) -> Optional[PromptLogprobs]:
+        """Incorporate prompt logprobs from this step, if they exist.
+
+        If prompt logprobs are enabled for this request and EngineCore
+        prefilled the prompt or a chunk of the prompt in this step,
+        both arguments should be non-empty lists. 
+
+        If prompt logprobs are enabled but prefill is completed, both
+        arguments should be empty lists.
+
+        If prompt logprobs are disabled, both arguments should be `None`.
+
+        Args:
+          token_ids: (num prompt tokens-1) x (topk + 1) token ids tensor
+                     `None` if prompt logprobs are disabled in this req
+          prompt_logprobs: (num prompt tokens-1) x (topk + 1) logprobs tensor
+          prompt_token_ids_lst: (num prompt tokens)-length list of prompt
+                                token ids
+
+        Return:
+          Prompt logprobs, if required for this request
+        """
+        assert prompt_logprobs is not None
+        assert token_ids is not None
+        if prompt_logprobs.numel() == 0:
+            # Prompt logprobs are enabled for this request but prefill
+            # is finished and no more logprobs are being streamed from
+            # engine core
+            return []
+        # Prompt logprobs are enabled & engine core is streaming prompt
+        # logprobs, in one or more chunks.
+        assert self.prompt_logprobs is not None
+
+        if len(self.prompt_logprobs) == 0:
+            self.prompt_logprobs = [None]
+
+        # Detokenize non-incrementally.
+        # NOTE(rob): the output is flattened:
+        # [num_tok, num_lps] -> [num_tok * num_lps]
+        decoded_tokens = detokenize_non_incrementally(self.tokenizer,
+                                                      token_ids)
+
+        # Make Logprob for each token.
+        num_chunk_tokens, decoded_tokens_stride = prompt_logprobs.shape
+        prompt_idx = len(self.prompt_logprobs)
+        for tok_idx, prompt_token_id in zip(range(num_chunk_tokens),
+                                            prompt_token_ids_lst[prompt_idx:]):
+            # Iterate over prefill chunk
+            assert prompt_token_id
+            assert prompt_token_id == token_ids[tok_idx, 0].item(), (
+                "Sampler concats the prompt token logprob in front of "
+                f"the topk logprobs, but got {prompt_token_id=} and "
+                f"{token_ids[tok_idx, 0].item()=}")
+            # Split into prompt token vs top_k.
+            prompt_token_logprob = prompt_logprobs[tok_idx, 0].item()
+            topk_token_ids = token_ids[tok_idx, 1:]
+            topk_logprobs = prompt_logprobs[tok_idx, 1:]
+            decoded_tokens_offset = tok_idx * decoded_tokens_stride + 1
+
+            # Make the dict of top-token Logprob objects associated with the
+            # current prompt offset
+            if prompt_token_id in topk_token_ids:
+                self.prompt_logprobs.append(
+                    self._make_pos_logprob_dict(
+                        topk_logprobs.tolist(),
+                        topk_token_ids.tolist(),
+                        # Deal with the flattening from above.
+                        decoded_tokens[decoded_tokens_offset:],
+                        self.num_prompt_logprobs,
+                    ))
+            else:
+                # If the prompt token is not one of the top tokens
+                # at this prompt offset, inject the prompt token
+                # & its Logprob instance into the dict
+                prompt_logprob_obj = Logprob(
+                    logprob=prompt_token_logprob,
+                    decoded_token=self.tokenizer.decode(prompt_token_id))
+                self.prompt_logprobs.append(
+                    self._make_pos_logprob_dict(
+                        topk_logprobs.tolist(), topk_token_ids.tolist(),
+                        decoded_tokens[decoded_tokens_offset:],
+                        self.num_prompt_logprobs,
+                        (prompt_token_id, prompt_logprob_obj)))
+        return self.prompt_logprobs
+
+    @staticmethod
+    def _make_pos_logprob_dict(
+        logprobs: List[float],
+        logprob_token_ids: List[int],
+        decoded_tokens: List[str],
+        num_logprobs: int,
+        special_token_id_logprob: Optional[Tuple[int, Logprob]] = None,
+    ) -> Dict[int, Logprob]:
+        """Make a Logprob dictionary for a position in the sequence.
+        
+        Returns a dictionary mapping top token ids to Logprob data
+        structures. Each Logprob data structure includes log probability,
+        decoded token, and rank (index+1). The size of the dict returned
+        will be be num_logprobs.
+
+        If the special token (sampled token or prompt token associated
+        with the current sequence position) is not among the top logprobs,
+        then special_token_id_logprob = (special_token_id,logprob) must be
+        provided; an additional dictionary entry mapping special_token_id -> 
+        logprob will be injected with rank equal to num_logprobs + 1 
+        (special_token_id must be lowest-rank if we are having to inject it.)
+        Note that the size of the dict returned will then be num_logprobs + 1.
+
+        Args:
+          logprobs: list of log probabilities
+          logprob_token_ids: list of top token ids
+          decoded_tokens: list of decoded top tokens
+          num_logprobs: number of top tokens
+          special_token_id_logprob: (optional) tuple of
+                                    (special_token_id,logprob) associated with
+                                    sampled token or prompt token
+
+        Returns:
+          Dict[top token id, Logprob]; num_logprobs or num_logprobs+1
+          keys in total
+        
+        """
+        # Sampler uses torch.topk() which sorts so the
+        # index in lists is equivalent to rank-1.
+        logprobs_dict = {
+            logprob_token_ids[idx]: Logprob(
+                logprob=logprobs[idx],
+                rank=idx + 1,
+                decoded_token=decoded_tokens[idx],
+            )
+            for idx in range(num_logprobs)
+        }
+
+        # Inject special token Logprob if necessary
+        if special_token_id_logprob:
+            special_token_id = special_token_id_logprob[0]
+            special_logprob_obj = special_token_id_logprob[1]
+            assert special_token_id is not None
+            assert special_logprob_obj is not None
+            special_logprob_obj.rank = num_logprobs + 1
+            logprobs_dict[special_token_id] = special_logprob_obj
+
+        return logprobs_dict
+
+    def update_from_output(
+        self,
+        output: EngineCoreOutput,
+        request_state: RequestState,
+    ) -> Optional[LogprobsOutput]:
+        """
+        Update RequestState for the request_id by:
+        """
+
+        # new_token_ids = output.new_token_ids
+        # finish_reason = output.finish_reason
+        # stop_reason = output.stop_reason
+        new_logprobs_token_ids = output.logprobs_token_ids
+        new_logprobs = output.logprobs
+        new_prompt_logprobs_token_ids = output.prompt_logprobs_token_ids
+        new_prompt_logprobs = output.prompt_logprobs
+
+        # 1) Make Sample Logprobs, if requested
+        logprobs = (None if self.num_logprobs == 0 else
+                    self._update_sample_logprobs(
+                        new_token_ids,
+                        new_logprobs_token_ids,
+                        new_logprobs,
+                    ))
+
+        # 4) Make Prompt Logprobs.
+        prompt_logprobs = (
+            None if self.num_prompt_logprobs else self._update_prompt_logprobs(
+                new_prompt_logprobs_token_ids, new_prompt_logprobs,
+                request_state.prompt_token_ids))
+
+        # 5) Makes the RequestOutput object with the new text.
+        finished = bool(finish_reason)
+        if self.output_kind == RequestOutputKind.FINAL_ONLY \
+            and not finished:
+            return None
+
+        delta = self.output_kind == RequestOutputKind.DELTA
+        output_text = self._get_next_output_text(finished, delta)
+        token_ids = new_token_ids if delta else self.output_token_ids
+        logprobs = logprobs if delta else self.logprobs
+        prompt_logprobs = prompt_logprobs if delta else self.prompt_logprobs
+
+        return LogprobsOutput(
+            output_text=output_text,
+            token_ids=token_ids,
+            finished=finished,
+            finish_reason=finish_reason,
+            stop_reason=stop_reason,
+            logprobs=logprobs,
+            prompt_logprobs=prompt_logprobs,
+            cumulative_logprob=self.cumulative_logprob,
+        )

From 016e747a032d70a95d7d691ab1dfc0bb9d5b5beb Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 15 Jan 2025 05:32:51 +0000
Subject: [PATCH 284/293] untested first-pass at logprobs integration into new
 output processing abstraction

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/core/scheduler.py                |  16 +-
 vllm/v1/engine/__init__.py               |   8 +-
 vllm/v1/engine/detokenizer.py            | 267 +----------------------
 vllm/v1/engine/logprobs.py               |  97 +++-----
 vllm/v1/engine/output_processor.py       |  26 ++-
 vllm/v1/engine/output_processor_utils.py |   7 +
 6 files changed, 73 insertions(+), 348 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index bda4a01baae8b..0e0ed3f363736 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -461,10 +461,10 @@ def update_from_output(
                     new_token_ids=request.output_token_ids[-num_new_tokens:],
                     finished=request.is_finished(),
                     finish_reason=request.get_finished_reason(),
-                    logprobs_token_ids=logprobs_token_ids,
-                    logprobs=logprobs,
-                    prompt_logprobs_token_ids=prompt_logprobs_token_ids,
-                    prompt_logprobs=prompt_logprobs,
+                    new_logprobs_token_ids=logprobs_token_ids,
+                    new_logprobs=logprobs,
+                    new_prompt_logprobs_token_ids=prompt_logprobs_token_ids,
+                    new_prompt_logprobs=prompt_logprobs,
                     stop_reason=request.stop_reason)
                 outputs.append(output)
 
@@ -481,10 +481,10 @@ def update_from_output(
                     new_token_ids=[],
                     finished=request.is_finished(),
                     finish_reason=request.get_finished_reason(),
-                    logprobs_token_ids=[],
-                    logprobs=[],
-                    prompt_logprobs_token_ids=prompt_logprobs_token_ids,
-                    prompt_logprobs=prompt_logprobs,
+                    new_logprobs_token_ids=[],
+                    new_logprobs=[],
+                    new_prompt_logprobs_token_ids=prompt_logprobs_token_ids,
+                    new_prompt_logprobs=prompt_logprobs,
                     stop_reason=request.stop_reason)
                 outputs.append(output)
 
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 5aec113d83a06..8d46304239384 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -43,10 +43,10 @@ class EngineCoreOutput(
 
     request_id: str
     new_token_ids: List[int]
-    logprobs: List[torch.Tensor]
-    logprobs_token_ids: List[torch.Tensor]
-    prompt_logprobs: Optional[torch.Tensor]
-    prompt_logprobs_token_ids: Optional[torch.Tensor]
+    new_logprobs: List[torch.Tensor]
+    new_logprobs_token_ids: List[torch.Tensor]
+    new_prompt_logprobs: Optional[torch.Tensor]
+    new_prompt_logprobs_token_ids: Optional[torch.Tensor]
     finished: bool
     finish_reason: Optional[str] = None
     stop_reason: Union[int, str, None] = None
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 406266297be0d..871c71a8e2ef8 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,17 +1,12 @@
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
-
-import torch
+from typing import List, Optional, Union
 
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
 from vllm.sampling_params import RequestOutputKind
-from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.transformers_utils.detokenizer_utils import (
-    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally,
-    detokenize_non_incrementally)
+    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
-from vllm.v1.engine.output_processor_utils import RequestState
 
 logger = init_logger(__name__)
 
@@ -20,9 +15,6 @@
 class DetokenizerOutput:
     output_text: str
     token_ids: List[int]
-    logprobs: Optional[SampleLogprobs]
-    prompt_logprobs: Optional[PromptLogprobs]
-    cumulative_logprob: Optional[float]
     finished: bool
     finish_reason: Optional[str] = None
     stop_reason: Union[int, str, None] = None
@@ -82,8 +74,6 @@ def from_new_request(
         else:
             stop_buffer_length = 0
 
-        logprobs = request.sampling_params.logprobs
-        prompt_logprobs = request.sampling_params.prompt_logprobs
         return cls(
             output_text="",
             tokens=tokens,
@@ -102,253 +92,21 @@ def from_new_request(
             prompt_len=len(request.prompt_token_ids),
             tokenizer=tokenizer,
             stop_buffer_length=stop_buffer_length,
-            cumulative_logprob=(0. if logprobs else None),
-            logprobs=([] if logprobs else None),
-            prompt_logprobs=([] if prompt_logprobs else None),
-            num_prompt_logprobs=(prompt_logprobs or 0),
-            num_logprobs=(logprobs or 0),
         )
 
-    def _update_sample_logprobs(
-        self,
-        sampled_token_ids: List[int],
-        token_ids_lst: List[torch.Tensor],
-        sample_logprobs_lst: List[torch.Tensor],
-    ) -> Optional[SampleLogprobs]:
-        """Incorporate sample logprobs from this step, if they exist.
-
-        Lists are only of length >1 if EngineCore made
-        >1 tokens in prior step (e.g. in spec decoding).
-
-        Args:
-          sampled_token_ids: list of int token ids
-          token_ids_list: list of (topk + 1) token ids tensors at each pos;
-                          `None` if sample logprobs are disabled in this req
-          sample_logprobs: list of (topk + 1) logprobs tensors at each pos;
-                          `None` if sample logprobs are disabled in this req
-
-        Return:
-          Sample logprobs, if required for this request
-        """
-        if self.num_logprobs == 0:
-            # Sample logprobs disabled for this request
-            return None
-        assert self.logprobs is not None
-
-        for sampled_token_id, logprobs, token_ids in zip(
-                sampled_token_ids, sample_logprobs_lst, token_ids_lst):
-
-            # Split into sampled vs top_k.
-            assert sampled_token_id == token_ids[0].item(), (
-                "Sampler concats the sampled token logprob in front of "
-                f"the topk logprobs, but got {sampled_token_id=} and "
-                f"{token_ids[0].item()=}")
-            sampled_token_logprob = logprobs[0].item()
-            topk_token_ids = token_ids[1:]
-            topk_logprobs = logprobs[1:]
-
-            # Detokenize non-incrementally.
-            decoded_tokens = detokenize_non_incrementally(
-                self.tokenizer, topk_token_ids)
-
-            # Make the dict of top-token Logprob objects associated with the
-            # current sequence offset
-            if sampled_token_id in topk_token_ids:
-                pos_logprobs_dict = self._make_pos_logprob_dict(
-                    topk_logprobs.tolist(), topk_token_ids.tolist(),
-                    decoded_tokens, self.num_logprobs)
-            else:
-                # If the sampled token is not one of the top tokens
-                # at this sequence offset, inject the sampled token
-                # & its Logprob instance into the dict
-                sample_logprob_obj = Logprob(
-                    logprob=sampled_token_logprob,
-                    decoded_token=self.tokenizer.decode(sampled_token_id))
-                pos_logprobs_dict = self._make_pos_logprob_dict(
-                    topk_logprobs.tolist(), topk_token_ids.tolist(),
-                    decoded_tokens, self.num_logprobs,
-                    (sampled_token_id, sample_logprob_obj))
-
-            self.logprobs.append(pos_logprobs_dict)
-            self.cumulative_logprob += sampled_token_logprob
-
-        # Return just the newly generated sample logprobs.
-        num_new_tokens = len(sampled_token_ids)
-        return self.logprobs[-num_new_tokens:]
-
-    def _update_prompt_logprobs(
-        self,
-        token_ids: Optional[torch.Tensor],
-        prompt_logprobs: Optional[torch.Tensor],
-        prompt_token_ids_lst: List[int],
-    ) -> Optional[PromptLogprobs]:
-        """Incorporate prompt logprobs from this step, if they exist.
-
-        If prompt logprobs are enabled for this request and EngineCore
-        prefilled the prompt or a chunk of the prompt in this step,
-        both arguments should be non-empty lists. 
-
-        If prompt logprobs are enabled but prefill is completed, both
-        arguments should be empty lists.
-
-        If prompt logprobs are disabled, both arguments should be `None`.
-
-        Args:
-          token_ids: (num prompt tokens-1) x (topk + 1) token ids tensor
-                     `None` if prompt logprobs are disabled in this req
-          prompt_logprobs: (num prompt tokens-1) x (topk + 1) logprobs tensor
-          prompt_token_ids_lst: (num prompt tokens)-length list of prompt
-                                token ids
-
-        Return:
-          Prompt logprobs, if required for this request
-        """
-        if self.num_prompt_logprobs == 0:
-            # Prompt logprobs disabled for this request
-            return None
-        assert prompt_logprobs is not None
-        assert token_ids is not None
-        if prompt_logprobs.numel() == 0:
-            # Prompt logprobs are enabled for this request but prefill
-            # is finished and no more logprobs are being streamed from
-            # engine core
-            return []
-        # Prompt logprobs are enabled & engine core is streaming prompt
-        # logprobs, in one or more chunks.
-        assert self.prompt_logprobs is not None
-
-        if len(self.prompt_logprobs) == 0:
-            self.prompt_logprobs = [None]
-
-        # Detokenize non-incrementally.
-        # NOTE(rob): the output is flattened:
-        # [num_tok, num_lps] -> [num_tok * num_lps]
-        decoded_tokens = detokenize_non_incrementally(self.tokenizer,
-                                                      token_ids)
-
-        # Make Logprob for each token.
-        num_chunk_tokens, decoded_tokens_stride = prompt_logprobs.shape
-        prompt_idx = len(self.prompt_logprobs)
-        for tok_idx, prompt_token_id in zip(range(num_chunk_tokens),
-                                            prompt_token_ids_lst[prompt_idx:]):
-            # Iterate over prefill chunk
-            assert prompt_token_id
-            assert prompt_token_id == token_ids[tok_idx, 0].item(), (
-                "Sampler concats the prompt token logprob in front of "
-                f"the topk logprobs, but got {prompt_token_id=} and "
-                f"{token_ids[tok_idx, 0].item()=}")
-            # Split into prompt token vs top_k.
-            prompt_token_logprob = prompt_logprobs[tok_idx, 0].item()
-            topk_token_ids = token_ids[tok_idx, 1:]
-            topk_logprobs = prompt_logprobs[tok_idx, 1:]
-            decoded_tokens_offset = tok_idx * decoded_tokens_stride + 1
-
-            # Make the dict of top-token Logprob objects associated with the
-            # current prompt offset
-            if prompt_token_id in topk_token_ids:
-                self.prompt_logprobs.append(
-                    self._make_pos_logprob_dict(
-                        topk_logprobs.tolist(),
-                        topk_token_ids.tolist(),
-                        # Deal with the flattening from above.
-                        decoded_tokens[decoded_tokens_offset:],
-                        self.num_prompt_logprobs,
-                    ))
-            else:
-                # If the prompt token is not one of the top tokens
-                # at this prompt offset, inject the prompt token
-                # & its Logprob instance into the dict
-                prompt_logprob_obj = Logprob(
-                    logprob=prompt_token_logprob,
-                    decoded_token=self.tokenizer.decode(prompt_token_id))
-                self.prompt_logprobs.append(
-                    self._make_pos_logprob_dict(
-                        topk_logprobs.tolist(), topk_token_ids.tolist(),
-                        decoded_tokens[decoded_tokens_offset:],
-                        self.num_prompt_logprobs,
-                        (prompt_token_id, prompt_logprob_obj)))
-        return self.prompt_logprobs
-
-    @staticmethod
-    def _make_pos_logprob_dict(
-        logprobs: List[float],
-        logprob_token_ids: List[int],
-        decoded_tokens: List[str],
-        num_logprobs: int,
-        special_token_id_logprob: Optional[Tuple[int, Logprob]] = None,
-    ) -> Dict[int, Logprob]:
-        """Make a Logprob dictionary for a position in the sequence.
-        
-        Returns a dictionary mapping top token ids to Logprob data
-        structures. Each Logprob data structure includes log probability,
-        decoded token, and rank (index+1). The size of the dict returned
-        will be be num_logprobs.
-
-        If the special token (sampled token or prompt token associated
-        with the current sequence position) is not among the top logprobs,
-        then special_token_id_logprob = (special_token_id,logprob) must be
-        provided; an additional dictionary entry mapping special_token_id -> 
-        logprob will be injected with rank equal to num_logprobs + 1 
-        (special_token_id must be lowest-rank if we are having to inject it.)
-        Note that the size of the dict returned will then be num_logprobs + 1.
-
-        Args:
-          logprobs: list of log probabilities
-          logprob_token_ids: list of top token ids
-          decoded_tokens: list of decoded top tokens
-          num_logprobs: number of top tokens
-          special_token_id_logprob: (optional) tuple of
-                                    (special_token_id,logprob) associated with
-                                    sampled token or prompt token
-
-        Returns:
-          Dict[top token id, Logprob]; num_logprobs or num_logprobs+1
-          keys in total
-        
-        """
-        # Sampler uses torch.topk() which sorts so the
-        # index in lists is equivalent to rank-1.
-        logprobs_dict = {
-            logprob_token_ids[idx]: Logprob(
-                logprob=logprobs[idx],
-                rank=idx + 1,
-                decoded_token=decoded_tokens[idx],
-            )
-            for idx in range(num_logprobs)
-        }
-
-        # Inject special token Logprob if necessary
-        if special_token_id_logprob:
-            special_token_id = special_token_id_logprob[0]
-            special_logprob_obj = special_token_id_logprob[1]
-            assert special_token_id is not None
-            assert special_logprob_obj is not None
-            special_logprob_obj.rank = num_logprobs + 1
-            logprobs_dict[special_token_id] = special_logprob_obj
-
-        return logprobs_dict
-
     def update_from_output(
         self,
         output: EngineCoreOutput,
-        request_state: RequestState,
     ) -> Optional[DetokenizerOutput]:
         """
         Update RequestState for the request_id by:
             1) Detokenize the new token ids incrementally.
             2) Evaluate stop criteria.
-            3) Detokenize sample logprobs non-incrementally.
-            4) Detokenize prompt logprobs non-incrementally.
-            5) Make the `RequestOutput` object with new text.
+            3) Make the `RequestOutput` object with new text.
         """
-
         new_token_ids = output.new_token_ids
         finish_reason = output.finish_reason
         stop_reason = output.stop_reason
-        new_logprobs_token_ids = output.logprobs_token_ids
-        new_logprobs = output.logprobs
-        new_prompt_logprobs_token_ids = output.prompt_logprobs_token_ids
-        new_prompt_logprobs = output.prompt_logprobs
 
         # 1) Detokenize the new token ids incrementally.
         # TODO(woosuk): This method becomes very inefficient when the number of
@@ -390,19 +148,7 @@ def update_from_output(
                 finish_reason = "stop"  # TODO: use constant
                 stop_reason = stop_str
 
-        # 3) Make Sample Logprobs.
-        logprobs = self._update_sample_logprobs(
-            new_token_ids,
-            new_logprobs_token_ids,
-            new_logprobs,
-        )
-
-        # 4) Make Prompt Logprobs.
-        prompt_logprobs = self._update_prompt_logprobs(
-            new_prompt_logprobs_token_ids, new_prompt_logprobs,
-            request_state.prompt_token_ids)
-
-        # 5) Makes the RequestOutput object with the new text.
+        # 3) Makes the RequestOutput object with the new text.
         finished = bool(finish_reason)
         if self.output_kind == RequestOutputKind.FINAL_ONLY \
             and not finished:
@@ -411,8 +157,6 @@ def update_from_output(
         delta = self.output_kind == RequestOutputKind.DELTA
         output_text = self._get_next_output_text(finished, delta)
         token_ids = new_token_ids if delta else self.output_token_ids
-        logprobs = logprobs if delta else self.logprobs
-        prompt_logprobs = prompt_logprobs if delta else self.prompt_logprobs
 
         return DetokenizerOutput(
             output_text=output_text,
@@ -420,9 +164,6 @@ def update_from_output(
             finished=finished,
             finish_reason=finish_reason,
             stop_reason=stop_reason,
-            logprobs=logprobs,
-            prompt_logprobs=prompt_logprobs,
-            cumulative_logprob=self.cumulative_logprob,
         )
 
     def _get_next_output_text(self, finished: bool, delta: bool) -> str:
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index d74cdb8de9cb7..56e028d29c88d 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -1,18 +1,14 @@
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple
 
 import torch
 
-from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
 from vllm.sampling_params import RequestOutputKind
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.transformers_utils.detokenizer_utils import (
-    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally,
-    detokenize_non_incrementally)
+    AnyTokenizer, detokenize_non_incrementally)
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
-from vllm.v1.engine.output_processor_utils import RequestState
-from vllm.v1.engine.detokenizer import IncrementalDetokenizer
 
 logger = init_logger(__name__)
 
@@ -30,6 +26,12 @@ class LogprobsProcessor:
     # Tokenizer for this request
     tokenizer: AnyTokenizer
 
+    # Request output kind
+    output_kind: RequestOutputKind
+
+    # Prompt tokens
+    prompt_token_ids: List[int]
+
     # Logprobs for this request
     logprobs: Optional[SampleLogprobs]
     prompt_logprobs: Optional[PromptLogprobs]
@@ -42,47 +44,18 @@ def from_new_request(
         cls,
         tokenizer: AnyTokenizer,
         request: EngineCoreRequest,
-    ) -> "IncrementalDetokenizer":
-
-        tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
-            tokenizer=tokenizer,
-            prompt_ids=request.prompt_token_ids,
-            skip_special_tokens=request.sampling_params.skip_special_tokens,
-        )
-
-        stops = request.sampling_params.stop
-        # Number of chars to hold back when stop strings are to be excluded
-        # from streamed output.
-        if stops and not request.sampling_params.include_stop_str_in_output:
-            stop_buffer_length = max(len(s) for s in stops) - 1
-        else:
-            stop_buffer_length = 0
-
-        logprobs = request.sampling_params.logprobs
-        prompt_logprobs = request.sampling_params.prompt_logprobs
+    ) -> "LogprobsProcessor":
+        num_logprobs = request.sampling_params.logprobs
+        num_prompt_logprobs = request.sampling_params.prompt_logprobs
         return cls(
-            output_text="",
-            tokens=tokens,
-            # Detokenizer mutates this list, so need a unique copy.
-            # NOTE(Nick): could we take ownership of it though?
-            token_ids=request.prompt_token_ids.copy(),
-            stop=stops,
-            include_stop_str_in_output=request.sampling_params.
-            include_stop_str_in_output,
-            prefix_offset=prefix_offset,
-            read_offset=read_offset,
-            skip_special_tokens=request.sampling_params.skip_special_tokens,
-            spaces_between_special_tokens=request.sampling_params.
-            spaces_between_special_tokens,
-            output_kind=request.sampling_params.output_kind,
-            prompt_len=len(request.prompt_token_ids),
             tokenizer=tokenizer,
-            stop_buffer_length=stop_buffer_length,
-            cumulative_logprob=(0. if logprobs else None),
-            logprobs=([] if logprobs else None),
-            prompt_logprobs=([] if prompt_logprobs else None),
-            num_prompt_logprobs=(prompt_logprobs or 0),
-            num_logprobs=(logprobs or 0),
+            output_kind=request.sampling_params.output_kind,
+            prompt_token_ids=request.prompt_token_ids,
+            cumulative_logprob=(0. if num_logprobs else None),
+            logprobs=([] if num_logprobs else None),
+            prompt_logprobs=([] if num_prompt_logprobs else None),
+            num_prompt_logprobs=(num_prompt_logprobs or 0),
+            num_logprobs=(num_logprobs or 0),
         )
 
     def _update_sample_logprobs(
@@ -301,19 +274,15 @@ def _make_pos_logprob_dict(
     def update_from_output(
         self,
         output: EngineCoreOutput,
-        request_state: RequestState,
     ) -> Optional[LogprobsOutput]:
         """
         Update RequestState for the request_id by:
         """
-
-        # new_token_ids = output.new_token_ids
-        # finish_reason = output.finish_reason
-        # stop_reason = output.stop_reason
-        new_logprobs_token_ids = output.logprobs_token_ids
-        new_logprobs = output.logprobs
-        new_prompt_logprobs_token_ids = output.prompt_logprobs_token_ids
-        new_prompt_logprobs = output.prompt_logprobs
+        new_token_ids = output.new_token_ids
+        new_logprobs_token_ids = output.new_logprobs_token_ids
+        new_logprobs = output.new_logprobs
+        new_prompt_logprobs_token_ids = output.new_prompt_logprobs_token_ids
+        new_prompt_logprobs = output.new_prompt_logprobs
 
         # 1) Make Sample Logprobs, if requested
         logprobs = (None if self.num_logprobs == 0 else
@@ -324,29 +293,21 @@ def update_from_output(
                     ))
 
         # 4) Make Prompt Logprobs.
-        prompt_logprobs = (
-            None if self.num_prompt_logprobs else self._update_prompt_logprobs(
-                new_prompt_logprobs_token_ids, new_prompt_logprobs,
-                request_state.prompt_token_ids))
+        prompt_logprobs = (None if self.num_prompt_logprobs else
+                           self._update_prompt_logprobs(
+                               new_prompt_logprobs_token_ids,
+                               new_prompt_logprobs, self.prompt_token_ids))
 
-        # 5) Makes the RequestOutput object with the new text.
-        finished = bool(finish_reason)
+        # 5) Makes the LogprobsOutput object with the new text.
+        finished = bool(output.finish_reason)
         if self.output_kind == RequestOutputKind.FINAL_ONLY \
             and not finished:
             return None
-
         delta = self.output_kind == RequestOutputKind.DELTA
-        output_text = self._get_next_output_text(finished, delta)
-        token_ids = new_token_ids if delta else self.output_token_ids
         logprobs = logprobs if delta else self.logprobs
         prompt_logprobs = prompt_logprobs if delta else self.prompt_logprobs
 
         return LogprobsOutput(
-            output_text=output_text,
-            token_ids=token_ids,
-            finished=finished,
-            finish_reason=finish_reason,
-            stop_reason=stop_reason,
             logprobs=logprobs,
             prompt_logprobs=prompt_logprobs,
             cumulative_logprob=self.cumulative_logprob,
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index f05786670332c..7c4a4dc4965eb 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -6,6 +6,7 @@
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
 from vllm.v1.engine.detokenizer import DetokenizerOutput
+from vllm.v1.engine.logprobs import LogprobsOutput
 from vllm.v1.engine.output_processor_utils import RequestState
 from vllm.v1.metrics.stats import IterationStats
 
@@ -110,12 +111,20 @@ def process_outputs(
             req_state.is_prefilling = False
 
             # 2) Detokenize the token ids into text.
-            detokenizer_output = req_state.detokenizer.update_from_output(
-                engine_core_output, req_state)
-
-            # 3) Create and handle RequestOutput objects.
+            if detokenizer_output := req_state.detokenizer.update_from_output(
+                    engine_core_output):
+                # Detect if detokenizer updated `finish_reason`
+                engine_core_output.finish_reason = (
+                    detokenizer_output.finish_reason)
+
+            # 3) Compute sample and prompt logprobs for request,
+            #    if required.
+            logprobs_output = req_state.logprobs_processor.update_from_output(
+                engine_core_output)
+
+            # 4) Create and handle RequestOutput objects.
             if request_output := self._make_request_output(
-                    req_state, detokenizer_output):
+                    req_state, logprobs_output, detokenizer_output):
                 if req_state.queue is not None:
                     # AsyncLLM: put into queue for handling by generate().
                     req_state.queue.put_nowait(request_output)
@@ -140,16 +149,23 @@ def process_outputs(
     def _make_request_output(
         self,
         request_state: RequestState,
+        logprobs_output: Optional[LogprobsOutput],
         detokenizer_output: Optional[DetokenizerOutput],
     ) -> Optional[RequestOutput]:
 
         if detokenizer_output is None:
+            # Only happens with FINAL request output kind when
+            # we are not on the final step
             return None
+        assert logprobs_output is not None
 
         request_output = RequestOutput.new(
             request_state.request_id,
             request_state.prompt,
             request_state.prompt_token_ids,
+            logprobs_output.logprobs,
+            logprobs_output.prompt_logprobs,
+            logprobs_output.cumulative_logprob,
             detokenizer_output.output_text,
             detokenizer_output.token_ids,
             detokenizer_output.finished,
diff --git a/vllm/v1/engine/output_processor_utils.py b/vllm/v1/engine/output_processor_utils.py
index 221228594cdf6..05d9a79b9698f 100644
--- a/vllm/v1/engine/output_processor_utils.py
+++ b/vllm/v1/engine/output_processor_utils.py
@@ -6,6 +6,7 @@
 from vllm.transformers_utils.detokenizer_utils import AnyTokenizer
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.detokenizer import IncrementalDetokenizer
+from vllm.v1.engine.logprobs import LogprobsProcessor
 
 
 class RequestState:
@@ -15,6 +16,7 @@ def __init__(
         request_id: str,
         prompt: Optional[str],
         prompt_token_ids: List[int],
+        logprobs_processor: LogprobsProcessor,
         detokenizer: IncrementalDetokenizer,
         queue: Optional[asyncio.Queue[RequestOutput]],
     ):
@@ -22,6 +24,7 @@ def __init__(
         self.prompt = prompt
         self.prompt_token_ids = prompt_token_ids
         self.prompt_len = len(prompt_token_ids)
+        self.logprobs_processor = logprobs_processor
         self.detokenizer = detokenizer
         self.is_prefilling = True
         self.queue = queue
@@ -37,6 +40,10 @@ def from_new_request(
             request_id=request.request_id,
             prompt=request.prompt,
             prompt_token_ids=request.prompt_token_ids,
+            logprobs_processor=LogprobsProcessor.from_new_request(
+                tokenizer=tokenizer,
+                request=request,
+            ),
             detokenizer=IncrementalDetokenizer.from_new_request(
                 tokenizer=tokenizer,
                 request=request,

From cda2ba21cff3b0d59368bb64366748b7184e1dde Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 15 Jan 2025 05:41:44 +0000
Subject: [PATCH 285/293] wip

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/logprobs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 56e028d29c88d..dab8f527cb7ff 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -52,8 +52,8 @@ def from_new_request(
             output_kind=request.sampling_params.output_kind,
             prompt_token_ids=request.prompt_token_ids,
             cumulative_logprob=(0. if num_logprobs else None),
-            logprobs=([] if num_logprobs else None),
-            prompt_logprobs=([] if num_prompt_logprobs else None),
+            logprobs=(None if num_logprobs is None else []),
+            prompt_logprobs=(None if num_prompt_logprobs is None else []),
             num_prompt_logprobs=(num_prompt_logprobs or 0),
             num_logprobs=(num_logprobs or 0),
         )

From bf20f4b23b08f6ad99c977c83b91846d8727e8f7 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 15 Jan 2025 05:51:27 +0000
Subject: [PATCH 286/293] passing with no sample/prompt logprobs

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/logprobs.py         | 10 ++++++++--
 vllm/v1/engine/output_processor.py |  4 ++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index dab8f527cb7ff..ea5de25d1cea1 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -52,8 +52,8 @@ def from_new_request(
             output_kind=request.sampling_params.output_kind,
             prompt_token_ids=request.prompt_token_ids,
             cumulative_logprob=(0. if num_logprobs else None),
-            logprobs=(None if num_logprobs is None else []),
-            prompt_logprobs=(None if num_prompt_logprobs is None else []),
+            logprobs=([] if num_logprobs else None),
+            prompt_logprobs=([] if num_prompt_logprobs else None),
             num_prompt_logprobs=(num_prompt_logprobs or 0),
             num_logprobs=(num_logprobs or 0),
         )
@@ -79,6 +79,9 @@ def _update_sample_logprobs(
         Return:
           Sample logprobs, if required for this request
         """
+        if self.num_logprobs == 0:
+            # Sample logprobs disabled for this request
+            return None
         assert self.logprobs is not None
 
         for sampled_token_id, logprobs, token_ids in zip(
@@ -149,6 +152,9 @@ def _update_prompt_logprobs(
         Return:
           Prompt logprobs, if required for this request
         """
+        if self.num_prompt_logprobs == 0:
+            # Prompt logprobs disabled for this request
+            return None
         assert prompt_logprobs is not None
         assert token_ids is not None
         if prompt_logprobs.numel() == 0:
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 7c4a4dc4965eb..7036c1e8225a9 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -163,11 +163,11 @@ def _make_request_output(
             request_state.request_id,
             request_state.prompt,
             request_state.prompt_token_ids,
+            detokenizer_output.output_text,
+            detokenizer_output.token_ids,
             logprobs_output.logprobs,
             logprobs_output.prompt_logprobs,
             logprobs_output.cumulative_logprob,
-            detokenizer_output.output_text,
-            detokenizer_output.token_ids,
             detokenizer_output.finished,
         )
         if detokenizer_output.finished:

From 4fae2004e797074d7d2d61fe3d49599e8823adb6 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 15 Jan 2025 05:56:00 +0000
Subject: [PATCH 287/293] fix to get prompt logprobs tests passing (sample
 logprobs tests already passing)

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/logprobs.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index ea5de25d1cea1..c8c3abcc49707 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -291,18 +291,16 @@ def update_from_output(
         new_prompt_logprobs = output.new_prompt_logprobs
 
         # 1) Make Sample Logprobs, if requested
-        logprobs = (None if self.num_logprobs == 0 else
-                    self._update_sample_logprobs(
-                        new_token_ids,
-                        new_logprobs_token_ids,
-                        new_logprobs,
-                    ))
+        logprobs = self._update_sample_logprobs(
+            new_token_ids,
+            new_logprobs_token_ids,
+            new_logprobs,
+        )
 
         # 4) Make Prompt Logprobs.
-        prompt_logprobs = (None if self.num_prompt_logprobs else
-                           self._update_prompt_logprobs(
-                               new_prompt_logprobs_token_ids,
-                               new_prompt_logprobs, self.prompt_token_ids))
+        prompt_logprobs = self._update_prompt_logprobs(
+            new_prompt_logprobs_token_ids, new_prompt_logprobs,
+            self.prompt_token_ids)
 
         # 5) Makes the LogprobsOutput object with the new text.
         finished = bool(output.finish_reason)

From 9deca70f6bbe1be0ebc66ba38b436de03d49f72f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 15 Jan 2025 06:03:22 +0000
Subject: [PATCH 288/293] sample and prompt logprobs optional in
 EngineCoreOutput; makes detokenizer tests pass

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/__init__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 8d46304239384..98ff7e9e6f3f2 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -43,11 +43,11 @@ class EngineCoreOutput(
 
     request_id: str
     new_token_ids: List[int]
-    new_logprobs: List[torch.Tensor]
-    new_logprobs_token_ids: List[torch.Tensor]
-    new_prompt_logprobs: Optional[torch.Tensor]
-    new_prompt_logprobs_token_ids: Optional[torch.Tensor]
     finished: bool
+    new_logprobs: List[torch.Tensor] = []
+    new_logprobs_token_ids: List[torch.Tensor] = []
+    new_prompt_logprobs: Optional[torch.Tensor] = None
+    new_prompt_logprobs_token_ids: Optional[torch.Tensor] = None
     finish_reason: Optional[str] = None
     stop_reason: Union[int, str, None] = None
 

From 46e65ae68425d6ed53d7d4e953a7e4412b367a2a Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 15 Jan 2025 06:14:12 +0000
Subject: [PATCH 289/293] wip

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/temp.py            | 372 +++++++++++++++++++++++++++++
 vllm/v1/engine/output_processor.py |  16 ++
 2 files changed, 388 insertions(+)
 create mode 100644 tests/v1/engine/temp.py

diff --git a/tests/v1/engine/temp.py b/tests/v1/engine/temp.py
new file mode 100644
index 0000000000000..71251442382db
--- /dev/null
+++ b/tests/v1/engine/temp.py
@@ -0,0 +1,372 @@
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import pytest
+import torch
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
+
+from tests.v1.engine.utils import (generate_dummy_prompt_logprobs,
+                                   generate_dummy_sample_logprobs,
+                                   validate_requests_logprobs)
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.engine.detokenizer import Detokenizer
+
+# Number of sample logprobs to request when testing sample logprobs
+NUM_SAMPLE_LOGPROBS = 5
+# Number of prompt logprobs to request when testing prompt logprobs
+NUM_PROMPT_LOGPROBS = 7
+# Use Mistral instruct tokenizer
+TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
+
+FULL_STRINGS = [
+    "My name is Robert from Neural Magic and I love working on vLLM so much!",
+    "Red Hat is the best open source company by far across Linux, K8s, and AI.",
+    "Nick is the name of my brother in addition to my colleague from Red Hat.",
+]
+STOP_STRINGS = ["I love working on", "company by far", "brother in"]
+PROMPT_LEN = 5
+
+
+@dataclass
+class DummyTestVectors:
+    """Dummy test vectors for detokenizer tests"""
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+    full_tokens: List[List[int]]  # Prompt + generated tokens
+    prompt_tokens: List[List[int]]
+    generation_tokens: List[List[int]]
+    # Each request is associated with a tuple of (top logprobs,top tokens)
+    # prompt logprobs tensors
+    prompt_logprobs: List[Tuple[torch.Tensor, torch.Tensor]]
+    # Each request is associated with a sample logprobs; a request's
+    # sample logprobs are a list of (top logprobs,top tokens)
+    # sample logprobs tensors at each sequence position
+    generation_logprobs: List[List[Tuple[torch.Tensor, torch.Tensor]]]
+    prompt_strings: List[str]
+    prompt_strings_len: List[int]
+    generation_strings: List[str]
+
+
+@pytest.fixture(scope="module")
+def dummy_test_vectors() -> DummyTestVectors:
+    """Generate dummy test vectors for detokenizer tests.
+    
+    Returns:
+      DummyTestVectors instance
+    """
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
+    # Tokenize prompts under test & create dummy generated tokens
+    prompt_tokens = [
+        tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
+    ]
+    generation_tokens = [
+        tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
+    ]
+    # Generate prompt strings
+    prompt_strings = [
+        tokenizer.decode(prompt_tokens,
+                         skip_special_tokens=True,
+                         tokenizer=tokenizer)
+        for prompt_tokens in prompt_tokens
+    ]
+    prompt_strings_len = [
+        len(prompt_string) for prompt_string in prompt_strings
+    ]
+    return DummyTestVectors(
+        tokenizer=tokenizer,
+        full_tokens=[tokenizer(text).input_ids for text in FULL_STRINGS],
+        prompt_tokens=prompt_tokens,
+        generation_tokens=generation_tokens,
+        prompt_strings=prompt_strings,
+        prompt_strings_len=prompt_strings_len,
+        generation_strings=[
+            text[prompt_len:]
+            for text, prompt_len in zip(FULL_STRINGS, prompt_strings_len)
+        ],
+        prompt_logprobs=[
+            generate_dummy_prompt_logprobs(prompt_tokens_list=tokens_list,
+                                           num_logprobs=NUM_PROMPT_LOGPROBS,
+                                           tokenizer=tokenizer)
+            for tokens_list in prompt_tokens
+        ],
+        generation_logprobs=[
+            generate_dummy_sample_logprobs(sampled_tokens_list=tokens_list,
+                                           num_logprobs=NUM_SAMPLE_LOGPROBS,
+                                           tokenizer=tokenizer)
+            for tokens_list in generation_tokens
+        ])
+
+
+class MockEngineCore:
+    """Mock outputs form premade tokens lists."""
+
+    def __init__(
+        self,
+        generated_tokens_list: List[List[int]],
+        prompt_tokens_list: List[List[int]],
+        generated_logprobs_raw: Optional[List[List[Tuple[torch.Tensor,
+                                                         torch.Tensor]]]],
+        prompt_logprobs_raw: Optional[List[Tuple[torch.Tensor, torch.Tensor]]],
+    ) -> None:
+        self.generated_tokens_list = generated_tokens_list
+        self.prompt_tokens_list = prompt_tokens_list
+        self.current_idx = 0
+        self.generated_logprobs_raw = generated_logprobs_raw
+        self.do_logprobs = generated_logprobs_raw is not None
+        self.prompt_logprobs_raw = prompt_logprobs_raw
+        self.do_prompt_logprobs = prompt_logprobs_raw is not None
+
+    def get_outputs(self) -> List[EngineCoreOutput]:
+        do_logprobs = self.do_logprobs
+        do_prompt_logprobs = self.do_prompt_logprobs
+        token_idx = self.current_idx
+
+        outputs = []
+        for req_idx, generated_token_ids in enumerate(
+                self.generated_tokens_list):
+            if len(generated_token_ids) > token_idx:
+                if do_logprobs:
+                    assert self.generated_logprobs_raw is not None
+                    (logprobs, logprobs_token_ids) = (
+                        self.generated_logprobs_raw[req_idx][token_idx])
+                    logprobs = [logprobs]
+                    logprobs_token_ids = [logprobs_token_ids]
+                else:
+                    logprobs = None
+                    logprobs_token_ids = None
+                if do_prompt_logprobs:
+                    if self.current_idx == 0:
+                        assert self.prompt_logprobs_raw is not None
+                        prompt_logprobs = self.prompt_logprobs_raw[req_idx][0]
+                        prompt_logprobs_token_ids = self.prompt_logprobs_raw[
+                            req_idx][1]
+                    else:
+                        (prompt_logprobs,
+                         prompt_logprobs_token_ids) = (torch.empty(0, 0),
+                                                       torch.empty(0, 0))
+                else:
+                    (prompt_logprobs, prompt_logprobs_token_ids) = (None, None)
+                output = EngineCoreOutput(
+                    request_id=f"request-{req_idx}",
+                    new_token_ids=[generated_token_ids[token_idx]],
+                    finished=False,
+                    logprobs=logprobs,
+                    logprobs_token_ids=logprobs_token_ids,
+                    prompt_logprobs=prompt_logprobs,
+                    prompt_logprobs_token_ids=prompt_logprobs_token_ids,
+                )
+                if token_idx == len(generated_token_ids) - 1:
+                    output.finished = True
+                    output.finish_reason = "stopped"
+                outputs.append(output)
+
+        self.current_idx += 1
+        return outputs
+
+
+@pytest.mark.parametrize(
+    "request_output_kind",
+    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.parametrize("logprobs,prompt_logprobs",
+                         [(None, None), (NUM_SAMPLE_LOGPROBS, None),
+                          (None, NUM_PROMPT_LOGPROBS),
+                          (NUM_SAMPLE_LOGPROBS, NUM_PROMPT_LOGPROBS)])
+def test_incremental_detokenization(
+    request_output_kind: RequestOutputKind,
+    logprobs: Optional[int],
+    prompt_logprobs: Optional[int],
+    dummy_test_vectors: DummyTestVectors,
+) -> None:
+    generation_tokens = dummy_test_vectors.generation_tokens
+    prompt_tokens = dummy_test_vectors.prompt_tokens
+    # Determine whether sample/prompt logprobs are enabled
+    do_generated_logprobs = logprobs is not None
+    do_prompt_logprobs = prompt_logprobs is not None
+    detokenizer = Detokenizer(TOKENIZER_NAME)
+    # Build mock engine core, which emulates sampling & logprobs
+    engine_core = MockEngineCore(
+        generated_tokens_list=generation_tokens,
+        prompt_tokens_list=prompt_tokens,
+        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
+        if do_generated_logprobs else None,
+        prompt_logprobs_raw=dummy_test_vectors.prompt_logprobs
+        if do_prompt_logprobs else None)
+
+    # Make N requests.
+    requests = [
+        EngineCoreRequest(request_id=f"request-{idx}",
+                          prompt=prompt,
+                          prompt_token_ids=prompt_tokens,
+                          arrival_time=0,
+                          mm_inputs=None,
+                          mm_hashes=None,
+                          mm_placeholders=None,
+                          eos_token_id=None,
+                          lora_request=None,
+                          sampling_params=SamplingParams(
+                              skip_special_tokens=False,
+                              spaces_between_special_tokens=False,
+                              output_kind=request_output_kind,
+                              stop=[],
+                              include_stop_str_in_output=False,
+                              logprobs=logprobs,
+                              prompt_logprobs=prompt_logprobs))
+        for idx, (prompt, prompt_tokens) in enumerate(
+            zip(dummy_test_vectors.prompt_strings, prompt_tokens))
+    ]
+
+    # Add requests to the detokenizer.
+    for request in requests:
+        detokenizer.add_request(request)
+
+    gen_strings = {}
+    gen_tokens = {}
+    while True:
+        # Mock output from the EngineCore.
+        outputs = engine_core.get_outputs()
+        if len(outputs) == 0:
+            break
+
+        # Step the Detokenizer.
+        request_outputs, requests_to_abort = detokenizer.step(outputs)
+        assert len(requests_to_abort) == 0
+
+        # Validate logprob detokenization
+        validate_requests_logprobs(requests, request_outputs,
+                                   dummy_test_vectors.tokenizer)
+
+        # Update tracking.
+        for request_output in request_outputs:
+            request_id = request_output.request_id
+            new_text = request_output.outputs[0].text
+            new_tokens = request_output.outputs[0].token_ids
+            if request_id not in gen_strings:
+                gen_strings[request_id] = new_text
+                gen_tokens[request_id] = new_tokens
+            else:
+                gen_strings[request_id] += new_text
+                gen_tokens[request_id].extend(new_tokens)
+
+    # Confirmed tracked values matches what we expected.
+    for idx, (ref_gen_str, ref_gen_toks) in enumerate(
+            zip(dummy_test_vectors.generation_strings, generation_tokens)):
+        gen_str = gen_strings[f"request-{idx}"]
+        gen_toks = gen_tokens[f"request-{idx}"]
+
+        assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
+        assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"
+
+    assert detokenizer.get_num_unfinished_requests() == 0
+    assert not detokenizer.has_unfinished_requests()
+
+
+@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
+@pytest.mark.parametrize("logprobs,prompt_logprobs",
+                         [(None, None), (NUM_SAMPLE_LOGPROBS, None),
+                          (None, NUM_PROMPT_LOGPROBS),
+                          (NUM_SAMPLE_LOGPROBS, NUM_PROMPT_LOGPROBS)])
+def test_stop_string(
+    include_stop_str_in_output: bool,
+    logprobs: Optional[int],
+    prompt_logprobs: Optional[int],
+    dummy_test_vectors: DummyTestVectors,
+) -> None:
+    prompt_tokens = dummy_test_vectors.prompt_tokens
+    do_generated_logprobs = logprobs is not None
+    do_prompt_logprobs = prompt_logprobs is not None
+    detokenizer = Detokenizer(TOKENIZER_NAME)
+    engine_core = MockEngineCore(
+        generated_tokens_list=dummy_test_vectors.generation_tokens,
+        prompt_tokens_list=prompt_tokens,
+        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
+        if do_generated_logprobs else None,
+        prompt_logprobs_raw=dummy_test_vectors.prompt_logprobs
+        if do_prompt_logprobs else None)
+
+    # Make N requests.
+    requests = [
+        EngineCoreRequest(
+            request_id=f"request-{idx}",
+            prompt=prompt,
+            prompt_token_ids=prompt_tokens,
+            arrival_time=0,
+            mm_inputs=None,
+            mm_hashes=None,
+            mm_placeholders=None,
+            eos_token_id=None,
+            lora_request=None,
+            sampling_params=SamplingParams(
+                skip_special_tokens=False,
+                spaces_between_special_tokens=False,
+                output_kind=RequestOutputKind.DELTA,
+                stop=STOP_STRINGS,
+                include_stop_str_in_output=include_stop_str_in_output,
+                logprobs=logprobs,
+                prompt_logprobs=prompt_logprobs,
+            )) for idx, (prompt, prompt_tokens) in enumerate(
+                zip(dummy_test_vectors.prompt_strings, prompt_tokens))
+    ]
+
+    # Add requests to the detokenizer.
+    for request in requests:
+        detokenizer.add_request(request)
+
+    gen_strings = {}
+    aborted = []
+    i = 0
+    while True:
+        # Mock output from the EngineCore.
+        outputs = engine_core.get_outputs()
+        if len(outputs) == 0:
+            break
+
+        # Step the Detokenizer.
+        request_outputs, requests_to_abort = detokenizer.step(outputs)
+        for request_output in request_outputs:
+            # If aborted, we should not get a request output.
+            assert request_output.request_id not in aborted
+        aborted.extend(requests_to_abort)
+
+        # Validate logprob detokenization
+        validate_requests_logprobs(requests, request_outputs,
+                                   dummy_test_vectors.tokenizer)
+
+        # Update tracking.
+        for request_output in request_outputs:
+            if request_output.finished:
+                assert request_output.outputs[0].finish_reason == "stop"
+
+            request_id = request_output.request_id
+            new_text = request_output.outputs[0].text
+            if request_id not in gen_strings:
+                gen_strings[request_id] = new_text
+            else:
+                gen_strings[request_id] += new_text
+        i += 1
+
+    # Confirmed tracked values matches what we expected.
+    for idx, (ref_gen_str, stop_str) in enumerate(
+            zip(dummy_test_vectors.generation_strings, STOP_STRINGS)):
+
+        # Request should be aborted.
+        request_id = f"request-{idx}"
+        assert request_id in aborted
+
+        # Collected values that were generated.
+        gen_str = gen_strings[request_id]
+
+        # Construct reference strings.
+        stop_str_idx = ref_gen_str.find(stop_str)
+        ref_str_exc_stop = ref_gen_str[:stop_str_idx]
+        ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str
+
+        if include_stop_str_in_output:
+            assert gen_str == ref_str_inc_stop, (
+                f"{gen_str=}, {ref_str_inc_stop=}")
+        else:
+            assert gen_str == ref_str_exc_stop, (
+                f"{gen_str=}, {ref_str_exc_stop=}")
+
+    assert detokenizer.get_num_unfinished_requests() == 0
+    assert not detokenizer.has_unfinished_requests()
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 7036c1e8225a9..32c4f4ab3bad5 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -11,6 +11,22 @@
 from vllm.v1.metrics.stats import IterationStats
 
 
+
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import pytest
+import torch
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
+
+from tests.v1.engine.utils import (generate_dummy_prompt_logprobs,
+                                   generate_dummy_sample_logprobs,
+                                   validate_requests_logprobs)
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.engine.detokenizer import Detokenizer
+
 @dataclass
 class OutputProcessorOutput:
 

From 65b9b64f557faea83c1d6573ca847a38800fac5c Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 15 Jan 2025 07:03:52 +0000
Subject: [PATCH 290/293] refactored output processor test vectors into utils
 and test fixtures

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/conftest.py              | 112 +++++++
 tests/v1/engine/temp.py                  | 372 -----------------------
 tests/v1/engine/test_output_processor.py | 124 +++-----
 tests/v1/engine/utils.py                 |  71 ++++-
 vllm/v1/engine/output_processor.py       |  16 -
 5 files changed, 214 insertions(+), 481 deletions(-)
 create mode 100644 tests/v1/engine/conftest.py
 delete mode 100644 tests/v1/engine/temp.py

diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py
new file mode 100644
index 0000000000000..9af32e055cc84
--- /dev/null
+++ b/tests/v1/engine/conftest.py
@@ -0,0 +1,112 @@
+"""Engine test fixtures"""
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+
+from tests.v1.engine.utils import (
+    DummyOutputProcessorTestVectors,
+    generate_dummy_sample_logprobs,
+    generate_dummy_prompt_logprobs,
+    TOKENIZER_NAME,
+    FULL_STRINGS,
+    PROMPT_LEN,
+    NUM_SAMPLE_LOGPROBS,
+    NUM_PROMPT_LOGPROBS,
+)
+
+@pytest.fixture
+def dummy_test_vectors() -> DummyOutputProcessorTestVectors:
+    """Generate dummy test vectors for detokenizer tests.
+    
+    Returns:
+      DummyTestVectors instance
+    """
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
+    vllm_config = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
+    # Tokenize prompts under test & create dummy generated tokens
+    prompt_tokens = [
+        tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
+    ]
+    generation_tokens = [
+        tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
+    ]
+    # Generate prompt strings
+    prompt_strings = [
+        tokenizer.decode(prompt_tokens, skip_special_tokens=True)
+        for prompt_tokens in prompt_tokens
+    ]
+    prompt_strings_len = [
+        len(prompt_string) for prompt_string in prompt_strings
+    ]
+    return DummyOutputProcessorTestVectors(
+        tokenizer=tokenizer,
+        tokenizer_group=init_tokenizer_from_configs(
+            vllm_config.model_config, vllm_config.scheduler_config,
+            vllm_config.parallel_config, vllm_config.lora_config),
+        vllm_config=vllm_config,
+        full_tokens=[tokenizer(text).input_ids for text in FULL_STRINGS],
+        prompt_tokens=prompt_tokens,
+        generation_tokens=generation_tokens,
+        prompt_strings=prompt_strings,
+        prompt_strings_len=prompt_strings_len,
+        generation_strings=[
+            text[prompt_len:]
+            for text, prompt_len in zip(FULL_STRINGS, prompt_strings_len)
+        ],
+        prompt_logprobs=[],
+        generation_logprobs=[])
+
+
+@pytest.fixture
+def dummy_test_vectors_with_logprobs() -> DummyOutputProcessorTestVectors:
+    """Generate dummy test vectors for detokenizer tests.
+    
+    Returns:
+      DummyTestVectors instance
+    """
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
+    vllm_config = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
+    # Tokenize prompts under test & create dummy generated tokens
+    prompt_tokens = [
+        tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
+    ]
+    generation_tokens = [
+        tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
+    ]
+    # Generate prompt strings
+    prompt_strings = [
+        tokenizer.decode(prompt_tokens, skip_special_tokens=True)
+        for prompt_tokens in prompt_tokens
+    ]
+    prompt_strings_len = [
+        len(prompt_string) for prompt_string in prompt_strings
+    ]
+    return DummyOutputProcessorTestVectors(
+        tokenizer=tokenizer,
+        tokenizer_group=init_tokenizer_from_configs(
+            vllm_config.model_config, vllm_config.scheduler_config,
+            vllm_config.parallel_config, vllm_config.lora_config),
+        vllm_config=vllm_config,
+        full_tokens=[tokenizer(text).input_ids for text in FULL_STRINGS],
+        prompt_tokens=prompt_tokens,
+        generation_tokens=generation_tokens,
+        prompt_strings=prompt_strings,
+        prompt_strings_len=prompt_strings_len,
+        generation_strings=[
+            text[prompt_len:]
+            for text, prompt_len in zip(FULL_STRINGS, prompt_strings_len)
+        ],
+        prompt_logprobs=[
+            generate_dummy_prompt_logprobs(prompt_tokens_list=tokens_list,
+                                           num_logprobs=NUM_PROMPT_LOGPROBS,
+                                           tokenizer=tokenizer)
+            for tokens_list in prompt_tokens
+        ],
+        generation_logprobs=[
+            generate_dummy_sample_logprobs(sampled_tokens_list=tokens_list,
+                                           num_logprobs=NUM_SAMPLE_LOGPROBS,
+                                           tokenizer=tokenizer)
+            for tokens_list in generation_tokens
+        ])
\ No newline at end of file
diff --git a/tests/v1/engine/temp.py b/tests/v1/engine/temp.py
deleted file mode 100644
index 71251442382db..0000000000000
--- a/tests/v1/engine/temp.py
+++ /dev/null
@@ -1,372 +0,0 @@
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-
-import pytest
-import torch
-from transformers import (AutoTokenizer, PreTrainedTokenizer,
-                          PreTrainedTokenizerFast)
-
-from tests.v1.engine.utils import (generate_dummy_prompt_logprobs,
-                                   generate_dummy_sample_logprobs,
-                                   validate_requests_logprobs)
-from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
-from vllm.v1.engine.detokenizer import Detokenizer
-
-# Number of sample logprobs to request when testing sample logprobs
-NUM_SAMPLE_LOGPROBS = 5
-# Number of prompt logprobs to request when testing prompt logprobs
-NUM_PROMPT_LOGPROBS = 7
-# Use Mistral instruct tokenizer
-TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
-
-FULL_STRINGS = [
-    "My name is Robert from Neural Magic and I love working on vLLM so much!",
-    "Red Hat is the best open source company by far across Linux, K8s, and AI.",
-    "Nick is the name of my brother in addition to my colleague from Red Hat.",
-]
-STOP_STRINGS = ["I love working on", "company by far", "brother in"]
-PROMPT_LEN = 5
-
-
-@dataclass
-class DummyTestVectors:
-    """Dummy test vectors for detokenizer tests"""
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
-    full_tokens: List[List[int]]  # Prompt + generated tokens
-    prompt_tokens: List[List[int]]
-    generation_tokens: List[List[int]]
-    # Each request is associated with a tuple of (top logprobs,top tokens)
-    # prompt logprobs tensors
-    prompt_logprobs: List[Tuple[torch.Tensor, torch.Tensor]]
-    # Each request is associated with a sample logprobs; a request's
-    # sample logprobs are a list of (top logprobs,top tokens)
-    # sample logprobs tensors at each sequence position
-    generation_logprobs: List[List[Tuple[torch.Tensor, torch.Tensor]]]
-    prompt_strings: List[str]
-    prompt_strings_len: List[int]
-    generation_strings: List[str]
-
-
-@pytest.fixture(scope="module")
-def dummy_test_vectors() -> DummyTestVectors:
-    """Generate dummy test vectors for detokenizer tests.
-    
-    Returns:
-      DummyTestVectors instance
-    """
-    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
-    # Tokenize prompts under test & create dummy generated tokens
-    prompt_tokens = [
-        tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
-    ]
-    generation_tokens = [
-        tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
-    ]
-    # Generate prompt strings
-    prompt_strings = [
-        tokenizer.decode(prompt_tokens,
-                         skip_special_tokens=True,
-                         tokenizer=tokenizer)
-        for prompt_tokens in prompt_tokens
-    ]
-    prompt_strings_len = [
-        len(prompt_string) for prompt_string in prompt_strings
-    ]
-    return DummyTestVectors(
-        tokenizer=tokenizer,
-        full_tokens=[tokenizer(text).input_ids for text in FULL_STRINGS],
-        prompt_tokens=prompt_tokens,
-        generation_tokens=generation_tokens,
-        prompt_strings=prompt_strings,
-        prompt_strings_len=prompt_strings_len,
-        generation_strings=[
-            text[prompt_len:]
-            for text, prompt_len in zip(FULL_STRINGS, prompt_strings_len)
-        ],
-        prompt_logprobs=[
-            generate_dummy_prompt_logprobs(prompt_tokens_list=tokens_list,
-                                           num_logprobs=NUM_PROMPT_LOGPROBS,
-                                           tokenizer=tokenizer)
-            for tokens_list in prompt_tokens
-        ],
-        generation_logprobs=[
-            generate_dummy_sample_logprobs(sampled_tokens_list=tokens_list,
-                                           num_logprobs=NUM_SAMPLE_LOGPROBS,
-                                           tokenizer=tokenizer)
-            for tokens_list in generation_tokens
-        ])
-
-
-class MockEngineCore:
-    """Mock outputs form premade tokens lists."""
-
-    def __init__(
-        self,
-        generated_tokens_list: List[List[int]],
-        prompt_tokens_list: List[List[int]],
-        generated_logprobs_raw: Optional[List[List[Tuple[torch.Tensor,
-                                                         torch.Tensor]]]],
-        prompt_logprobs_raw: Optional[List[Tuple[torch.Tensor, torch.Tensor]]],
-    ) -> None:
-        self.generated_tokens_list = generated_tokens_list
-        self.prompt_tokens_list = prompt_tokens_list
-        self.current_idx = 0
-        self.generated_logprobs_raw = generated_logprobs_raw
-        self.do_logprobs = generated_logprobs_raw is not None
-        self.prompt_logprobs_raw = prompt_logprobs_raw
-        self.do_prompt_logprobs = prompt_logprobs_raw is not None
-
-    def get_outputs(self) -> List[EngineCoreOutput]:
-        do_logprobs = self.do_logprobs
-        do_prompt_logprobs = self.do_prompt_logprobs
-        token_idx = self.current_idx
-
-        outputs = []
-        for req_idx, generated_token_ids in enumerate(
-                self.generated_tokens_list):
-            if len(generated_token_ids) > token_idx:
-                if do_logprobs:
-                    assert self.generated_logprobs_raw is not None
-                    (logprobs, logprobs_token_ids) = (
-                        self.generated_logprobs_raw[req_idx][token_idx])
-                    logprobs = [logprobs]
-                    logprobs_token_ids = [logprobs_token_ids]
-                else:
-                    logprobs = None
-                    logprobs_token_ids = None
-                if do_prompt_logprobs:
-                    if self.current_idx == 0:
-                        assert self.prompt_logprobs_raw is not None
-                        prompt_logprobs = self.prompt_logprobs_raw[req_idx][0]
-                        prompt_logprobs_token_ids = self.prompt_logprobs_raw[
-                            req_idx][1]
-                    else:
-                        (prompt_logprobs,
-                         prompt_logprobs_token_ids) = (torch.empty(0, 0),
-                                                       torch.empty(0, 0))
-                else:
-                    (prompt_logprobs, prompt_logprobs_token_ids) = (None, None)
-                output = EngineCoreOutput(
-                    request_id=f"request-{req_idx}",
-                    new_token_ids=[generated_token_ids[token_idx]],
-                    finished=False,
-                    logprobs=logprobs,
-                    logprobs_token_ids=logprobs_token_ids,
-                    prompt_logprobs=prompt_logprobs,
-                    prompt_logprobs_token_ids=prompt_logprobs_token_ids,
-                )
-                if token_idx == len(generated_token_ids) - 1:
-                    output.finished = True
-                    output.finish_reason = "stopped"
-                outputs.append(output)
-
-        self.current_idx += 1
-        return outputs
-
-
-@pytest.mark.parametrize(
-    "request_output_kind",
-    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-@pytest.mark.parametrize("logprobs,prompt_logprobs",
-                         [(None, None), (NUM_SAMPLE_LOGPROBS, None),
-                          (None, NUM_PROMPT_LOGPROBS),
-                          (NUM_SAMPLE_LOGPROBS, NUM_PROMPT_LOGPROBS)])
-def test_incremental_detokenization(
-    request_output_kind: RequestOutputKind,
-    logprobs: Optional[int],
-    prompt_logprobs: Optional[int],
-    dummy_test_vectors: DummyTestVectors,
-) -> None:
-    generation_tokens = dummy_test_vectors.generation_tokens
-    prompt_tokens = dummy_test_vectors.prompt_tokens
-    # Determine whether sample/prompt logprobs are enabled
-    do_generated_logprobs = logprobs is not None
-    do_prompt_logprobs = prompt_logprobs is not None
-    detokenizer = Detokenizer(TOKENIZER_NAME)
-    # Build mock engine core, which emulates sampling & logprobs
-    engine_core = MockEngineCore(
-        generated_tokens_list=generation_tokens,
-        prompt_tokens_list=prompt_tokens,
-        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
-        if do_generated_logprobs else None,
-        prompt_logprobs_raw=dummy_test_vectors.prompt_logprobs
-        if do_prompt_logprobs else None)
-
-    # Make N requests.
-    requests = [
-        EngineCoreRequest(request_id=f"request-{idx}",
-                          prompt=prompt,
-                          prompt_token_ids=prompt_tokens,
-                          arrival_time=0,
-                          mm_inputs=None,
-                          mm_hashes=None,
-                          mm_placeholders=None,
-                          eos_token_id=None,
-                          lora_request=None,
-                          sampling_params=SamplingParams(
-                              skip_special_tokens=False,
-                              spaces_between_special_tokens=False,
-                              output_kind=request_output_kind,
-                              stop=[],
-                              include_stop_str_in_output=False,
-                              logprobs=logprobs,
-                              prompt_logprobs=prompt_logprobs))
-        for idx, (prompt, prompt_tokens) in enumerate(
-            zip(dummy_test_vectors.prompt_strings, prompt_tokens))
-    ]
-
-    # Add requests to the detokenizer.
-    for request in requests:
-        detokenizer.add_request(request)
-
-    gen_strings = {}
-    gen_tokens = {}
-    while True:
-        # Mock output from the EngineCore.
-        outputs = engine_core.get_outputs()
-        if len(outputs) == 0:
-            break
-
-        # Step the Detokenizer.
-        request_outputs, requests_to_abort = detokenizer.step(outputs)
-        assert len(requests_to_abort) == 0
-
-        # Validate logprob detokenization
-        validate_requests_logprobs(requests, request_outputs,
-                                   dummy_test_vectors.tokenizer)
-
-        # Update tracking.
-        for request_output in request_outputs:
-            request_id = request_output.request_id
-            new_text = request_output.outputs[0].text
-            new_tokens = request_output.outputs[0].token_ids
-            if request_id not in gen_strings:
-                gen_strings[request_id] = new_text
-                gen_tokens[request_id] = new_tokens
-            else:
-                gen_strings[request_id] += new_text
-                gen_tokens[request_id].extend(new_tokens)
-
-    # Confirmed tracked values matches what we expected.
-    for idx, (ref_gen_str, ref_gen_toks) in enumerate(
-            zip(dummy_test_vectors.generation_strings, generation_tokens)):
-        gen_str = gen_strings[f"request-{idx}"]
-        gen_toks = gen_tokens[f"request-{idx}"]
-
-        assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
-        assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"
-
-    assert detokenizer.get_num_unfinished_requests() == 0
-    assert not detokenizer.has_unfinished_requests()
-
-
-@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
-@pytest.mark.parametrize("logprobs,prompt_logprobs",
-                         [(None, None), (NUM_SAMPLE_LOGPROBS, None),
-                          (None, NUM_PROMPT_LOGPROBS),
-                          (NUM_SAMPLE_LOGPROBS, NUM_PROMPT_LOGPROBS)])
-def test_stop_string(
-    include_stop_str_in_output: bool,
-    logprobs: Optional[int],
-    prompt_logprobs: Optional[int],
-    dummy_test_vectors: DummyTestVectors,
-) -> None:
-    prompt_tokens = dummy_test_vectors.prompt_tokens
-    do_generated_logprobs = logprobs is not None
-    do_prompt_logprobs = prompt_logprobs is not None
-    detokenizer = Detokenizer(TOKENIZER_NAME)
-    engine_core = MockEngineCore(
-        generated_tokens_list=dummy_test_vectors.generation_tokens,
-        prompt_tokens_list=prompt_tokens,
-        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
-        if do_generated_logprobs else None,
-        prompt_logprobs_raw=dummy_test_vectors.prompt_logprobs
-        if do_prompt_logprobs else None)
-
-    # Make N requests.
-    requests = [
-        EngineCoreRequest(
-            request_id=f"request-{idx}",
-            prompt=prompt,
-            prompt_token_ids=prompt_tokens,
-            arrival_time=0,
-            mm_inputs=None,
-            mm_hashes=None,
-            mm_placeholders=None,
-            eos_token_id=None,
-            lora_request=None,
-            sampling_params=SamplingParams(
-                skip_special_tokens=False,
-                spaces_between_special_tokens=False,
-                output_kind=RequestOutputKind.DELTA,
-                stop=STOP_STRINGS,
-                include_stop_str_in_output=include_stop_str_in_output,
-                logprobs=logprobs,
-                prompt_logprobs=prompt_logprobs,
-            )) for idx, (prompt, prompt_tokens) in enumerate(
-                zip(dummy_test_vectors.prompt_strings, prompt_tokens))
-    ]
-
-    # Add requests to the detokenizer.
-    for request in requests:
-        detokenizer.add_request(request)
-
-    gen_strings = {}
-    aborted = []
-    i = 0
-    while True:
-        # Mock output from the EngineCore.
-        outputs = engine_core.get_outputs()
-        if len(outputs) == 0:
-            break
-
-        # Step the Detokenizer.
-        request_outputs, requests_to_abort = detokenizer.step(outputs)
-        for request_output in request_outputs:
-            # If aborted, we should not get a request output.
-            assert request_output.request_id not in aborted
-        aborted.extend(requests_to_abort)
-
-        # Validate logprob detokenization
-        validate_requests_logprobs(requests, request_outputs,
-                                   dummy_test_vectors.tokenizer)
-
-        # Update tracking.
-        for request_output in request_outputs:
-            if request_output.finished:
-                assert request_output.outputs[0].finish_reason == "stop"
-
-            request_id = request_output.request_id
-            new_text = request_output.outputs[0].text
-            if request_id not in gen_strings:
-                gen_strings[request_id] = new_text
-            else:
-                gen_strings[request_id] += new_text
-        i += 1
-
-    # Confirmed tracked values matches what we expected.
-    for idx, (ref_gen_str, stop_str) in enumerate(
-            zip(dummy_test_vectors.generation_strings, STOP_STRINGS)):
-
-        # Request should be aborted.
-        request_id = f"request-{idx}"
-        assert request_id in aborted
-
-        # Collected values that were generated.
-        gen_str = gen_strings[request_id]
-
-        # Construct reference strings.
-        stop_str_idx = ref_gen_str.find(stop_str)
-        ref_str_exc_stop = ref_gen_str[:stop_str_idx]
-        ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str
-
-        if include_stop_str_in_output:
-            assert gen_str == ref_str_inc_stop, (
-                f"{gen_str=}, {ref_str_inc_stop=}")
-        else:
-            assert gen_str == ref_str_exc_stop, (
-                f"{gen_str=}, {ref_str_exc_stop=}")
-
-    assert detokenizer.get_num_unfinished_requests() == 0
-    assert not detokenizer.has_unfinished_requests()
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 4735c6f947537..ee33e086512d7 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -1,80 +1,19 @@
-from typing import List
-
 import pytest
-from transformers import AutoTokenizer
 
-from vllm.engine.arg_utils import EngineArgs
+from tests.v1.engine.utils import STOP_STRINGS, MockEngineCore
 from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
-from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.output_processor import OutputProcessor
 
-TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
-VLLM_CONFIG = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
-TOKENIZER_GROUP = init_tokenizer_from_configs(VLLM_CONFIG.model_config,
-                                              VLLM_CONFIG.scheduler_config,
-                                              VLLM_CONFIG.parallel_config,
-                                              VLLM_CONFIG.lora_config)
-tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
-
-FULL_STRINGS = [
-    "My name is Robert from Neural Magic and I love working on vLLM so much!",
-    "Red Hat is the best open source company by far across Linux, K8s, and AI.",
-    "Nick is the name of my brother in addition to my colleague from Red Hat.",
-]
-
-STOP_STRINGS = ["I love working on", "company by far", "brother in"]
-
-FULL_TOKENS = [tokenizer(text).input_ids for text in FULL_STRINGS]
-PROMPT_LEN = 5
-PROMPT_TOKENS = [
-    tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
-]
-GENERATION_TOKENS = [
-    tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
-]
-PROMPT_STRINGS = [
-    tokenizer.decode(prompt_tokens, skip_special_tokens=True)
-    for prompt_tokens in PROMPT_TOKENS
-]
-PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS]
-GENERATION_STRINGS = [
-    text[prompt_len:]
-    for text, prompt_len in zip(FULL_STRINGS, PROMPT_STRINGS_LEN)
-]
-
-
-class MockEngineCore:
-    """Mock outputs form premade tokens lists."""
-
-    def __init__(self, tokens_list: List[List[int]]):
-        self.tokens_list = tokens_list
-        self.current_idx = 0
-
-    def get_outputs(self) -> List[EngineCoreOutput]:
-        token_idx = self.current_idx
-        self.current_idx += 1
-
-        outputs = []
-        for req_idx, token_ids in enumerate(self.tokens_list):
-            if len(token_ids) > token_idx:
-                output = EngineCoreOutput(request_id=f"request-{req_idx}",
-                                          new_token_ids=[token_ids[token_idx]],
-                                          finished=False)
-                if token_idx == len(token_ids) - 1:
-                    output.finished = True
-                    output.finish_reason = "stopped"
-                outputs.append(output)
-
-        return outputs
-
 
 @pytest.mark.parametrize(
     "request_output_kind",
     [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-def test_incremental_detokenization(request_output_kind: RequestOutputKind):
-    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
-    engine_core = MockEngineCore(GENERATION_TOKENS)
+def test_incremental_detokenization(request_output_kind: RequestOutputKind,
+                                    dummy_test_vectors):
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
+                                       log_stats=False)
+    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
 
     # Make N requests.
     requests = [
@@ -93,9 +32,9 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
                               output_kind=request_output_kind,
                               stop=[],
                               include_stop_str_in_output=False))
-        for idx, (
-            prompt,
-            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+        for idx, (prompt, prompt_tokens) in enumerate(
+            zip(dummy_test_vectors.prompt_strings,
+                dummy_test_vectors.prompt_tokens))
     ]
 
     # Add requests to the detokenizer.
@@ -130,7 +69,8 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
 
     # Confirmed tracked values matches what we expected.
     for idx, (ref_gen_str, ref_gen_toks) in enumerate(
-            zip(GENERATION_STRINGS, GENERATION_TOKENS)):
+            zip(dummy_test_vectors.generation_strings,
+                dummy_test_vectors.generation_tokens)):
         gen_str = gen_strings[f"request-{idx}"]
         gen_toks = gen_tokens[f"request-{idx}"]
 
@@ -142,9 +82,10 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
 
 
 @pytest.mark.parametrize("include_stop_str_in_output", [True, False])
-def test_stop_string(include_stop_str_in_output: bool):
-    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
-    engine_core = MockEngineCore(GENERATION_TOKENS)
+def test_stop_string(include_stop_str_in_output: bool, dummy_test_vectors):
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
+                                       log_stats=False)
+    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
 
     # Make N requests.
     requests = [
@@ -164,9 +105,9 @@ def test_stop_string(include_stop_str_in_output: bool):
                 output_kind=RequestOutputKind.DELTA,
                 stop=STOP_STRINGS,
                 include_stop_str_in_output=include_stop_str_in_output,
-            )) for idx, (
-                prompt,
-                prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+            )) for idx, (prompt, prompt_tokens) in enumerate(
+                zip(dummy_test_vectors.prompt_strings,
+                    dummy_test_vectors.prompt_tokens))
     ]
 
     # Add requests to the detokenizer.
@@ -203,8 +144,8 @@ def test_stop_string(include_stop_str_in_output: bool):
                 gen_strings[request_id] += new_text
 
     # Confirmed tracked values matches what we expected.
-    for idx, (ref_gen_str,
-              stop_str) in enumerate(zip(GENERATION_STRINGS, STOP_STRINGS)):
+    for idx, (ref_gen_str, stop_str) in enumerate(
+            zip(dummy_test_vectors.generation_strings, STOP_STRINGS)):
 
         # Request should be aborted.
         request_id = f"request-{idx}"
@@ -229,9 +170,10 @@ def test_stop_string(include_stop_str_in_output: bool):
     assert not output_processor.has_unfinished_requests()
 
 
-def test_iteration_stats():
-    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=True)
-    engine_core = MockEngineCore(GENERATION_TOKENS)
+def test_iteration_stats(dummy_test_vectors):
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
+                                       log_stats=True)
+    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
 
     # Make N requests.
     requests = [
@@ -246,13 +188,13 @@ def test_iteration_stats():
             eos_token_id=None,
             lora_request=None,
             sampling_params=SamplingParams(),
-        ) for idx, (
-            prompt,
-            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+        ) for idx, (prompt, prompt_tokens) in enumerate(
+            zip(dummy_test_vectors.prompt_strings,
+                dummy_test_vectors.prompt_tokens))
     ]
 
     # Add all requests except one to the OutputProcessor.
-    num_active = len(GENERATION_TOKENS) - 1
+    num_active = len(dummy_test_vectors.generation_tokens) - 1
     for request in requests[:num_active]:
         output_processor.add_request(request)
     inactive_request = requests[num_active]
@@ -261,8 +203,10 @@ def test_iteration_stats():
     outputs = engine_core.get_outputs()[:num_active]
     processed_outputs = output_processor.process_outputs(outputs)
     iteration_stats = processed_outputs.iteration_stats
-    total_prompt_tokens = sum(
-        [len(prompt_tokens) for prompt_tokens in PROMPT_TOKENS[:num_active]])
+    total_prompt_tokens = sum([
+        len(prompt_tokens)
+        for prompt_tokens in dummy_test_vectors.prompt_tokens[:num_active]
+    ])
 
     assert iteration_stats.num_prompt_tokens == total_prompt_tokens
     assert iteration_stats.num_generation_tokens == num_active
@@ -281,7 +225,7 @@ def test_iteration_stats():
     outputs = engine_core.get_outputs()[:num_active]
     processed_outputs = output_processor.process_outputs(outputs)
     iteration_stats = processed_outputs.iteration_stats
-    total_prompt_tokens = len(PROMPT_TOKENS[num_active - 1])
+    total_prompt_tokens = len(dummy_test_vectors.prompt_tokens[num_active - 1])
 
     assert iteration_stats.num_prompt_tokens == total_prompt_tokens
     assert iteration_stats.num_generation_tokens == num_active
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index ff2ebe77f0911..e6b5806d36429 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -1,12 +1,31 @@
 """Engine test utils"""
 import random
-from typing import List, Tuple
+from dataclasses import dataclass
+from typing import List, Tuple, Union
 
 import torch
-from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
+from vllm.engine.arg_utils import EngineArgs
 from vllm.outputs import RequestOutput
-from vllm.v1.engine import EngineCoreRequest
+from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
+    BaseTokenizerGroup)
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+
+# Number of sample logprobs to request when testing sample logprobs
+NUM_SAMPLE_LOGPROBS = 5
+# Number of prompt logprobs to request when testing prompt logprobs
+NUM_PROMPT_LOGPROBS = 7
+
+TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
+
+FULL_STRINGS = [
+    "My name is Robert from Neural Magic and I love working on vLLM so much!",
+    "Red Hat is the best open source company by far across Linux, K8s, and AI.",
+    "Nick is the name of my brother in addition to my colleague from Red Hat.",
+]
+STOP_STRINGS = ["I love working on", "company by far", "brother in"]
+PROMPT_LEN = 5
 
 random.seed(42)
 
@@ -270,3 +289,49 @@ def validate_requests_logprobs(
                     assert plp.decoded_token == _decode_token(
                         tok_id,
                         tokenizer), "prompt logprob decoded token mismatch"
+
+
+@dataclass
+class DummyOutputProcessorTestVectors:
+    """Dummy test vectors for output processor tests"""
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+    tokenizer_group: BaseTokenizerGroup
+    vllm_config: EngineArgs
+    full_tokens: List[List[int]]  # Prompt + generated tokens
+    prompt_tokens: List[List[int]]
+    generation_tokens: List[List[int]]
+    # Each request is associated with a tuple of (top logprobs,top tokens)
+    # prompt logprobs tensors
+    prompt_logprobs: List[Tuple[torch.Tensor, torch.Tensor]]
+    # Each request is associated with a sample logprobs; a request's
+    # sample logprobs are a list of (top logprobs,top tokens)
+    # sample logprobs tensors at each sequence position
+    generation_logprobs: List[List[Tuple[torch.Tensor, torch.Tensor]]]
+    prompt_strings: List[str]
+    prompt_strings_len: List[int]
+    generation_strings: List[str]
+
+
+class MockEngineCore:
+    """Mock engine core outputs form premade tokens lists."""
+
+    def __init__(self, tokens_list: List[List[int]]):
+        self.tokens_list = tokens_list
+        self.current_idx = 0
+
+    def get_outputs(self) -> List[EngineCoreOutput]:
+        token_idx = self.current_idx
+        self.current_idx += 1
+
+        outputs = []
+        for req_idx, token_ids in enumerate(self.tokens_list):
+            if len(token_ids) > token_idx:
+                output = EngineCoreOutput(request_id=f"request-{req_idx}",
+                                          new_token_ids=[token_ids[token_idx]],
+                                          finished=False)
+                if token_idx == len(token_ids) - 1:
+                    output.finished = True
+                    output.finish_reason = "stopped"
+                outputs.append(output)
+
+        return outputs
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 32c4f4ab3bad5..7036c1e8225a9 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -11,22 +11,6 @@
 from vllm.v1.metrics.stats import IterationStats
 
 
-
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-
-import pytest
-import torch
-from transformers import (AutoTokenizer, PreTrainedTokenizer,
-                          PreTrainedTokenizerFast)
-
-from tests.v1.engine.utils import (generate_dummy_prompt_logprobs,
-                                   generate_dummy_sample_logprobs,
-                                   validate_requests_logprobs)
-from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
-from vllm.v1.engine.detokenizer import Detokenizer
-
 @dataclass
 class OutputProcessorOutput:
 

From 8dad9840210b104492f0a239fc3b10c7bb96f267 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 15 Jan 2025 07:14:37 +0000
Subject: [PATCH 291/293] refactored test fixtures

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/conftest.py | 93 ++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 57 deletions(-)

diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py
index 9af32e055cc84..1e8a92378bd25 100644
--- a/tests/v1/engine/conftest.py
+++ b/tests/v1/engine/conftest.py
@@ -2,27 +2,23 @@
 import pytest
 from transformers import AutoTokenizer
 
+from tests.v1.engine.utils import (FULL_STRINGS, NUM_PROMPT_LOGPROBS,
+                                   NUM_SAMPLE_LOGPROBS, PROMPT_LEN,
+                                   TOKENIZER_NAME,
+                                   DummyOutputProcessorTestVectors,
+                                   generate_dummy_prompt_logprobs,
+                                   generate_dummy_sample_logprobs)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 
-from tests.v1.engine.utils import (
-    DummyOutputProcessorTestVectors,
-    generate_dummy_sample_logprobs,
-    generate_dummy_prompt_logprobs,
-    TOKENIZER_NAME,
-    FULL_STRINGS,
-    PROMPT_LEN,
-    NUM_SAMPLE_LOGPROBS,
-    NUM_PROMPT_LOGPROBS,
-)
 
-@pytest.fixture
-def dummy_test_vectors() -> DummyOutputProcessorTestVectors:
+def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors:
     """Generate dummy test vectors for detokenizer tests.
     
     Returns:
-      DummyTestVectors instance
+      DummyOutputProcessorTestVectors instance with no logprobs
     """
+
     tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
     vllm_config = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
     # Tokenize prompts under test & create dummy generated tokens
@@ -60,53 +56,36 @@ def dummy_test_vectors() -> DummyOutputProcessorTestVectors:
 
 
 @pytest.fixture
-def dummy_test_vectors_with_logprobs() -> DummyOutputProcessorTestVectors:
+def dummy_test_vectors() -> DummyOutputProcessorTestVectors:
     """Generate dummy test vectors for detokenizer tests.
     
     Returns:
-      DummyTestVectors instance
+      DummyOutputProcessorTestVectors instance with no
+      logprobs.
     """
-    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
-    vllm_config = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
-    # Tokenize prompts under test & create dummy generated tokens
-    prompt_tokens = [
-        tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
-    ]
-    generation_tokens = [
-        tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
-    ]
-    # Generate prompt strings
-    prompt_strings = [
-        tokenizer.decode(prompt_tokens, skip_special_tokens=True)
-        for prompt_tokens in prompt_tokens
+    return _build_test_vectors_no_logprobs()
+
+
+@pytest.fixture
+def dummy_test_vectors_w_lp() -> DummyOutputProcessorTestVectors:
+    """Generate dummy test vectors for logprob processor tests.
+    
+    Returns:
+      DummyOutputProcessorTestVectors instance with logprobs
+    """
+    dtv = _build_test_vectors_no_logprobs()
+    # Add sample and prompt logprobs to the dummy test vectors
+    # data structure.
+    dtv.generation_logprobs = [
+        generate_dummy_sample_logprobs(sampled_tokens_list=tokens_list,
+                                       num_logprobs=NUM_SAMPLE_LOGPROBS,
+                                       tokenizer=dtv.tokenizer)
+        for tokens_list in dtv.generation_tokens
     ]
-    prompt_strings_len = [
-        len(prompt_string) for prompt_string in prompt_strings
+    dtv.prompt_logprobs = [
+        generate_dummy_prompt_logprobs(prompt_tokens_list=tokens_list,
+                                       num_logprobs=NUM_PROMPT_LOGPROBS,
+                                       tokenizer=dtv.tokenizer)
+        for tokens_list in dtv.prompt_tokens
     ]
-    return DummyOutputProcessorTestVectors(
-        tokenizer=tokenizer,
-        tokenizer_group=init_tokenizer_from_configs(
-            vllm_config.model_config, vllm_config.scheduler_config,
-            vllm_config.parallel_config, vllm_config.lora_config),
-        vllm_config=vllm_config,
-        full_tokens=[tokenizer(text).input_ids for text in FULL_STRINGS],
-        prompt_tokens=prompt_tokens,
-        generation_tokens=generation_tokens,
-        prompt_strings=prompt_strings,
-        prompt_strings_len=prompt_strings_len,
-        generation_strings=[
-            text[prompt_len:]
-            for text, prompt_len in zip(FULL_STRINGS, prompt_strings_len)
-        ],
-        prompt_logprobs=[
-            generate_dummy_prompt_logprobs(prompt_tokens_list=tokens_list,
-                                           num_logprobs=NUM_PROMPT_LOGPROBS,
-                                           tokenizer=tokenizer)
-            for tokens_list in prompt_tokens
-        ],
-        generation_logprobs=[
-            generate_dummy_sample_logprobs(sampled_tokens_list=tokens_list,
-                                           num_logprobs=NUM_SAMPLE_LOGPROBS,
-                                           tokenizer=tokenizer)
-            for tokens_list in generation_tokens
-        ])
\ No newline at end of file
+    return dtv

From 29f491f8b0717f58491b7781e2b1c8801400a9ad Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 15 Jan 2025 07:47:14 +0000
Subject: [PATCH 292/293] format

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/sample/test_logprobs.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 35087b585b40f..affa5b39c40ad 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -22,6 +22,7 @@ def _test_case_get_logprobs_and_prompt_logprobs(
     detokenize: bool,
     batch_logprobs_composition: str,
     max_num_batched_tokens: int,
+    enable_prefix_caching: bool,
     example_prompts,
     monkeypatch,
 ) -> None:
@@ -76,6 +77,7 @@ def _test_case_get_logprobs_and_prompt_logprobs(
             max_num_seqs=max_num_seqs,
             max_model_len=max_model_len,
             enforce_eager=True,
+            enable_prefix_caching=enable_prefix_caching,
     ) as vllm_model:
         vllm_results = vllm_model.model.generate(
             test_prompts, sampling_params=vllm_sampling_params)
@@ -243,6 +245,7 @@ def test_get_logprobs_and_prompt_logprobs(
         detokenize=True,
         batch_logprobs_composition=batch_logprobs_composition,
         max_num_batched_tokens=max_num_batched_tokens,
+        enable_prefix_caching=False,
         example_prompts=example_prompts,
         monkeypatch=monkeypatch)
 
@@ -257,7 +260,7 @@ def test_max_logprobs(monkeypatch):
     """
     override_backend_env_variable(monkeypatch, "FLASH_ATTN")
 
-    runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
+    runner = VllmRunner("facebook/opt-125m", max_logprobs=1,enable_prefix_caching=False)
     vllm_sampling_params = SamplingParams(logprobs=1)
     # should pass
     runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
@@ -288,6 +291,7 @@ def test_none_logprobs(vllm_runner, model, example_prompts, monkeypatch):
             model,
             max_num_batched_tokens=max_num_batched_tokens,
             max_num_seqs=max_num_seqs,
+            enable_prefix_caching=False,
     ) as vllm_model:
         sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
                                                        logprobs=None,

From 3302eae8c25e3532211691731ae0c79899a261e3 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 15 Jan 2025 07:53:23 +0000
Subject: [PATCH 293/293] format

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/sample/test_logprobs.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index affa5b39c40ad..57bd5284c180d 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -260,7 +260,9 @@ def test_max_logprobs(monkeypatch):
     """
     override_backend_env_variable(monkeypatch, "FLASH_ATTN")
 
-    runner = VllmRunner("facebook/opt-125m", max_logprobs=1,enable_prefix_caching=False)
+    runner = VllmRunner("facebook/opt-125m",
+                        max_logprobs=1,
+                        enable_prefix_caching=False)
     vllm_sampling_params = SamplingParams(logprobs=1)
     # should pass
     runner.generate(["Hello world"], sampling_params=vllm_sampling_params)