Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Misc] Enable V1 LoRA by default #15320

Merged
merged 5 commits into from
Apr 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 57 additions & 3 deletions tests/entrypoints/openai/test_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,23 @@


@pytest.fixture(scope="module")
def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811
def monkeypatch_module():
from _pytest.monkeypatch import MonkeyPatch
mpatch = MonkeyPatch()
yield mpatch
mpatch.undo()


@pytest.fixture(scope="module", params=[False, True])
def server(
request,
monkeypatch_module,
zephyr_lora_files, #noqa: F811
zephyr_lora_added_tokens_files): # noqa: F811

use_v1 = request.param
monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')

args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
Expand All @@ -49,6 +65,13 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811
yield remote_server


@pytest.fixture
def is_v1_server(server):
import os
assert os.environ['VLLM_USE_V1'] in ['0', '1']
return os.environ['VLLM_USE_V1'] == '1'


@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
Expand Down Expand Up @@ -471,8 +494,13 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
async def test_guided_choice_chat(client: openai.AsyncOpenAI,
is_v1_server: bool,
guided_decoding_backend: str,
sample_guided_choice):

if is_v1_server and guided_decoding_backend != 'xgrammar':
pytest.skip("Only xgrammar backend is supported with V1")

messages = [{
"role": "system",
"content": "you are a helpful assistant"
Expand Down Expand Up @@ -511,9 +539,13 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,

@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
async def test_guided_json_chat(client: openai.AsyncOpenAI,
async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
guided_decoding_backend: str,
sample_json_schema):

if is_v1_server:
pytest.skip("sample_json_schema has features unsupported in V1")

messages = [{
"role": "system",
"content": "you are a helpful assistant"
Expand Down Expand Up @@ -559,7 +591,12 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
async def test_guided_regex_chat(client: openai.AsyncOpenAI,
is_v1_server: bool,
guided_decoding_backend: str, sample_regex):

if is_v1_server and guided_decoding_backend != 'xgrammar':
pytest.skip("Only xgrammar backend is supported with V1")

messages = [{
"role": "system",
"content": "you are a helpful assistant"
Expand Down Expand Up @@ -617,8 +654,13 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
is_v1_server: bool,
guided_decoding_backend: str,
sample_guided_choice):

if is_v1_server and guided_decoding_backend != 'xgrammar':
pytest.skip("Only xgrammar backend is supported with V1")

messages = [{
"role": "system",
"content": "you are a helpful assistant"
Expand Down Expand Up @@ -648,9 +690,13 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,

@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
async def test_named_tool_use(client: openai.AsyncOpenAI,
async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
guided_decoding_backend: str,
sample_json_schema):

if is_v1_server:
pytest.skip("sample_json_schema has features unsupported on V1")

messages = [{
"role": "system",
"content": "you are a helpful assistant"
Expand Down Expand Up @@ -742,6 +788,10 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
@pytest.mark.asyncio
async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
sample_json_schema):

if is_v1_server:
pytest.skip("sample_json_schema has features unsupported on V1")

messages = [{
"role": "system",
"content": "you are a helpful assistant"
Expand Down Expand Up @@ -787,6 +837,10 @@ async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
@pytest.mark.asyncio
async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
sample_json_schema):

if is_v1_server:
pytest.skip("sample_json_schema has features unsupported on V1")

messages = [{
"role": "system",
"content": "you are a helpful assistant"
Expand Down
16 changes: 8 additions & 8 deletions tests/lora/test_baichuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cosmetic changes. moving the run_with_both_engines_lora above the utility functions.


@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
Expand Down Expand Up @@ -40,14 +48,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return generated_texts


@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


def test_baichuan_lora(baichuan_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
Expand Down
16 changes: 8 additions & 8 deletions tests/lora/test_chatglm3_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@
]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cosmetic changes. moving the run_with_both_engines_lora above the utility functions.


@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
Expand Down Expand Up @@ -46,14 +54,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return generated_texts


@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


@create_new_process_for_each_test()
def test_chatglm3_lora(chatglm3_lora_files):
llm = vllm.LLM(MODEL_PATH,
Expand Down
16 changes: 8 additions & 8 deletions tests/lora/test_gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@
MODEL_PATH = "google/gemma-7b"


@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [
"Quote: Imagination is",
Expand All @@ -31,14 +39,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return generated_texts


@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


# The V1 lora test for this model requires more than 24GB.
@pytest.mark.skip_v1
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am checking if the tests pass with latest main 🤞

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the CI OOMs still - turning the test off of now.

@pytest.mark.xfail(current_platform.is_rocm(),
Expand Down
5 changes: 0 additions & 5 deletions tests/lora/test_layers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import importlib
import random
from copy import deepcopy
from dataclasses import dataclass
Expand Down Expand Up @@ -82,10 +81,6 @@ def v1(run_with_both_engines_lora):
# This can be promoted up to conftest.py to run for every
# test in a package

# Reload punica_gpu as the kernels used are tied to engine type.
from vllm.lora.punica_wrapper import punica_gpu
importlib.reload(punica_gpu)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has been a no-op since #14685 landed since both v0 and v1 use the same kernels.

# Release any memory we might be holding on to. CI runs OOMs otherwise.
from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
_LORA_B_PTR_DICT)
Expand Down
20 changes: 8 additions & 12 deletions tests/lora/test_llama_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@
]


@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
Expand Down Expand Up @@ -71,16 +79,6 @@ def generate_and_test(llm, sql_lora_files):
print("removing lora")


@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


# V1 Test: Failing due to numerics on V1.
@pytest.mark.skip_v1
@create_new_process_for_each_test()
def test_llama_lora(sql_lora_files):

Expand Down Expand Up @@ -126,8 +124,6 @@ def get_num_gpu_blocks_no_lora():
"less when using lora than when not using lora")


# V1 Test: Failing due to numerics on V1.
@pytest.mark.skip_v1
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe these numeric test failures are resolved. This was added in #13726 and the failure was due to the "can't find tokenizer file error" . Discussion on slack https://vllm-dev.slack.com/archives/C07R5PAL2L9/p1742010357759669

@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_llama_lora_tp4(sql_lora_files):
Expand Down
14 changes: 11 additions & 3 deletions tests/lora/test_lora_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from safetensors.torch import load_file
from torch import nn

from vllm import envs
from vllm.config import LoRAConfig
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
MergedColumnParallelLinearWithLoRA,
Expand All @@ -33,6 +32,17 @@
] if current_platform.is_cuda_alike() else ["cpu"])


@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch: pytest.MonkeyPatch):
"""
Some tests depend on V0 internals. Since both V0 and V1 use the same
LoRAModelManager it is okay to just test V0.
"""
with monkeypatch.context() as m:
m.setenv('VLLM_USE_V1', '0')
yield


@pytest.mark.parametrize("device", DEVICES)
def test_from_lora_tensors(sql_lora_files, device):
tensors = load_file(
Expand Down Expand Up @@ -411,7 +421,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
assert manager.device == device


@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
@pytest.mark.parametrize("device", DEVICES)
def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
sql_lora_files, device):
Expand Down Expand Up @@ -491,7 +500,6 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
device)


@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
@pytest.mark.parametrize("device", DEVICES)
def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
sql_lora_files, device):
Expand Down
16 changes: 8 additions & 8 deletions tests/lora/test_phi.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@
PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501


@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [
PROMPT_TEMPLATE.format(
Expand Down Expand Up @@ -48,14 +56,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return generated_texts


@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


# Skipping for V1 for now as we are hitting,
# "Head size 80 is not supported by FlashAttention." error.
@pytest.mark.skip_v1
Expand Down
16 changes: 8 additions & 8 deletions tests/lora/test_quant_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ class ModelWithQuantization:
]


@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


def do_sample(llm: vllm.LLM,
lora_path: str,
lora_id: int,
Expand Down Expand Up @@ -69,14 +77,6 @@ def format_prompt_tuples(prompt):
return generated_texts


@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", [1])
def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
Expand Down
Loading