-
-
Notifications
You must be signed in to change notification settings - Fork 6.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Misc] Enable V1 LoRA by default #15320
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,14 @@ | |
] | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cosmetic changes. moving the run_with_both_engines_lora above the utility functions. |
||
|
||
@pytest.fixture(autouse=True) | ||
def v1(run_with_both_engines_lora): | ||
# Simple autouse wrapper to run both engines for each test | ||
# This can be promoted up to conftest.py to run for every | ||
# test in a package | ||
pass | ||
|
||
|
||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: | ||
prompts = [ | ||
PROMPT_TEMPLATE.format(query="How many singers do we have?"), | ||
|
@@ -46,14 +54,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: | |
return generated_texts | ||
|
||
|
||
@pytest.fixture(autouse=True) | ||
def v1(run_with_both_engines_lora): | ||
# Simple autouse wrapper to run both engines for each test | ||
# This can be promoted up to conftest.py to run for every | ||
# test in a package | ||
pass | ||
|
||
|
||
@create_new_process_for_each_test() | ||
def test_chatglm3_lora(chatglm3_lora_files): | ||
llm = vllm.LLM(MODEL_PATH, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,14 @@ | |
MODEL_PATH = "google/gemma-7b" | ||
|
||
|
||
@pytest.fixture(autouse=True) | ||
def v1(run_with_both_engines_lora): | ||
# Simple autouse wrapper to run both engines for each test | ||
# This can be promoted up to conftest.py to run for every | ||
# test in a package | ||
pass | ||
|
||
|
||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: | ||
prompts = [ | ||
"Quote: Imagination is", | ||
|
@@ -31,14 +39,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: | |
return generated_texts | ||
|
||
|
||
@pytest.fixture(autouse=True) | ||
def v1(run_with_both_engines_lora): | ||
# Simple autouse wrapper to run both engines for each test | ||
# This can be promoted up to conftest.py to run for every | ||
# test in a package | ||
pass | ||
|
||
|
||
# The V1 lora test for this model requires more than 24GB. | ||
@pytest.mark.skip_v1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am checking if the tests pass with latest main 🤞 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the CI OOMs still - turning the test off of now. |
||
@pytest.mark.xfail(current_platform.is_rocm(), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,5 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import importlib | ||
import random | ||
from copy import deepcopy | ||
from dataclasses import dataclass | ||
|
@@ -82,10 +81,6 @@ def v1(run_with_both_engines_lora): | |
# This can be promoted up to conftest.py to run for every | ||
# test in a package | ||
|
||
# Reload punica_gpu as the kernels used are tied to engine type. | ||
from vllm.lora.punica_wrapper import punica_gpu | ||
importlib.reload(punica_gpu) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This has been a no-op since #14685 landed since both v0 and v1 use the same kernels. |
||
# Release any memory we might be holding on to. CI runs OOMs otherwise. | ||
from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT, | ||
_LORA_B_PTR_DICT) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,14 @@ | |
] | ||
|
||
|
||
@pytest.fixture(autouse=True) | ||
def v1(run_with_both_engines_lora): | ||
# Simple autouse wrapper to run both engines for each test | ||
# This can be promoted up to conftest.py to run for every | ||
# test in a package | ||
pass | ||
|
||
|
||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: | ||
prompts = [ | ||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 | ||
|
@@ -71,16 +79,6 @@ def generate_and_test(llm, sql_lora_files): | |
print("removing lora") | ||
|
||
|
||
@pytest.fixture(autouse=True) | ||
def v1(run_with_both_engines_lora): | ||
# Simple autouse wrapper to run both engines for each test | ||
# This can be promoted up to conftest.py to run for every | ||
# test in a package | ||
pass | ||
|
||
|
||
# V1 Test: Failing due to numerics on V1. | ||
@pytest.mark.skip_v1 | ||
@create_new_process_for_each_test() | ||
def test_llama_lora(sql_lora_files): | ||
|
||
|
@@ -126,8 +124,6 @@ def get_num_gpu_blocks_no_lora(): | |
"less when using lora than when not using lora") | ||
|
||
|
||
# V1 Test: Failing due to numerics on V1. | ||
@pytest.mark.skip_v1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe these numeric test failures are resolved. This was added in #13726 and the failure was due to the "can't find tokenizer file error" . Discussion on slack https://vllm-dev.slack.com/archives/C07R5PAL2L9/p1742010357759669 |
||
@multi_gpu_test(num_gpus=4) | ||
@create_new_process_for_each_test() | ||
def test_llama_lora_tp4(sql_lora_files): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cosmetic changes. moving the
run_with_both_engines_lora
above the utility functions.