You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I want to use vllm enging in litserve and enable batching. The code below works fine, but when I make parallel requests, the results of both requests are returned in the result of the first request. Do you know how to modify the code to avoid this?
Code sample
class SimpleLitAPI(ls.LitAPI):
def setup(self, device):
config = self._config
engine_args = EngineArgs.from_cli_args(config)
self.llm_engine = LLMEngine.from_engine_args(engine_args)
self.tokenizer = self.llm_engine.get_tokenizer_group(TokenizerGroup).tokenizer
self.model_config = self.llm_engine.get_model_config()
def batch(self, inputs):
return inputs
def predict(self, prompt, context):
conversations = []
for item in prompt:
conversation, mm_data = parse_chat_messages(item, self.model_config,
self.tokenizer)
conversations.append(conversation)
prompts = []
logger.info(f'type_tokenizer: {type(self.tokenizer)}')
for conversation in conversations:
prompt = apply_hf_chat_template(
self.tokenizer,
conversation,
chat_template=None,
)
prompts.append(prompt)
inputs = []
for _ in prompts:
inputs.append(TextPrompt(prompt=_)
contexts = context
context_list = []
for context in contexts:
# set params
temperature = float(context.get("temperature", 1.0))
top_p = float(context.get("top_p", 1.0))
max_new_tokens = context.get("max_new_tokens", 256)
top_k = context.get("top_k", -1)
presence_penalty = float(context.get("presence_penalty", 0.0))
frequency_penalty = float(context.get("frequency_penalty", 0.0))
best_of = context.get("best_of", None)
# make sampling params in vllm
top_p = max(top_p, 1e-5)
if temperature <= 1e-5:
top_p = 1.0
sampling_params = SamplingParams(
n=1,
temperature=temperature,
top_p=top_p,
max_tokens=max_new_tokens,
top_k=top_k,
presence_penalty=presence_penalty,
frequency_penalty=frequency_penalty,
best_of=best_of,
)
context_list.append(sampling_params)
result = list(zip(inputs, context_list))
while result or self.llm_engine.has_unfinished_requests():
if result:
prompt, sampling_params = result.pop(0)
request_id = random_uuid()
self.llm_engine.add_request(str(request_id), prompt, sampling_params)
previous_output = ""
while self.llm_engine.has_unfinished_requests():
request_outputs: List[RequestOutput] = self.llm_engine.step()
for request_output in request_outputs:
current_output = request_output.outputs[0].text
new_output = current_output[len(previous_output):] # ζεζ°ηζηι¨ε
previous_output = current_output # ζ΄ζ°ε·²θΎεΊηι¨ε
if new_output: # εͺε¨ζζ°ηζηζζ¬ζΆθΎεΊ
yield [{'role': 'assistant', 'content': new_output}]
The text was updated successfully, but these errors were encountered:
I don't know if my idea is right. Maybe I need to get all the outputs and combine them into a list and put it in the yield line to execute the batch function normally?
π Bug
I want to use vllm enging in litserve and enable batching. The code below works fine, but when I make parallel requests, the results of both requests are returned in the result of the first request. Do you know how to modify the code to avoid this?
Code sample
The text was updated successfully, but these errors were encountered: