Skip to content

Commit ffe4ba9

Browse files
authored
Fix crash and remove sys_instruct from chat.py and client.py(#591)
* fix crash * update profile_generation.py * format * use self.bos_id * remove sys_instruct
1 parent af2f072 commit ffe4ba9

File tree

5 files changed

+11
-13
lines changed

5 files changed

+11
-13
lines changed

benchmark/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ pip install nvidia-ml-py
3030
```bash
3131
python profile_generation.py \
3232
--model-path /path/to/your/model \
33-
--concurrency 1 8 --prompt-tokens 0 512 --completion-tokens 2048 512
33+
--concurrency 1 8 --prompt-tokens 1 512 --completion-tokens 2048 512
3434
```
3535

3636
## profile serving

benchmark/profile_generation.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def _infer(model, session_id):
9090

9191
def profile_throughput(model_path: str,
9292
concurrency: int = 1,
93-
input_seqlen: int = 0,
93+
input_seqlen: int = 1,
9494
output_seqlen: int = 512,
9595
test_round: int = 10,
9696
tp: int = 1):
@@ -99,8 +99,10 @@ def profile_throughput(model_path: str,
9999
tm_model = TurboMind(model_path=model_path, tp=tp)
100100

101101
# make up a prompt that can be tokenized into {input_seqlen} tokens
102-
prompt = '' if input_seqlen == 0 else 'hi' + ' hi' * (input_seqlen - 1)
102+
assert input_seqlen > 0, 'input_seqlen should > 0'
103+
prompt = 'hi'
103104
input_ids = tokenizer.encode(prompt)
105+
input_ids = input_ids * input_seqlen
104106

105107
warmup(tm_model, concurrency, input_ids, output_seqlen)
106108

lmdeploy/serve/client.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ def input_prompt(model_name):
2020
def main(tritonserver_addr: str,
2121
session_id: int = 1,
2222
cap: str = 'chat',
23-
sys_instruct: str = None,
2423
stream_output: bool = True,
2524
**kwargs):
2625
"""An example to communicate with inference server through the command line
@@ -32,13 +31,11 @@ def main(tritonserver_addr: str,
3231
session_id (int): the identical id of a session
3332
cap (str): the capability of a model. For example, codellama has
3433
the ability among ['completion', 'infill', 'instruct', 'python']
35-
sys_instruct (str): the content of 'system' role, which is used by
36-
conversational model
3734
stream_output (bool): indicator for streaming output or not
3835
**kwargs (dict): other arguments for initializing model's chat template
3936
"""
4037
log_level = os.environ.get('SERVICE_LOG_LEVEL', 'WARNING')
41-
kwargs.update(capability=cap, system=sys_instruct)
38+
kwargs.update(capability=cap)
4239
chatbot = Chatbot(tritonserver_addr,
4340
log_level=log_level,
4441
display=stream_output,

lmdeploy/serve/turbomind/chatbot.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,10 @@ def _stream_infer(self,
459459
session.sequence_length = 0
460460

461461
input_ids, input_lengths = self.preprocess(prompt)
462+
# will crash if last_token_id == eos_id and send empty input_ids
463+
if sequence_end and request_output_len == 0:
464+
input_ids = np.array([[self.bos_id]], dtype=np.uint32)
465+
input_lengths = np.array([[1]], dtype=np.uint32)
462466
input_tokens = input_lengths.squeeze()
463467
if self.profile_generation:
464468
yield StatusCode.TRITON_STREAM_ING, \

lmdeploy/turbomind/chat.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@ def get_gen_param(cap,
7373
def main(model_path,
7474
session_id: int = 1,
7575
cap: str = 'chat',
76-
sys_instruct: str = None,
7776
tp=1,
7877
stream_output=True,
7978
**kwargs):
@@ -85,8 +84,6 @@ def main(model_path,
8584
session_id (int): the identical id of a session
8685
cap (str): the capability of a model. For example, codellama has
8786
the ability among ['completion', 'infilling', 'chat', 'python']
88-
sys_instruct (str): the content of 'system' role, which is used by
89-
conversational model
9087
tp (int): GPU number used in tensor parallelism
9188
stream_output (bool): indicator for streaming output or not
9289
**kwarg (dict): other arguments for initializing model's chat template
@@ -100,9 +97,7 @@ def main(model_path,
10097
step = 0
10198
seed = random.getrandbits(64)
10299
model_name = tm_model.model_name
103-
model = MODELS.get(model_name)(capability=cap, **kwargs) \
104-
if sys_instruct is None else MODELS.get(model_name)(
105-
capability=cap, system=sys_instruct, **kwargs)
100+
model = MODELS.get(model_name)(capability=cap, **kwargs)
106101

107102
print(f'session {session_id}')
108103
while True:

0 commit comments

Comments
 (0)