Skip to content

Commit ebe90bc

Browse files
authored
Report the inference benchmark of models with different size (#794)
* update test scripts for models with different sizes * update * only test after tunning gemm * chmod +x * fix typo * benchmark on a100 * fix typo * fix typo * per-token latency percentile in profile_throughput * fix * fix * rename * make the script accept parameters * minor fix * indent * reformat table * change to 3000 * minor fix
1 parent 5b9e454 commit ebe90bc

File tree

9 files changed

+556
-49
lines changed

9 files changed

+556
-49
lines changed

benchmark/benchmark_13b.sh

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#!/bin/bash
2+
if [ -z "$1" ]
3+
then
4+
echo "Error. Please input the model path of llama2-13b model"
5+
exit 1
6+
fi
7+
8+
workspace_dir=$(dirname $(realpath "$0"))
9+
10+
tp=1
11+
model_path="$1"
12+
model_foldername=$(basename "$model_path")
13+
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
14+
15+
# convert
16+
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
17+
if [ $? != 0 ]
18+
then
19+
exit 1
20+
fi
21+
22+
# update recommended config to config.ini
23+
config_path=${turbomind_model_path}/triton_models/weights/config.ini
24+
25+
apt-get update
26+
apt-get install crudini -y
27+
28+
crudini --set ${config_path} llama max_context_token_num 4
29+
crudini --set ${config_path} llama cache_chunk_size -1
30+
crudini --set ${config_path} llama cache_max_entry_count 500
31+
crudini --set ${config_path} llama max_batch_size 128
32+
# end of update config
33+
34+
cd ${workspace_dir}
35+
36+
# download dataset
37+
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
38+
39+
benchmark_rpm () {
40+
output_path=$1
41+
mkdir -p "${output_path}"
42+
43+
batches=(64 128)
44+
for batch in "${batches[@]}"
45+
do
46+
for i in {1..3}
47+
do
48+
python3 profile_throughput.py \
49+
ShareGPT_V3_unfiltered_cleaned_split.json \
50+
${turbomind_model_path} \
51+
--concurrency "$batch" \
52+
--num_prompts 3000 \
53+
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
54+
done
55+
done
56+
}
57+
58+
benchmark_generation () {
59+
output_path=$1
60+
mkdir -p "${output_path}"
61+
62+
python3 profile_generation.py \
63+
${turbomind_model_path} \
64+
--concurrency 1 16 32 64 \
65+
--csv ${output_path}/generation.csv
66+
}
67+
68+
################################# BENCHMARK AFTER TUNING GEMM #################################
69+
# tune gemm
70+
head_num=$(crudini --get "${config_path}" llama head_num)
71+
size_per_head=$(crudini --get "${config_path}" llama size_per_head)
72+
vocab_size=$(crudini --get "${config_path}" llama vocab_size)
73+
inter_size=$(crudini --get "${config_path}" llama inter_size)
74+
tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
75+
max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
76+
77+
echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
78+
79+
python3 -m lmdeploy.turbomind.generate_gemm_config \
80+
--head_num ${head_num} \
81+
--size_per_head ${size_per_head} \
82+
--vocab_size ${vocab_size} \
83+
--inter_size ${inter_size} \
84+
--tensor_para_size ${tensor_para_size} \
85+
--max_batch_size ${max_batch_size}
86+
87+
output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
88+
# benchmark request throughput and static inference
89+
benchmark_rpm ${output_path}
90+
benchmark_generation ${output_path}
91+
92+
mv gemm_config.in ${output_path}

benchmark/benchmark_20b.sh

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#!/bin/bash
2+
if [ -z "$1" ]
3+
then
4+
echo "Error. Please input the model path of internlm-20b model"
5+
exit 1
6+
fi
7+
8+
workspace_dir=$(dirname $(realpath "$0"))
9+
10+
tp=2
11+
model_path="$1"
12+
model_foldername=$(basename "$model_path")
13+
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
14+
15+
# convert
16+
lmdeploy convert internlm-20b ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
17+
if [ $? != 0 ]
18+
then
19+
exit 1
20+
fi
21+
22+
# update recommended config to config.ini
23+
config_path=${turbomind_model_path}/triton_models/weights/config.ini
24+
25+
apt-get update
26+
apt-get install crudini -y
27+
28+
crudini --set ${config_path} llama max_context_token_num 4
29+
crudini --set ${config_path} llama cache_chunk_size -1
30+
crudini --set ${config_path} llama cache_max_entry_count 700
31+
crudini --set ${config_path} llama max_batch_size 128
32+
# end of update config
33+
34+
cd ${workspace_dir}
35+
36+
# download dataset
37+
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
38+
39+
benchmark_rpm () {
40+
output_path=$1
41+
mkdir -p "${output_path}"
42+
43+
batches=(64 128)
44+
for batch in "${batches[@]}"
45+
do
46+
for i in {1..3}
47+
do
48+
python3 profile_throughput.py \
49+
ShareGPT_V3_unfiltered_cleaned_split.json \
50+
${turbomind_model_path} \
51+
--concurrency "$batch" \
52+
--num_prompts 3000 \
53+
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
54+
done
55+
done
56+
}
57+
58+
benchmark_generation () {
59+
output_path=$1
60+
mkdir -p "${output_path}"
61+
62+
python3 profile_generation.py \
63+
${turbomind_model_path} \
64+
--concurrency 1 16 32 64 \
65+
--csv ${output_path}/generation.csv
66+
}
67+
68+
################################# BENCHMARK AFTER TUNING GEMM #################################
69+
# tune gemm
70+
head_num=$(crudini --get "${config_path}" llama head_num)
71+
size_per_head=$(crudini --get "${config_path}" llama size_per_head)
72+
vocab_size=$(crudini --get "${config_path}" llama vocab_size)
73+
inter_size=$(crudini --get "${config_path}" llama inter_size)
74+
tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
75+
max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
76+
77+
echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
78+
79+
python3 -m lmdeploy.turbomind.generate_gemm_config \
80+
--head_num ${head_num} \
81+
--size_per_head ${size_per_head} \
82+
--vocab_size ${vocab_size} \
83+
--inter_size ${inter_size} \
84+
--tensor_para_size ${tensor_para_size} \
85+
--max_batch_size ${max_batch_size}
86+
87+
output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
88+
# benchmark request throughput and static inference
89+
benchmark_rpm ${output_path}
90+
benchmark_generation ${output_path}
91+
92+
cp gemm_config.in ${output_path}

benchmark/benchmark_70b.sh

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/bin/bash
2+
if [ -z "$1" ]
3+
then
4+
echo "Error. Please input the model path of llama2-70b model"
5+
exit 1
6+
fi
7+
8+
workspace_dir=$(dirname $(realpath "$0"))
9+
10+
tp=4
11+
model_path="$1"
12+
model_foldername=$(basename "$model_path")
13+
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
14+
15+
# convert
16+
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
17+
if [ $? != 0 ]
18+
then
19+
exit 1
20+
fi
21+
22+
# update recommended config to config.ini
23+
config_path=${turbomind_model_path}/triton_models/weights/config.ini
24+
25+
apt-get update
26+
apt-get install crudini -y
27+
28+
crudini --set ${config_path} llama max_context_token_num 4
29+
crudini --set ${config_path} llama cache_chunk_size -1
30+
crudini --set ${config_path} llama cache_max_entry_count 4000
31+
crudini --set ${config_path} llama max_batch_size 256
32+
# end of update config
33+
34+
cd ${workspace_dir}
35+
36+
# download dataset
37+
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
38+
39+
benchmark_rpm () {
40+
output_path=$1
41+
mkdir -p "${output_path}"
42+
43+
batches=(64 128 256)
44+
for batch in "${batches[@]}"
45+
do
46+
for i in {1..3}
47+
do
48+
python3 profile_throughput.py \
49+
ShareGPT_V3_unfiltered_cleaned_split.json \
50+
${turbomind_model_path} \
51+
--concurrency "$batch" \
52+
--num_prompts 3000 \
53+
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
54+
done
55+
done
56+
}
57+
58+
benchmark_generation () {
59+
output_path=$1
60+
mkdir -p "${output_path}"
61+
62+
python3 profile_generation.py \
63+
${turbomind_model_path} \
64+
--concurrency 1 64 128 256 \
65+
--csv ${output_path}/generation.csv
66+
}
67+
68+
output_path="${workspace_dir}"/output/"${model_foldername}"-tp"${tp}"
69+
# benchmark request throughput and static inference
70+
benchmark_rpm ${output_path}
71+
benchmark_generation ${output_path}

benchmark/benchmark_7b.sh

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#!/bin/bash
2+
if [ -z "$1" ]
3+
then
4+
echo "Error. Please input the model path of llama2-7b model"
5+
exit 1
6+
fi
7+
8+
workspace_dir=$(dirname $(realpath "$0"))
9+
10+
tp=1
11+
model_path="$1"
12+
model_foldername=$(basename "$model_path")
13+
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
14+
15+
# convert
16+
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
17+
if [ $? != 0 ]
18+
then
19+
exit 1
20+
fi
21+
22+
# update recommended config to config.ini
23+
config_path=${turbomind_model_path}/triton_models/weights/config.ini
24+
25+
apt-get update
26+
apt-get install crudini -y
27+
28+
crudini --set ${config_path} llama max_context_token_num 4
29+
crudini --set ${config_path} llama cache_chunk_size -1
30+
crudini --set ${config_path} llama cache_max_entry_count 1000
31+
crudini --set ${config_path} llama max_batch_size 128
32+
# end of update config
33+
34+
cd ${workspace_dir}
35+
36+
# download dataset
37+
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
38+
39+
benchmark_rpm () {
40+
output_path=$1
41+
mkdir -p "${output_path}"
42+
43+
batches=(64 128)
44+
for batch in "${batches[@]}"
45+
do
46+
for i in {1..3}
47+
do
48+
python3 profile_throughput.py \
49+
ShareGPT_V3_unfiltered_cleaned_split.json \
50+
${turbomind_model_path} \
51+
--concurrency "$batch" \
52+
--num_prompts 3000 \
53+
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
54+
done
55+
done
56+
}
57+
58+
benchmark_generation () {
59+
output_path=$1
60+
mkdir -p "${output_path}"
61+
62+
python3 profile_generation.py \
63+
${turbomind_model_path} \
64+
--concurrency 1 16 32 64 \
65+
--csv ${output_path}/generation.csv
66+
}
67+
68+
################################# BENCHMARK AFTER TUNING GEMM #################################
69+
output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
70+
71+
# tune gemm
72+
head_num=$(crudini --get "${config_path}" llama head_num)
73+
size_per_head=$(crudini --get "${config_path}" llama size_per_head)
74+
vocab_size=$(crudini --get "${config_path}" llama vocab_size)
75+
inter_size=$(crudini --get "${config_path}" llama inter_size)
76+
tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
77+
max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
78+
79+
echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
80+
81+
python3 -m lmdeploy.turbomind.generate_gemm_config \
82+
--head_num ${head_num} \
83+
--size_per_head ${size_per_head} \
84+
--vocab_size ${vocab_size} \
85+
--inter_size ${inter_size} \
86+
--tensor_para_size ${tensor_para_size} \
87+
--max_batch_size ${max_batch_size}
88+
89+
# benchmark request throughput and static inference
90+
benchmark_rpm ${output_path}
91+
benchmark_generation ${output_path}
92+
93+
mv gemm_config.in ${output_path}

benchmark/profile_generation.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,14 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
2929
for _ in range(test_round):
3030
token_latency_stats = [0] * (output_seqlen + 1)
3131
prev = time.perf_counter()
32-
n_pre_token = 0
32+
n_prev_token = 0
3333
"""
3434
The iterator provided by `stream_infer` denotes the number of generated tokens so far,
3535
which is represented by the variable `n_token`.
3636
Please note that `n_token` is not a continuous value. In other words, during the iteration,
3737
its value might be 5, 7, 8, 16, and so on, rather than 1, 2, 3, 4, etc.
3838
So, it is quite difficult to get the latency of each generated token.
39-
As a work-around, we set the latency `new-prev` of each iteration to the first token of
39+
As a work-around, we set the latency `now-prev` of each iteration to the first token of
4040
the new generated tokens, and leave the latency of the rest tokens being 0.
4141
For example, in the first iteration, 5 tokens are generated.
4242
The time elapsing in this iteration `now-prev` is set to the latency of first token of
@@ -54,9 +54,9 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
5454
temperature=temperature):
5555
_, n_token = outputs[0]
5656
now = time.perf_counter()
57-
if n_pre_token != n_token:
58-
token_latency_stats[n_pre_token] = np.round(now - prev, 3)
59-
n_pre_token = n_token
57+
if n_prev_token != n_token:
58+
token_latency_stats[n_prev_token] = np.round(now - prev, 3)
59+
n_prev_token = n_token
6060
prev = now
6161
if session_id == 1:
6262
pbar.update(1)

0 commit comments

Comments
 (0)