Skip to content

Commit 3dcf128

Browse files
committed
Fixed the benchmark typo
1 parent b06e632 commit 3dcf128

File tree

3 files changed

+41
-24
lines changed

3 files changed

+41
-24
lines changed

tools/perf/Flux/benchmark.sh

+15-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,20 @@
11
#TODO: Enter the HF Token
22
huggingface-cli login --token HF_TOKEN
33

4+
nvidia-smi --query-gpu=index,utilization.gpu,utilization.memory,temperature.gpu,temperature.memory,power.draw,clocks.sm,clocks.mem,memory.total,memory.used --format=csv,nounits -lms 5000 >> fp8_gpu_utilization.txt &
5+
NVIDIA_SMI_PID=$!
46
python flux_quantization.py --dtype fp8 > fp8_benchmark.txt
7+
kill $NVIDIA_SMI_PID
8+
sleep 10
9+
10+
nvidia-smi --query-gpu=index,utilization.gpu,utilization.memory,temperature.gpu,temperature.memory,power.draw,clocks.sm,clocks.mem,memory.total,memory.used --format=csv,nounits -lms 5000 >> int8_gpu_utilization.txt &
11+
NVIDIA_SMI_PID=$!
512
python flux_quantization.py --dtype int8 > int8_benchmark.txt
6-
python flux_perf.py > fp16_benchmark.txt
13+
kill $NVIDIA_SMI_PID
14+
sleep 10
15+
16+
nvidia-smi --query-gpu=index,utilization.gpu,utilization.memory,temperature.gpu,temperature.memory,power.draw,clocks.sm,clocks.mem,memory.total,memory.used --format=csv,nounits -lms 5000 >> fp16_gpu_utilization.txt &
17+
NVIDIA_SMI_PID=$!
18+
python flux_perf.py > fp16_benchmark.txt
19+
kill $NVIDIA_SMI_PID
20+
sleep 10

tools/perf/Flux/flux_perf.py

+21-20
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@
88
print(torch.cuda.get_device_properties(i).name)
99

1010
DEVICE = "cuda:0"
11-
# pipe = FluxPipeline.from_pretrained(
12-
# "black-forest-labs/FLUX.1-dev",
13-
# torch_dtype=torch.float32,
14-
# )
15-
pipe.to(DEVICE).to(torch.float32)
11+
pipe = FluxPipeline.from_pretrained(
12+
"black-forest-labs/FLUX.1-dev",
13+
torch_dtype=torch.bfloat16,
14+
)
15+
pipe.to(DEVICE).to(torch.bfloat16)
1616
backbone = pipe.transformer
1717

1818

@@ -44,10 +44,11 @@
4444
"debug": False,
4545
"use_python_runtime": True,
4646
"immutable_weights": False,
47+
"offload_module_to_cpu": True,
4748
}
4849

4950

50-
def generate_image(prompt, inference_step, batch_size=2, benchmark=False, iterations=1):
51+
def generate_image(prompt, inference_step, batch_size=1, benchmark=False, iterations=1):
5152

5253
start = time()
5354
for i in range(iterations):
@@ -59,38 +60,38 @@ def generate_image(prompt, inference_step, batch_size=2, benchmark=False, iterat
5960
).images
6061
end = time()
6162
if benchmark:
63+
print(f"Batch Size: {batch_size}")
6264
print("Time Elapse for", iterations, "iterations:", end - start)
6365
print(
6466
"Average Latency Per Step:",
65-
(end - start) / inference_step / iterations / batchsize,
67+
(end - start) / inference_step / iterations / batch_size,
6668
)
6769
return image
6870

6971

70-
generate_image(["Test"], 2)
71-
print("Benchmark Original PyTorch Module Latency (float32)")
72-
generate_image(["Test"], 50, benchmark=True, iterations=3)
72+
pipe.to(torch.bfloat16)
73+
torch.cuda.empty_cache()
74+
# Warmup
75+
generate_image(["Test"], 20)
76+
print("Benchmark Original PyTorch Module Latency (bfloat16)")
77+
for batch_size in range(1, 9):
78+
generate_image(["Test"], 20, batch_size=batch_size, benchmark=True, iterations=3)
7379

7480
pipe.to(torch.float16)
7581
print("Benchmark Original PyTorch Module Latency (float16)")
76-
generate_image(["Test"], 50, benchmark=True, iterations=3)
77-
82+
for batch_size in range(1, 9):
83+
generate_image(["Test"], 20, batch_size=batch_size, benchmark=True, iterations=3)
7884

7985
trt_gm = torch_tensorrt.MutableTorchTensorRTModule(backbone, **settings)
8086
trt_gm.set_expected_dynamic_shape_range((), dynamic_shapes)
8187
pipe.transformer = trt_gm
8288

8389
start = time()
84-
generate_image(["Test"], 2)
90+
generate_image(["Test"], 2, batch_size=2)
8591
end = time()
8692
print("Time Elapse compilation:", end - start)
8793
print()
8894
print("Benchmark TRT Accelerated Latency")
89-
generate_image(["Test"], 50, benchmark=True, iterations=3)
95+
for batch_size in range(1, 9):
96+
generate_image(["Test"], 20, batch_size=batch_size, benchmark=True, iterations=3)
9097
torch.cuda.empty_cache()
91-
92-
93-
with torch_tensorrt.runtime.enable_cudagraphs(trt_gm):
94-
generate_image(["Test"], 2)
95-
print("Benchmark TRT Accelerated Latency with Cuda Graph")
96-
generate_image(["Test"], 50, benchmark=True, iterations=3)

tools/perf/Flux/flux_quantization.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def generate_image(pipe, prompt, image_name):
7575
print(f"Image generated using {image_name} model saved as {image_name}.png")
7676

7777

78-
def benchmark(prompt, inference_step, batch_size=2, iterations=1):
78+
def benchmark(prompt, inference_step, batch_size=1, iterations=1):
7979
from time import time
8080

8181
start = time()
@@ -87,6 +87,7 @@ def benchmark(prompt, inference_step, batch_size=2, iterations=1):
8787
num_images_per_prompt=batch_size,
8888
).images
8989
end = time()
90+
print(f"Batch Size: {batch_size}")
9091
print("Time Elapse for", iterations, "iterations:", end - start)
9192
print(
9293
"Average Latency Per Step:",
@@ -127,7 +128,7 @@ def forward_loop(mod):
127128
mtq.disable_quantizer(backbone, filter_func)
128129

129130
batch_size = 2
130-
BATCH = torch.export.Dim("batch", min=1, max=2)
131+
BATCH = torch.export.Dim("batch", min=1, max=8)
131132
SEQ_LEN = torch.export.Dim("seq_len", min=1, max=512)
132133
# This particular min, max values for img_id input are recommended by torch dynamo during the export of the model.
133134
# To see this recommendation, you can try exporting using min=1, max=4096
@@ -199,7 +200,8 @@ def forward_loop(mod):
199200

200201

201202
print(f"Benchmark TRT Module Latency at ({args.dtype})")
202-
benchmark(["Test"], 50, batch_size=2, iterations=3)
203+
for batch_size in range(1, 9):
204+
benchmark(["Test"], 20, batch_size=batch_size, iterations=3)
203205
print()
204206

205207
# For this dummy model, the fp16 engine size is around 1GB, fp32 engine size is around 2GB

0 commit comments

Comments
 (0)