Fixed the benchmark typo

cehongwang · cehongwang · commit 3dcf1285ad72 · 2025-05-09T07:00:09.000Z
diff --git a/tools/perf/Flux/benchmark.sh b/tools/perf/Flux/benchmark.sh
@@ -1,6 +1,20 @@
 #TODO: Enter the HF Token
 huggingface-cli login --token HF_TOKEN
 
+nvidia-smi --query-gpu=index,utilization.gpu,utilization.memory,temperature.gpu,temperature.memory,power.draw,clocks.sm,clocks.mem,memory.total,memory.used --format=csv,nounits -lms 5000 >> fp8_gpu_utilization.txt &
+NVIDIA_SMI_PID=$!
 python flux_quantization.py --dtype fp8 > fp8_benchmark.txt
+kill $NVIDIA_SMI_PID
+sleep 10
+
+nvidia-smi --query-gpu=index,utilization.gpu,utilization.memory,temperature.gpu,temperature.memory,power.draw,clocks.sm,clocks.mem,memory.total,memory.used --format=csv,nounits -lms 5000 >> int8_gpu_utilization.txt &
+NVIDIA_SMI_PID=$!
 python flux_quantization.py --dtype int8 > int8_benchmark.txt
-python flux_perf.py > fp16_benchmark.txt
+kill $NVIDIA_SMI_PID
+sleep 10
+
+nvidia-smi --query-gpu=index,utilization.gpu,utilization.memory,temperature.gpu,temperature.memory,power.draw,clocks.sm,clocks.mem,memory.total,memory.used --format=csv,nounits -lms 5000 >> fp16_gpu_utilization.txt &
+NVIDIA_SMI_PID=$!
+python flux_perf.py > fp16_benchmark.txt
+kill $NVIDIA_SMI_PID
+sleep 10
diff --git a/tools/perf/Flux/flux_perf.py b/tools/perf/Flux/flux_perf.py
@@ -8,11 +8,11 @@
     print(torch.cuda.get_device_properties(i).name)
 
 DEVICE = "cuda:0"
-# pipe = FluxPipeline.from_pretrained(
-#     "black-forest-labs/FLUX.1-dev",
-#     torch_dtype=torch.float32,
-# )
-pipe.to(DEVICE).to(torch.float32)
+pipe = FluxPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    torch_dtype=torch.bfloat16,
+)
+pipe.to(DEVICE).to(torch.bfloat16)
 backbone = pipe.transformer
 
 
@@ -44,10 +44,11 @@
     "debug": False,
     "use_python_runtime": True,
     "immutable_weights": False,
+    "offload_module_to_cpu": True,
 }
 
 
-def generate_image(prompt, inference_step, batch_size=2, benchmark=False, iterations=1):
+def generate_image(prompt, inference_step, batch_size=1, benchmark=False, iterations=1):
 
     start = time()
     for i in range(iterations):
@@ -59,38 +60,38 @@ def generate_image(prompt, inference_step, batch_size=2, benchmark=False, iterat
         ).images
     end = time()
     if benchmark:
+        print(f"Batch Size: {batch_size}")
         print("Time Elapse for", iterations, "iterations:", end - start)
         print(
             "Average Latency Per Step:",
-            (end - start) / inference_step / iterations / batchsize,
+            (end - start) / inference_step / iterations / batch_size,
         )
     return image
 
 
-generate_image(["Test"], 2)
-print("Benchmark Original PyTorch Module Latency (float32)")
-generate_image(["Test"], 50, benchmark=True, iterations=3)
+pipe.to(torch.bfloat16)
+torch.cuda.empty_cache()
+# Warmup
+generate_image(["Test"], 20)
+print("Benchmark Original PyTorch Module Latency (bfloat16)")
+for batch_size in range(1, 9):
+    generate_image(["Test"], 20, batch_size=batch_size, benchmark=True, iterations=3)
 
 pipe.to(torch.float16)
 print("Benchmark Original PyTorch Module Latency (float16)")
-generate_image(["Test"], 50, benchmark=True, iterations=3)
-
+for batch_size in range(1, 9):
+    generate_image(["Test"], 20, batch_size=batch_size, benchmark=True, iterations=3)
 
 trt_gm = torch_tensorrt.MutableTorchTensorRTModule(backbone, **settings)
 trt_gm.set_expected_dynamic_shape_range((), dynamic_shapes)
 pipe.transformer = trt_gm
 
 start = time()
-generate_image(["Test"], 2)
+generate_image(["Test"], 2, batch_size=2)
 end = time()
 print("Time Elapse compilation:", end - start)
 print()
 print("Benchmark TRT Accelerated Latency")
-generate_image(["Test"], 50, benchmark=True, iterations=3)
+for batch_size in range(1, 9):
+    generate_image(["Test"], 20, batch_size=batch_size, benchmark=True, iterations=3)
 torch.cuda.empty_cache()
-
-
-with torch_tensorrt.runtime.enable_cudagraphs(trt_gm):
-    generate_image(["Test"], 2)
-    print("Benchmark TRT Accelerated Latency with Cuda Graph")
-    generate_image(["Test"], 50, benchmark=True, iterations=3)
diff --git a/tools/perf/Flux/flux_quantization.py b/tools/perf/Flux/flux_quantization.py
@@ -75,7 +75,7 @@ def generate_image(pipe, prompt, image_name):
     print(f"Image generated using {image_name} model saved as {image_name}.png")
 
 
-def benchmark(prompt, inference_step, batch_size=2, iterations=1):
+def benchmark(prompt, inference_step, batch_size=1, iterations=1):
     from time import time
 
     start = time()
@@ -87,6 +87,7 @@ def benchmark(prompt, inference_step, batch_size=2, iterations=1):
             num_images_per_prompt=batch_size,
         ).images
     end = time()
+    print(f"Batch Size: {batch_size}")
     print("Time Elapse for", iterations, "iterations:", end - start)
     print(
         "Average Latency Per Step:",
@@ -127,7 +128,7 @@ def forward_loop(mod):
 mtq.disable_quantizer(backbone, filter_func)
 
 batch_size = 2
-BATCH = torch.export.Dim("batch", min=1, max=2)
+BATCH = torch.export.Dim("batch", min=1, max=8)
 SEQ_LEN = torch.export.Dim("seq_len", min=1, max=512)
 # This particular min, max values for img_id input are recommended by torch dynamo during the export of the model.
 # To see this recommendation, you can try exporting using min=1, max=4096
@@ -199,7 +200,8 @@ def forward_loop(mod):
 
 
 print(f"Benchmark TRT Module Latency at ({args.dtype})")
-benchmark(["Test"], 50, batch_size=2, iterations=3)
+for batch_size in range(1, 9):
+    benchmark(["Test"], 20, batch_size=batch_size, iterations=3)
 print()
 
 # For this dummy model, the fp16 engine size is around 1GB, fp32 engine size is around 2GB