8
8
print (torch .cuda .get_device_properties (i ).name )
9
9
10
10
DEVICE = "cuda:0"
11
- # pipe = FluxPipeline.from_pretrained(
12
- # "black-forest-labs/FLUX.1-dev",
13
- # torch_dtype=torch.float32 ,
14
- # )
15
- pipe .to (DEVICE ).to (torch .float32 )
11
+ pipe = FluxPipeline .from_pretrained (
12
+ "black-forest-labs/FLUX.1-dev" ,
13
+ torch_dtype = torch .bfloat16 ,
14
+ )
15
+ pipe .to (DEVICE ).to (torch .bfloat16 )
16
16
backbone = pipe .transformer
17
17
18
18
44
44
"debug" : False ,
45
45
"use_python_runtime" : True ,
46
46
"immutable_weights" : False ,
47
+ "offload_module_to_cpu" : True ,
47
48
}
48
49
49
50
50
- def generate_image (prompt , inference_step , batch_size = 2 , benchmark = False , iterations = 1 ):
51
+ def generate_image (prompt , inference_step , batch_size = 1 , benchmark = False , iterations = 1 ):
51
52
52
53
start = time ()
53
54
for i in range (iterations ):
@@ -59,38 +60,38 @@ def generate_image(prompt, inference_step, batch_size=2, benchmark=False, iterat
59
60
).images
60
61
end = time ()
61
62
if benchmark :
63
+ print (f"Batch Size: { batch_size } " )
62
64
print ("Time Elapse for" , iterations , "iterations:" , end - start )
63
65
print (
64
66
"Average Latency Per Step:" ,
65
- (end - start ) / inference_step / iterations / batchsize ,
67
+ (end - start ) / inference_step / iterations / batch_size ,
66
68
)
67
69
return image
68
70
69
71
70
- generate_image (["Test" ], 2 )
71
- print ("Benchmark Original PyTorch Module Latency (float32)" )
72
- generate_image (["Test" ], 50 , benchmark = True , iterations = 3 )
72
+ pipe .to (torch .bfloat16 )
73
+ torch .cuda .empty_cache ()
74
+ # Warmup
75
+ generate_image (["Test" ], 20 )
76
+ print ("Benchmark Original PyTorch Module Latency (bfloat16)" )
77
+ for batch_size in range (1 , 9 ):
78
+ generate_image (["Test" ], 20 , batch_size = batch_size , benchmark = True , iterations = 3 )
73
79
74
80
pipe .to (torch .float16 )
75
81
print ("Benchmark Original PyTorch Module Latency (float16)" )
76
- generate_image ([ "Test" ], 50 , benchmark = True , iterations = 3 )
77
-
82
+ for batch_size in range ( 1 , 9 ):
83
+ generate_image ([ "Test" ], 20 , batch_size = batch_size , benchmark = True , iterations = 3 )
78
84
79
85
trt_gm = torch_tensorrt .MutableTorchTensorRTModule (backbone , ** settings )
80
86
trt_gm .set_expected_dynamic_shape_range ((), dynamic_shapes )
81
87
pipe .transformer = trt_gm
82
88
83
89
start = time ()
84
- generate_image (["Test" ], 2 )
90
+ generate_image (["Test" ], 2 , batch_size = 2 )
85
91
end = time ()
86
92
print ("Time Elapse compilation:" , end - start )
87
93
print ()
88
94
print ("Benchmark TRT Accelerated Latency" )
89
- generate_image (["Test" ], 50 , benchmark = True , iterations = 3 )
95
+ for batch_size in range (1 , 9 ):
96
+ generate_image (["Test" ], 20 , batch_size = batch_size , benchmark = True , iterations = 3 )
90
97
torch .cuda .empty_cache ()
91
-
92
-
93
- with torch_tensorrt .runtime .enable_cudagraphs (trt_gm ):
94
- generate_image (["Test" ], 2 )
95
- print ("Benchmark TRT Accelerated Latency with Cuda Graph" )
96
- generate_image (["Test" ], 50 , benchmark = True , iterations = 3 )
0 commit comments