8
8
print (torch .cuda .get_device_properties (i ).name )
9
9
10
10
DEVICE = "cuda:0"
11
- # pipe = FluxPipeline.from_pretrained(
12
- # "black-forest-labs/FLUX.1-dev",
13
- # torch_dtype=torch.float32 ,
14
- # )
15
- pipe .to (DEVICE ).to (torch .float32 )
11
+ pipe = FluxPipeline .from_pretrained (
12
+ "black-forest-labs/FLUX.1-dev" ,
13
+ torch_dtype = torch .bfloat16 ,
14
+ )
15
+ pipe .to (DEVICE ).to (torch .bfloat16 )
16
16
backbone = pipe .transformer
17
17
18
18
44
44
"debug" : False ,
45
45
"use_python_runtime" : True ,
46
46
"immutable_weights" : False ,
47
+ "offload_module_to_cpu" : True ,
47
48
}
48
49
49
50
50
- def generate_image (prompt , inference_step , batch_size = 2 , benchmark = False , iterations = 1 ):
51
+ def generate_image (prompt , inference_step , batch_size = 1 , benchmark = False , iterations = 1 ):
51
52
52
53
start = time ()
53
54
for i in range (iterations ):
@@ -62,35 +63,37 @@ def generate_image(prompt, inference_step, batch_size=2, benchmark=False, iterat
62
63
print ("Time Elapse for" , iterations , "iterations:" , end - start )
63
64
print (
64
65
"Average Latency Per Step:" ,
65
- (end - start ) / inference_step / iterations / batchsize ,
66
+ (end - start ) / inference_step / iterations / batch_size ,
66
67
)
67
68
return image
68
69
69
70
70
- generate_image (["Test" ], 2 )
71
- print ("Benchmark Original PyTorch Module Latency (float32)" )
72
- generate_image (["Test" ], 50 , benchmark = True , iterations = 3 )
71
+ pipe .to (torch .bfloat16 )
72
+ torch .cuda .empty_cache ()
73
+ # Warmup
74
+ generate_image (["Test" ], 20 )
75
+ print ("Benchmark Original PyTorch Module Latency (bfloat16)" )
76
+ generate_image (["Test" ], 20 , benchmark = True , iterations = 3 )
73
77
74
78
pipe .to (torch .float16 )
75
79
print ("Benchmark Original PyTorch Module Latency (float16)" )
76
- generate_image (["Test" ], 50 , benchmark = True , iterations = 3 )
77
-
80
+ generate_image (["Test" ], 20 , benchmark = True , iterations = 3 )
78
81
79
82
trt_gm = torch_tensorrt .MutableTorchTensorRTModule (backbone , ** settings )
80
83
trt_gm .set_expected_dynamic_shape_range ((), dynamic_shapes )
81
84
pipe .transformer = trt_gm
82
85
83
86
start = time ()
84
- generate_image (["Test" ], 2 )
87
+ generate_image (["Test" ], 2 , batch_size = 2 )
85
88
end = time ()
86
89
print ("Time Elapse compilation:" , end - start )
87
90
print ()
88
91
print ("Benchmark TRT Accelerated Latency" )
89
- generate_image (["Test" ], 50 , benchmark = True , iterations = 3 )
92
+ generate_image (["Test" ], 20 , benchmark = True , iterations = 3 )
90
93
torch .cuda .empty_cache ()
91
94
92
95
93
96
with torch_tensorrt .runtime .enable_cudagraphs (trt_gm ):
94
97
generate_image (["Test" ], 2 )
95
98
print ("Benchmark TRT Accelerated Latency with Cuda Graph" )
96
- generate_image (["Test" ], 50 , benchmark = True , iterations = 3 )
99
+ generate_image (["Test" ], 20 , benchmark = True , iterations = 3 )
0 commit comments