Skip to content

Commit b572d6f

Browse files
authored
[ukernels] Add missing specializations on gfx942/gfx950 and associated e2e tests (#22446)
The primary purpose of this PR is to add missing e2e tests to cover all combinations of ukernels. We have different ukernels for static and dynamic shapes, so this PR introduces an option to generate_e2e_matmul_tests.py that allows specifying the dynamicity of m, n, and k. Since ukernels require specialization to be enabled ([see PR #22425](#22425)), this PR also adds the following missing specializations : - bf16 on gf942 - f16, bf16, f8 on gfx950. This will positively affect performance on dynamic shape matmuls without compile-time bounds information. For example : ``` !A_size = tensor<?x4096xbf16> !B_size = tensor<4096x4096xbf16> !C_size = tensor<?x4096xf32> func.func @Matmul( %A : !A_size, %B : !B_size) -> !C_size { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 %m = tensor.dim %A, %c0 : tensor<?x4096xbf16> %empty = tensor.empty(%m) : !C_size %C = linalg.fill ins(%cst : f32) outs(%empty : !C_size) -> !C_size %0 = linalg.matmul indexing_maps = [affine_map<(m, n, k) -> (m, k)>, affine_map<(m, n, k) -> (n, k)>,// transpose affine_map<(m, n, k) -> (m, n)>] ins(%A, %B : !A_size, !B_size) outs(%C : !C_size) -> !C_size return %0 : !C_size } ``` Before PR: Time (ms): 30.831274 After PR: Time (ms): 0.284123
1 parent 10a48ac commit b572d6f

File tree

8 files changed

+400
-17
lines changed

8 files changed

+400
-17
lines changed

compiler/plugins/target/ROCM/builtins/specialization/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ endif()
2626
# Target archs for specialization patternsets. https://llvm.org/docs/AMDGPUUsage.html#processors
2727
gpu_archs = [
2828
"gfx942",
29+
"gfx950",
2930
]
3031

3132
specialization_patterns_mlir_files = [

compiler/plugins/target/ROCM/builtins/specialization/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ iree_c_embed_data(
1919
iree_specialization_patterns_amdgpu
2020
SRCS
2121
"specialization_patterns_gfx942.mlir"
22+
"specialization_patterns_gfx950.mlir"
2223
C_FILE_OUTPUT
2324
"iree_specialization_patterns_amdgpu.c"
2425
H_FILE_OUTPUT
@@ -32,6 +33,7 @@ iree_lit_test_suite(
3233
verify_specialization_patterns_amdgpu
3334
SRCS
3435
"specialization_patterns_gfx942.mlir"
36+
"specialization_patterns_gfx950.mlir"
3537
TOOLS
3638
iree-opt
3739
)

compiler/plugins/target/ROCM/builtins/specialization/specialization_patterns_gfx942.mlir

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,37 @@ pdl.pattern @f16_pingpong : benefit(1) {
3333
}
3434
}
3535

36+
pdl.pattern @bf16_pingpong : benefit(1) {
37+
%imaps = pdl.attribute = [
38+
affine_map<(d0, d1, d2) -> (d0, d2)>,
39+
affine_map<(d0, d1, d2) -> (d1, d2)>,
40+
affine_map<(d0, d1, d2) -> (d0, d1)>
41+
]
42+
%elemtypes = pdl.attribute = [bf16, bf16, f32]
43+
%operands = pdl.operands
44+
%types = pdl.types
45+
%matmul = pdl.operation (%operands : !pdl.range<value>) -> (%types : !pdl.range<type>)
46+
pdl.apply_native_constraint "matchContraction"(
47+
%matmul, %elemtypes, %imaps
48+
: !pdl.operation, !pdl.attribute, !pdl.attribute)
49+
50+
// Skip if the operation already has ranges.
51+
%attr_name = pdl.attribute = "iree_codegen.specialization_ranges"
52+
pdl.apply_native_constraint "hasAttr"(
53+
%matmul, %attr_name
54+
: !pdl.operation, !pdl.attribute) {isNegated = true}
55+
56+
pdl.rewrite %matmul {
57+
%ranges = pdl.attribute = #util<int.assumption.multi_array[
58+
[<umin = 2048, udiv = 256>, <umin = 2048, udiv = 256>, <udiv = 64>], // Large pingpong
59+
[<umin = 1024, udiv = 128>, <umin = 1024, udiv = 128>, <udiv = 64>] // Medium pingpong
60+
]>
61+
pdl.apply_native_rewrite "annotateOperation"(
62+
%matmul, %attr_name, %ranges
63+
: !pdl.operation, !pdl.attribute, !pdl.attribute)
64+
}
65+
}
66+
3667
pdl.pattern @f8E4M3_pingpong : benefit(1) {
3768
%imaps = pdl.attribute = [
3869
affine_map<(d0, d1, d2) -> (d0, d2)>,
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
// RUN: iree-opt %s
2+
3+
// PDL pattern spec to annotate operations with specialization ranges.
4+
5+
pdl.pattern @f16_pingpong : benefit(1) {
6+
%imaps = pdl.attribute = [
7+
affine_map<(d0, d1, d2) -> (d0, d2)>,
8+
affine_map<(d0, d1, d2) -> (d1, d2)>,
9+
affine_map<(d0, d1, d2) -> (d0, d1)>
10+
]
11+
%elemtypes = pdl.attribute = [f16, f16, f32]
12+
%operands = pdl.operands
13+
%types = pdl.types
14+
%matmul = pdl.operation (%operands : !pdl.range<value>) -> (%types : !pdl.range<type>)
15+
pdl.apply_native_constraint "matchContraction"(
16+
%matmul, %elemtypes, %imaps
17+
: !pdl.operation, !pdl.attribute, !pdl.attribute)
18+
19+
// Skip if the operation already has ranges.
20+
%attr_name = pdl.attribute = "iree_codegen.specialization_ranges"
21+
pdl.apply_native_constraint "hasAttr"(
22+
%matmul, %attr_name
23+
: !pdl.operation, !pdl.attribute) {isNegated = true}
24+
25+
pdl.rewrite %matmul {
26+
%ranges = pdl.attribute = #util<int.assumption.multi_array[
27+
[<umin = 2048, udiv = 256>, <umin = 2048, udiv = 256>, <udiv = 64>], // Large pingpong
28+
[<umin = 1024, udiv = 128>, <umin = 1024, udiv = 128>, <udiv = 64>] // Medium pingpong
29+
]>
30+
pdl.apply_native_rewrite "annotateOperation"(
31+
%matmul, %attr_name, %ranges
32+
: !pdl.operation, !pdl.attribute, !pdl.attribute)
33+
}
34+
}
35+
36+
pdl.pattern @bf16_pingpong : benefit(1) {
37+
%imaps = pdl.attribute = [
38+
affine_map<(d0, d1, d2) -> (d0, d2)>,
39+
affine_map<(d0, d1, d2) -> (d1, d2)>,
40+
affine_map<(d0, d1, d2) -> (d0, d1)>
41+
]
42+
%elemtypes = pdl.attribute = [bf16, bf16, f32]
43+
%operands = pdl.operands
44+
%types = pdl.types
45+
%matmul = pdl.operation (%operands : !pdl.range<value>) -> (%types : !pdl.range<type>)
46+
pdl.apply_native_constraint "matchContraction"(
47+
%matmul, %elemtypes, %imaps
48+
: !pdl.operation, !pdl.attribute, !pdl.attribute)
49+
50+
// Skip if the operation already has ranges.
51+
%attr_name = pdl.attribute = "iree_codegen.specialization_ranges"
52+
pdl.apply_native_constraint "hasAttr"(
53+
%matmul, %attr_name
54+
: !pdl.operation, !pdl.attribute) {isNegated = true}
55+
56+
pdl.rewrite %matmul {
57+
%ranges = pdl.attribute = #util<int.assumption.multi_array[
58+
[<umin = 2048, udiv = 256>, <umin = 2048, udiv = 256>, <udiv = 64>], // Large pingpong
59+
[<umin = 1024, udiv = 128>, <umin = 1024, udiv = 128>, <udiv = 64>] // Medium pingpong
60+
]>
61+
pdl.apply_native_rewrite "annotateOperation"(
62+
%matmul, %attr_name, %ranges
63+
: !pdl.operation, !pdl.attribute, !pdl.attribute)
64+
}
65+
}
66+
67+
pdl.pattern @f8E4M3_pingpong : benefit(1) {
68+
%imaps = pdl.attribute = [
69+
affine_map<(d0, d1, d2) -> (d0, d2)>,
70+
affine_map<(d0, d1, d2) -> (d1, d2)>,
71+
affine_map<(d0, d1, d2) -> (d0, d1)>
72+
]
73+
%elemtypes = pdl.attribute = [f8E4M3FN, f8E4M3FN, f32]
74+
%operands = pdl.operands
75+
%types = pdl.types
76+
%matmul = pdl.operation (%operands : !pdl.range<value>) -> (%types : !pdl.range<type>)
77+
pdl.apply_native_constraint "matchContraction"(
78+
%matmul, %elemtypes, %imaps
79+
: !pdl.operation, !pdl.attribute, !pdl.attribute)
80+
81+
// Skip if the operation already has ranges.
82+
%attr_name = pdl.attribute = "iree_codegen.specialization_ranges"
83+
pdl.apply_native_constraint "hasAttr"(
84+
%matmul, %attr_name
85+
: !pdl.operation, !pdl.attribute) {isNegated = true}
86+
87+
pdl.rewrite %matmul {
88+
%ranges = pdl.attribute = #util<int.assumption.multi_array[
89+
[<umin = 2048, udiv = 256>, <umin = 2048, udiv = 256>, <udiv = 128>], // Large pingpong
90+
[<umin = 1024, udiv = 128>, <umin = 1024, udiv = 128>, <udiv = 128>] // Medium pingpong
91+
]>
92+
pdl.apply_native_rewrite "annotateOperation"(
93+
%matmul, %attr_name, %ranges
94+
: !pdl.operation, !pdl.attribute, !pdl.attribute)
95+
}
96+
}

0 commit comments

Comments
 (0)