Skip to content

Commit 1a01a80

Browse files
author
Martien de Jong
committed
[AIE] Add baseline tests for gemm_bf16 and conv2d_bf16
1 parent 638e872 commit 1a01a80

File tree

6 files changed

+1385
-0
lines changed

6 files changed

+1385
-0
lines changed
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
3+
# See https://llvm.org/LICENSE.txt for license information.
4+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5+
#
6+
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
7+
8+
# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \
9+
# RUN: --debug-only=postpipeliner-summary -o - | FileCheck %s
10+
11+
12+
# derived from gemm_bf16_0
13+
14+
--- |
15+
define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 {
16+
; CHECK-LABEL: gemm:
17+
; CHECK: .p2align 4
18+
; CHECK-NEXT: // %bb.0: // %entry
19+
; CHECK-NEXT: mova r1, #0; nopb ; nopxm
20+
; CHECK-NEXT: ge r1, r1, r0
21+
; CHECK-NEXT: jnz r1, #.LBB0_4
22+
; CHECK-NEXT: nop // Delay Slot 5
23+
; CHECK-NEXT: nop // Delay Slot 4
24+
; CHECK-NEXT: nop // Delay Slot 3
25+
; CHECK-NEXT: nop // Delay Slot 2
26+
; CHECK-NEXT: nop // Delay Slot 1
27+
; CHECK-NEXT: // %bb.1: // %for.body.preheader
28+
; CHECK-NEXT: add.nc lc, r0, #-1
29+
; CHECK-NEXT: movxm ls, #.LBB0_2
30+
; CHECK-NEXT: movxm le, #.L_LEnd0
31+
; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopx ; mov p7, p5; nopv
32+
; CHECK-NEXT: vldb wl8, [p0], m4; nopa ; nops ; nopxm ; nopv
33+
; CHECK-NEXT: vldb wh10, [p0, #32]; nopa ; nops ; nopxm ; nopv
34+
; CHECK-NEXT: vldb wl10, [p0], m4; nopa ; nops ; nopxm ; nopv
35+
; CHECK-NEXT: vldb wh1, [p0, #32]; nopa ; nops ; nopxm ; nopv
36+
; CHECK-NEXT: vldb wl1, [p0], m4; nopa ; nops ; nopxm ; nopv
37+
; CHECK-NEXT: nopa ; vldb wh3, [p0, #32]; nopx
38+
; CHECK-NEXT: vldb.3d wl3, [p0], d1
39+
; CHECK-NEXT: vlda wh7, [p4, #352]; vshift.align x0, x0, s0, x8, r3
40+
; CHECK-NEXT: vlda wl7, [p4, #320]
41+
; CHECK-NEXT: vlda wh9, [p4, #416]; vshift.align x2, x2, s0, x10, r3
42+
; CHECK-NEXT: vlda wl9, [p4, #384]; vshuffle x8, x0, x2, r9
43+
; CHECK-NEXT: vldb wh5, [p5, #32]; vshift.align x4, x4, s0, x1, r3
44+
; CHECK-NEXT: vlda wl5, [p5], #256; vshuffle x5, x0, x2, r25
45+
; CHECK-NEXT: vlda wh11, [p4, #480]; vshift.align x6, x6, s0, x3, r3
46+
; CHECK-NEXT: vlda wl11, [p4, #448]; vshuffle x3, x4, x6, r9
47+
; CHECK-NEXT: vshuffle x10, x4, x6, r25
48+
; CHECK-NEXT: vshuffle x1, x3, x5, r13
49+
; CHECK-NEXT: vshuffle x3, x3, x5, r24
50+
; CHECK-NEXT: mov r3, p0
51+
; CHECK-NEXT: and r3, r3, r0; mov p4, p7; vmac.f bmh7, bmh7, x8, x5, r29
52+
; CHECK-NEXT: add r3, r3, #34; vmac.f bmh5, bmh5, x1, x5, r29
53+
; CHECK-NEXT: vmac.f bml2, bml2, x3, x5, r29
54+
; CHECK-NEXT: vmac.f bml0, bml0, x10, x5, r29
55+
; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x9, r29
56+
; CHECK-NEXT: vmac.f bmh0, bmh0, x1, x9, r29
57+
; CHECK-NEXT: vmac.f bmh3, bmh3, x3, x9, r29
58+
; CHECK-NEXT: .p2align 4
59+
; CHECK-NEXT: .LBB0_2: // %for.body
60+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
61+
; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopx ; mov p7, p5; vmac.f bmh2, bmh2, x10, x9, r29
62+
; CHECK-NEXT: nopa ; vldb wl8, [p0], m4; nopx ; vmac.f bml4, bml4, x8, x7, r29
63+
; CHECK-NEXT: vldb wh10, [p0, #32]; vmac.f bml3, bml3, x1, x7, r29
64+
; CHECK-NEXT: vldb wl10, [p0], m4; vmac.f bml6, bml6, x3, x7, r29
65+
; CHECK-NEXT: vldb wh1, [p0, #32]; vmac.f bml5, bml5, x10, x7, r29
66+
; CHECK-NEXT: vldb wl1, [p0], m4; vmac.f bmh6, bmh6, x8, x11, r29
67+
; CHECK-NEXT: vldb wh3, [p0, #32]; vmac.f bmh4, bmh4, x1, x11, r29
68+
; CHECK-NEXT: vldb.3d wl3, [p0], d1; vmac.f bml1, bml1, x3, x11, r29
69+
; CHECK-NEXT: vlda wh7, [p4, #352]; vshift.align x0, x0, s0, x8, r3; vmac.f bmh8, bmh8, x10, x11, r29
70+
; CHECK-NEXT: vlda wl7, [p4, #320]
71+
; CHECK-NEXT: vlda wh9, [p4, #416]; vshift.align x2, x2, s0, x10, r3
72+
; CHECK-NEXT: vlda wl9, [p4, #384]; vshuffle x8, x0, x2, r9
73+
; CHECK-NEXT: vldb wh5, [p5, #32]; vshift.align x4, x4, s0, x1, r3
74+
; CHECK-NEXT: vlda wl5, [p5], #256; vshuffle x5, x0, x2, r25
75+
; CHECK-NEXT: vlda wh11, [p4, #480]; vshift.align x6, x6, s0, x3, r3
76+
; CHECK-NEXT: vlda wl11, [p4, #448]; vshuffle x3, x4, x6, r9
77+
; CHECK-NEXT: vshuffle x10, x4, x6, r25
78+
; CHECK-NEXT: vshuffle x1, x3, x5, r13
79+
; CHECK-NEXT: vshuffle x3, x3, x5, r24
80+
; CHECK-NEXT: mov r3, p0
81+
; CHECK-NEXT: and r3, r3, r0; mov p4, p7; vmac.f bmh7, bmh7, x8, x5, r29
82+
; CHECK-NEXT: add r3, r3, #34; vmac.f bmh5, bmh5, x1, x5, r29
83+
; CHECK-NEXT: vmac.f bml2, bml2, x3, x5, r29
84+
; CHECK-NEXT: vmac.f bml0, bml0, x10, x5, r29
85+
; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x9, r29
86+
; CHECK-NEXT: vmac.f bmh0, bmh0, x1, x9, r29
87+
; CHECK-NEXT: .L_LEnd0:
88+
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh3, bmh3, x3, x9, r29
89+
; CHECK-NEXT: // %bb.3: // %for.cond.cleanup
90+
; CHECK-NEXT: nopa ; nopb ; nopx ; vmac.f bmh2, bmh2, x10, x9, r29
91+
; CHECK-NEXT: vmac.f bml4, bml4, x8, x7, r29
92+
; CHECK-NEXT: vmac.f bml3, bml3, x1, x7, r29
93+
; CHECK-NEXT: vmac.f bml6, bml6, x3, x7, r29
94+
; CHECK-NEXT: vmac.f bml5, bml5, x10, x7, r29
95+
; CHECK-NEXT: vmac.f bmh6, bmh6, x8, x11, r29
96+
; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x11, r29
97+
; CHECK-NEXT: vmac.f bml1, bml1, x3, x11, r29
98+
; CHECK-NEXT: vmac.f bmh8, bmh8, x10, x11, r29
99+
; CHECK-NEXT: nop
100+
; CHECK-NEXT: nop
101+
; CHECK-NEXT: nop
102+
; CHECK-NEXT: nop
103+
; CHECK-NEXT: nop
104+
; CHECK-NEXT: nop
105+
; CHECK-NEXT: nop
106+
; CHECK-NEXT: nop
107+
; CHECK-NEXT: nop
108+
; CHECK-NEXT: nop
109+
; CHECK-NEXT: nop
110+
; CHECK-NEXT: nop
111+
; CHECK-NEXT: nop
112+
; CHECK-NEXT: nop
113+
; CHECK-NEXT: nop
114+
; CHECK-NEXT: nop
115+
; CHECK-NEXT: nop
116+
; CHECK-NEXT: nop
117+
; CHECK-NEXT: .p2align 4
118+
; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup
119+
; CHECK-NEXT: nopa ; ret lr
120+
; CHECK-NEXT: nop // Delay Slot 5
121+
; CHECK-NEXT: nop // Delay Slot 4
122+
; CHECK-NEXT: nop // Delay Slot 3
123+
; CHECK-NEXT: nop // Delay Slot 2
124+
; CHECK-NEXT: nop // Delay Slot 1
125+
entry:
126+
%cmp5 = icmp sgt i32 %n, 0
127+
br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
128+
129+
for.body.preheader: ; preds = %entry
130+
call void @llvm.set.loop.iterations.i32(i32 %n)
131+
br label %for.body
132+
133+
for.cond.cleanup: ; preds = %for.body, %entry
134+
ret void
135+
136+
for.body: ; preds = %for.body.preheader, %for.body
137+
%d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ]
138+
%s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ]
139+
%0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2
140+
%add = add nsw i32 %0, 1
141+
store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2
142+
%incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1
143+
%incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1
144+
%1 = call i1 @llvm.loop.decrement.i32(i32 1)
145+
br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6
146+
}
147+
148+
; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
149+
declare void @llvm.set.loop.iterations.i32(i32) #1
150+
151+
; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
152+
declare i1 @llvm.loop.decrement.i32(i32) #1
153+
154+
attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
155+
attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn }
156+
157+
!llvm.module.flags = !{!0}
158+
!llvm.ident = !{!1}
159+
160+
!0 = !{i32 1, !"wchar_size", i32 4}
161+
!1 = !{!"clang version 18.0.0git ([email protected]:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"}
162+
!2 = !{!3, !3, i64 0}
163+
!3 = !{!"int", !4, i64 0}
164+
!4 = !{!"omnipotent char", !5, i64 0}
165+
!5 = !{!"Simple C/C++ TBAA"}
166+
!6 = distinct !{!6, !7, !8}
167+
!7 = !{!"llvm.loop.mustprogress"}
168+
!8 = !{!"llvm.loop.itercount.range", i64 10}
169+
170+
...
171+
---
172+
name: gemm
173+
alignment: 16
174+
tracksRegLiveness: true
175+
body: |
176+
bb.0.entry (align 16):
177+
successors: %bb.1(0x50000000), %bb.3(0x30000000)
178+
liveins: $p0, $p1, $r0
179+
180+
$r1 = MOV_RLC_imm10_pseudo 0
181+
$r1 = GE $r1, $r0
182+
JNZ $r1, %bb.3
183+
DelayedSchedBarrier
184+
185+
bb.1.for.body.preheader:
186+
successors: %bb.2(0x80000000)
187+
liveins: $p0, $p1, $r0
188+
189+
$lc = ADD_NC $r0, 0
190+
$ls = MOVXM_lng_cg %bb.2
191+
$le = MOVXM_lng_cg <mcsymbol .L_LEnd0>
192+
193+
bb.2.for.body (align 16):
194+
liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc3, $dc4, $dc5, $dc7, $dj1, $dj3, $dj5, $dj7, $dn1, $dn5, $dn7, $m0, $m1, $m3, $m4, $m5, $m7, $p0, $p1, $p2, $p3, $p4, $p5, $p6, $p7, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r13, $r14, $r15, $r16, $r17, $r18, $r19, $r20, $r21, $r22, $r23, $r24, $r25, $r26, $r27, $r28, $r29, $r30, $r31, $s0, $s1, $x0, $x2, $x4, $x6, $d1_3d:0x000000000003C870
195+
196+
$p7 = MOV_mv_scl $p5
197+
$wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
198+
$wl8, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
199+
$wh10 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
200+
$wl10, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
201+
$wh1 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
202+
$wl1, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
203+
$wh3 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
204+
$wl3, $p0, $dc1, $dc5 = VLD_3D_pseudo $p0, $d1_3d :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
205+
$x0 = VSHIFT_ALIGN $x0, $s0, $x8, $r3
206+
$x2 = VSHIFT_ALIGN $x2, $s0, $x10, $r3
207+
$x4 = VSHIFT_ALIGN $x4, $s0, $x1, $r3
208+
$x6 = VSHIFT_ALIGN $x6, $s0, $x3, $r3
209+
$x8 = VSHUFFLE $x0, $x2, $r9
210+
$x3 = VSHUFFLE $x4, $x6, $r9
211+
$x5 = VSHUFFLE $x0, $x2, $r25
212+
$x10 = VSHUFFLE $x4, $x6, $r25
213+
$x1 = VSHUFFLE $x3, $x5, $r13
214+
$x3 = VSHUFFLE $x3, $x5, $r24
215+
$wh5 = VLD_idx_imm_3x32_pseudo $p5, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
216+
$wl5, $p5 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p5, 256 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
217+
$wh7 = VLDA_dmw_lda_w_ag_idx_imm $p4, 352 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
218+
$wl7 = VLDA_dmw_lda_w_ag_idx_imm $p4, 320 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
219+
$wh9 = VLDA_dmw_lda_w_ag_idx_imm $p4, 416 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
220+
$wl9 = VLDA_dmw_lda_w_ag_idx_imm $p4, 384 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
221+
$wh11 = VLDA_dmw_lda_w_ag_idx_imm $p4, 480 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
222+
$wl11 = VLDA_dmw_lda_w_ag_idx_imm $p4, 448 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
223+
$bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x8, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask
224+
$bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x1, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask
225+
$bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x3, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask
226+
$bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x10, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask
227+
$bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x8, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask
228+
$bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask
229+
$bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x3, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask
230+
$bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask
231+
$bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x8, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask
232+
$bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask
233+
$bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x3, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask
234+
$bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x10, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask
235+
$bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x8, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask
236+
$bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask
237+
$bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x3, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask
238+
$bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x10, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask
239+
$r3 = MOV_mv_scl $p0
240+
$r3 = AND $r3, $r0
241+
$r3 = nuw nsw ADD_add_r_ri $r3, 34, implicit-def $srcarry
242+
$p4 = MOV_mv_scl $p7
243+
244+
PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2
245+
246+
bb.3.for.cond.cleanup (align 16):
247+
RET implicit $lr
248+
DelayedSchedBarrier
249+
250+
...

0 commit comments

Comments
 (0)