|
| 1 | +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 |
| 2 | +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. |
| 3 | +# See https://llvm.org/LICENSE.txt for license information. |
| 4 | +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 5 | +# |
| 6 | +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates |
| 7 | + |
| 8 | +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \ |
| 9 | +# RUN: --debug-only=postpipeliner-summary -o - | FileCheck %s |
| 10 | + |
| 11 | + |
| 12 | +# derived from gemm_bf16_0 |
| 13 | + |
| 14 | +--- | |
| 15 | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { |
| 16 | + ; CHECK-LABEL: gemm: |
| 17 | + ; CHECK: .p2align 4 |
| 18 | + ; CHECK-NEXT: // %bb.0: // %entry |
| 19 | + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm |
| 20 | + ; CHECK-NEXT: ge r1, r1, r0 |
| 21 | + ; CHECK-NEXT: jnz r1, #.LBB0_4 |
| 22 | + ; CHECK-NEXT: nop // Delay Slot 5 |
| 23 | + ; CHECK-NEXT: nop // Delay Slot 4 |
| 24 | + ; CHECK-NEXT: nop // Delay Slot 3 |
| 25 | + ; CHECK-NEXT: nop // Delay Slot 2 |
| 26 | + ; CHECK-NEXT: nop // Delay Slot 1 |
| 27 | + ; CHECK-NEXT: // %bb.1: // %for.body.preheader |
| 28 | + ; CHECK-NEXT: add.nc lc, r0, #-1 |
| 29 | + ; CHECK-NEXT: movxm ls, #.LBB0_2 |
| 30 | + ; CHECK-NEXT: movxm le, #.L_LEnd0 |
| 31 | + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopx ; mov p7, p5; nopv |
| 32 | + ; CHECK-NEXT: vldb wl8, [p0], m4; nopa ; nops ; nopxm ; nopv |
| 33 | + ; CHECK-NEXT: vldb wh10, [p0, #32]; nopa ; nops ; nopxm ; nopv |
| 34 | + ; CHECK-NEXT: vldb wl10, [p0], m4; nopa ; nops ; nopxm ; nopv |
| 35 | + ; CHECK-NEXT: vldb wh1, [p0, #32]; nopa ; nops ; nopxm ; nopv |
| 36 | + ; CHECK-NEXT: vldb wl1, [p0], m4; nopa ; nops ; nopxm ; nopv |
| 37 | + ; CHECK-NEXT: nopa ; vldb wh3, [p0, #32]; nopx |
| 38 | + ; CHECK-NEXT: vldb.3d wl3, [p0], d1 |
| 39 | + ; CHECK-NEXT: vlda wh7, [p4, #352]; vshift.align x0, x0, s0, x8, r3 |
| 40 | + ; CHECK-NEXT: vlda wl7, [p4, #320] |
| 41 | + ; CHECK-NEXT: vlda wh9, [p4, #416]; vshift.align x2, x2, s0, x10, r3 |
| 42 | + ; CHECK-NEXT: vlda wl9, [p4, #384]; vshuffle x8, x0, x2, r9 |
| 43 | + ; CHECK-NEXT: vldb wh5, [p5, #32]; vshift.align x4, x4, s0, x1, r3 |
| 44 | + ; CHECK-NEXT: vlda wl5, [p5], #256; vshuffle x5, x0, x2, r25 |
| 45 | + ; CHECK-NEXT: vlda wh11, [p4, #480]; vshift.align x6, x6, s0, x3, r3 |
| 46 | + ; CHECK-NEXT: vlda wl11, [p4, #448]; vshuffle x3, x4, x6, r9 |
| 47 | + ; CHECK-NEXT: vshuffle x10, x4, x6, r25 |
| 48 | + ; CHECK-NEXT: vshuffle x1, x3, x5, r13 |
| 49 | + ; CHECK-NEXT: vshuffle x3, x3, x5, r24 |
| 50 | + ; CHECK-NEXT: mov r3, p0 |
| 51 | + ; CHECK-NEXT: and r3, r3, r0; mov p4, p7; vmac.f bmh7, bmh7, x8, x5, r29 |
| 52 | + ; CHECK-NEXT: add r3, r3, #34; vmac.f bmh5, bmh5, x1, x5, r29 |
| 53 | + ; CHECK-NEXT: vmac.f bml2, bml2, x3, x5, r29 |
| 54 | + ; CHECK-NEXT: vmac.f bml0, bml0, x10, x5, r29 |
| 55 | + ; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x9, r29 |
| 56 | + ; CHECK-NEXT: vmac.f bmh0, bmh0, x1, x9, r29 |
| 57 | + ; CHECK-NEXT: vmac.f bmh3, bmh3, x3, x9, r29 |
| 58 | + ; CHECK-NEXT: .p2align 4 |
| 59 | + ; CHECK-NEXT: .LBB0_2: // %for.body |
| 60 | + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 |
| 61 | + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopx ; mov p7, p5; vmac.f bmh2, bmh2, x10, x9, r29 |
| 62 | + ; CHECK-NEXT: nopa ; vldb wl8, [p0], m4; nopx ; vmac.f bml4, bml4, x8, x7, r29 |
| 63 | + ; CHECK-NEXT: vldb wh10, [p0, #32]; vmac.f bml3, bml3, x1, x7, r29 |
| 64 | + ; CHECK-NEXT: vldb wl10, [p0], m4; vmac.f bml6, bml6, x3, x7, r29 |
| 65 | + ; CHECK-NEXT: vldb wh1, [p0, #32]; vmac.f bml5, bml5, x10, x7, r29 |
| 66 | + ; CHECK-NEXT: vldb wl1, [p0], m4; vmac.f bmh6, bmh6, x8, x11, r29 |
| 67 | + ; CHECK-NEXT: vldb wh3, [p0, #32]; vmac.f bmh4, bmh4, x1, x11, r29 |
| 68 | + ; CHECK-NEXT: vldb.3d wl3, [p0], d1; vmac.f bml1, bml1, x3, x11, r29 |
| 69 | + ; CHECK-NEXT: vlda wh7, [p4, #352]; vshift.align x0, x0, s0, x8, r3; vmac.f bmh8, bmh8, x10, x11, r29 |
| 70 | + ; CHECK-NEXT: vlda wl7, [p4, #320] |
| 71 | + ; CHECK-NEXT: vlda wh9, [p4, #416]; vshift.align x2, x2, s0, x10, r3 |
| 72 | + ; CHECK-NEXT: vlda wl9, [p4, #384]; vshuffle x8, x0, x2, r9 |
| 73 | + ; CHECK-NEXT: vldb wh5, [p5, #32]; vshift.align x4, x4, s0, x1, r3 |
| 74 | + ; CHECK-NEXT: vlda wl5, [p5], #256; vshuffle x5, x0, x2, r25 |
| 75 | + ; CHECK-NEXT: vlda wh11, [p4, #480]; vshift.align x6, x6, s0, x3, r3 |
| 76 | + ; CHECK-NEXT: vlda wl11, [p4, #448]; vshuffle x3, x4, x6, r9 |
| 77 | + ; CHECK-NEXT: vshuffle x10, x4, x6, r25 |
| 78 | + ; CHECK-NEXT: vshuffle x1, x3, x5, r13 |
| 79 | + ; CHECK-NEXT: vshuffle x3, x3, x5, r24 |
| 80 | + ; CHECK-NEXT: mov r3, p0 |
| 81 | + ; CHECK-NEXT: and r3, r3, r0; mov p4, p7; vmac.f bmh7, bmh7, x8, x5, r29 |
| 82 | + ; CHECK-NEXT: add r3, r3, #34; vmac.f bmh5, bmh5, x1, x5, r29 |
| 83 | + ; CHECK-NEXT: vmac.f bml2, bml2, x3, x5, r29 |
| 84 | + ; CHECK-NEXT: vmac.f bml0, bml0, x10, x5, r29 |
| 85 | + ; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x9, r29 |
| 86 | + ; CHECK-NEXT: vmac.f bmh0, bmh0, x1, x9, r29 |
| 87 | + ; CHECK-NEXT: .L_LEnd0: |
| 88 | + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh3, bmh3, x3, x9, r29 |
| 89 | + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup |
| 90 | + ; CHECK-NEXT: nopa ; nopb ; nopx ; vmac.f bmh2, bmh2, x10, x9, r29 |
| 91 | + ; CHECK-NEXT: vmac.f bml4, bml4, x8, x7, r29 |
| 92 | + ; CHECK-NEXT: vmac.f bml3, bml3, x1, x7, r29 |
| 93 | + ; CHECK-NEXT: vmac.f bml6, bml6, x3, x7, r29 |
| 94 | + ; CHECK-NEXT: vmac.f bml5, bml5, x10, x7, r29 |
| 95 | + ; CHECK-NEXT: vmac.f bmh6, bmh6, x8, x11, r29 |
| 96 | + ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x11, r29 |
| 97 | + ; CHECK-NEXT: vmac.f bml1, bml1, x3, x11, r29 |
| 98 | + ; CHECK-NEXT: vmac.f bmh8, bmh8, x10, x11, r29 |
| 99 | + ; CHECK-NEXT: nop |
| 100 | + ; CHECK-NEXT: nop |
| 101 | + ; CHECK-NEXT: nop |
| 102 | + ; CHECK-NEXT: nop |
| 103 | + ; CHECK-NEXT: nop |
| 104 | + ; CHECK-NEXT: nop |
| 105 | + ; CHECK-NEXT: nop |
| 106 | + ; CHECK-NEXT: nop |
| 107 | + ; CHECK-NEXT: nop |
| 108 | + ; CHECK-NEXT: nop |
| 109 | + ; CHECK-NEXT: nop |
| 110 | + ; CHECK-NEXT: nop |
| 111 | + ; CHECK-NEXT: nop |
| 112 | + ; CHECK-NEXT: nop |
| 113 | + ; CHECK-NEXT: nop |
| 114 | + ; CHECK-NEXT: nop |
| 115 | + ; CHECK-NEXT: nop |
| 116 | + ; CHECK-NEXT: nop |
| 117 | + ; CHECK-NEXT: .p2align 4 |
| 118 | + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup |
| 119 | + ; CHECK-NEXT: nopa ; ret lr |
| 120 | + ; CHECK-NEXT: nop // Delay Slot 5 |
| 121 | + ; CHECK-NEXT: nop // Delay Slot 4 |
| 122 | + ; CHECK-NEXT: nop // Delay Slot 3 |
| 123 | + ; CHECK-NEXT: nop // Delay Slot 2 |
| 124 | + ; CHECK-NEXT: nop // Delay Slot 1 |
| 125 | + entry: |
| 126 | + %cmp5 = icmp sgt i32 %n, 0 |
| 127 | + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup |
| 128 | + |
| 129 | + for.body.preheader: ; preds = %entry |
| 130 | + call void @llvm.set.loop.iterations.i32(i32 %n) |
| 131 | + br label %for.body |
| 132 | + |
| 133 | + for.cond.cleanup: ; preds = %for.body, %entry |
| 134 | + ret void |
| 135 | + |
| 136 | + for.body: ; preds = %for.body.preheader, %for.body |
| 137 | + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] |
| 138 | + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] |
| 139 | + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 |
| 140 | + %add = add nsw i32 %0, 1 |
| 141 | + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 |
| 142 | + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 |
| 143 | + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 |
| 144 | + %1 = call i1 @llvm.loop.decrement.i32(i32 1) |
| 145 | + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 |
| 146 | + } |
| 147 | + |
| 148 | + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn |
| 149 | + declare void @llvm.set.loop.iterations.i32(i32) #1 |
| 150 | + |
| 151 | + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn |
| 152 | + declare i1 @llvm.loop.decrement.i32(i32) #1 |
| 153 | + |
| 154 | + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } |
| 155 | + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } |
| 156 | + |
| 157 | + !llvm.module.flags = !{!0} |
| 158 | + !llvm.ident = !{!1} |
| 159 | + |
| 160 | + !0 = !{i32 1, !"wchar_size", i32 4} |
| 161 | + !1 = !{!"clang version 18.0.0git ([email protected]:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} |
| 162 | + !2 = !{!3, !3, i64 0} |
| 163 | + !3 = !{!"int", !4, i64 0} |
| 164 | + !4 = !{!"omnipotent char", !5, i64 0} |
| 165 | + !5 = !{!"Simple C/C++ TBAA"} |
| 166 | + !6 = distinct !{!6, !7, !8} |
| 167 | + !7 = !{!"llvm.loop.mustprogress"} |
| 168 | + !8 = !{!"llvm.loop.itercount.range", i64 10} |
| 169 | + |
| 170 | +... |
| 171 | +--- |
| 172 | +name: gemm |
| 173 | +alignment: 16 |
| 174 | +tracksRegLiveness: true |
| 175 | +body: | |
| 176 | + bb.0.entry (align 16): |
| 177 | + successors: %bb.1(0x50000000), %bb.3(0x30000000) |
| 178 | + liveins: $p0, $p1, $r0 |
| 179 | +
|
| 180 | + $r1 = MOV_RLC_imm10_pseudo 0 |
| 181 | + $r1 = GE $r1, $r0 |
| 182 | + JNZ $r1, %bb.3 |
| 183 | + DelayedSchedBarrier |
| 184 | +
|
| 185 | + bb.1.for.body.preheader: |
| 186 | + successors: %bb.2(0x80000000) |
| 187 | + liveins: $p0, $p1, $r0 |
| 188 | +
|
| 189 | + $lc = ADD_NC $r0, 0 |
| 190 | + $ls = MOVXM_lng_cg %bb.2 |
| 191 | + $le = MOVXM_lng_cg <mcsymbol .L_LEnd0> |
| 192 | +
|
| 193 | + bb.2.for.body (align 16): |
| 194 | + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc3, $dc4, $dc5, $dc7, $dj1, $dj3, $dj5, $dj7, $dn1, $dn5, $dn7, $m0, $m1, $m3, $m4, $m5, $m7, $p0, $p1, $p2, $p3, $p4, $p5, $p6, $p7, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r13, $r14, $r15, $r16, $r17, $r18, $r19, $r20, $r21, $r22, $r23, $r24, $r25, $r26, $r27, $r28, $r29, $r30, $r31, $s0, $s1, $x0, $x2, $x4, $x6, $d1_3d:0x000000000003C870 |
| 195 | +
|
| 196 | + $p7 = MOV_mv_scl $p5 |
| 197 | + $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) |
| 198 | + $wl8, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) |
| 199 | + $wh10 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) |
| 200 | + $wl10, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) |
| 201 | + $wh1 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) |
| 202 | + $wl1, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) |
| 203 | + $wh3 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) |
| 204 | + $wl3, $p0, $dc1, $dc5 = VLD_3D_pseudo $p0, $d1_3d :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) |
| 205 | + $x0 = VSHIFT_ALIGN $x0, $s0, $x8, $r3 |
| 206 | + $x2 = VSHIFT_ALIGN $x2, $s0, $x10, $r3 |
| 207 | + $x4 = VSHIFT_ALIGN $x4, $s0, $x1, $r3 |
| 208 | + $x6 = VSHIFT_ALIGN $x6, $s0, $x3, $r3 |
| 209 | + $x8 = VSHUFFLE $x0, $x2, $r9 |
| 210 | + $x3 = VSHUFFLE $x4, $x6, $r9 |
| 211 | + $x5 = VSHUFFLE $x0, $x2, $r25 |
| 212 | + $x10 = VSHUFFLE $x4, $x6, $r25 |
| 213 | + $x1 = VSHUFFLE $x3, $x5, $r13 |
| 214 | + $x3 = VSHUFFLE $x3, $x5, $r24 |
| 215 | + $wh5 = VLD_idx_imm_3x32_pseudo $p5, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) |
| 216 | + $wl5, $p5 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p5, 256 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) |
| 217 | + $wh7 = VLDA_dmw_lda_w_ag_idx_imm $p4, 352 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) |
| 218 | + $wl7 = VLDA_dmw_lda_w_ag_idx_imm $p4, 320 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) |
| 219 | + $wh9 = VLDA_dmw_lda_w_ag_idx_imm $p4, 416 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) |
| 220 | + $wl9 = VLDA_dmw_lda_w_ag_idx_imm $p4, 384 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) |
| 221 | + $wh11 = VLDA_dmw_lda_w_ag_idx_imm $p4, 480 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) |
| 222 | + $wl11 = VLDA_dmw_lda_w_ag_idx_imm $p4, 448 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) |
| 223 | + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x8, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask |
| 224 | + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x1, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask |
| 225 | + $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x3, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask |
| 226 | + $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x10, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask |
| 227 | + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x8, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask |
| 228 | + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask |
| 229 | + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x3, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask |
| 230 | + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask |
| 231 | + $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x8, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask |
| 232 | + $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask |
| 233 | + $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x3, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask |
| 234 | + $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x10, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask |
| 235 | + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x8, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask |
| 236 | + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask |
| 237 | + $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x3, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask |
| 238 | + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x10, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask |
| 239 | + $r3 = MOV_mv_scl $p0 |
| 240 | + $r3 = AND $r3, $r0 |
| 241 | + $r3 = nuw nsw ADD_add_r_ri $r3, 34, implicit-def $srcarry |
| 242 | + $p4 = MOV_mv_scl $p7 |
| 243 | +
|
| 244 | + PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2 |
| 245 | +
|
| 246 | + bb.3.for.cond.cleanup (align 16): |
| 247 | + RET implicit $lr |
| 248 | + DelayedSchedBarrier |
| 249 | +
|
| 250 | +... |
0 commit comments