diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir new file mode 100644 index 000000000000..0f4b40ce222c --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir @@ -0,0 +1,250 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \ +# RUN: --debug-only=postpipeliner-summary -o - | FileCheck %s + + +# derived from gemm_bf16_0 + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: gemm: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: add.nc lc, r0, #-1 + ; CHECK-NEXT: movxm ls, #.LBB0_2 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopx ; mov p7, p5; nopv + ; CHECK-NEXT: vldb wl8, [p0], m4; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh10, [p0, #32]; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl10, [p0], m4; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh1, [p0, #32]; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl1, [p0], m4; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; vldb wh3, [p0, #32]; nopx + ; CHECK-NEXT: vldb.3d wl3, [p0], d1 + ; CHECK-NEXT: vlda wh7, [p4, #352]; vshift.align x0, x0, s0, x8, r3 + ; CHECK-NEXT: vlda wl7, [p4, #320] + ; CHECK-NEXT: vlda wh9, [p4, #416]; vshift.align x2, x2, s0, x10, r3 + ; CHECK-NEXT: vlda wl9, [p4, #384]; vshuffle x8, x0, x2, r9 + ; CHECK-NEXT: vldb wh5, [p5, #32]; vshift.align x4, x4, s0, x1, r3 + ; CHECK-NEXT: vlda wl5, [p5], #256; vshuffle x5, x0, x2, r25 + ; CHECK-NEXT: vlda wh11, [p4, #480]; vshift.align x6, x6, s0, x3, r3 + ; CHECK-NEXT: vlda wl11, [p4, #448]; vshuffle x3, x4, x6, r9 + ; CHECK-NEXT: vshuffle x10, x4, x6, r25 + ; CHECK-NEXT: vshuffle x1, x3, x5, r13 + ; CHECK-NEXT: vshuffle x3, x3, x5, r24 + ; CHECK-NEXT: mov r3, p0 + ; CHECK-NEXT: and r3, r3, r0; mov p4, p7; vmac.f bmh7, bmh7, x8, x5, r29 + ; CHECK-NEXT: add r3, r3, #34; vmac.f bmh5, bmh5, x1, x5, r29 + ; CHECK-NEXT: vmac.f bml2, bml2, x3, x5, r29 + ; CHECK-NEXT: vmac.f bml0, bml0, x10, x5, r29 + ; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x9, r29 + ; CHECK-NEXT: vmac.f bmh0, bmh0, x1, x9, r29 + ; CHECK-NEXT: vmac.f bmh3, bmh3, x3, x9, r29 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopx ; mov p7, p5; vmac.f bmh2, bmh2, x10, x9, r29 + ; CHECK-NEXT: nopa ; vldb wl8, [p0], m4; nopx ; vmac.f bml4, bml4, x8, x7, r29 + ; CHECK-NEXT: vldb wh10, [p0, #32]; vmac.f bml3, bml3, x1, x7, r29 + ; CHECK-NEXT: vldb wl10, [p0], m4; vmac.f bml6, bml6, x3, x7, r29 + ; CHECK-NEXT: vldb wh1, [p0, #32]; vmac.f bml5, bml5, x10, x7, r29 + ; CHECK-NEXT: vldb wl1, [p0], m4; vmac.f bmh6, bmh6, x8, x11, r29 + ; CHECK-NEXT: vldb wh3, [p0, #32]; vmac.f bmh4, bmh4, x1, x11, r29 + ; CHECK-NEXT: vldb.3d wl3, [p0], d1; vmac.f bml1, bml1, x3, x11, r29 + ; CHECK-NEXT: vlda wh7, [p4, #352]; vshift.align x0, x0, s0, x8, r3; vmac.f bmh8, bmh8, x10, x11, r29 + ; CHECK-NEXT: vlda wl7, [p4, #320] + ; CHECK-NEXT: vlda wh9, [p4, #416]; vshift.align x2, x2, s0, x10, r3 + ; CHECK-NEXT: vlda wl9, [p4, #384]; vshuffle x8, x0, x2, r9 + ; CHECK-NEXT: vldb wh5, [p5, #32]; vshift.align x4, x4, s0, x1, r3 + ; CHECK-NEXT: vlda wl5, [p5], #256; vshuffle x5, x0, x2, r25 + ; CHECK-NEXT: vlda wh11, [p4, #480]; vshift.align x6, x6, s0, x3, r3 + ; CHECK-NEXT: vlda wl11, [p4, #448]; vshuffle x3, x4, x6, r9 + ; CHECK-NEXT: vshuffle x10, x4, x6, r25 + ; CHECK-NEXT: vshuffle x1, x3, x5, r13 + ; CHECK-NEXT: vshuffle x3, x3, x5, r24 + ; CHECK-NEXT: mov r3, p0 + ; CHECK-NEXT: and r3, r3, r0; mov p4, p7; vmac.f bmh7, bmh7, x8, x5, r29 + ; CHECK-NEXT: add r3, r3, #34; vmac.f bmh5, bmh5, x1, x5, r29 + ; CHECK-NEXT: vmac.f bml2, bml2, x3, x5, r29 + ; CHECK-NEXT: vmac.f bml0, bml0, x10, x5, r29 + ; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x9, r29 + ; CHECK-NEXT: vmac.f bmh0, bmh0, x1, x9, r29 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh3, bmh3, x3, x9, r29 + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; nopb ; nopx ; vmac.f bmh2, bmh2, x10, x9, r29 + ; CHECK-NEXT: vmac.f bml4, bml4, x8, x7, r29 + ; CHECK-NEXT: vmac.f bml3, bml3, x1, x7, r29 + ; CHECK-NEXT: vmac.f bml6, bml6, x3, x7, r29 + ; CHECK-NEXT: vmac.f bml5, bml5, x10, x7, r29 + ; CHECK-NEXT: vmac.f bmh6, bmh6, x8, x11, r29 + ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x11, r29 + ; CHECK-NEXT: vmac.f bml1, bml1, x3, x11, r29 + ; CHECK-NEXT: vmac.f bmh8, bmh8, x10, x11, r29 + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOV_RLC_imm10_pseudo 0 + $r1 = GE $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc3, $dc4, $dc5, $dc7, $dj1, $dj3, $dj5, $dj7, $dn1, $dn5, $dn7, $m0, $m1, $m3, $m4, $m5, $m7, $p0, $p1, $p2, $p3, $p4, $p5, $p6, $p7, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r13, $r14, $r15, $r16, $r17, $r18, $r19, $r20, $r21, $r22, $r23, $r24, $r25, $r26, $r27, $r28, $r29, $r30, $r31, $s0, $s1, $x0, $x2, $x4, $x6, $d1_3d:0x000000000003C870 + + $p7 = MOV_mv_scl $p5 + $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl8, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh10 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl10, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh1 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl1, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh3 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl3, $p0, $dc1, $dc5 = VLD_3D_pseudo $p0, $d1_3d :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $x0 = VSHIFT_ALIGN $x0, $s0, $x8, $r3 + $x2 = VSHIFT_ALIGN $x2, $s0, $x10, $r3 + $x4 = VSHIFT_ALIGN $x4, $s0, $x1, $r3 + $x6 = VSHIFT_ALIGN $x6, $s0, $x3, $r3 + $x8 = VSHUFFLE $x0, $x2, $r9 + $x3 = VSHUFFLE $x4, $x6, $r9 + $x5 = VSHUFFLE $x0, $x2, $r25 + $x10 = VSHUFFLE $x4, $x6, $r25 + $x1 = VSHUFFLE $x3, $x5, $r13 + $x3 = VSHUFFLE $x3, $x5, $r24 + $wh5 = VLD_idx_imm_3x32_pseudo $p5, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl5, $p5 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p5, 256 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh7 = VLDA_dmw_lda_w_ag_idx_imm $p4, 352 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl7 = VLDA_dmw_lda_w_ag_idx_imm $p4, 320 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh9 = VLDA_dmw_lda_w_ag_idx_imm $p4, 416 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl9 = VLDA_dmw_lda_w_ag_idx_imm $p4, 384 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh11 = VLDA_dmw_lda_w_ag_idx_imm $p4, 480 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl11 = VLDA_dmw_lda_w_ag_idx_imm $p4, 448 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x8, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x1, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x3, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x10, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x8, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x3, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x8, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x3, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x10, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x8, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x3, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x10, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask + $r3 = MOV_mv_scl $p0 + $r3 = AND $r3, $r0 + $r3 = nuw nsw ADD_add_r_ri $r3, 34, implicit-def $srcarry + $p4 = MOV_mv_scl $p7 + + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir new file mode 100644 index 000000000000..346e56f42fc1 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir @@ -0,0 +1,245 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \ +# RUN: --debug-only=postpipeliner-summary -o - | FileCheck %s + + +# derived from gemm_bf16_0 + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: gemm: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: add.nc lc, r0, #-1 + ; CHECK-NEXT: movxm ls, #.LBB0_2 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopx ; mov p5, p7; nopv + ; CHECK-NEXT: vldb wl8, [p0], m4; nopa ; nops ; nopx ; mov p4, p2; nopv + ; CHECK-NEXT: vldb wh10, [p0, #32]; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl10, [p0], m4; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh1, [p0, #32]; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl1, [p0], m4; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh3, [p0, #32] + ; CHECK-NEXT: vldb.3d wl3, [p0], d1 + ; CHECK-NEXT: vldb wh7, [p7, #32]; vshift.align x0, x0, s0, x8, r3 + ; CHECK-NEXT: vlda wl7, [p7], #256; paddb [p4], #320; mov r1, p0 + ; CHECK-NEXT: and r2, r1, r0; vshift.align x2, x2, s0, x10, r3 + ; CHECK-NEXT: vshuffle x8, x0, x2, r9 + ; CHECK-NEXT: vlda wh5, [p2, #352]; vshift.align x4, x4, s0, x1, r3 + ; CHECK-NEXT: vldb wl5, [p4], #64; vshuffle x5, x0, x2, r25 + ; CHECK-NEXT: vldb wh9, [p4, #32]; add r3, r2, #34; vshift.align x6, x6, s0, x3, r3 + ; CHECK-NEXT: vldb wl9, [p4], #64; vshuffle x3, x4, x6, r9 + ; CHECK-NEXT: vldb wh11, [p4, #32]; vshuffle x10, x4, x6, r25; vmac.f bmh7, bmh7, x8, x7, r29 + ; CHECK-NEXT: vldb wl11, [p4, #0]; vshuffle x1, x3, x5, r13 + ; CHECK-NEXT: vshuffle x3, x3, x5, r24 + ; CHECK-NEXT: mov p2, p5; vmac.f bmh5, bmh5, x1, x7, r29 + ; CHECK-NEXT: vmac.f bml2, bml2, x3, x7, r29 + ; CHECK-NEXT: vmac.f bml0, bml0, x10, x7, r29 + ; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x9, r29 + ; CHECK-NEXT: vmac.f bmh0, bmh0, x1, x9, r29 + ; CHECK-NEXT: vmac.f bmh3, bmh3, x3, x9, r29 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: nopa ; vldb wh8, [p0, #32]; nopx ; mov p5, p7; vmac.f bmh2, bmh2, x10, x9, r29 + ; CHECK-NEXT: vldb wl8, [p0], m4; mov p4, p2; vmac.f bml4, bml4, x8, x5, r29 + ; CHECK-NEXT: vldb wh10, [p0, #32]; vmac.f bml3, bml3, x1, x5, r29 + ; CHECK-NEXT: vldb wl10, [p0], m4; vmac.f bml6, bml6, x3, x5, r29 + ; CHECK-NEXT: vldb wh1, [p0, #32]; vmac.f bml5, bml5, x10, x5, r29 + ; CHECK-NEXT: vldb wl1, [p0], m4; vmac.f bmh6, bmh6, x8, x11, r29 + ; CHECK-NEXT: vldb wh3, [p0, #32]; vmac.f bmh4, bmh4, x1, x11, r29 + ; CHECK-NEXT: vldb.3d wl3, [p0], d1; vmac.f bml1, bml1, x3, x11, r29 + ; CHECK-NEXT: vldb wh7, [p7, #32]; vshift.align x0, x0, s0, x8, r3; vmac.f bmh8, bmh8, x10, x11, r29 + ; CHECK-NEXT: vlda wl7, [p7], #256; paddb [p4], #320; mov r1, p0 + ; CHECK-NEXT: and r2, r1, r0; vshift.align x2, x2, s0, x10, r3 + ; CHECK-NEXT: vshuffle x8, x0, x2, r9 + ; CHECK-NEXT: vlda wh5, [p2, #352]; vshift.align x4, x4, s0, x1, r3 + ; CHECK-NEXT: vldb wl5, [p4], #64; vshuffle x5, x0, x2, r25 + ; CHECK-NEXT: vldb wh9, [p4, #32]; add r3, r2, #34; vshift.align x6, x6, s0, x3, r3 + ; CHECK-NEXT: vldb wl9, [p4], #64; vshuffle x3, x4, x6, r9 + ; CHECK-NEXT: vldb wh11, [p4, #32]; vshuffle x10, x4, x6, r25; vmac.f bmh7, bmh7, x8, x7, r29 + ; CHECK-NEXT: vldb wl11, [p4, #0]; vshuffle x1, x3, x5, r13 + ; CHECK-NEXT: vshuffle x3, x3, x5, r24 + ; CHECK-NEXT: mov p2, p5; vmac.f bmh5, bmh5, x1, x7, r29 + ; CHECK-NEXT: vmac.f bml2, bml2, x3, x7, r29 + ; CHECK-NEXT: vmac.f bml0, bml0, x10, x7, r29 + ; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x9, r29 + ; CHECK-NEXT: vmac.f bmh0, bmh0, x1, x9, r29 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh3, bmh3, x3, x9, r29 + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh2, bmh2, x10, x9, r29 + ; CHECK-NEXT: vmac.f bml4, bml4, x8, x5, r29 + ; CHECK-NEXT: vmac.f bml3, bml3, x1, x5, r29 + ; CHECK-NEXT: vmac.f bml6, bml6, x3, x5, r29 + ; CHECK-NEXT: vmac.f bml5, bml5, x10, x5, r29 + ; CHECK-NEXT: vmac.f bmh6, bmh6, x8, x11, r29 + ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x11, r29 + ; CHECK-NEXT: vmac.f bml1, bml1, x3, x11, r29 + ; CHECK-NEXT: vmac.f bmh8, bmh8, x10, x11, r29 + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOV_RLC_imm10_pseudo 0 + $r1 = GE $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc3, $dc4, $dc5, $dc7, $dj1, $dj3, $dj5, $dj7, $dn1, $dn5, $dn7, $m0, $m1, $m3, $m4, $m5, $m7, $p0, $p1, $p2, $p3, $p6, $p7, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r13, $r14, $r15, $r16, $r17, $r18, $r19, $r20, $r21, $r22, $r23, $r24, $r25, $r26, $r27, $r28, $r29, $r30, $r31, $s0, $s1, $x0, $x2, $x4, $x6, $d1_3d:0x000000000003C870 + + $p5 = MOV_mv_scl $p7 + $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07 + 32, addrspace 5) + $wl8, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh10 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07 + 32, addrspace 5) + $wl10, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh1 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07 + 32, addrspace 5) + $wl1, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh3 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07 + 32, addrspace 5) + $wl3, $p0, $dc1, $dc5 = VLD_3D_pseudo $p0, $d1_3d :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $x0 = VSHIFT_ALIGN $x0, $s0, $x8, $r3 + $x2 = VSHIFT_ALIGN $x2, $s0, $x10, $r3 + $x4 = VSHIFT_ALIGN $x4, $s0, $x1, $r3 + $x6 = VSHIFT_ALIGN $x6, $s0, $x3, $r3 + $x8 = VSHUFFLE $x0, $x2, $r9 + $x3 = VSHUFFLE $x4, $x6, $r9 + $x5 = VSHUFFLE $x0, $x2, $r25 + $x10 = VSHUFFLE $x4, $x6, $r25 + $x1 = VSHUFFLE $x3, $x5, $r13 + $x3 = VSHUFFLE $x3, $x5, $r24 + $wh7 = VLD_idx_imm_3x32_pseudo $p7, 32 :: (load (<16 x s16>) from %ir.d.addr.07 + 32, addrspace 5) + $wl7, $p7 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p7, 256 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $p4 = MOV_mv_scl $p2 + $p4 = nuw PADD_imm9_pseudo $p4, 320 + $wh5 = VLDA_dmw_lda_w_ag_idx_imm $p2, 352 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl5, $p4 = VLD_pstm_imm_4x32_pseudo $p4, 64 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh9 = VLD_idx_imm_3x32_pseudo $p4, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl9, $p4 = VLD_pstm_imm_4x32_pseudo $p4, 64 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh11 = VLD_idx_imm_3x32_pseudo $p4, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl11 = VLD_idx_imm_3x32_pseudo $p4, 0 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x8, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x1, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x3, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x10, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x8, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x3, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x8, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x3, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x10, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x8, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x3, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x10, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask + $r1 = MOV_mv_scl $p0 + $r2 = AND $r1, $r0 + $r3 = nuw nsw ADD_add_r_ri $r2, 34, implicit-def $srcarry + $p2 = MOV_mv_scl $p5 + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir new file mode 100644 index 000000000000..b62ff74d0bd3 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir @@ -0,0 +1,218 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \ +# RUN: --debug-only=postpipeliner,postpipeliner-summary -o - | FileCheck %s + + +# derived from gemm_bf16_0 + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: gemm: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: add.nc lc, r0, #-1 + ; CHECK-NEXT: movxm ls, #.LBB0_2 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: vldb wl0, [p0, #0]; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh0, [p0, #32]; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl1, [p0, #64]; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh1, [p0, #96]; nopa ; padds [p0], m4; nopxm ; nopv + ; CHECK-NEXT: vldb wl8, [p1], m5; nopa ; padds [p0], #128; nopxm ; nopv + ; CHECK-NEXT: vldb wl2, [p0], #32; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; vldb wh2, [p0], #32; nopxm ; nops + ; CHECK-NEXT: vldb wl3, [p0], #32 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: vldb wh8, [p1], m6 + ; CHECK-NEXT: vldb wl9, [p1], m5 + ; CHECK-NEXT: vldb wh9, [p1], m6 + ; CHECK-NEXT: vldb wl10, [p1], m5 + ; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3 + ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x5, x0, x2, r16 + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x6, x1, x3, r3 + ; CHECK-NEXT: vshuffle x7, x1, x3, r16 + ; CHECK-NEXT: vshuffle x8, x8, x8, r6 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wl0, [p0, #0]; nopa ; nops ; nopx ; vshuffle x9, x9, x9, r6; nopv + ; CHECK-NEXT: vldb wh0, [p0, #32]; nopx ; vmac.f bmh0, bmh0, x4, x8, r2 + ; CHECK-NEXT: vldb wl1, [p0, #64]; vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: padds [p0], m4; vldb wh1, [p0, #96]; vmac.f bmh2, bmh2, x5, x8, r2 + ; CHECK-NEXT: padds [p0], #128; vldb wl8, [p1], m5; vshuffle x11, x11, x11, r6; vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: vldb wl2, [p0], #32; vmac.f bmh4, bmh0, x4, x9, r2 + ; CHECK-NEXT: vldb wh2, [p0], #32; vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh6, bmh2, x5, x9, r2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bmh7, bmh3, x7, x9, r2 + ; CHECK-NEXT: vldb wh8, [p1], m6; vmac.f bmh8, bmh0, x4, x10, r2 + ; CHECK-NEXT: vldb wl9, [p1], m5; vmac.f bml0, bmh1, x6, x10, r2 + ; CHECK-NEXT: vldb wh9, [p1], m6; vmac.f bml1, bmh2, x5, x10, r2 + ; CHECK-NEXT: vldb wl10, [p1], m5; vmac.f bml2, bmh3, x7, x10, r2 + ; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3; vmac.f bml3, bmh0, x4, x11, r2 + ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x5, x0, x2, r16; vmac.f bml4, bmh1, x6, x11, r2 + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x6, x1, x3, r3; vmac.f bml5, bmh2, x5, x11, r2 + ; CHECK-NEXT: vshuffle x7, x1, x3, r16; vmac.f bml6, bmh3, x7, x11, r2 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x8, x8, x8, r6; nopv + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; vshuffle x9, x9, x9, r6 + ; CHECK-NEXT: vmac.f bmh0, bmh0, x4, x8, r2 + ; CHECK-NEXT: vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: vmac.f bmh2, bmh2, x5, x8, r2 + ; CHECK-NEXT: vshuffle x11, x11, x11, r6; vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: vmac.f bmh4, bmh0, x4, x9, r2 + ; CHECK-NEXT: vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vmac.f bmh6, bmh2, x5, x9, r2 + ; CHECK-NEXT: vmac.f bmh7, bmh3, x7, x9, r2 + ; CHECK-NEXT: vmac.f bmh8, bmh0, x4, x10, r2 + ; CHECK-NEXT: vmac.f bml0, bmh1, x6, x10, r2 + ; CHECK-NEXT: vmac.f bml1, bmh2, x5, x10, r2 + ; CHECK-NEXT: vmac.f bml2, bmh3, x7, x10, r2 + ; CHECK-NEXT: vmac.f bml3, bmh0, x4, x11, r2 + ; CHECK-NEXT: vmac.f bml4, bmh1, x6, x11, r2 + ; CHECK-NEXT: vmac.f bml5, bmh2, x5, x11, r2 + ; CHECK-NEXT: vmac.f bml6, bmh3, x7, x11, r2 + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOV_RLC_imm10_pseudo 0 + $r1 = GE $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13 + + $wl0 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh0 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh1 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $p0 = nuw PADDS_st_ptr_inc_idx $p0, $m4 + $p0 = nuw PADD_imm9_pseudo $p0, 128 + $wl2, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh2, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x4 = VSHUFFLE $x0, $x2, $r3 + $x5 = VSHUFFLE $x0, $x2, $r16 + $x6 = VSHUFFLE $x1, $x3, $r3 + $x7 = VSHUFFLE $x1, $x3, $r16 + $wl8, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh8, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl10, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh10, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl11, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh11, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x8 = VSHUFFLE $x8, $x8, $r6 + $x9 = VSHUFFLE $x9, $x9, $r6 + $x10 = VSHUFFLE $x10, $x10, $r6 + $x11 = VSHUFFLE $x11, $x11, $r6 + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml0 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml1 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml2 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml3 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml4 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml5 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml6 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask + + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA.mir new file mode 100644 index 000000000000..f79564877880 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA.mir @@ -0,0 +1,217 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \ +# RUN: --debug-only=postpipeliner,postpipeliner-summary -o - | FileCheck %s + + +# derived from gemm_bf16_0 + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: gemm: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: add.nc lc, r0, #-1 + ; CHECK-NEXT: movxm ls, #.LBB0_2 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: vldb wl0, [p0], #32; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh0, [p0], #32; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl1, [p0], #32; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh1, [p0], #32; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl8, [p1], m5; nopa ; padds [p0], m4; nopxm ; nopv + ; CHECK-NEXT: vldb wl2, [p0], #32; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; vldb wh2, [p0], #32; nopxm ; nops + ; CHECK-NEXT: vldb wl3, [p0], #32 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: vldb wh8, [p1], m6 + ; CHECK-NEXT: vldb wl9, [p1], m5 + ; CHECK-NEXT: vldb wh9, [p1], m6 + ; CHECK-NEXT: vldb wl10, [p1], m5 + ; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3 + ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x5, x0, x2, r16 + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x6, x1, x3, r3 + ; CHECK-NEXT: vshuffle x7, x1, x3, r16 + ; CHECK-NEXT: vshuffle x8, x8, x8, r6 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wl0, [p0], #32; nopa ; nops ; nopx ; vshuffle x9, x9, x9, r6; nopv + ; CHECK-NEXT: nopa ; vldb wh0, [p0], #32; nopx ; vmac.f bmh0, bmh0, x4, x8, r2 + ; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: vldb wh1, [p0], #32; vmac.f bmh2, bmh2, x5, x8, r2 + ; CHECK-NEXT: padds [p0], m4; vldb wl8, [p1], m5; vshuffle x11, x11, x11, r6; vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: vldb wl2, [p0], #32; vmac.f bmh4, bmh0, x4, x9, r2 + ; CHECK-NEXT: vldb wh2, [p0], #32; vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh6, bmh2, x5, x9, r2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bmh7, bmh3, x7, x9, r2 + ; CHECK-NEXT: vldb wh8, [p1], m6; vmac.f bmh8, bmh0, x4, x10, r2 + ; CHECK-NEXT: vldb wl9, [p1], m5; vmac.f bml0, bmh1, x6, x10, r2 + ; CHECK-NEXT: vldb wh9, [p1], m6; vmac.f bml1, bmh2, x5, x10, r2 + ; CHECK-NEXT: vldb wl10, [p1], m5; vmac.f bml2, bmh3, x7, x10, r2 + ; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3; vmac.f bml3, bmh0, x4, x11, r2 + ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x5, x0, x2, r16; vmac.f bml4, bmh1, x6, x11, r2 + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x6, x1, x3, r3; vmac.f bml5, bmh2, x5, x11, r2 + ; CHECK-NEXT: vshuffle x7, x1, x3, r16; vmac.f bml6, bmh3, x7, x11, r2 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x8, x8, x8, r6; nopv + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; vshuffle x9, x9, x9, r6 + ; CHECK-NEXT: vmac.f bmh0, bmh0, x4, x8, r2 + ; CHECK-NEXT: vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: vmac.f bmh2, bmh2, x5, x8, r2 + ; CHECK-NEXT: vshuffle x11, x11, x11, r6; vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: vmac.f bmh4, bmh0, x4, x9, r2 + ; CHECK-NEXT: vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vmac.f bmh6, bmh2, x5, x9, r2 + ; CHECK-NEXT: vmac.f bmh7, bmh3, x7, x9, r2 + ; CHECK-NEXT: vmac.f bmh8, bmh0, x4, x10, r2 + ; CHECK-NEXT: vmac.f bml0, bmh1, x6, x10, r2 + ; CHECK-NEXT: vmac.f bml1, bmh2, x5, x10, r2 + ; CHECK-NEXT: vmac.f bml2, bmh3, x7, x10, r2 + ; CHECK-NEXT: vmac.f bml3, bmh0, x4, x11, r2 + ; CHECK-NEXT: vmac.f bml4, bmh1, x6, x11, r2 + ; CHECK-NEXT: vmac.f bml5, bmh2, x5, x11, r2 + ; CHECK-NEXT: vmac.f bml6, bmh3, x7, x11, r2 + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOV_RLC_imm10_pseudo 0 + $r1 = GE $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13 + + $wl0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl1, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh1, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $p0 = nuw PADDS_st_ptr_inc_idx $p0, $m4 + $wl2, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh2, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x4 = VSHUFFLE $x0, $x2, $r3 + $x5 = VSHUFFLE $x0, $x2, $r16 + $x6 = VSHUFFLE $x1, $x3, $r3 + $x7 = VSHUFFLE $x1, $x3, $r16 + $wl8, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh8, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl10, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh10, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl11, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh11, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x8 = VSHUFFLE $x8, $x8, $r6 + $x9 = VSHUFFLE $x9, $x9, $r6 + $x10 = VSHUFFLE $x10, $x10, $r6 + $x11 = VSHUFFLE $x11, $x11, $r6 + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml0 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml1 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml2 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml3 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml4 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml5 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml6 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask + + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched-nobanks.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched-nobanks.mir new file mode 100644 index 000000000000..ac136c57b989 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched-nobanks.mir @@ -0,0 +1,224 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \ +# RUN: --debug-only=postpipeliner-summary -o - | FileCheck %s + + +# derived from gemm_bf16_0 + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: gemm: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: add.nc lc, r0, #-1 + ; CHECK-NEXT: movxm ls, #.LBB0_2 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl1, [p0, #64]; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: paddb [p0], m4; vlda wh1, [p0, #96]; nops ; nopxm ; nopv + ; CHECK-NEXT: paddb [p0], #128; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl10, [p0], #32; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; vldb wh10, [p0], #32; nopx + ; CHECK-NEXT: vldb wl3, [p0], #32 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb wl3, [p1], m5 + ; CHECK-NEXT: vldb wh3, [p1], m6 + ; CHECK-NEXT: vldb wl5, [p1], m5 + ; CHECK-NEXT: vldb wh5, [p1], m6; vshuffle x6, x8, x10, r3 + ; CHECK-NEXT: vldb wl0, [p1], m5; vshuffle x11, x8, x10, r16 + ; CHECK-NEXT: vldb wh0, [p1], m6; vshuffle x0, x1, x3, r3 + ; CHECK-NEXT: vldb wl7, [p1], m5; vshuffle x9, x1, x3, r16 + ; CHECK-NEXT: vldb.3d wh7, [p1], d1 + ; CHECK-NEXT: vshuffle x3, x3, x3, r6 + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopx ; vshuffle x5, x5, x5, r6; vmac.f bmh0, bmh0, x6, x3, r2 + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopx ; vmac.f bmh1, bmh1, x11, x3, r2 + ; CHECK-NEXT: vldb wl1, [p0, #64]; vshuffle x0, x0, x0, r6 + ; CHECK-NEXT: vlda wh1, [p0, #96]; paddb [p0], m4 + ; CHECK-NEXT: paddb [p0], #128; vshuffle x7, x7, x7, r6; vmac.f bmh2, bmh2, x0, x3, r2 + ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bmh3, bmh3, x9, x3, r2 + ; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bmh4, bmh4, x6, x5, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh5, bmh5, x11, x5, r2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bmh6, bmh6, x0, x5, r2 + ; CHECK-NEXT: vmac.f bmh7, bmh7, x9, x5, r2 + ; CHECK-NEXT: vldb wl3, [p1], m5; vmac.f bmh8, bmh8, x6, x0, r2 + ; CHECK-NEXT: vldb wh3, [p1], m6; vmac.f bml0, bml0, x11, x0, r2 + ; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml1, bml1, x0, x0, r2 + ; CHECK-NEXT: vldb wh5, [p1], m6; vshuffle x6, x8, x10, r3; vmac.f bml2, bml2, x9, x0, r2 + ; CHECK-NEXT: vldb wl0, [p1], m5; vshuffle x11, x8, x10, r16; vmac.f bml3, bml3, x6, x7, r2 + ; CHECK-NEXT: vldb wh0, [p1], m6; vshuffle x0, x1, x3, r3; vmac.f bml4, bml4, x11, x7, r2 + ; CHECK-NEXT: vldb wl7, [p1], m5; vshuffle x9, x1, x3, r16; vmac.f bml6, bml6, x0, x7, r2 + ; CHECK-NEXT: vldb.3d wh7, [p1], d1; vmac.f bml5, bml5, x9, x7, r2 + ; CHECK-NEXT: vshuffle x3, x3, x3, r6 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x5, x5, x5, r6; vmac.f bmh0, bmh0, x6, x3, r2 + ; CHECK-NEXT: nopa ; nopx ; vmac.f bmh1, bmh1, x11, x3, r2 + ; CHECK-NEXT: vshuffle x0, x0, x0, r6 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vshuffle x7, x7, x7, r6; vmac.f bmh2, bmh2, x0, x3, r2 + ; CHECK-NEXT: vmac.f bmh3, bmh3, x9, x3, r2 + ; CHECK-NEXT: vmac.f bmh4, bmh4, x6, x5, r2 + ; CHECK-NEXT: vmac.f bmh5, bmh5, x11, x5, r2 + ; CHECK-NEXT: vmac.f bmh6, bmh6, x0, x5, r2 + ; CHECK-NEXT: vmac.f bmh7, bmh7, x9, x5, r2 + ; CHECK-NEXT: vmac.f bmh8, bmh8, x6, x0, r2 + ; CHECK-NEXT: vmac.f bml0, bml0, x11, x0, r2 + ; CHECK-NEXT: vmac.f bml1, bml1, x0, x0, r2 + ; CHECK-NEXT: vmac.f bml2, bml2, x9, x0, r2 + ; CHECK-NEXT: vmac.f bml3, bml3, x6, x7, r2 + ; CHECK-NEXT: vmac.f bml4, bml4, x11, x7, r2 + ; CHECK-NEXT: vmac.f bml6, bml6, x0, x7, r2 + ; CHECK-NEXT: vmac.f bml5, bml5, x9, x7, r2 + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOV_RLC_imm10_pseudo 0 + $r1 = GE $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13 + + $wl8 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh1 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $p0 = nuw PADD_mod_pseudo $p0, $m4 + $p0 = PADD_imm9_pseudo $p0, 128 + $wl10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x6 = VSHUFFLE $x8, $x10, $r3 + $x11 = VSHUFFLE $x8, $x10, $r16 + $x0 = VSHUFFLE $x1, $x3, $r3 + $x9 = VSHUFFLE $x1, $x3, $r16 + $wl3, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh3, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl0, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh0, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh7, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x3 = VSHUFFLE $x3, $x3, $r6 + $x5 = VSHUFFLE $x5, $x5, $r6 + $x7 = VSHUFFLE $x7, $x7, $r6 + $x0 = VSHUFFLE $x0, $x0, $r6 + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x6, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x11, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x0, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x9, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x6, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x11, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x0, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x9, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x6, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x11, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x0, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x9, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x6, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x11, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x0, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x9, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask + + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm.mir new file mode 100644 index 000000000000..84045383b00a --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm.mir @@ -0,0 +1,231 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s + + +# derived from gemm_bf16_0 + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: gemm: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: add.nc lc, r0, #-1 + ; CHECK-NEXT: movxm ls, #.LBB0_2 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl6, [p0, #64]; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh6, [p0, #96]; nopa ; padds [p0], m4; nopxm ; nopv + ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; padds [p0], #128; nopxm ; nopv + ; CHECK-NEXT: vldb wl10, [p0], #32; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh10, [p0], #32; nopx + ; CHECK-NEXT: vldb wh9, [p1], m6 + ; CHECK-NEXT: vldb wl3, [p0], #32 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: vldb wl5, [p1], m5 + ; CHECK-NEXT: vldb wh5, [p1], m6 + ; CHECK-NEXT: vldb wl7, [p1], m5 + ; CHECK-NEXT: vldb wh7, [p1], m6; vshuffle x1, x8, x10, r4 + ; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x11, x9, x9, r2 + ; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x8, x8, x10, r16 + ; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3 + ; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh2, bmh2, x10, x11, r3 + ; CHECK-NEXT: vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vmac.f bmh5, bmh5, x8, x5, r3 + ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopx ; vmac.f bmh8, bmh8, x1, x7, r3 + ; CHECK-NEXT: vldb wl6, [p0, #64]; vmac.f bml0, bml0, x8, x7, r3 + ; CHECK-NEXT: padds [p0], m4; vldb wh6, [p0, #96]; vmac.f bml1, bml1, x10, x7, r3 + ; CHECK-NEXT: padds [p0], #128; vldb wl9, [p1], m5; vmac.f bml2, bml2, x6, x7, r3 + ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml3, bml3, x1, x3, r3 + ; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bml5, bml5, x8, x3, r3 + ; CHECK-NEXT: vldb wh9, [p1], m6; vmac.f bml6, bml6, x10, x3, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: vldb wl5, [p1], m5 + ; CHECK-NEXT: vldb wh5, [p1], m6 + ; CHECK-NEXT: vldb wl7, [p1], m5 + ; CHECK-NEXT: vldb wh7, [p1], m6; vshuffle x1, x8, x10, r4 + ; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x11, x9, x9, r2 + ; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x8, x8, x10, r16 + ; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3 + ; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh2, bmh2, x10, x11, r3 + ; CHECK-NEXT: vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vmac.f bmh5, bmh5, x8, x5, r3 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x3, x3, x3, r2; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x7, r3 + ; CHECK-NEXT: vmac.f bml0, bml0, x8, x7, r3 + ; CHECK-NEXT: vmac.f bml1, bml1, x10, x7, r3 + ; CHECK-NEXT: vmac.f bml2, bml2, x6, x7, r3 + ; CHECK-NEXT: vmac.f bml3, bml3, x1, x3, r3 + ; CHECK-NEXT: vmac.f bml5, bml5, x8, x3, r3 + ; CHECK-NEXT: vmac.f bml6, bml6, x10, x3, r3 + ; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + renamable $r1 = MOV_RLC_imm10_pseudo 0 + renamable $r1 = GE killed renamable $r1, renamable $r0 + JNZ killed renamable $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13 + + $wl8 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl6 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh6 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $p0 = nuw PADD_mod_pseudo $p0, $m4 + $p0 = PADD_imm9_pseudo $p0, 128 + $wl10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x1 = VSHUFFLE $x8, $x10, $r4 + $wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x11 = VSHUFFLE $x9, $x9, $r2 + $x8 = VSHUFFLE $x8, $x10, $r16 + $wh7, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x10 = VSHUFFLE $x6, $x3, $r4 + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $x6 = VSHUFFLE $x6, $x3, $r16 + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x8, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $wl3, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x5 = VSHUFFLE $x5, $x5, $r2 + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x6, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $wh3, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x8, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $x7 = VSHUFFLE $x7, $x7, $r2 + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x10, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x6, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x1, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x8, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $x3 = VSHUFFLE $x3, $x3, $r2 + $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x10, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x6, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x8, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x10, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x6, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +...