-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[AIE] Mostly refactoring of postpipeliner
SlotCount class for use in resource computations PostPipelinerStrategy interface class
Martien de Jong
committed
Nov 18, 2024
1 parent
b4ddb6e
commit 8f03d3e
Showing
14 changed files
with
1,460 additions
and
189 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
234 changes: 234 additions & 0 deletions
234
llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-1.mir
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,234 @@ | ||
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 | ||
# This file is licensed under the Apache License v2.0 with LLVM Exceptions. | ||
# See https://llvm.org/LICENSE.txt for license information. | ||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
# | ||
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates | ||
|
||
# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s | ||
|
||
|
||
# derived from gemm_bf16_0 | ||
|
||
--- | | ||
define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { | ||
; CHECK-LABEL: gemm: | ||
; CHECK: .p2align 4 | ||
; CHECK-NEXT: // %bb.0: // %entry | ||
; CHECK-NEXT: mova r1, #0; nopb ; nopxm | ||
; CHECK-NEXT: ge r1, r1, r0 | ||
; CHECK-NEXT: jnz r1, #.LBB0_4 | ||
; CHECK-NEXT: nop // Delay Slot 5 | ||
; CHECK-NEXT: nop // Delay Slot 4 | ||
; CHECK-NEXT: nop // Delay Slot 3 | ||
; CHECK-NEXT: nop // Delay Slot 2 | ||
; CHECK-NEXT: nop // Delay Slot 1 | ||
; CHECK-NEXT: // %bb.1: // %for.body.preheader | ||
; CHECK-NEXT: add.nc lc, r0, #-1 | ||
; CHECK-NEXT: movxm ls, #.LBB0_2 | ||
; CHECK-NEXT: movxm le, #.L_LEnd0 | ||
; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; nopv | ||
; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopxm ; nopv | ||
; CHECK-NEXT: vldb wl6, [p0, #64]; nopa ; nops ; nopxm ; nopv | ||
; CHECK-NEXT: vldb wh6, [p0, #96]; nopa ; nops ; nopxm ; nopv | ||
; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; padds [p0], m4; nopxm ; nopv | ||
; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; padds [p0], #128; nopxm ; nopv | ||
; CHECK-NEXT: vldb wl10, [p0], #32; nopx | ||
; CHECK-NEXT: vldb wh10, [p0], #32 | ||
; CHECK-NEXT: vldb wl5, [p1], m5 | ||
; CHECK-NEXT: vldb wh5, [p1], m6 | ||
; CHECK-NEXT: vldb wl3, [p0], #32 | ||
; CHECK-NEXT: vldb wl7, [p1], m5 | ||
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2 | ||
; CHECK-NEXT: vldb wh7, [p1], m6 | ||
; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4 | ||
; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x8, x8, x10, r16 | ||
; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3 | ||
; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x11, r3 | ||
; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 | ||
; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh5, bmh5, x8, x5, r3 | ||
; CHECK-NEXT: vshuffle x6, x6, x3, r16 | ||
; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x0, x11, r3 | ||
; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 | ||
; CHECK-NEXT: vmac.f bmh6, bmh6, x0, x5, r3 | ||
; CHECK-NEXT: .p2align 4 | ||
; CHECK-NEXT: .LBB0_2: // %for.body | ||
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 | ||
; CHECK-NEXT: nopa ; vldb wl8, [p0, #0]; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 | ||
; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3 | ||
; CHECK-NEXT: vldb wl6, [p0, #64]; vmac.f bml0, bml0, x8, x7, r3 | ||
; CHECK-NEXT: vldb wh6, [p0, #96]; vmac.f bml1, bml1, x0, x7, r3 | ||
; CHECK-NEXT: padds [p0], m4; vldb wl9, [p1], m5; vmac.f bml2, bml2, x6, x7, r3 | ||
; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; vmac.f bml3, bml3, x1, x3, r3 | ||
; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml5, bml5, x8, x3, r3 | ||
; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bml6, bml6, x0, x3, r3 | ||
; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml4, bml4, x6, x3, r3 | ||
; CHECK-NEXT: vldb wh5, [p1], m6 | ||
; CHECK-NEXT: vldb wl3, [p0], #32 | ||
; CHECK-NEXT: vldb wl7, [p1], m5 | ||
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2 | ||
; CHECK-NEXT: vldb wh7, [p1], m6 | ||
; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4 | ||
; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x8, x8, x10, r16 | ||
; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3 | ||
; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x11, r3 | ||
; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 | ||
; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh5, bmh5, x8, x5, r3 | ||
; CHECK-NEXT: vshuffle x6, x6, x3, r16 | ||
; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x0, x11, r3 | ||
; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 | ||
; CHECK-NEXT: .L_LEnd0: | ||
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh6, bmh6, x0, x5, r3 | ||
; CHECK-NEXT: // %bb.3: // %for.cond.cleanup | ||
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 | ||
; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x7, r3 | ||
; CHECK-NEXT: vmac.f bml0, bml0, x8, x7, r3 | ||
; CHECK-NEXT: vmac.f bml1, bml1, x0, x7, r3 | ||
; CHECK-NEXT: vmac.f bml2, bml2, x6, x7, r3 | ||
; CHECK-NEXT: vmac.f bml3, bml3, x1, x3, r3 | ||
; CHECK-NEXT: vmac.f bml5, bml5, x8, x3, r3 | ||
; CHECK-NEXT: vmac.f bml6, bml6, x0, x3, r3 | ||
; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3 | ||
; CHECK-NEXT: nopx | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: .p2align 4 | ||
; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup | ||
; CHECK-NEXT: nopa ; ret lr | ||
; CHECK-NEXT: nop // Delay Slot 5 | ||
; CHECK-NEXT: nop // Delay Slot 4 | ||
; CHECK-NEXT: nop // Delay Slot 3 | ||
; CHECK-NEXT: nop // Delay Slot 2 | ||
; CHECK-NEXT: nop // Delay Slot 1 | ||
entry: | ||
%cmp5 = icmp sgt i32 %n, 0 | ||
br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup | ||
|
||
for.body.preheader: ; preds = %entry | ||
call void @llvm.set.loop.iterations.i32(i32 %n) | ||
br label %for.body | ||
|
||
for.cond.cleanup: ; preds = %for.body, %entry | ||
ret void | ||
|
||
for.body: ; preds = %for.body.preheader, %for.body | ||
%d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] | ||
%s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] | ||
%0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 | ||
%add = add nsw i32 %0, 1 | ||
store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 | ||
%incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 | ||
%incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 | ||
%1 = call i1 @llvm.loop.decrement.i32(i32 1) | ||
br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 | ||
} | ||
|
||
; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn | ||
declare void @llvm.set.loop.iterations.i32(i32) #1 | ||
|
||
; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn | ||
declare i1 @llvm.loop.decrement.i32(i32) #1 | ||
|
||
attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } | ||
attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } | ||
|
||
!llvm.module.flags = !{!0} | ||
!llvm.ident = !{!1} | ||
|
||
!0 = !{i32 1, !"wchar_size", i32 4} | ||
!1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} | ||
!2 = !{!3, !3, i64 0} | ||
!3 = !{!"int", !4, i64 0} | ||
!4 = !{!"omnipotent char", !5, i64 0} | ||
!5 = !{!"Simple C/C++ TBAA"} | ||
!6 = distinct !{!6, !7, !8} | ||
!7 = !{!"llvm.loop.mustprogress"} | ||
!8 = !{!"llvm.loop.itercount.range", i64 10} | ||
|
||
... | ||
--- | ||
name: gemm | ||
alignment: 16 | ||
tracksRegLiveness: true | ||
body: | | ||
bb.0.entry (align 16): | ||
successors: %bb.1(0x50000000), %bb.3(0x30000000) | ||
liveins: $p0, $p1, $r0 | ||
renamable $r1 = MOV_RLC_imm10_pseudo 0 | ||
renamable $r1 = GE killed renamable $r1, renamable $r0 | ||
JNZ killed renamable $r1, %bb.3 | ||
DelayedSchedBarrier | ||
bb.1.for.body.preheader: | ||
successors: %bb.2(0x80000000) | ||
liveins: $p0, $p1, $r0 | ||
$lc = ADD_NC $r0, 0 | ||
$ls = MOVXM_lng_cg %bb.2 | ||
$le = MOVXM_lng_cg <mcsymbol .L_LEnd0> | ||
bb.2.for.body (align 16): | ||
liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13 | ||
$wl8 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wl6 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh6 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$p0 = nuw PADD_mod_pseudo $p0, $m4 | ||
$p0 = PADD_imm9_pseudo $p0, 128 | ||
$wl10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$x1 = VSHUFFLE $x8, $x10, $r4 | ||
$wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$x11 = VSHUFFLE $x9, $x9, $r2 | ||
$x8 = VSHUFFLE $x8, $x10, $r16 | ||
$wh7, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$x0 = VSHUFFLE $x6, $x3, $r4 | ||
$bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$x6 = VSHUFFLE $x6, $x3, $r16 | ||
$bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x8, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$wl3, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$x5 = VSHUFFLE $x5, $x5, $r2 | ||
$bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x0, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x6, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$wh3, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x8, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$x7 = VSHUFFLE $x7, $x7, $r2 | ||
$bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x0, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x6, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x1, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x8, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$x3 = VSHUFFLE $x3, $x3, $r2 | ||
$bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x0, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x6, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x8, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x0, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x6, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2 | ||
bb.3.for.cond.cleanup (align 16): | ||
RET implicit $lr | ||
DelayedSchedBarrier | ||
... |
234 changes: 234 additions & 0 deletions
234
llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,234 @@ | ||
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 | ||
# This file is licensed under the Apache License v2.0 with LLVM Exceptions. | ||
# See https://llvm.org/LICENSE.txt for license information. | ||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
# | ||
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates | ||
|
||
# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s | ||
|
||
|
||
# derived from gemm_bf16_0 | ||
|
||
--- | | ||
define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { | ||
; CHECK-LABEL: gemm: | ||
; CHECK: .p2align 4 | ||
; CHECK-NEXT: // %bb.0: // %entry | ||
; CHECK-NEXT: mova r1, #0; nopb ; nopxm | ||
; CHECK-NEXT: ge r1, r1, r0 | ||
; CHECK-NEXT: jnz r1, #.LBB0_4 | ||
; CHECK-NEXT: nop // Delay Slot 5 | ||
; CHECK-NEXT: nop // Delay Slot 4 | ||
; CHECK-NEXT: nop // Delay Slot 3 | ||
; CHECK-NEXT: nop // Delay Slot 2 | ||
; CHECK-NEXT: nop // Delay Slot 1 | ||
; CHECK-NEXT: // %bb.1: // %for.body.preheader | ||
; CHECK-NEXT: add.nc lc, r0, #-1 | ||
; CHECK-NEXT: movxm ls, #.LBB0_2 | ||
; CHECK-NEXT: movxm le, #.L_LEnd0 | ||
; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; nopv | ||
; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopxm ; nopv | ||
; CHECK-NEXT: vldb wl6, [p0, #64]; nopa ; nops ; nopxm ; nopv | ||
; CHECK-NEXT: vldb wh6, [p0, #96]; nopa ; nops ; nopxm ; nopv | ||
; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; padds [p0], m4; nopxm ; nopv | ||
; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; padds [p0], #128; nopxm ; nopv | ||
; CHECK-NEXT: vldb wl10, [p0], #32; nopx | ||
; CHECK-NEXT: vldb wh10, [p0], #32 | ||
; CHECK-NEXT: vldb wl5, [p1], m5 | ||
; CHECK-NEXT: vldb wh5, [p1], m6 | ||
; CHECK-NEXT: vldb wl3, [p0], #32 | ||
; CHECK-NEXT: vldb wl7, [p1], m5 | ||
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2 | ||
; CHECK-NEXT: vldb wh7, [p1], m6 | ||
; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4 | ||
; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x0, x8, x10, r16 | ||
; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3 | ||
; CHECK-NEXT: vmac.f bmh1, bmh1, x0, x11, r3 | ||
; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 | ||
; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh5, bmh5, x0, x5, r3 | ||
; CHECK-NEXT: vshuffle x6, x6, x3, r16 | ||
; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x10, x11, r3 | ||
; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 | ||
; CHECK-NEXT: vmac.f bmh6, bmh6, x10, x5, r3 | ||
; CHECK-NEXT: .p2align 4 | ||
; CHECK-NEXT: .LBB0_2: // %for.body | ||
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 | ||
; CHECK-NEXT: nopa ; vldb wl8, [p0, #0]; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 | ||
; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3 | ||
; CHECK-NEXT: vldb wl6, [p0, #64]; vmac.f bml0, bml0, x0, x7, r3 | ||
; CHECK-NEXT: vldb wh6, [p0, #96]; vmac.f bml1, bml1, x10, x7, r3 | ||
; CHECK-NEXT: padds [p0], m4; vldb wl9, [p1], m5; vmac.f bml2, bml2, x6, x7, r3 | ||
; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; vmac.f bml3, bml3, x1, x3, r3 | ||
; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml5, bml5, x0, x3, r3 | ||
; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bml6, bml6, x10, x3, r3 | ||
; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml4, bml4, x6, x3, r3 | ||
; CHECK-NEXT: vldb wh5, [p1], m6 | ||
; CHECK-NEXT: vldb wl3, [p0], #32 | ||
; CHECK-NEXT: vldb wl7, [p1], m5 | ||
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2 | ||
; CHECK-NEXT: vldb wh7, [p1], m6 | ||
; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4 | ||
; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x0, x8, x10, r16 | ||
; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3 | ||
; CHECK-NEXT: vmac.f bmh1, bmh1, x0, x11, r3 | ||
; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 | ||
; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh5, bmh5, x0, x5, r3 | ||
; CHECK-NEXT: vshuffle x6, x6, x3, r16 | ||
; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x10, x11, r3 | ||
; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 | ||
; CHECK-NEXT: .L_LEnd0: | ||
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh6, bmh6, x10, x5, r3 | ||
; CHECK-NEXT: // %bb.3: // %for.cond.cleanup | ||
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 | ||
; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x7, r3 | ||
; CHECK-NEXT: vmac.f bml0, bml0, x0, x7, r3 | ||
; CHECK-NEXT: vmac.f bml1, bml1, x10, x7, r3 | ||
; CHECK-NEXT: vmac.f bml2, bml2, x6, x7, r3 | ||
; CHECK-NEXT: vmac.f bml3, bml3, x1, x3, r3 | ||
; CHECK-NEXT: vmac.f bml5, bml5, x0, x3, r3 | ||
; CHECK-NEXT: vmac.f bml6, bml6, x10, x3, r3 | ||
; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3 | ||
; CHECK-NEXT: nopx | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: .p2align 4 | ||
; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup | ||
; CHECK-NEXT: nopa ; ret lr | ||
; CHECK-NEXT: nop // Delay Slot 5 | ||
; CHECK-NEXT: nop // Delay Slot 4 | ||
; CHECK-NEXT: nop // Delay Slot 3 | ||
; CHECK-NEXT: nop // Delay Slot 2 | ||
; CHECK-NEXT: nop // Delay Slot 1 | ||
entry: | ||
%cmp5 = icmp sgt i32 %n, 0 | ||
br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup | ||
|
||
for.body.preheader: ; preds = %entry | ||
call void @llvm.set.loop.iterations.i32(i32 %n) | ||
br label %for.body | ||
|
||
for.cond.cleanup: ; preds = %for.body, %entry | ||
ret void | ||
|
||
for.body: ; preds = %for.body.preheader, %for.body | ||
%d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] | ||
%s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] | ||
%0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 | ||
%add = add nsw i32 %0, 1 | ||
store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 | ||
%incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 | ||
%incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 | ||
%1 = call i1 @llvm.loop.decrement.i32(i32 1) | ||
br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 | ||
} | ||
|
||
; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn | ||
declare void @llvm.set.loop.iterations.i32(i32) #1 | ||
|
||
; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn | ||
declare i1 @llvm.loop.decrement.i32(i32) #1 | ||
|
||
attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } | ||
attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } | ||
|
||
!llvm.module.flags = !{!0} | ||
!llvm.ident = !{!1} | ||
|
||
!0 = !{i32 1, !"wchar_size", i32 4} | ||
!1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} | ||
!2 = !{!3, !3, i64 0} | ||
!3 = !{!"int", !4, i64 0} | ||
!4 = !{!"omnipotent char", !5, i64 0} | ||
!5 = !{!"Simple C/C++ TBAA"} | ||
!6 = distinct !{!6, !7, !8} | ||
!7 = !{!"llvm.loop.mustprogress"} | ||
!8 = !{!"llvm.loop.itercount.range", i64 10} | ||
|
||
... | ||
--- | ||
name: gemm | ||
alignment: 16 | ||
tracksRegLiveness: true | ||
body: | | ||
bb.0.entry (align 16): | ||
successors: %bb.1(0x50000000), %bb.3(0x30000000) | ||
liveins: $p0, $p1, $r0 | ||
renamable $r1 = MOV_RLC_imm10_pseudo 0 | ||
renamable $r1 = GE killed renamable $r1, renamable $r0 | ||
JNZ killed renamable $r1, %bb.3 | ||
DelayedSchedBarrier | ||
bb.1.for.body.preheader: | ||
successors: %bb.2(0x80000000) | ||
liveins: $p0, $p1, $r0 | ||
$lc = ADD_NC $r0, 0 | ||
$ls = MOVXM_lng_cg %bb.2 | ||
$le = MOVXM_lng_cg <mcsymbol .L_LEnd0> | ||
bb.2.for.body (align 16): | ||
liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13 | ||
$wl8 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wl6 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh6 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$p0 = nuw PADD_mod_pseudo $p0, $m4 | ||
$p0 = PADD_imm9_pseudo $p0, 128 | ||
$wl10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$x1 = VSHUFFLE $x8, $x10, $r4 | ||
$wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$x11 = VSHUFFLE $x9, $x9, $r2 | ||
$x0 = VSHUFFLE $x8, $x10, $r16 | ||
$wh7, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$x10 = VSHUFFLE $x6, $x3, $r4 | ||
$bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$x6 = VSHUFFLE $x6, $x3, $r16 | ||
$bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x0, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$wl3, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$x5 = VSHUFFLE $x5, $x5, $r2 | ||
$bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x6, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$wh3, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x0, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$x7 = VSHUFFLE $x7, $x7, $r2 | ||
$bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x10, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x6, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x1, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x0, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$x3 = VSHUFFLE $x3, $x3, $r2 | ||
$bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x10, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x6, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x0, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x10, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x6, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2 | ||
bb.3.for.cond.cleanup (align 16): | ||
RET implicit $lr | ||
DelayedSchedBarrier | ||
... |
234 changes: 234 additions & 0 deletions
234
llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,234 @@ | ||
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 | ||
# This file is licensed under the Apache License v2.0 with LLVM Exceptions. | ||
# See https://llvm.org/LICENSE.txt for license information. | ||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
# | ||
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates | ||
|
||
# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s | ||
|
||
|
||
# derived from gemm_bf16_0 | ||
|
||
--- | | ||
define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { | ||
; CHECK-LABEL: gemm: | ||
; CHECK: .p2align 4 | ||
; CHECK-NEXT: // %bb.0: // %entry | ||
; CHECK-NEXT: mova r1, #0; nopb ; nopxm | ||
; CHECK-NEXT: ge r1, r1, r0 | ||
; CHECK-NEXT: jnz r1, #.LBB0_4 | ||
; CHECK-NEXT: nop // Delay Slot 5 | ||
; CHECK-NEXT: nop // Delay Slot 4 | ||
; CHECK-NEXT: nop // Delay Slot 3 | ||
; CHECK-NEXT: nop // Delay Slot 2 | ||
; CHECK-NEXT: nop // Delay Slot 1 | ||
; CHECK-NEXT: // %bb.1: // %for.body.preheader | ||
; CHECK-NEXT: add.nc lc, r0, #-1 | ||
; CHECK-NEXT: movxm ls, #.LBB0_2 | ||
; CHECK-NEXT: movxm le, #.L_LEnd0 | ||
; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; nopv | ||
; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopxm ; nopv | ||
; CHECK-NEXT: vldb wl6, [p0, #64]; nopa ; nops ; nopxm ; nopv | ||
; CHECK-NEXT: vldb wh6, [p0, #96]; nopa ; nops ; nopxm ; nopv | ||
; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; padds [p0], m4; nopxm ; nopv | ||
; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; padds [p0], #128; nopxm ; nopv | ||
; CHECK-NEXT: vldb wl10, [p0], #32; nopx | ||
; CHECK-NEXT: vldb wh10, [p0], #32 | ||
; CHECK-NEXT: vldb wl5, [p1], m5 | ||
; CHECK-NEXT: vldb wh5, [p1], m6 | ||
; CHECK-NEXT: vldb wl3, [p0], #32 | ||
; CHECK-NEXT: vldb wl7, [p1], m5 | ||
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2 | ||
; CHECK-NEXT: vldb wh7, [p1], m6 | ||
; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4 | ||
; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x2, x2, x10, r16 | ||
; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3 | ||
; CHECK-NEXT: vmac.f bmh1, bmh1, x2, x11, r3 | ||
; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 | ||
; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh5, bmh5, x2, x5, r3 | ||
; CHECK-NEXT: vshuffle x6, x6, x3, r16 | ||
; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x0, x11, r3 | ||
; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 | ||
; CHECK-NEXT: vmac.f bmh6, bmh6, x0, x5, r3 | ||
; CHECK-NEXT: .p2align 4 | ||
; CHECK-NEXT: .LBB0_2: // %for.body | ||
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 | ||
; CHECK-NEXT: nopa ; vldb wl8, [p0, #0]; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 | ||
; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3 | ||
; CHECK-NEXT: vldb wl6, [p0, #64]; vmac.f bml0, bml0, x2, x7, r3 | ||
; CHECK-NEXT: vldb wh6, [p0, #96]; vmac.f bml1, bml1, x0, x7, r3 | ||
; CHECK-NEXT: padds [p0], m4; vldb wl9, [p1], m5; vmac.f bml2, bml2, x6, x7, r3 | ||
; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; vmac.f bml3, bml3, x1, x3, r3 | ||
; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml5, bml5, x2, x3, r3 | ||
; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bml6, bml6, x0, x3, r3 | ||
; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml4, bml4, x6, x3, r3 | ||
; CHECK-NEXT: vldb wh5, [p1], m6 | ||
; CHECK-NEXT: vldb wl3, [p0], #32 | ||
; CHECK-NEXT: vldb wl7, [p1], m5 | ||
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2 | ||
; CHECK-NEXT: vldb wh7, [p1], m6 | ||
; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4 | ||
; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x2, x2, x10, r16 | ||
; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3 | ||
; CHECK-NEXT: vmac.f bmh1, bmh1, x2, x11, r3 | ||
; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 | ||
; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh5, bmh5, x2, x5, r3 | ||
; CHECK-NEXT: vshuffle x6, x6, x3, r16 | ||
; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x0, x11, r3 | ||
; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 | ||
; CHECK-NEXT: .L_LEnd0: | ||
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh6, bmh6, x0, x5, r3 | ||
; CHECK-NEXT: // %bb.3: // %for.cond.cleanup | ||
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 | ||
; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x7, r3 | ||
; CHECK-NEXT: vmac.f bml0, bml0, x2, x7, r3 | ||
; CHECK-NEXT: vmac.f bml1, bml1, x0, x7, r3 | ||
; CHECK-NEXT: vmac.f bml2, bml2, x6, x7, r3 | ||
; CHECK-NEXT: vmac.f bml3, bml3, x1, x3, r3 | ||
; CHECK-NEXT: vmac.f bml5, bml5, x2, x3, r3 | ||
; CHECK-NEXT: vmac.f bml6, bml6, x0, x3, r3 | ||
; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3 | ||
; CHECK-NEXT: nopx | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: .p2align 4 | ||
; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup | ||
; CHECK-NEXT: nopa ; ret lr | ||
; CHECK-NEXT: nop // Delay Slot 5 | ||
; CHECK-NEXT: nop // Delay Slot 4 | ||
; CHECK-NEXT: nop // Delay Slot 3 | ||
; CHECK-NEXT: nop // Delay Slot 2 | ||
; CHECK-NEXT: nop // Delay Slot 1 | ||
entry: | ||
%cmp5 = icmp sgt i32 %n, 0 | ||
br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup | ||
|
||
for.body.preheader: ; preds = %entry | ||
call void @llvm.set.loop.iterations.i32(i32 %n) | ||
br label %for.body | ||
|
||
for.cond.cleanup: ; preds = %for.body, %entry | ||
ret void | ||
|
||
for.body: ; preds = %for.body.preheader, %for.body | ||
%d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] | ||
%s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] | ||
%0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 | ||
%add = add nsw i32 %0, 1 | ||
store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 | ||
%incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 | ||
%incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 | ||
%1 = call i1 @llvm.loop.decrement.i32(i32 1) | ||
br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 | ||
} | ||
|
||
; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn | ||
declare void @llvm.set.loop.iterations.i32(i32) #1 | ||
|
||
; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn | ||
declare i1 @llvm.loop.decrement.i32(i32) #1 | ||
|
||
attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } | ||
attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } | ||
|
||
!llvm.module.flags = !{!0} | ||
!llvm.ident = !{!1} | ||
|
||
!0 = !{i32 1, !"wchar_size", i32 4} | ||
!1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} | ||
!2 = !{!3, !3, i64 0} | ||
!3 = !{!"int", !4, i64 0} | ||
!4 = !{!"omnipotent char", !5, i64 0} | ||
!5 = !{!"Simple C/C++ TBAA"} | ||
!6 = distinct !{!6, !7, !8} | ||
!7 = !{!"llvm.loop.mustprogress"} | ||
!8 = !{!"llvm.loop.itercount.range", i64 10} | ||
|
||
... | ||
--- | ||
name: gemm | ||
alignment: 16 | ||
tracksRegLiveness: true | ||
body: | | ||
bb.0.entry (align 16): | ||
successors: %bb.1(0x50000000), %bb.3(0x30000000) | ||
liveins: $p0, $p1, $r0 | ||
renamable $r1 = MOV_RLC_imm10_pseudo 0 | ||
renamable $r1 = GE killed renamable $r1, renamable $r0 | ||
JNZ killed renamable $r1, %bb.3 | ||
DelayedSchedBarrier | ||
bb.1.for.body.preheader: | ||
successors: %bb.2(0x80000000) | ||
liveins: $p0, $p1, $r0 | ||
$lc = ADD_NC $r0, 0 | ||
$ls = MOVXM_lng_cg %bb.2 | ||
$le = MOVXM_lng_cg <mcsymbol .L_LEnd0> | ||
bb.2.for.body (align 16): | ||
liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13 | ||
$wl8 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wl6 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh6 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$p0 = nuw PADD_mod_pseudo $p0, $m4 | ||
$p0 = PADD_imm9_pseudo $p0, 128 | ||
$wl10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$x1 = VSHUFFLE $x8, $x10, $r4 | ||
$wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$x11 = VSHUFFLE $x9, $x9, $r2 | ||
$x2 = VSHUFFLE $x2, $x10, $r16 | ||
$wh7, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$x0 = VSHUFFLE $x6, $x3, $r4 | ||
$bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$x6 = VSHUFFLE $x6, $x3, $r16 | ||
$bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x2, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$wl3, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$x5 = VSHUFFLE $x5, $x5, $r2 | ||
$bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x0, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x6, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$wh3, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x2, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$x7 = VSHUFFLE $x7, $x7, $r2 | ||
$bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x0, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x6, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x1, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x2, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$x3 = VSHUFFLE $x3, $x3, $r2 | ||
$bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x0, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x6, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x2, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x0, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
$bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x6, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask | ||
PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2 | ||
bb.3.for.cond.cleanup (align 16): | ||
RET implicit $lr | ||
DelayedSchedBarrier | ||
... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
211 changes: 211 additions & 0 deletions
211
llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,211 @@ | ||
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 | ||
# This file is licensed under the Apache License v2.0 with LLVM Exceptions. | ||
# See https://llvm.org/LICENSE.txt for license information. | ||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
# | ||
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates | ||
|
||
# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \ | ||
# RUN: --debug-only=postpipeliner-summary -o - | FileCheck %s | ||
|
||
|
||
# derived from gemm_bf16_0 | ||
|
||
--- | | ||
define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { | ||
; CHECK-LABEL: gemm: | ||
; CHECK: .p2align 4 | ||
; CHECK-NEXT: // %bb.0: // %entry | ||
; CHECK-NEXT: mova r1, #0; nopb ; nopxm | ||
; CHECK-NEXT: ge r1, r1, r0 | ||
; CHECK-NEXT: jnz r1, #.LBB0_4 | ||
; CHECK-NEXT: nop // Delay Slot 5 | ||
; CHECK-NEXT: nop // Delay Slot 4 | ||
; CHECK-NEXT: nop // Delay Slot 3 | ||
; CHECK-NEXT: nop // Delay Slot 2 | ||
; CHECK-NEXT: nop // Delay Slot 1 | ||
; CHECK-NEXT: // %bb.1: // %for.body.preheader | ||
; CHECK-NEXT: add.nc lc, r0, #-1 | ||
; CHECK-NEXT: movxm ls, #.LBB0_2 | ||
; CHECK-NEXT: movxm le, #.L_LEnd0 | ||
; CHECK-NEXT: vldb wl8, [p0], #32; vlda wl11, [p1], m5; nops ; nopxm ; nopv | ||
; CHECK-NEXT: vldb wh8, [p0], #32; vlda wh11, [p1], m6; nops ; nopxm ; nopv | ||
; CHECK-NEXT: vldb wl1, [p0], #32; vlda wl5, [p1], m5; nops ; nopxm ; nopv | ||
; CHECK-NEXT: vldb wh1, [p0], #32; vlda wh5, [p1], m6; nops ; nopxm ; nopv | ||
; CHECK-NEXT: paddb [p0], m4; nopa ; nops ; nopxm ; nopv | ||
; CHECK-NEXT: vldb wl0, [p0], #32; nopa ; nops ; nopxm ; nopv | ||
; CHECK-NEXT: vldb wh0, [p0], #32; nopx | ||
; CHECK-NEXT: vldb wl3, [p0], #32 | ||
; CHECK-NEXT: vldb.3d wh3, [p0], d0 | ||
; CHECK-NEXT: vldb wl0, [p1], m5 | ||
; CHECK-NEXT: vldb wh0, [p1], m6; vshuffle x5, x5, x5, r6 | ||
; CHECK-NEXT: vldb wl7, [p1], m5 | ||
; CHECK-NEXT: vldb.3d wh7, [p1], d1 | ||
; CHECK-NEXT: vshuffle x6, x8, x0, r3 | ||
; CHECK-NEXT: vshuffle x2, x8, x0, r16 | ||
; CHECK-NEXT: vshuffle x10, x1, x3, r3; vmac.f bmh4, bmh4, x6, x5, r2 | ||
; CHECK-NEXT: .p2align 4 | ||
; CHECK-NEXT: .LBB0_2: // %for.body | ||
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 | ||
; CHECK-NEXT: vldb wl8, [p0], #32; vlda wl11, [p1], m5; nops ; nopx ; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r2 | ||
; CHECK-NEXT: vlda wh11, [p1], m6; vldb wh8, [p0], #32; nopx ; vshuffle x3, x11, x11, r6; vmac.f bmh6, bmh6, x10, x5, r2 | ||
; CHECK-NEXT: vlda wl5, [p1], m5; vldb wl1, [p0], #32; vshuffle x0, x0, x0, r6; vmac.f bmh7, bmh7, x9, x5, r2 | ||
; CHECK-NEXT: vlda wh5, [p1], m6; vldb wh1, [p0], #32; vshuffle x7, x7, x7, r6; vmac.f bmh0, bmh0, x6, x3, r2 | ||
; CHECK-NEXT: paddb [p0], m4; vmac.f bmh1, bmh1, x2, x3, r2 | ||
; CHECK-NEXT: vldb wl0, [p0], #32; vmac.f bmh2, bmh2, x10, x3, r2 | ||
; CHECK-NEXT: vldb wh0, [p0], #32; vmac.f bmh3, bmh3, x9, x3, r2 | ||
; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh8, bmh8, x6, x0, r2 | ||
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml0, bml0, x2, x0, r2 | ||
; CHECK-NEXT: vldb wl0, [p1], m5; vmac.f bml1, bml1, x10, x0, r2 | ||
; CHECK-NEXT: vldb wh0, [p1], m6; vshuffle x5, x5, x5, r6; vmac.f bml2, bml2, x9, x0, r2 | ||
; CHECK-NEXT: vldb wl7, [p1], m5; vmac.f bml3, bml3, x6, x7, r2 | ||
; CHECK-NEXT: vldb.3d wh7, [p1], d1; vmac.f bml4, bml4, x2, x7, r2 | ||
; CHECK-NEXT: vshuffle x6, x8, x0, r3; vmac.f bml6, bml6, x10, x7, r2 | ||
; CHECK-NEXT: vshuffle x2, x8, x0, r16; vmac.f bml5, bml5, x9, x7, r2 | ||
; CHECK-NEXT: .L_LEnd0: | ||
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x10, x1, x3, r3; vmac.f bmh4, bmh4, x6, x5, r2 | ||
; CHECK-NEXT: // %bb.3: // %for.cond.cleanup | ||
; CHECK-NEXT: nopx ; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r2 | ||
; CHECK-NEXT: vshuffle x3, x11, x11, r6; vmac.f bmh6, bmh6, x10, x5, r2 | ||
; CHECK-NEXT: vshuffle x0, x0, x0, r6; vmac.f bmh7, bmh7, x9, x5, r2 | ||
; CHECK-NEXT: vshuffle x7, x7, x7, r6; vmac.f bmh0, bmh0, x6, x3, r2 | ||
; CHECK-NEXT: vmac.f bmh1, bmh1, x2, x3, r2 | ||
; CHECK-NEXT: vmac.f bmh2, bmh2, x10, x3, r2 | ||
; CHECK-NEXT: vmac.f bmh3, bmh3, x9, x3, r2 | ||
; CHECK-NEXT: vmac.f bmh8, bmh8, x6, x0, r2 | ||
; CHECK-NEXT: vmac.f bml0, bml0, x2, x0, r2 | ||
; CHECK-NEXT: vmac.f bml1, bml1, x10, x0, r2 | ||
; CHECK-NEXT: vmac.f bml2, bml2, x9, x0, r2 | ||
; CHECK-NEXT: vmac.f bml3, bml3, x6, x7, r2 | ||
; CHECK-NEXT: vmac.f bml4, bml4, x2, x7, r2 | ||
; CHECK-NEXT: vmac.f bml6, bml6, x10, x7, r2 | ||
; CHECK-NEXT: vmac.f bml5, bml5, x9, x7, r2 | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: .p2align 4 | ||
; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup | ||
; CHECK-NEXT: nopa ; ret lr | ||
; CHECK-NEXT: nop // Delay Slot 5 | ||
; CHECK-NEXT: nop // Delay Slot 4 | ||
; CHECK-NEXT: nop // Delay Slot 3 | ||
; CHECK-NEXT: nop // Delay Slot 2 | ||
; CHECK-NEXT: nop // Delay Slot 1 | ||
entry: | ||
%cmp5 = icmp sgt i32 %n, 0 | ||
br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup | ||
|
||
for.body.preheader: ; preds = %entry | ||
call void @llvm.set.loop.iterations.i32(i32 %n) | ||
br label %for.body | ||
|
||
for.cond.cleanup: ; preds = %for.body, %entry | ||
ret void | ||
|
||
for.body: ; preds = %for.body.preheader, %for.body | ||
%d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] | ||
%s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] | ||
%0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 | ||
%add = add nsw i32 %0, 1 | ||
store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 | ||
%incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 | ||
%incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 | ||
%1 = call i1 @llvm.loop.decrement.i32(i32 1) | ||
br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 | ||
} | ||
|
||
; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn | ||
declare void @llvm.set.loop.iterations.i32(i32) #1 | ||
|
||
; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn | ||
declare i1 @llvm.loop.decrement.i32(i32) #1 | ||
|
||
attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } | ||
attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } | ||
|
||
!llvm.module.flags = !{!0} | ||
!llvm.ident = !{!1} | ||
|
||
!0 = !{i32 1, !"wchar_size", i32 4} | ||
!1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} | ||
!2 = !{!3, !3, i64 0} | ||
!3 = !{!"int", !4, i64 0} | ||
!4 = !{!"omnipotent char", !5, i64 0} | ||
!5 = !{!"Simple C/C++ TBAA"} | ||
!6 = distinct !{!6, !7, !8} | ||
!7 = !{!"llvm.loop.mustprogress"} | ||
!8 = !{!"llvm.loop.itercount.range", i64 10} | ||
|
||
... | ||
--- | ||
name: gemm | ||
alignment: 16 | ||
tracksRegLiveness: true | ||
body: | | ||
bb.0.entry (align 16): | ||
successors: %bb.1(0x50000000), %bb.3(0x30000000) | ||
liveins: $p0, $p1, $r0 | ||
$r1 = MOV_RLC_imm10_pseudo 0 | ||
$r1 = GE $r1, $r0 | ||
JNZ $r1, %bb.3 | ||
DelayedSchedBarrier | ||
bb.1.for.body.preheader: | ||
successors: %bb.2(0x80000000) | ||
liveins: $p0, $p1, $r0 | ||
$lc = ADD_NC $r0, 0 | ||
$ls = MOVXM_lng_cg %bb.2 | ||
$le = MOVXM_lng_cg <mcsymbol .L_LEnd0> | ||
bb.2.for.body (align 16): | ||
liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13 | ||
$wl8, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh8, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wl1, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh1, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$p0 = nuw PADD_mod_pseudo $p0, $m4 | ||
$wl0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) | ||
$x6 = VSHUFFLE $x8, $x0, $r3 | ||
$x2 = VSHUFFLE $x8, $x0, $r16 | ||
$x10 = VSHUFFLE $x1, $x3, $r3 | ||
$x9 = VSHUFFLE $x1, $x3, $r16 | ||
$wl11, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) | ||
$wh11, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) | ||
$wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) | ||
$wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) | ||
$wl0, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) | ||
$wh0, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) | ||
$wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) | ||
$wh7, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) | ||
$x3 = VSHUFFLE $x11, $x11, $r6 | ||
$x5 = VSHUFFLE $x5, $x5, $r6 | ||
$x0 = VSHUFFLE $x0, $x0, $r6 | ||
$x7 = VSHUFFLE $x7, $x7, $r6 | ||
$bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x6, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x2, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x9, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x6, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x2, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x10, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x9, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask | ||
$bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x6, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask | ||
$bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x2, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask | ||
$bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x10, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask | ||
$bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x9, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask | ||
$bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x6, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask | ||
$bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x2, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask | ||
$bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x10, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask | ||
$bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x9, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask | ||
PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2 | ||
bb.3.for.cond.cleanup (align 16): | ||
RET implicit $lr | ||
DelayedSchedBarrier | ||
... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters