Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[AIE] Mostly refactoring of postpipeliner
Browse files Browse the repository at this point in the history
SlotCount class for use in resource computations
PostPipelinerStrategy interface class
Martien de Jong committed Nov 18, 2024
1 parent b4ddb6e commit 8f03d3e
Showing 14 changed files with 1,460 additions and 189 deletions.
425 changes: 345 additions & 80 deletions llvm/lib/Target/AIE/AIEPostPipeliner.cpp

Large diffs are not rendered by default.

90 changes: 86 additions & 4 deletions llvm/lib/Target/AIE/AIEPostPipeliner.h
Original file line number Diff line number Diff line change
@@ -17,6 +17,8 @@
#include "AIEHazardRecognizer.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/ResourceScoreboard.h"
#include "llvm/Support/raw_ostream.h"
#include <unordered_set>
#include <vector>

namespace llvm {
@@ -31,6 +33,37 @@ namespace llvm::AIE {
/// These copies are scheduled 'synchronously', i.e. the copies are checked
/// to fit into the same cycle modulo II.

/// Efficient representation of slot requirements
class SlotCounts {
static const int MaxSlots = 16;
short Counts[MaxSlots];
// The number of valid Counts. Further counts are assumed to be zero.
int Size = 0;

public:
// Useful constructors
SlotCounts() = default;
SlotCounts(SlotBits Bits);
SlotCounts(const SlotCounts &Org);
SlotCounts &operator=(const SlotCounts &Rhs) = default;

// Compute the number of required cycles
int max();

// Add slot counts of Other to this
SlotCounts &operator+=(const SlotCounts &Other);

// By-value addition.
SlotCounts operator+(const SlotCounts &Other) const;

// Indexing
const short &operator[](int I) const { return Counts[I]; };

int size() const { return Size; }
};

llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const SlotCounts &Val);

class NodeInfo {
public:
// Keep track of being scheduled. Only maintained for the
@@ -43,14 +76,50 @@ class NodeInfo {
int ModuloCycle = 0;
// Cycle / II
int Stage = 0;

int StaticEarliest;
int StaticLatest;

// The earliest cycle at which this can be scheduled to meet latencies
// This includes the lowerbound of the modulo condition, i.e.
// Earliest(N) >= Cycle(N - NInstr) + II
int Earliest = 0;
// For an LCD K1 -> K2, this holds II + Earliest(K2 - NInstr) - Latency(LCD)
// Instructions with lower Latest have higher priority in the
// top down scheduling
int Latest = 0;
// Latest is relative to the size of the linear schedule. Since this equals
// StageCount * II, and latest should not be less than earliest, this
// implies a minimum stagecount.
int Latest = -1;

// Latest corrected taking Earliest of an LCD successor into account
int LCDLatest = -1;

// The transitive closure of my predecessors
std::unordered_set<int> Ancestors;

// The transitive closure of my successors
std::unordered_set<int> Offspring;
};

class PostPipelinerStrategy {
protected:
int LatestBias = 0;

public:
PostPipelinerStrategy(int LatestBias) : LatestBias(LatestBias){};
virtual ~PostPipelinerStrategy();
virtual std::string name() { return "PostPipelinerStrategy"; }
// Choose among available alternatives
virtual bool better(const NodeInfo &A, const NodeInfo &B) { return false; }
// Tweak the effective earliest
virtual int earliest(const NodeInfo &N) { return N.Earliest; }
// Select from top or from bottom.
virtual int latest(const NodeInfo &N) { return N.Latest + LatestBias; }
// Report a final selection. This marks the start of selecting a new node.
// fromTop() should be invariant between calls to selected()
virtual void selected(NodeInfo &N){};
virtual bool fromTop() { return true; }
};

class PipelineScheduleVisitor {
@@ -72,6 +141,7 @@ class PostPipeliner {

int NTotalInstrs = 0;
int FirstUnscheduled = 0;
int LastUnscheduled = -1;

/// Holds the cycle of each SUnit. The following should hold:
/// Cycle(N) mod II == Cycle(N % NInstr) mod II
@@ -96,7 +166,8 @@ class PostPipeliner {
int II = 1;
int NStages = 0;

/// Place SU in cycle Cycle; update Earliest of dependent instructions
/// Place SU in cycle Cycle; update Earliest of successors and Latest
/// of predecessors
void scheduleNode(SUnit &SU, int Cycle);

/// Compute the stage in which each instruction runs
@@ -111,13 +182,24 @@ class PostPipeliner {
/// on the second iteration.
void computeLoopCarriedParameters();

/// Helpers of computeLoopCarriedParameters()
void computeForward();
bool computeBackward();

/// Forget the previous round of scheduling
void resetSchedule();

/// Try all heuristics, stop at the first that fits the II
/// If it returns true, a valid schedule is laid down in Info.
bool tryHeuristics();

/// Find the first available unscheduled instruction with the highest
/// priority
int mostUrgent();
int mostUrgent(PostPipelinerStrategy &Strategy);

/// Schedule the original instructions, taking the modulo scoreboard
/// into account
bool scheduleFirstIteration();
bool scheduleFirstIteration(PostPipelinerStrategy &Strategy);

/// Check that all copied instructions can run in the same modulo cycle
bool scheduleOtherIterations();
Original file line number Diff line number Diff line change
@@ -5,7 +5,8 @@
#
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates

# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s
# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \
# RUN: --debug-only=postpipeliner-summary -o - 2>&1 | FileCheck %s

# add-store can run in a two-stage II=1 pipeline

3 changes: 2 additions & 1 deletion llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir
Original file line number Diff line number Diff line change
@@ -7,7 +7,8 @@
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates


# RUN: llc -O2 --mtriple=aie2 --start-before=postmisched %s -o - | FileCheck %s
# RUN: llc -O2 --mtriple=aie2 --start-before=postmisched %s \
# RUN: --debug-only=postpipeliner-summary -o - 2>&1 | FileCheck %s

--- |
define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %cond, ptr %cond.i50, <16 x i32> %0, i32 %cond67.i79, i20 %idx.ext.i.i81, i20 %idx.ext.i404.i, i20 %idx.ext.i410.i, i20 %idx.ext.i434.i85, i32 %1, i20 %2, i20 %3, i20 %4, i20 %5, i20 %6, i32 %7, i32 %8, i32 %or9.i.i.i.i.i96, i32 %9, i20 %idx.ext.i422.i82, i20 %10, i20 %11, i20 %12, i20 %13, i20 %14, i20 %15, i20 %16, i20 %17, i20 %18, i20 %19, i20 %20, i20 %21, i20 %22, i20 %23, i32 %conv192.i107, i20 %24, i20 %idx.ext.i428.i, i20 %25, i20 %26, i20 %27, i32 %28) #0 {
Original file line number Diff line number Diff line change
@@ -44,9 +44,9 @@
; CHECK-NEXT: vlda wl11, [p4, #448]; vshuffle x8, x0, x2, r9
; CHECK-NEXT: vldb wh5, [p5, #32]; vshift.align x6, x6, s0, x3, r3
; CHECK-NEXT: vlda wl5, [p5], #256; vshuffle x3, x4, x6, r9
; CHECK-NEXT: vshuffle x10, x4, x6, r25; vmac.f bml4, bml4, x8, x7, r29
; CHECK-NEXT: vshuffle x1, x3, x5, r13
; CHECK-NEXT: vshuffle x3, x3, x5, r24; vmac.f bmh1, bmh1, x8, x9, r29
; CHECK-NEXT: vshuffle x1, x3, x5, r13; vmac.f bml4, bml4, x8, x7, r29
; CHECK-NEXT: vshuffle x3, x3, x5, r24
; CHECK-NEXT: vshuffle x10, x4, x6, r25; vmac.f bmh1, bmh1, x8, x9, r29
; CHECK-NEXT: mov r3, p0; vmac.f bmh0, bmh0, x1, x9, r29
; CHECK-NEXT: and r3, r3, r0; mov p4, p7; vmac.f bmh3, bmh3, x3, x9, r29
; CHECK-NEXT: add r3, r3, #34; vmac.f bmh2, bmh2, x10, x9, r29
@@ -71,9 +71,9 @@
; CHECK-NEXT: vlda wl11, [p4, #448]; vshuffle x8, x0, x2, r9
; CHECK-NEXT: vldb wh5, [p5, #32]; vshift.align x6, x6, s0, x3, r3
; CHECK-NEXT: vlda wl5, [p5], #256; vshuffle x3, x4, x6, r9
; CHECK-NEXT: vshuffle x10, x4, x6, r25; vmac.f bml4, bml4, x8, x7, r29
; CHECK-NEXT: vshuffle x1, x3, x5, r13
; CHECK-NEXT: vshuffle x3, x3, x5, r24; vmac.f bmh1, bmh1, x8, x9, r29
; CHECK-NEXT: vshuffle x1, x3, x5, r13; vmac.f bml4, bml4, x8, x7, r29
; CHECK-NEXT: vshuffle x3, x3, x5, r24
; CHECK-NEXT: vshuffle x10, x4, x6, r25; vmac.f bmh1, bmh1, x8, x9, r29
; CHECK-NEXT: mov r3, p0; vmac.f bmh0, bmh0, x1, x9, r29
; CHECK-NEXT: and r3, r3, r0; mov p4, p7; vmac.f bmh3, bmh3, x3, x9, r29
; CHECK-NEXT: add r3, r3, #34; vmac.f bmh2, bmh2, x10, x9, r29
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir
Original file line number Diff line number Diff line change
@@ -29,8 +29,8 @@
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_1: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: nopb ; lda r0, [p2, #0]; nops ; nopx ; mov p2, p1; nopv
; CHECK-NEXT: nopa ; nopx
; CHECK-NEXT: nopb ; lda r0, [p2, #0]; nops ; nopxm ; nopv
; CHECK-NEXT: nopa ; mov p2, p1
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
234 changes: 234 additions & 0 deletions llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-1.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates

# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s


# derived from gemm_bf16_0

--- |
define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 {
; CHECK-LABEL: gemm:
; CHECK: .p2align 4
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: mova r1, #0; nopb ; nopxm
; CHECK-NEXT: ge r1, r1, r0
; CHECK-NEXT: jnz r1, #.LBB0_4
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: add.nc lc, r0, #-1
; CHECK-NEXT: movxm ls, #.LBB0_2
; CHECK-NEXT: movxm le, #.L_LEnd0
; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wl6, [p0, #64]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wh6, [p0, #96]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; padds [p0], m4; nopxm ; nopv
; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; padds [p0], #128; nopxm ; nopv
; CHECK-NEXT: vldb wl10, [p0], #32; nopx
; CHECK-NEXT: vldb wh10, [p0], #32
; CHECK-NEXT: vldb wl5, [p1], m5
; CHECK-NEXT: vldb wh5, [p1], m6
; CHECK-NEXT: vldb wl3, [p0], #32
; CHECK-NEXT: vldb wl7, [p1], m5
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2
; CHECK-NEXT: vldb wh7, [p1], m6
; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4
; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x8, x8, x10, r16
; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3
; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x11, r3
; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3
; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh5, bmh5, x8, x5, r3
; CHECK-NEXT: vshuffle x6, x6, x3, r16
; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x0, x11, r3
; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3
; CHECK-NEXT: vmac.f bmh6, bmh6, x0, x5, r3
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: nopa ; vldb wl8, [p0, #0]; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3
; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3
; CHECK-NEXT: vldb wl6, [p0, #64]; vmac.f bml0, bml0, x8, x7, r3
; CHECK-NEXT: vldb wh6, [p0, #96]; vmac.f bml1, bml1, x0, x7, r3
; CHECK-NEXT: padds [p0], m4; vldb wl9, [p1], m5; vmac.f bml2, bml2, x6, x7, r3
; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; vmac.f bml3, bml3, x1, x3, r3
; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml5, bml5, x8, x3, r3
; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bml6, bml6, x0, x3, r3
; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml4, bml4, x6, x3, r3
; CHECK-NEXT: vldb wh5, [p1], m6
; CHECK-NEXT: vldb wl3, [p0], #32
; CHECK-NEXT: vldb wl7, [p1], m5
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2
; CHECK-NEXT: vldb wh7, [p1], m6
; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4
; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x8, x8, x10, r16
; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3
; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x11, r3
; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3
; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh5, bmh5, x8, x5, r3
; CHECK-NEXT: vshuffle x6, x6, x3, r16
; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x0, x11, r3
; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3
; CHECK-NEXT: .L_LEnd0:
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh6, bmh6, x0, x5, r3
; CHECK-NEXT: // %bb.3: // %for.cond.cleanup
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3
; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x7, r3
; CHECK-NEXT: vmac.f bml0, bml0, x8, x7, r3
; CHECK-NEXT: vmac.f bml1, bml1, x0, x7, r3
; CHECK-NEXT: vmac.f bml2, bml2, x6, x7, r3
; CHECK-NEXT: vmac.f bml3, bml3, x1, x3, r3
; CHECK-NEXT: vmac.f bml5, bml5, x8, x3, r3
; CHECK-NEXT: vmac.f bml6, bml6, x0, x3, r3
; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3
; CHECK-NEXT: nopx
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup
; CHECK-NEXT: nopa ; ret lr
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
entry:
%cmp5 = icmp sgt i32 %n, 0
br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader: ; preds = %entry
call void @llvm.set.loop.iterations.i32(i32 %n)
br label %for.body

for.cond.cleanup: ; preds = %for.body, %entry
ret void

for.body: ; preds = %for.body.preheader, %for.body
%d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ]
%s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ]
%0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2
%add = add nsw i32 %0, 1
store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2
%incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1
%incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1
%1 = call i1 @llvm.loop.decrement.i32(i32 1)
br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6
}

; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
declare void @llvm.set.loop.iterations.i32(i32) #1

; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
declare i1 @llvm.loop.decrement.i32(i32) #1

attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn }

!llvm.module.flags = !{!0}
!llvm.ident = !{!1}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"}
!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C/C++ TBAA"}
!6 = distinct !{!6, !7, !8}
!7 = !{!"llvm.loop.mustprogress"}
!8 = !{!"llvm.loop.itercount.range", i64 10}

...
---
name: gemm
alignment: 16
tracksRegLiveness: true
body: |
bb.0.entry (align 16):
successors: %bb.1(0x50000000), %bb.3(0x30000000)
liveins: $p0, $p1, $r0
renamable $r1 = MOV_RLC_imm10_pseudo 0
renamable $r1 = GE killed renamable $r1, renamable $r0
JNZ killed renamable $r1, %bb.3
DelayedSchedBarrier
bb.1.for.body.preheader:
successors: %bb.2(0x80000000)
liveins: $p0, $p1, $r0
$lc = ADD_NC $r0, 0
$ls = MOVXM_lng_cg %bb.2
$le = MOVXM_lng_cg <mcsymbol .L_LEnd0>
bb.2.for.body (align 16):
liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13
$wl8 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wl6 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh6 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$p0 = nuw PADD_mod_pseudo $p0, $m4
$p0 = PADD_imm9_pseudo $p0, 128
$wl10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$x1 = VSHUFFLE $x8, $x10, $r4
$wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$x11 = VSHUFFLE $x9, $x9, $r2
$x8 = VSHUFFLE $x8, $x10, $r16
$wh7, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$x0 = VSHUFFLE $x6, $x3, $r4
$bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
$x6 = VSHUFFLE $x6, $x3, $r16
$bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x8, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
$wl3, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$x5 = VSHUFFLE $x5, $x5, $r2
$bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x0, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
$bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x6, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
$wh3, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
$bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x8, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
$x7 = VSHUFFLE $x7, $x7, $r2
$bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x0, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
$bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x6, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
$bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x1, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
$bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x8, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
$x3 = VSHUFFLE $x3, $x3, $r2
$bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x0, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
$bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x6, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
$bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
$bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x8, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
$bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x0, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
$bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x6, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2
bb.3.for.cond.cleanup (align 16):
RET implicit $lr
DelayedSchedBarrier
...
234 changes: 234 additions & 0 deletions llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates

# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s


# derived from gemm_bf16_0

--- |
define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 {
; CHECK-LABEL: gemm:
; CHECK: .p2align 4
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: mova r1, #0; nopb ; nopxm
; CHECK-NEXT: ge r1, r1, r0
; CHECK-NEXT: jnz r1, #.LBB0_4
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: add.nc lc, r0, #-1
; CHECK-NEXT: movxm ls, #.LBB0_2
; CHECK-NEXT: movxm le, #.L_LEnd0
; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wl6, [p0, #64]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wh6, [p0, #96]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; padds [p0], m4; nopxm ; nopv
; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; padds [p0], #128; nopxm ; nopv
; CHECK-NEXT: vldb wl10, [p0], #32; nopx
; CHECK-NEXT: vldb wh10, [p0], #32
; CHECK-NEXT: vldb wl5, [p1], m5
; CHECK-NEXT: vldb wh5, [p1], m6
; CHECK-NEXT: vldb wl3, [p0], #32
; CHECK-NEXT: vldb wl7, [p1], m5
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2
; CHECK-NEXT: vldb wh7, [p1], m6
; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4
; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x0, x8, x10, r16
; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3
; CHECK-NEXT: vmac.f bmh1, bmh1, x0, x11, r3
; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3
; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh5, bmh5, x0, x5, r3
; CHECK-NEXT: vshuffle x6, x6, x3, r16
; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x10, x11, r3
; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3
; CHECK-NEXT: vmac.f bmh6, bmh6, x10, x5, r3
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: nopa ; vldb wl8, [p0, #0]; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3
; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3
; CHECK-NEXT: vldb wl6, [p0, #64]; vmac.f bml0, bml0, x0, x7, r3
; CHECK-NEXT: vldb wh6, [p0, #96]; vmac.f bml1, bml1, x10, x7, r3
; CHECK-NEXT: padds [p0], m4; vldb wl9, [p1], m5; vmac.f bml2, bml2, x6, x7, r3
; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; vmac.f bml3, bml3, x1, x3, r3
; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml5, bml5, x0, x3, r3
; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bml6, bml6, x10, x3, r3
; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml4, bml4, x6, x3, r3
; CHECK-NEXT: vldb wh5, [p1], m6
; CHECK-NEXT: vldb wl3, [p0], #32
; CHECK-NEXT: vldb wl7, [p1], m5
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2
; CHECK-NEXT: vldb wh7, [p1], m6
; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4
; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x0, x8, x10, r16
; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3
; CHECK-NEXT: vmac.f bmh1, bmh1, x0, x11, r3
; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3
; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh5, bmh5, x0, x5, r3
; CHECK-NEXT: vshuffle x6, x6, x3, r16
; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x10, x11, r3
; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3
; CHECK-NEXT: .L_LEnd0:
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh6, bmh6, x10, x5, r3
; CHECK-NEXT: // %bb.3: // %for.cond.cleanup
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3
; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x7, r3
; CHECK-NEXT: vmac.f bml0, bml0, x0, x7, r3
; CHECK-NEXT: vmac.f bml1, bml1, x10, x7, r3
; CHECK-NEXT: vmac.f bml2, bml2, x6, x7, r3
; CHECK-NEXT: vmac.f bml3, bml3, x1, x3, r3
; CHECK-NEXT: vmac.f bml5, bml5, x0, x3, r3
; CHECK-NEXT: vmac.f bml6, bml6, x10, x3, r3
; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3
; CHECK-NEXT: nopx
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup
; CHECK-NEXT: nopa ; ret lr
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
entry:
%cmp5 = icmp sgt i32 %n, 0
br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader: ; preds = %entry
call void @llvm.set.loop.iterations.i32(i32 %n)
br label %for.body

for.cond.cleanup: ; preds = %for.body, %entry
ret void

for.body: ; preds = %for.body.preheader, %for.body
%d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ]
%s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ]
%0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2
%add = add nsw i32 %0, 1
store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2
%incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1
%incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1
%1 = call i1 @llvm.loop.decrement.i32(i32 1)
br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6
}

; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
declare void @llvm.set.loop.iterations.i32(i32) #1

; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
declare i1 @llvm.loop.decrement.i32(i32) #1

attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn }

!llvm.module.flags = !{!0}
!llvm.ident = !{!1}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"}
!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C/C++ TBAA"}
!6 = distinct !{!6, !7, !8}
!7 = !{!"llvm.loop.mustprogress"}
!8 = !{!"llvm.loop.itercount.range", i64 10}

...
---
name: gemm
alignment: 16
tracksRegLiveness: true
body: |
bb.0.entry (align 16):
successors: %bb.1(0x50000000), %bb.3(0x30000000)
liveins: $p0, $p1, $r0
renamable $r1 = MOV_RLC_imm10_pseudo 0
renamable $r1 = GE killed renamable $r1, renamable $r0
JNZ killed renamable $r1, %bb.3
DelayedSchedBarrier
bb.1.for.body.preheader:
successors: %bb.2(0x80000000)
liveins: $p0, $p1, $r0
$lc = ADD_NC $r0, 0
$ls = MOVXM_lng_cg %bb.2
$le = MOVXM_lng_cg <mcsymbol .L_LEnd0>
bb.2.for.body (align 16):
liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13
$wl8 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wl6 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh6 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$p0 = nuw PADD_mod_pseudo $p0, $m4
$p0 = PADD_imm9_pseudo $p0, 128
$wl10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$x1 = VSHUFFLE $x8, $x10, $r4
$wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$x11 = VSHUFFLE $x9, $x9, $r2
$x0 = VSHUFFLE $x8, $x10, $r16
$wh7, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$x10 = VSHUFFLE $x6, $x3, $r4
$bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
$x6 = VSHUFFLE $x6, $x3, $r16
$bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x0, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
$wl3, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$x5 = VSHUFFLE $x5, $x5, $r2
$bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
$bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x6, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
$wh3, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
$bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x0, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
$x7 = VSHUFFLE $x7, $x7, $r2
$bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x10, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
$bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x6, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
$bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x1, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
$bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x0, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
$x3 = VSHUFFLE $x3, $x3, $r2
$bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x10, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
$bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x6, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
$bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
$bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x0, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
$bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x10, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
$bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x6, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2
bb.3.for.cond.cleanup (align 16):
RET implicit $lr
DelayedSchedBarrier
...
234 changes: 234 additions & 0 deletions llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates

# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s


# derived from gemm_bf16_0

--- |
define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 {
; CHECK-LABEL: gemm:
; CHECK: .p2align 4
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: mova r1, #0; nopb ; nopxm
; CHECK-NEXT: ge r1, r1, r0
; CHECK-NEXT: jnz r1, #.LBB0_4
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: add.nc lc, r0, #-1
; CHECK-NEXT: movxm ls, #.LBB0_2
; CHECK-NEXT: movxm le, #.L_LEnd0
; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wl6, [p0, #64]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wh6, [p0, #96]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; padds [p0], m4; nopxm ; nopv
; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; padds [p0], #128; nopxm ; nopv
; CHECK-NEXT: vldb wl10, [p0], #32; nopx
; CHECK-NEXT: vldb wh10, [p0], #32
; CHECK-NEXT: vldb wl5, [p1], m5
; CHECK-NEXT: vldb wh5, [p1], m6
; CHECK-NEXT: vldb wl3, [p0], #32
; CHECK-NEXT: vldb wl7, [p1], m5
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2
; CHECK-NEXT: vldb wh7, [p1], m6
; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4
; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x2, x2, x10, r16
; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3
; CHECK-NEXT: vmac.f bmh1, bmh1, x2, x11, r3
; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3
; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh5, bmh5, x2, x5, r3
; CHECK-NEXT: vshuffle x6, x6, x3, r16
; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x0, x11, r3
; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3
; CHECK-NEXT: vmac.f bmh6, bmh6, x0, x5, r3
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: nopa ; vldb wl8, [p0, #0]; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3
; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3
; CHECK-NEXT: vldb wl6, [p0, #64]; vmac.f bml0, bml0, x2, x7, r3
; CHECK-NEXT: vldb wh6, [p0, #96]; vmac.f bml1, bml1, x0, x7, r3
; CHECK-NEXT: padds [p0], m4; vldb wl9, [p1], m5; vmac.f bml2, bml2, x6, x7, r3
; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; vmac.f bml3, bml3, x1, x3, r3
; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml5, bml5, x2, x3, r3
; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bml6, bml6, x0, x3, r3
; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml4, bml4, x6, x3, r3
; CHECK-NEXT: vldb wh5, [p1], m6
; CHECK-NEXT: vldb wl3, [p0], #32
; CHECK-NEXT: vldb wl7, [p1], m5
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2
; CHECK-NEXT: vldb wh7, [p1], m6
; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4
; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x2, x2, x10, r16
; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3
; CHECK-NEXT: vmac.f bmh1, bmh1, x2, x11, r3
; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3
; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh5, bmh5, x2, x5, r3
; CHECK-NEXT: vshuffle x6, x6, x3, r16
; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x0, x11, r3
; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3
; CHECK-NEXT: .L_LEnd0:
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh6, bmh6, x0, x5, r3
; CHECK-NEXT: // %bb.3: // %for.cond.cleanup
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3
; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x7, r3
; CHECK-NEXT: vmac.f bml0, bml0, x2, x7, r3
; CHECK-NEXT: vmac.f bml1, bml1, x0, x7, r3
; CHECK-NEXT: vmac.f bml2, bml2, x6, x7, r3
; CHECK-NEXT: vmac.f bml3, bml3, x1, x3, r3
; CHECK-NEXT: vmac.f bml5, bml5, x2, x3, r3
; CHECK-NEXT: vmac.f bml6, bml6, x0, x3, r3
; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3
; CHECK-NEXT: nopx
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup
; CHECK-NEXT: nopa ; ret lr
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
entry:
%cmp5 = icmp sgt i32 %n, 0
br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader: ; preds = %entry
call void @llvm.set.loop.iterations.i32(i32 %n)
br label %for.body

for.cond.cleanup: ; preds = %for.body, %entry
ret void

for.body: ; preds = %for.body.preheader, %for.body
%d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ]
%s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ]
%0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2
%add = add nsw i32 %0, 1
store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2
%incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1
%incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1
%1 = call i1 @llvm.loop.decrement.i32(i32 1)
br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6
}

; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
declare void @llvm.set.loop.iterations.i32(i32) #1

; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
declare i1 @llvm.loop.decrement.i32(i32) #1

attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn }

!llvm.module.flags = !{!0}
!llvm.ident = !{!1}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"}
!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C/C++ TBAA"}
!6 = distinct !{!6, !7, !8}
!7 = !{!"llvm.loop.mustprogress"}
!8 = !{!"llvm.loop.itercount.range", i64 10}

...
---
name: gemm
alignment: 16
tracksRegLiveness: true
body: |
bb.0.entry (align 16):
successors: %bb.1(0x50000000), %bb.3(0x30000000)
liveins: $p0, $p1, $r0
renamable $r1 = MOV_RLC_imm10_pseudo 0
renamable $r1 = GE killed renamable $r1, renamable $r0
JNZ killed renamable $r1, %bb.3
DelayedSchedBarrier
bb.1.for.body.preheader:
successors: %bb.2(0x80000000)
liveins: $p0, $p1, $r0
$lc = ADD_NC $r0, 0
$ls = MOVXM_lng_cg %bb.2
$le = MOVXM_lng_cg <mcsymbol .L_LEnd0>
bb.2.for.body (align 16):
liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13
$wl8 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wl6 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh6 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$p0 = nuw PADD_mod_pseudo $p0, $m4
$p0 = PADD_imm9_pseudo $p0, 128
$wl10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$x1 = VSHUFFLE $x8, $x10, $r4
$wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$x11 = VSHUFFLE $x9, $x9, $r2
$x2 = VSHUFFLE $x2, $x10, $r16
$wh7, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$x0 = VSHUFFLE $x6, $x3, $r4
$bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
$x6 = VSHUFFLE $x6, $x3, $r16
$bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x2, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
$wl3, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$x5 = VSHUFFLE $x5, $x5, $r2
$bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x0, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
$bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x6, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
$wh3, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
$bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x2, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
$x7 = VSHUFFLE $x7, $x7, $r2
$bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x0, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
$bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x6, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
$bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x1, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
$bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x2, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
$x3 = VSHUFFLE $x3, $x3, $r2
$bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x0, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
$bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x6, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
$bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
$bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x2, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
$bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x0, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
$bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x6, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2
bb.3.for.cond.cleanup (align 16):
RET implicit $lr
DelayedSchedBarrier
...
Original file line number Diff line number Diff line change
@@ -31,59 +31,59 @@
; CHECK-NEXT: vldb wl0, [p0, #0]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wh0, [p0, #32]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wl1, [p0, #64]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wh1, [p0, #96]; nopa ; padds [p0], m4; nopxm ; nopv
; CHECK-NEXT: vldb wl8, [p1], m5; nopa ; padds [p0], #128; nopxm ; nopv
; CHECK-NEXT: vldb wl2, [p0], #32; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wh2, [p0], #32; nopx
; CHECK-NEXT: vldb wh8, [p1], m6
; CHECK-NEXT: vldb wh1, [p0, #96]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wl8, [p1], m5; nopa ; padds [p0], m4; nopxm ; nopv
; CHECK-NEXT: vldb wh8, [p1], m6; nopa ; padds [p0], #128; nopxm ; nopv
; CHECK-NEXT: vldb wl2, [p0], #32; nopx
; CHECK-NEXT: vldb wh2, [p0], #32
; CHECK-NEXT: vldb wl3, [p0], #32
; CHECK-NEXT: vldb wl9, [p1], m5
; CHECK-NEXT: vldb.3d wh3, [p0], d0
; CHECK-NEXT: vldb wh9, [p1], m6
; CHECK-NEXT: vldb wl10, [p1], m5
; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3
; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x5, x0, x2, r16
; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x8, x8, x8, r6
; CHECK-NEXT: vldb wl0, [p0, #0]
; CHECK-NEXT: vldb wh0, [p0, #32]; vshuffle x6, x1, x3, r3; vmac.f bmh0, bmh0, x4, x8, r2
; CHECK-NEXT: vldb wl1, [p0, #64]; vshuffle x7, x1, x3, r16; vmac.f bmh2, bmh2, x5, x8, r2
; CHECK-NEXT: padds [p0], m4; vldb wh1, [p0, #96]; vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2
; CHECK-NEXT: padds [p0], #128; vldb wl8, [p1], m5; vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2
; CHECK-NEXT: vldb wl2, [p0], #32; vmac.f bmh4, bmh0, x4, x9, r2
; CHECK-NEXT: vldb wh2, [p0], #32; vshuffle x11, x11, x11, r6; vmac.f bmh6, bmh2, x5, x9, r2
; CHECK-NEXT: vldb wh8, [p1], m6; vmac.f bmh5, bmh1, x6, x9, r2
; CHECK-NEXT: vldb wl10, [p1], m5; vshuffle x8, x8, x8, r6
; CHECK-NEXT: vldb wh10, [p1], m6
; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x4, x0, x2, r3
; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x5, x0, x2, r16
; CHECK-NEXT: vldb wl0, [p0, #0]; vmac.f bmh0, bmh0, x4, x8, r2
; CHECK-NEXT: vldb wh0, [p0, #32]; vshuffle x6, x1, x3, r3; vmac.f bmh2, bmh2, x5, x8, r2
; CHECK-NEXT: vldb wl1, [p0, #64]; vshuffle x7, x1, x3, r16
; CHECK-NEXT: vldb wh1, [p0, #96]; vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2
; CHECK-NEXT: padds [p0], m4; vldb wl8, [p1], m5; vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2
; CHECK-NEXT: padds [p0], #128; vldb wh8, [p1], m6; vmac.f bmh4, bmh0, x4, x9, r2
; CHECK-NEXT: vldb wl2, [p0], #32; vshuffle x11, x11, x11, r6; vmac.f bmh6, bmh2, x5, x9, r2
; CHECK-NEXT: vldb wh2, [p0], #32; vmac.f bmh5, bmh1, x6, x9, r2
; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh7, bmh3, x7, x9, r2
; CHECK-NEXT: vldb wl9, [p1], m5; vmac.f bmh8, bmh0, x4, x10, r2
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml0, bmh1, x6, x10, r2
; CHECK-NEXT: vldb wh9, [p1], m6; vmac.f bml1, bmh2, x5, x10, r2
; CHECK-NEXT: vldb wl10, [p1], m5; vmac.f bml2, bmh3, x7, x10, r2
; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3; vmac.f bml3, bmh0, x4, x11, r2
; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x5, x0, x2, r16; vmac.f bml4, bmh1, x6, x11, r2
; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x8, x8, x8, r6; vmac.f bml5, bmh2, x5, x11, r2
; CHECK-NEXT: vldb wl10, [p1], m5; vshuffle x8, x8, x8, r6; vmac.f bml2, bmh3, x7, x10, r2
; CHECK-NEXT: vldb wh10, [p1], m6; vmac.f bml3, bmh0, x4, x11, r2
; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x4, x0, x2, r3; vmac.f bml4, bmh1, x6, x11, r2
; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x5, x0, x2, r16; vmac.f bml5, bmh2, x5, x11, r2
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: nopa ; vldb wl0, [p0, #0]; nopxm ; vmac.f bml6, bmh3, x7, x11, r2
; CHECK-NEXT: vldb wh0, [p0, #32]; vshuffle x6, x1, x3, r3; vmac.f bmh0, bmh0, x4, x8, r2
; CHECK-NEXT: vldb wl1, [p0, #64]; vshuffle x7, x1, x3, r16; vmac.f bmh2, bmh2, x5, x8, r2
; CHECK-NEXT: padds [p0], m4; vldb wh1, [p0, #96]; vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2
; CHECK-NEXT: padds [p0], #128; vldb wl8, [p1], m5; vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2
; CHECK-NEXT: vldb wl2, [p0], #32; vmac.f bmh4, bmh0, x4, x9, r2
; CHECK-NEXT: vldb wh2, [p0], #32; vshuffle x11, x11, x11, r6; vmac.f bmh6, bmh2, x5, x9, r2
; CHECK-NEXT: vldb wh8, [p1], m6; vmac.f bmh5, bmh1, x6, x9, r2
; CHECK-NEXT: nopa ; vldb wl0, [p0, #0]; nopxm ; vmac.f bmh0, bmh0, x4, x8, r2
; CHECK-NEXT: vldb wh0, [p0, #32]; vshuffle x6, x1, x3, r3; vmac.f bmh2, bmh2, x5, x8, r2
; CHECK-NEXT: vldb wl1, [p0, #64]; vshuffle x7, x1, x3, r16; vmac.f bml6, bmh3, x7, x11, r2
; CHECK-NEXT: vldb wh1, [p0, #96]; vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2
; CHECK-NEXT: padds [p0], m4; vldb wl8, [p1], m5; vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2
; CHECK-NEXT: padds [p0], #128; vldb wh8, [p1], m6; vmac.f bmh4, bmh0, x4, x9, r2
; CHECK-NEXT: vldb wl2, [p0], #32; vshuffle x11, x11, x11, r6; vmac.f bmh6, bmh2, x5, x9, r2
; CHECK-NEXT: vldb wh2, [p0], #32; vmac.f bmh5, bmh1, x6, x9, r2
; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh7, bmh3, x7, x9, r2
; CHECK-NEXT: vldb wl9, [p1], m5; vmac.f bmh8, bmh0, x4, x10, r2
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml0, bmh1, x6, x10, r2
; CHECK-NEXT: vldb wh9, [p1], m6; vmac.f bml1, bmh2, x5, x10, r2
; CHECK-NEXT: vldb wl10, [p1], m5; vmac.f bml2, bmh3, x7, x10, r2
; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3; vmac.f bml3, bmh0, x4, x11, r2
; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x5, x0, x2, r16; vmac.f bml4, bmh1, x6, x11, r2
; CHECK-NEXT: vldb wl10, [p1], m5; vshuffle x8, x8, x8, r6; vmac.f bml2, bmh3, x7, x10, r2
; CHECK-NEXT: vldb wh10, [p1], m6; vmac.f bml3, bmh0, x4, x11, r2
; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x4, x0, x2, r3; vmac.f bml4, bmh1, x6, x11, r2
; CHECK-NEXT: .L_LEnd0:
; CHECK-NEXT: vldb.3d wh11, [p1], d1; nopa ; nops ; nopx ; vshuffle x8, x8, x8, r6; vmac.f bml5, bmh2, x5, x11, r2
; CHECK-NEXT: vldb.3d wh11, [p1], d1; nopa ; nops ; nopx ; vshuffle x5, x0, x2, r16; vmac.f bml5, bmh2, x5, x11, r2
; CHECK-NEXT: // %bb.3: // %for.cond.cleanup
; CHECK-NEXT: nopa ; nopb ; nopxm ; vmac.f bml6, bmh3, x7, x11, r2
; CHECK-NEXT: vshuffle x6, x1, x3, r3; vmac.f bmh0, bmh0, x4, x8, r2
; CHECK-NEXT: vshuffle x7, x1, x3, r16; vmac.f bmh2, bmh2, x5, x8, r2
; CHECK-NEXT: nopa ; nopb ; nopxm ; vmac.f bmh0, bmh0, x4, x8, r2
; CHECK-NEXT: vshuffle x6, x1, x3, r3; vmac.f bmh2, bmh2, x5, x8, r2
; CHECK-NEXT: vshuffle x7, x1, x3, r16; vmac.f bml6, bmh3, x7, x11, r2
; CHECK-NEXT: vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2
; CHECK-NEXT: vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2
; CHECK-NEXT: vmac.f bmh4, bmh0, x4, x9, r2
@@ -97,9 +97,9 @@
; CHECK-NEXT: vmac.f bml3, bmh0, x4, x11, r2
; CHECK-NEXT: vmac.f bml4, bmh1, x6, x11, r2
; CHECK-NEXT: vmac.f bml5, bmh2, x5, x11, r2
; CHECK-NEXT: vmac.f bml6, bmh3, x7, x11, r2
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: vmac.f bml6, bmh3, x7, x11, r2
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
Original file line number Diff line number Diff line change
@@ -42,11 +42,11 @@
; CHECK-NEXT: vldb wh9, [p1], m6
; CHECK-NEXT: vldb wl10, [p1], m5
; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3
; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x5, x0, x2, r16
; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x8, x8, x8, r6
; CHECK-NEXT: vldb wl0, [p0], #32
; CHECK-NEXT: vldb wh0, [p0], #32; vshuffle x6, x1, x3, r3; vmac.f bmh0, bmh0, x4, x8, r2
; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x7, x1, x3, r16; vmac.f bmh2, bmh2, x5, x8, r2
; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x8, x8, x8, r6
; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x5, x0, x2, r16
; CHECK-NEXT: vldb wl0, [p0], #32; vmac.f bmh0, bmh0, x4, x8, r2
; CHECK-NEXT: vldb wh0, [p0], #32; vshuffle x6, x1, x3, r3; vmac.f bmh2, bmh2, x5, x8, r2
; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x7, x1, x3, r16
; CHECK-NEXT: vldb wh1, [p0], #32; vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2
; CHECK-NEXT: padds [p0], m4; vldb wl8, [p1], m5; vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2
; CHECK-NEXT: vldb wl2, [p0], #32; vmac.f bmh4, bmh0, x4, x9, r2
@@ -58,14 +58,14 @@
; CHECK-NEXT: vldb wh9, [p1], m6; vmac.f bml1, bmh2, x5, x10, r2
; CHECK-NEXT: vldb wl10, [p1], m5; vmac.f bml2, bmh3, x7, x10, r2
; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3; vmac.f bml3, bmh0, x4, x11, r2
; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x5, x0, x2, r16; vmac.f bml4, bmh1, x6, x11, r2
; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x8, x8, x8, r6; vmac.f bml5, bmh2, x5, x11, r2
; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x8, x8, x8, r6; vmac.f bml4, bmh1, x6, x11, r2
; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x5, x0, x2, r16; vmac.f bml5, bmh2, x5, x11, r2
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldb wl0, [p0], #32; nopa ; nops ; nopxm ; vmac.f bml6, bmh3, x7, x11, r2
; CHECK-NEXT: vldb wh0, [p0], #32; vshuffle x6, x1, x3, r3; vmac.f bmh0, bmh0, x4, x8, r2
; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x7, x1, x3, r16; vmac.f bmh2, bmh2, x5, x8, r2
; CHECK-NEXT: vldb wl0, [p0], #32; nopa ; nops ; nopxm ; vmac.f bmh0, bmh0, x4, x8, r2
; CHECK-NEXT: vldb wh0, [p0], #32; vshuffle x6, x1, x3, r3; vmac.f bmh2, bmh2, x5, x8, r2
; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x7, x1, x3, r16; vmac.f bml6, bmh3, x7, x11, r2
; CHECK-NEXT: vldb wh1, [p0], #32; vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2
; CHECK-NEXT: padds [p0], m4; vldb wl8, [p1], m5; vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2
; CHECK-NEXT: vldb wl2, [p0], #32; vmac.f bmh4, bmh0, x4, x9, r2
@@ -77,13 +77,13 @@
; CHECK-NEXT: vldb wh9, [p1], m6; vmac.f bml1, bmh2, x5, x10, r2
; CHECK-NEXT: vldb wl10, [p1], m5; vmac.f bml2, bmh3, x7, x10, r2
; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3; vmac.f bml3, bmh0, x4, x11, r2
; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x5, x0, x2, r16; vmac.f bml4, bmh1, x6, x11, r2
; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x8, x8, x8, r6; vmac.f bml4, bmh1, x6, x11, r2
; CHECK-NEXT: .L_LEnd0:
; CHECK-NEXT: vldb.3d wh11, [p1], d1; nopa ; nops ; nopx ; vshuffle x8, x8, x8, r6; vmac.f bml5, bmh2, x5, x11, r2
; CHECK-NEXT: vldb.3d wh11, [p1], d1; nopa ; nops ; nopx ; vshuffle x5, x0, x2, r16; vmac.f bml5, bmh2, x5, x11, r2
; CHECK-NEXT: // %bb.3: // %for.cond.cleanup
; CHECK-NEXT: nopa ; nopb ; nopxm ; vmac.f bml6, bmh3, x7, x11, r2
; CHECK-NEXT: vshuffle x6, x1, x3, r3; vmac.f bmh0, bmh0, x4, x8, r2
; CHECK-NEXT: vshuffle x7, x1, x3, r16; vmac.f bmh2, bmh2, x5, x8, r2
; CHECK-NEXT: nopa ; nopb ; nopxm ; vmac.f bmh0, bmh0, x4, x8, r2
; CHECK-NEXT: vshuffle x6, x1, x3, r3; vmac.f bmh2, bmh2, x5, x8, r2
; CHECK-NEXT: vshuffle x7, x1, x3, r16; vmac.f bml6, bmh3, x7, x11, r2
; CHECK-NEXT: vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2
; CHECK-NEXT: vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2
; CHECK-NEXT: vmac.f bmh4, bmh0, x4, x9, r2
@@ -97,9 +97,9 @@
; CHECK-NEXT: vmac.f bml3, bmh0, x4, x11, r2
; CHECK-NEXT: vmac.f bml4, bmh1, x6, x11, r2
; CHECK-NEXT: vmac.f bml5, bmh2, x5, x11, r2
; CHECK-NEXT: vmac.f bml6, bmh3, x7, x11, r2
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: vmac.f bml6, bmh3, x7, x11, r2
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
211 changes: 211 additions & 0 deletions llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates

# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \
# RUN: --debug-only=postpipeliner-summary -o - | FileCheck %s


# derived from gemm_bf16_0

--- |
define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 {
; CHECK-LABEL: gemm:
; CHECK: .p2align 4
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: mova r1, #0; nopb ; nopxm
; CHECK-NEXT: ge r1, r1, r0
; CHECK-NEXT: jnz r1, #.LBB0_4
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: add.nc lc, r0, #-1
; CHECK-NEXT: movxm ls, #.LBB0_2
; CHECK-NEXT: movxm le, #.L_LEnd0
; CHECK-NEXT: vldb wl8, [p0], #32; vlda wl11, [p1], m5; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wh8, [p0], #32; vlda wh11, [p1], m6; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wl1, [p0], #32; vlda wl5, [p1], m5; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wh1, [p0], #32; vlda wh5, [p1], m6; nops ; nopxm ; nopv
; CHECK-NEXT: paddb [p0], m4; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wl0, [p0], #32; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wh0, [p0], #32; nopx
; CHECK-NEXT: vldb wl3, [p0], #32
; CHECK-NEXT: vldb.3d wh3, [p0], d0
; CHECK-NEXT: vldb wl0, [p1], m5
; CHECK-NEXT: vldb wh0, [p1], m6; vshuffle x5, x5, x5, r6
; CHECK-NEXT: vldb wl7, [p1], m5
; CHECK-NEXT: vldb.3d wh7, [p1], d1
; CHECK-NEXT: vshuffle x6, x8, x0, r3
; CHECK-NEXT: vshuffle x2, x8, x0, r16
; CHECK-NEXT: vshuffle x10, x1, x3, r3; vmac.f bmh4, bmh4, x6, x5, r2
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldb wl8, [p0], #32; vlda wl11, [p1], m5; nops ; nopx ; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r2
; CHECK-NEXT: vlda wh11, [p1], m6; vldb wh8, [p0], #32; nopx ; vshuffle x3, x11, x11, r6; vmac.f bmh6, bmh6, x10, x5, r2
; CHECK-NEXT: vlda wl5, [p1], m5; vldb wl1, [p0], #32; vshuffle x0, x0, x0, r6; vmac.f bmh7, bmh7, x9, x5, r2
; CHECK-NEXT: vlda wh5, [p1], m6; vldb wh1, [p0], #32; vshuffle x7, x7, x7, r6; vmac.f bmh0, bmh0, x6, x3, r2
; CHECK-NEXT: paddb [p0], m4; vmac.f bmh1, bmh1, x2, x3, r2
; CHECK-NEXT: vldb wl0, [p0], #32; vmac.f bmh2, bmh2, x10, x3, r2
; CHECK-NEXT: vldb wh0, [p0], #32; vmac.f bmh3, bmh3, x9, x3, r2
; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh8, bmh8, x6, x0, r2
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml0, bml0, x2, x0, r2
; CHECK-NEXT: vldb wl0, [p1], m5; vmac.f bml1, bml1, x10, x0, r2
; CHECK-NEXT: vldb wh0, [p1], m6; vshuffle x5, x5, x5, r6; vmac.f bml2, bml2, x9, x0, r2
; CHECK-NEXT: vldb wl7, [p1], m5; vmac.f bml3, bml3, x6, x7, r2
; CHECK-NEXT: vldb.3d wh7, [p1], d1; vmac.f bml4, bml4, x2, x7, r2
; CHECK-NEXT: vshuffle x6, x8, x0, r3; vmac.f bml6, bml6, x10, x7, r2
; CHECK-NEXT: vshuffle x2, x8, x0, r16; vmac.f bml5, bml5, x9, x7, r2
; CHECK-NEXT: .L_LEnd0:
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x10, x1, x3, r3; vmac.f bmh4, bmh4, x6, x5, r2
; CHECK-NEXT: // %bb.3: // %for.cond.cleanup
; CHECK-NEXT: nopx ; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r2
; CHECK-NEXT: vshuffle x3, x11, x11, r6; vmac.f bmh6, bmh6, x10, x5, r2
; CHECK-NEXT: vshuffle x0, x0, x0, r6; vmac.f bmh7, bmh7, x9, x5, r2
; CHECK-NEXT: vshuffle x7, x7, x7, r6; vmac.f bmh0, bmh0, x6, x3, r2
; CHECK-NEXT: vmac.f bmh1, bmh1, x2, x3, r2
; CHECK-NEXT: vmac.f bmh2, bmh2, x10, x3, r2
; CHECK-NEXT: vmac.f bmh3, bmh3, x9, x3, r2
; CHECK-NEXT: vmac.f bmh8, bmh8, x6, x0, r2
; CHECK-NEXT: vmac.f bml0, bml0, x2, x0, r2
; CHECK-NEXT: vmac.f bml1, bml1, x10, x0, r2
; CHECK-NEXT: vmac.f bml2, bml2, x9, x0, r2
; CHECK-NEXT: vmac.f bml3, bml3, x6, x7, r2
; CHECK-NEXT: vmac.f bml4, bml4, x2, x7, r2
; CHECK-NEXT: vmac.f bml6, bml6, x10, x7, r2
; CHECK-NEXT: vmac.f bml5, bml5, x9, x7, r2
; CHECK-NEXT: nop
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup
; CHECK-NEXT: nopa ; ret lr
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
entry:
%cmp5 = icmp sgt i32 %n, 0
br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader: ; preds = %entry
call void @llvm.set.loop.iterations.i32(i32 %n)
br label %for.body

for.cond.cleanup: ; preds = %for.body, %entry
ret void

for.body: ; preds = %for.body.preheader, %for.body
%d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ]
%s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ]
%0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2
%add = add nsw i32 %0, 1
store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2
%incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1
%incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1
%1 = call i1 @llvm.loop.decrement.i32(i32 1)
br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6
}

; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
declare void @llvm.set.loop.iterations.i32(i32) #1

; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
declare i1 @llvm.loop.decrement.i32(i32) #1

attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn }

!llvm.module.flags = !{!0}
!llvm.ident = !{!1}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"}
!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C/C++ TBAA"}
!6 = distinct !{!6, !7, !8}
!7 = !{!"llvm.loop.mustprogress"}
!8 = !{!"llvm.loop.itercount.range", i64 10}

...
---
name: gemm
alignment: 16
tracksRegLiveness: true
body: |
bb.0.entry (align 16):
successors: %bb.1(0x50000000), %bb.3(0x30000000)
liveins: $p0, $p1, $r0
$r1 = MOV_RLC_imm10_pseudo 0
$r1 = GE $r1, $r0
JNZ $r1, %bb.3
DelayedSchedBarrier
bb.1.for.body.preheader:
successors: %bb.2(0x80000000)
liveins: $p0, $p1, $r0
$lc = ADD_NC $r0, 0
$ls = MOVXM_lng_cg %bb.2
$le = MOVXM_lng_cg <mcsymbol .L_LEnd0>
bb.2.for.body (align 16):
liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13
$wl8, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh8, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wl1, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh1, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$p0 = nuw PADD_mod_pseudo $p0, $m4
$wl0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
$x6 = VSHUFFLE $x8, $x0, $r3
$x2 = VSHUFFLE $x8, $x0, $r16
$x10 = VSHUFFLE $x1, $x3, $r3
$x9 = VSHUFFLE $x1, $x3, $r16
$wl11, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
$wh11, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
$wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
$wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
$wl0, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
$wh0, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
$wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
$wh7, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
$x3 = VSHUFFLE $x11, $x11, $r6
$x5 = VSHUFFLE $x5, $x5, $r6
$x0 = VSHUFFLE $x0, $x0, $r6
$x7 = VSHUFFLE $x7, $x7, $r6
$bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x6, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask
$bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x2, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask
$bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask
$bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x9, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask
$bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x6, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask
$bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x2, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask
$bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x10, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask
$bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x9, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask
$bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x6, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask
$bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x2, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask
$bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x10, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask
$bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x9, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask
$bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x6, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask
$bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x2, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask
$bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x10, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask
$bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x9, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask
PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2
bb.3.for.cond.cleanup (align 16):
RET implicit $lr
DelayedSchedBarrier
...
42 changes: 21 additions & 21 deletions llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm.mir
Original file line number Diff line number Diff line change
@@ -30,51 +30,51 @@
; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wl6, [p0, #64]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wh6, [p0, #96]; nopa ; padds [p0], m4; nopxm ; nopv
; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; padds [p0], #128; nopxm ; nopv
; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wl10, [p0], #32
; CHECK-NEXT: vldb wl5, [p1], m5
; CHECK-NEXT: vldb wh6, [p0, #96]; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; padds [p0], m4; nopxm ; nopv
; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; padds [p0], #128; nopxm ; nopv
; CHECK-NEXT: vldb wl10, [p0], #32; nopx
; CHECK-NEXT: vldb wh10, [p0], #32
; CHECK-NEXT: vldb wl5, [p1], m5
; CHECK-NEXT: vldb wh5, [p1], m6
; CHECK-NEXT: vldb wl3, [p0], #32
; CHECK-NEXT: vldb wl7, [p1], m5
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2
; CHECK-NEXT: vldb wh7, [p1], m6
; CHECK-NEXT: vldb wl3, [p1], m5
; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x1, x8, x10, r4
; CHECK-NEXT: vshuffle x8, x8, x10, r16
; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4
; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x8, x8, x10, r16
; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3
; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x11, r3
; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh4, bmh4, x1, x5, r3
; CHECK-NEXT: vshuffle x6, x6, x3, r16; vmac.f bmh5, bmh5, x8, x5, r3
; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3
; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh5, bmh5, x8, x5, r3
; CHECK-NEXT: vshuffle x6, x6, x3, r16
; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x10, x11, r3
; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3
; CHECK-NEXT: vmac.f bmh6, bmh6, x10, x5, r3
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: nopa ; vldb wl8, [p0, #0]; nopx ; vmac.f bmh7, bmh7, x6, x5, r3
; CHECK-NEXT: nopa ; vldb wl8, [p0, #0]; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3
; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3
; CHECK-NEXT: vldb wl6, [p0, #64]; vmac.f bml0, bml0, x8, x7, r3
; CHECK-NEXT: padds [p0], m4; vldb wh6, [p0, #96]; vmac.f bml1, bml1, x10, x7, r3
; CHECK-NEXT: padds [p0], #128; vldb wl9, [p1], m5; vmac.f bml2, bml2, x6, x7, r3
; CHECK-NEXT: vldb wh9, [p1], m6; vmac.f bml3, bml3, x1, x3, r3
; CHECK-NEXT: vldb wh6, [p0, #96]; vmac.f bml1, bml1, x10, x7, r3
; CHECK-NEXT: padds [p0], m4; vldb wl9, [p1], m5; vmac.f bml2, bml2, x6, x7, r3
; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; vmac.f bml3, bml3, x1, x3, r3
; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml5, bml5, x8, x3, r3
; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml6, bml6, x10, x3, r3
; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bml4, bml4, x6, x3, r3
; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bml6, bml6, x10, x3, r3
; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml4, bml4, x6, x3, r3
; CHECK-NEXT: vldb wh5, [p1], m6
; CHECK-NEXT: vldb wl3, [p0], #32
; CHECK-NEXT: vldb wl7, [p1], m5
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2
; CHECK-NEXT: vldb wh7, [p1], m6
; CHECK-NEXT: vldb wl3, [p1], m5
; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x1, x8, x10, r4
; CHECK-NEXT: vshuffle x8, x8, x10, r16
; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4
; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x8, x8, x10, r16
; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3
; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x11, r3
; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh4, bmh4, x1, x5, r3
; CHECK-NEXT: vshuffle x6, x6, x3, r16; vmac.f bmh5, bmh5, x8, x5, r3
; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3
; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh5, bmh5, x8, x5, r3
; CHECK-NEXT: vshuffle x6, x6, x3, r16
; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x10, x11, r3
; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3
; CHECK-NEXT: .L_LEnd0:
51 changes: 30 additions & 21 deletions llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir
Original file line number Diff line number Diff line change
@@ -34,49 +34,58 @@
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: add.nc lc, r0, #-3
; CHECK-NEXT: add.nc lc, r0, #-5
; CHECK-NEXT: movxm ls, #.LBB0_2
; CHECK-NEXT: movxm le, #.L_LEnd0
; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; nops ; nopxm ; nopv
; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; nops ; nopxm ; nopv
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; nops ; nopxm ; nopv
; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; nops ; nopxm ; nopv
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv
; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32
; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32
; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32
; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1
; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1
; CHECK-NEXT: nop
; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32
; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1
; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1
; CHECK-NEXT: nop
; CHECK-NEXT: vups.s32.s8 cm2, wh0, s1
; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1; vups.s32.s8 cm3, wh2, s1
; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; vups.s32.s8 cm2, wh0, s1
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1; nopxm ; nopv
; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; nopb ; vst.srs.s8.s32 cm2, s0, [p1], #32
; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32
; CHECK-NEXT: vups.s32.s8 cm2, wh0, s1
; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1; nopx ; vups.s32.s8 cm3, wh2, s1; nopv
; CHECK-NEXT: nopa ; nopb ; nopx ; vsrs.s8.s32 wh0, cm0, s1
; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32
; CHECK-NEXT: .L_LEnd0:
; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh0, cm0, s1; nopx ; vups.s32.s8 cm3, wh2, s1; nopv
; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; vst.srs.s8.s32 cm3, s0, [p1], #32; nopx ; vups.s32.s8 cm2, wh0, s1; nopv
; CHECK-NEXT: // %bb.3: // %for.cond.cleanup
; CHECK-NEXT: nopa ; vsrs.s8.s32 wh2, cm1, s1; nopx
; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1; vups.s32.s8 cm3, wh2, s1
; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1
; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32
; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32
; CHECK-NEXT: vups.s32.s8 cm2, wh0, s1
; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1; vups.s32.s8 cm3, wh2, s1
; CHECK-NEXT: vsrs.s8.s32 wh2, cm1, s1
; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; vst.srs.s8.s32 cm3, s0, [p1], #32; vups.s32.s8 cm2, wh0, s1
; CHECK-NEXT: vsrs.s8.s32 wh2, cm1, s1; vups.s32.s8 cm3, wh2, s1
; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1
; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32
; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32
; CHECK-NEXT: vups.s32.s8 cm2, wh0, s1
; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32; vups.s32.s8 cm2, wh0, s1
; CHECK-NEXT: vsrs.s8.s32 wh2, cm1, s1; vups.s32.s8 cm3, wh2, s1
; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1
; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32
; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32; vups.s32.s8 cm2, wh0, s1
; CHECK-NEXT: vsrs.s8.s32 wh2, cm1, s1; vups.s32.s8 cm3, wh2, s1
; CHECK-NEXT: nop
; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32
; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32; vups.s32.s8 cm2, wh0, s1
; CHECK-NEXT: vups.s32.s8 cm3, wh2, s1
; CHECK-NEXT: nop
; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32
; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup
; CHECK-NEXT: nopa ; ret lr

0 comments on commit 8f03d3e

Please sign in to comment.