diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp index a8bcbe216736..c355c9e7be19 100644 --- a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp +++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp @@ -704,6 +704,8 @@ AIEBaseSubtarget::getPostRAMutationsImpl(const Triple &TT) { std::vector> Mutations; Mutations.emplace_back(std::make_unique()); if (!TT.isAIE1()) { + if (EnableWAWStickyRegisters) + Mutations.emplace_back(std::make_unique()); Mutations.emplace_back(std::make_unique()); Mutations.emplace_back(std::make_unique()); Mutations.emplace_back(std::make_unique()); diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp index d6896bd624da..57fa9ed2ab52 100644 --- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp +++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp @@ -1320,8 +1320,9 @@ void llvm::AIEPostRASchedStrategy::buildGraph(ScheduleDAGMI &DAG, AAResults *AA, assert(BS.getRegions().size() == 1); // Try to wrap the linear schedule within II. // We virtually unroll the body by the stagecount, computed from rounding - // up the length divided by II. - NCopies = (BS.getScheduleLength() + II - 1) / II; + // up the length divided by II, adding one more stage to account for + // the added resource contention + NCopies = (BS.getScheduleLength() + II - 1) / II + 1; } DEBUG_BLOCKS(dbgs() << " buildGraph, NCopies=" << NCopies << "\n"); for (int S = 0; S < NCopies; S++) { @@ -1386,6 +1387,8 @@ void AIEScheduleDAGMI::schedule() { // If it succeeds, we need to implement it, if we fail we fall back on the // normal loop schedule SchedImpl->buildGraph(*this, AA); + postProcessDAG(); + auto &PostSWP = BS.getPostSWP(); if (PostSWP.schedule(*this, BS.FixPoint.II)) { BS.setPipelined(); diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp index 961aacb6acca..48f194d0878b 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp @@ -12,14 +12,23 @@ //===----------------------------------------------------------------------===// #include "AIEPostPipeliner.h" +#include "AIESlotCounts.h" #include "Utils/AIELoopUtils.h" +#include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/Support/MathExtras.h" #define DEBUG_TYPE "postpipeliner" +#define DEBUG_SUMMARY(X) DEBUG_WITH_TYPE("postpipeliner-summary", X) +#define DEBUG_FULL(X) DEBUG_WITH_TYPE("postpipeliner-full", X) namespace llvm::AIE { +static cl::opt + Heuristic("aie-postpipeliner-heuristic", + cl::desc("Select one specific post-pipeliner heuristic"), + cl::init(-1), cl::Hidden); + PipelineScheduleVisitor::~PipelineScheduleVisitor() {} class PostPipelineDumper : public PipelineScheduleVisitor { @@ -106,32 +115,24 @@ bool PostPipeliner::canAccept(MachineBasicBlock &LoopBlock) { return true; } +static SlotCounts getSlotCounts(MachineInstr &MI, const AIEBaseInstrInfo *TII) { + auto *SlotInfo = TII->getSlotInfo(TII->getSlotKind(MI.getOpcode())); + return SlotInfo ? SlotInfo->getSlotSet() : 0; +} + int PostPipeliner::getResMII(MachineBasicBlock &LoopBlock) { - // For each instruction, find the first cycle in which it fits and collect the - // maximum - std::vector Scoreboard(NInstr, 0); - int MII = 1; + // Add up all slot requirements and return the maximum slot count + SlotCounts Counts; for (auto &MI : LoopBlock) { - auto *SlotInfo = TII->getSlotInfo(TII->getSlotKind(MI.getOpcode())); - SlotBits Slots = SlotInfo ? SlotInfo->getSlotSet() : 0; - - int C = 0; - while (C < NInstr && (Scoreboard[C] & Slots)) { - C++; - } - if (C >= NInstr) { - MII = NInstr; - break; - } - Scoreboard[C] |= Slots; - MII = std::max(MII, C + 1); + Counts += getSlotCounts(MI, TII); } + int MII = Counts.max(); LLVM_DEBUG(dbgs() << "PostPipeliner: ResMII=" << MII << "\n"); return MII; } -// This assigns Cycle of SU, Earliest of its predecessors and Earliest of -// the next instance of SU. +// This assigns Cycle of SU, Earliest of its successors and Latest of its +// predecessors void PostPipeliner::scheduleNode(SUnit &SU, int Cycle) { LLVM_DEBUG(dbgs() << "PostPipeline " << SU.NodeNum << " in cycle " << Cycle << ". "); @@ -145,8 +146,25 @@ void PostPipeliner::scheduleNode(SUnit &SU, int Cycle) { const int SNum = Succ->NodeNum; const int NewEarliest = Cycle + Latency; if (NewEarliest > Info[SNum].Earliest) { + Info[SNum].LastEarliestPusher = SU.NodeNum; Info[SNum].Earliest = NewEarliest; - LLVM_DEBUG(dbgs() << SNum << " to " << Info[SNum].Earliest << "; "); + Info[SU.NodeNum].NumPushedEarliest++; + LLVM_DEBUG(dbgs() << SNum << " to " << Info[SNum].Earliest << " -; "); + } + } + for (auto &Dep : SU.Preds) { + int Latency = Dep.getSignedLatency(); + auto *Pred = Dep.getSUnit(); + if (Pred->isBoundaryNode()) { + continue; + } + const int PNum = Pred->NodeNum; + const int NewLatest = Cycle - Latency; + if (NewLatest < Info[PNum].Latest) { + Info[PNum].LastLatestPusher = SU.NodeNum; + Info[PNum].Latest = NewLatest; + Info[SU.NodeNum].NumPushedLatest++; + LLVM_DEBUG(dbgs() << PNum << " to - " << Info[PNum].Latest << "; "); } } LLVM_DEBUG(dbgs() << "\n"); @@ -160,8 +178,10 @@ void PostPipeliner::scheduleNode(SUnit &SU, int Cycle) { // Check resources. We only insert at the position modulo II. Since we insert // all iterations separately, the resources that wrap around accumulate in the // overflow area, causing conflicts when inserting future iterations -int PostPipeliner::fit(MachineInstr *MI, int Earliest, int NTries, int II) { - for (int C = Earliest; C < Earliest + NTries; C++) { +int PostPipeliner::fit(MachineInstr *MI, int First, int Last, int II) { + const int Step = First > Last ? -1 : 1; + LLVM_DEBUG(dbgs() << " " << First << ", " << Last << ", " << Step << "\n"); + for (int C = First; C != Last; C += Step) { int Mod = C % II; LLVM_DEBUG(dbgs() << " at " << C << " (" << Mod << ")\n"); if (!HR.checkConflict(Scoreboard, *MI, -Depth + Mod)) { @@ -174,49 +194,153 @@ int PostPipeliner::fit(MachineInstr *MI, int Earliest, int NTries, int II) { return -1; } -void PostPipeliner::computeLoopCarriedParameters() { - // We schedule the first iteration, only using earliest. This updates - // earliest of the successors. Any successor in the second iteration - // represents a loop carried dependence, and we account for that by - // propagating its Earliest back to the first iteration - // Note that we don't have to clean the effects of this exploration, - // since the real scheduling will overwrite Cycle, and the ultimate Earliest - // will never be less than we compute here. - +void PostPipeliner::computeForward() { + // The forward order defines a topological sort, so we can compute + // Earliest and Ancestors in a single forward sweep for (int K = 0; K < NInstr; K++) { + auto &Me = Info[K]; + SUnit &SU = DAG->SUnits[K]; + for (auto &Dep : SU.Preds) { + if (Dep.getKind() != SDep::Data) { + continue; + } + int P = Dep.getSUnit()->NodeNum; + assert(P < K); + Me.Ancestors.insert(P); + auto &Pred = Info[P]; + for (int Anc : Pred.Ancestors) { + Me.Ancestors.insert(Anc); + } + } + for (auto &Dep : SU.Succs) { + auto *Succ = Dep.getSUnit(); + if (Succ->isBoundaryNode()) { + continue; + } + auto &SInfo = Info[Succ->NodeNum]; + const int NewEarliest = Me.Earliest + Dep.getSignedLatency(); + SInfo.Earliest = std::max(SInfo.Earliest, NewEarliest); + } + Me.Slots = getSlotCounts(*SU.getInstr(), TII); + } +} + +bool PostPipeliner::computeBackward() { + bool Changed = false; + + auto AddOffspring = [&Changed](NodeInfo &Info, int E) { + if (Info.Offspring.insert(E).second) { + Changed = true; + } + }; + + // Traversing backwards will speed convergence a bit + for (int K = NInstr - 1; K >= 0; K--) { SUnit &SU = DAG->SUnits[K]; - const int Earliest = Info[K].Earliest; - scheduleNode(SU, Earliest); + auto &Me = Info[K]; + const int Latest = Info[K].Latest; + for (auto &Dep : SU.Preds) { + if (Dep.getKind() != SDep::Data) { + continue; + } + int P = Dep.getSUnit()->NodeNum; + auto &Pred = Info[P]; + AddOffspring(Pred, K); + for (auto Offs : Me.Offspring) { + AddOffspring(Pred, Offs); + } + int NewLatest = Latest - Dep.getSignedLatency(); + if (NewLatest < Pred.Latest) { + Pred.Latest = NewLatest; + Changed = true; + } + } } + return Changed; +} + +bool PostPipeliner::computeLoopCarriedParameters() { + + // Forward properties like Earliest and Ancestors. + computeForward(); - // Propagate Earliest upstream, initialize Latest + // Backward properties like Latest and Offspring. + // Use a fixpoint loop, because plain reversed order may not be topological + // for predecessors + while (computeBackward()) { + /* EMPTY */; + } + + // Adjust Earliest and Latest with resource requirements. + // FIXME: We do not account for negative latencies here. This can lead to + // suboptimality, but we only include true dependences, where negative + // latencies are rare. for (int K = 0; K < NInstr; K++) { - const int K2 = K + NInstr; - const int Earliest = Info[K2].Earliest - II; + auto &Me = Info[K]; + SlotCounts ASlots(Me.Slots); + for (int A : Me.Ancestors) { + ASlots += Info[A].Slots; + } + SlotCounts OSlots(Me.Slots); + for (int O : Me.Offspring) { + OSlots += Info[O].Slots; + } + LLVM_DEBUG(dbgs() << "SU" << K << " : " << Info[K].Earliest << " - " + << Info[K].Latest << " " << ASlots << " " << OSlots + << "\n"); + Me.Earliest = std::max(Me.Earliest, 0 + (ASlots.max() - 1)); + Me.Latest = std::min(Me.Latest, -1 - (OSlots.max() - 1)); + LLVM_DEBUG(dbgs() << " -> " << Info[K].Earliest << " - " + << Info[K].Latest << "\n"); + } + + // Loop carried dependences will have pushed away Earliest of the second + // iteration, which should stay in lock step with the first. + for (int K = 0; K < NInstr; K++) { + const int KNextIter = K + NInstr; + const int Earliest = Info[KNextIter].Earliest - II; Info[K].Earliest = std::max(Info[K].Earliest, Earliest); - // Unrestricted: Beyond the last stage. - Info[K].Latest = NCopies * II; } - // Propagate Latest upstream. Latest is the latest - // that is admissible for Earliest to be achievable within II + + // Make Earliest of the second iteration push up Latest of the first for (int K = 0; K < NInstr; K++) { - const int K2 = K + NInstr; - const int Earliest = Info[K2].Earliest; - const auto &SU = DAG->SUnits[K2]; - for (auto &Dep : SU.Preds) { - const auto *Pred = Dep.getSUnit(); - // Any predecessor in the first iteration - int K1 = Pred->NodeNum; - if (K1 < NInstr) { - const int Latest = Earliest - Dep.getSignedLatency(); - Info[K1].Latest = std::min(Info[K1].Latest, Latest); + auto &Me = Info[K]; + int LCDLatest = Me.Latest; + auto &SU = DAG->SUnits[K]; + for (auto &Dep : SU.Succs) { + const int S = Dep.getSUnit()->NodeNum; + if (S < NInstr) { + continue; } + const int Earliest = Info[S - NInstr].Earliest; + const int Latest = Earliest - Dep.getSignedLatency(); + LCDLatest = std::min(LCDLatest, Latest); + } + Me.LCDLatest = LCDLatest; + if (LCDLatest != Me.Latest) { + LLVM_DEBUG(dbgs() << "SU" << K << " LCDLatest=" << Me.LCDLatest << "\n"); } } - LLVM_DEBUG(for (int K = 0; K < NInstr; K++) { - dbgs() << "SU" << K << " : " << Info[K].Earliest << " - " << Info[K].Latest - << "\n"; - }); + + // Save the static values for ease of reset + for (auto &N : Info) { + N.StaticEarliest = N.Earliest; + N.StaticLatest = N.Latest; + } + return true; +} + +int PostPipeliner::computeMinScheduleLength() const { + // The minimum length makes sure that every node has a range in which it + // can be scheduled + int MinLength = II; + for (int K = 0; K < NInstr; K++) { + auto &Node = Info[K]; + while (Node.Earliest > Node.Latest + MinLength) { + MinLength += II; + } + } + return MinLength; } void dumpGraph(int NInstr, const std::vector &Info, @@ -238,33 +362,54 @@ void dumpGraph(int NInstr, const std::vector &Info, if (S >= NInstr) { dbgs() << "_" << S % NInstr; } - dbgs() << "# L=" << Dep.getSignedLatency() << "\n"; + if (Dep.getKind() == SDep::Data) { + dbgs() << " [color=red] "; + } else if (Dep.getKind() == SDep::Output) { + dbgs() << " [color=black] "; + } else if (Dep.getKind() == SDep::Anti) { + dbgs() << " [color=blue] "; + } + + dbgs() << " # L=" << Dep.getSignedLatency(); + if (Dep.getKind() == SDep::Output) { + dbgs() << " WAW"; + } + dbgs() << "\n"; } } dbgs() << "}\n"; } -int PostPipeliner::mostUrgent() { - assert(FirstUnscheduled < NInstr); +int PostPipeliner::mostUrgent(PostPipelinerStrategy &Strategy) { + assert(FirstUnscheduled <= LastUnscheduled); while (Info[FirstUnscheduled].Scheduled) { FirstUnscheduled++; } - assert(FirstUnscheduled < NInstr); + while (Info[LastUnscheduled].Scheduled) { + LastUnscheduled--; + } + assert(FirstUnscheduled <= LastUnscheduled); + + auto NotScheduled = [&](const auto &Dep) { + auto *SU = Dep.getSUnit(); + if (SU->isBoundaryNode()) { + return false; + } + int N = SU->NodeNum; + return N < NInstr && !Info[N].Scheduled; + }; int Best = -1; LLVM_DEBUG(dbgs() << "Available:"); - for (int K = FirstUnscheduled; K < NInstr; K++) { + for (int K = FirstUnscheduled; K <= LastUnscheduled; K++) { const auto &SU = DAG->SUnits[K]; + auto &Edges = Strategy.fromTop() ? SU.Preds : SU.Succs; // Check whether it is available - if (any_of(SU.Preds, [&](const auto &Dep) { - return !Info[Dep.getSUnit()->NodeNum].Scheduled; - })) { + if (Info[K].Scheduled || any_of(Edges, NotScheduled)) { continue; } LLVM_DEBUG(dbgs() << " SU" << K); - // Yeah, I know. This is a difficult way to schedule in the original - // node order. Have patience, my friend. - if (Best == -1) { + if (Best == -1 || Strategy.better(SU, DAG->SUnits[Best])) { Best = K; LLVM_DEBUG(dbgs() << "*"); } @@ -274,24 +419,41 @@ int PostPipeliner::mostUrgent() { return Best; } -bool PostPipeliner::scheduleFirstIteration() { +void PostPipeliner::resetSchedule(bool FullReset) { + Scoreboard.clear(); + for (int K = 0; K < NTotalInstrs; K++) { + auto &N = Info[K]; + N.reset(FullReset); + if (K < NInstr) { + N.Earliest = N.StaticEarliest; + N.Latest = N.StaticLatest; + } + } + + FirstUnscheduled = 0; + LastUnscheduled = NInstr - 1; +} + +bool PostPipeliner::scheduleFirstIteration(PostPipelinerStrategy &Strategy) { // Set up the basic schedule from the original instructions for (int K = 0; K < NInstr; K++) { - const int N = mostUrgent(); + const int N = mostUrgent(Strategy); LLVM_DEBUG(dbgs() << " Trying " << N << "\n"); SUnit &SU = DAG->SUnits[N]; MachineInstr *MI = SU.getInstr(); - const int Earliest = Info[N].Earliest; + const int Earliest = Strategy.earliest(SU); + const int Latest = Strategy.latest(SU); // Find the first cycle that fits. We try every position modulo II - const int Actual = fit(MI, Earliest, II, II); + const int Actual = Strategy.fromTop() ? fit(MI, Earliest, Latest + 1, II) + : fit(MI, Latest, Earliest - 1, II); if (Actual < 0) { // out of resources for this II; LLVM_DEBUG(dbgs() << "Out of resources\n"); return false; } + Strategy.selected(SU); const int LocalCycle = Actual % II; const MemoryBankBits MemoryBanks = HR.getMemoryBanks(MI); - LLVM_DEBUG(dbgs() << " Emit in " << -Depth + LocalCycle << "\n"); int Cycle = -Depth + LocalCycle; LLVM_DEBUG(dbgs() << " Emit in " << Cycle << "\n"); for (int N = 0; N < NCopies; N++) { @@ -306,12 +468,23 @@ bool PostPipeliner::scheduleFirstIteration() { scheduleNode(SU, Actual); Info[N].Scheduled = true; - LLVM_DEBUG(dbgs() << "Scoreboard\n"; Scoreboard.dumpFull();); + DEBUG_FULL(dbgs() << "Scoreboard\n"; Scoreboard.dumpFull();); } - LLVM_DEBUG(dbgs() << "==== First iteration scheduled ======\n"); + LLVM_DEBUG(dbgs() << "==== First iteration scheduled by " << Strategy.name() + << "====\n"); return true; } +namespace { +void dumpEarliestChain(const std::vector &Info, int N) { + auto Prev = Info[N].LastEarliestPusher; + if (Prev) { + dumpEarliestChain(Info, *Prev); + } + dbgs() << " --> " << N << " @" << Info[N].Cycle << "\n"; +} +} // namespace + bool PostPipeliner::scheduleOtherIterations() { // Make sure that all the copies can be placed at II from the previous one. // This looks like overkill, but it accommodates dependences that span @@ -328,8 +501,9 @@ bool PostPipeliner::scheduleOtherIterations() { // All iterations following the first one should fit exactly if (Earliest > Insert) { - LLVM_DEBUG(dbgs() << " Latency not met (Earliest=" << Earliest - << ")\n"); + LLVM_DEBUG(dbgs() << " Latency not met for " << N + << "(Earliest=" << Earliest << ")\n"; + dumpEarliestChain(Info, N);); return false; } @@ -339,6 +513,195 @@ bool PostPipeliner::scheduleOtherIterations() { return true; } +class DefaultStrategy : public PostPipelinerStrategy { +public: + DefaultStrategy(ScheduleDAGMI &DAG, std::vector &Info, + int LatestBias) + : PostPipelinerStrategy(DAG, Info, LatestBias) {} + bool better(const SUnit &A, const SUnit &B) override { + return Info[A.NodeNum].Latest < Info[B.NodeNum].Latest; + } +}; + +class ConfigStrategy : public PostPipelinerStrategy { + bool TopDown = true; + +public: + enum PriorityComponent { + NodeNum, + Latest, + Critical, + Sibling, + LCDLatest, + Size + }; + static std::string getPriorityName(PriorityComponent Component) { + switch (Component) { + case PriorityComponent::NodeNum: + return "NodeNum"; + case PriorityComponent::Latest: + return "Latest"; + case PriorityComponent::Critical: + return "Critical"; + case PriorityComponent::Sibling: + return "Sibling"; + case PriorityComponent::LCDLatest: + return "LcdLatest"; + default: + break; + } + return "Size - Illegal"; + } + +private: + std::string Name; + std::set SuccSiblingScheduled; + std::set PredSiblingScheduled; + std::function + Discriminators[PriorityComponent::Size] = { + [&](const SUnit &A, const SUnit &B) { + return TopDown ? A.NodeNum < B.NodeNum : A.NodeNum > B.NodeNum; + }, + [&](const SUnit &A, const SUnit &B) { + auto &IA = Info[A.NodeNum]; + auto &IB = Info[B.NodeNum]; + return TopDown ? IA.Latest < IB.Latest : IA.Earliest > IB.Earliest; + }, + [&](const SUnit &A, const SUnit &B) { + auto &IA = Info[A.NodeNum]; + auto &IB = Info[B.NodeNum]; + return TopDown ? IA.NumPushedEarliest > IB.NumPushedEarliest + : IA.NumPushedLatest > IB.NumPushedLatest; + }, + [&](const SUnit &A, const SUnit &B) { + std::set &Sibling = + TopDown ? SuccSiblingScheduled : PredSiblingScheduled; + return Sibling.count(A.NodeNum) > Sibling.count(B.NodeNum); + }, + [&](const SUnit &A, const SUnit &B) { + auto &IA = Info[A.NodeNum]; + auto &IB = Info[B.NodeNum]; + return IA.LCDLatest < IB.LCDLatest; + }, + }; + std::vector Priority; + + bool fromTop() override { return TopDown; } + + bool better(const SUnit &A, const SUnit &B) override { + for (auto P : Priority) { + if (Discriminators[P](A, B)) { + return true; + } + } + return false; + } + + void selected(const SUnit &N) override { + // Promote the critical path + NodeInfo *Pushed = &Info[N.NodeNum]; + while (Pushed->LastEarliestPusher) { + Pushed = &Info[*Pushed->LastEarliestPusher]; + Pushed->NumPushedEarliest++; + } + + // Promote my siblings + for (auto &SDep : N.Succs) { + if (SDep.getKind() != SDep::Data) { + continue; + } + for (auto &PDep : SDep.getSUnit()->Preds) { + if (PDep.getKind() != SDep::Data) { + continue; + } + SuccSiblingScheduled.insert(PDep.getSUnit()->NodeNum); + } + } + for (auto &PDep : N.Preds) { + if (PDep.getKind() != SDep::Data) { + continue; + } + for (auto &SDep : PDep.getSUnit()->Succs) { + if (SDep.getKind() != SDep::Data) { + continue; + } + PredSiblingScheduled.insert(PDep.getSUnit()->NodeNum); + } + } + } + +public: + std::string name() override { return Name; } + ConfigStrategy(ScheduleDAGInstrs &DAG, std::vector &Info, + int Length, bool TopDown, + ArrayRef Components) + : PostPipelinerStrategy(DAG, Info, Length), TopDown(TopDown) { + Name = "Config_" + std::to_string(Length) + "_" + std::to_string(TopDown); + for (auto Comp : Components) { + Name += "_" + getPriorityName(Comp); + Priority.emplace_back(Comp); + } + } +}; + +static const struct { + int ExtraStages; + bool TopDown; + bool Rerun; + ConfigStrategy::PriorityComponent Components[3]; +} Strategies[] = { + // Loosely speaking, a lower value of the first parameter targets + // a lower stage count, which benefits code size. + // Rerurn is only useful for heuristics that use it, e.g. Critical + {1, true, false, {ConfigStrategy::NodeNum}}, + {1, true, false, {ConfigStrategy::Latest}}, + {1, true, true, {ConfigStrategy::Critical}}, + {1, true, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}}, + {0, false, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}}, + {1, false, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}}, + // This is pure bottom up + {1, false, false, {ConfigStrategy::NodeNum}}, +}; + +bool PostPipeliner::tryHeuristics() { + int MinLength = computeMinScheduleLength(); + + DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength << "\n"); + + int HeuristicIndex = 0; + for (auto &[ExtraStages, TopDown, Rerun, Components] : Strategies) { + if (Heuristic >= 0 && Heuristic != HeuristicIndex++) { + continue; + } + ConfigStrategy S(*DAG, Info, MinLength + ExtraStages * II, TopDown, + Components); + resetSchedule(/*FullReset=*/true); + DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name() << "\n"); + if (scheduleFirstIteration(S) && scheduleOtherIterations()) { + DEBUG_SUMMARY(dbgs() << " Strategy " << S.name() << " found II=" << II + << "\n"); + return true; + } + + DEBUG_SUMMARY(dbgs() << " failed\n"); + if (!Rerun) { + continue; + } + + // Rerun with dynamic information retained + resetSchedule(/*FullReset=*/false); + DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name() + << " with critical path"); + if (scheduleFirstIteration(S) && scheduleOtherIterations()) { + DEBUG_SUMMARY(dbgs() << " found II=" << II << "\n"); + return true; + } + DEBUG_SUMMARY(dbgs() << " failed\n"); + } + DEBUG_SUMMARY(dbgs() << "=== II=" << II << " Failed ===\n"); + return false; +} + bool PostPipeliner::schedule(ScheduleDAGMI &TheDAG, int InitiationInterval) { NTotalInstrs = TheDAG.SUnits.size(); assert(NTotalInstrs % NInstr == 0); @@ -349,7 +712,6 @@ bool PostPipeliner::schedule(ScheduleDAGMI &TheDAG, int InitiationInterval) { } II = InitiationInterval; DAG = &TheDAG; - FirstUnscheduled = 0; // Let's not skimp on size here. This allows us to insert any instruction // in the unrolled dag. @@ -358,12 +720,14 @@ bool PostPipeliner::schedule(ScheduleDAGMI &TheDAG, int InitiationInterval) { Info.clear(); Info.resize(NTotalInstrs); + LLVM_DEBUG(for (int I = 0; I < NInstr; I++) { dbgs() << I << " " << *DAG->SUnits[I].getInstr(); }); LLVM_DEBUG(dumpGraph(NInstr, Info, DAG)); computeLoopCarriedParameters(); - if (!scheduleFirstIteration() || !scheduleOtherIterations()) { + + if (!tryHeuristics()) { LLVM_DEBUG(dbgs() << "PostPipeliner: No schedule found\n"); return false; } @@ -461,4 +825,17 @@ void PostPipeliner::updateTripCount() const { TII->adjustTripCount(*TripCountDef, -Delta); } +void NodeInfo::reset(bool FullReset) { + Cycle = 0; + Scheduled = false; + Earliest = 0; + Latest = -1; + if (FullReset) { + NumPushedEarliest = 0; + NumPushedLatest = 0; + LastEarliestPusher = {}; + LastLatestPusher = {}; + } +} + } // namespace llvm::AIE diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.h b/llvm/lib/Target/AIE/AIEPostPipeliner.h index d9bcb558103a..5fa8ca8d7f49 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.h +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.h @@ -15,8 +15,10 @@ #define LLVM_LIB_TARGET_AIE_AIEPOSTPIPELINER_H #include "AIEHazardRecognizer.h" +#include "AIESlotCounts.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/ResourceScoreboard.h" +#include #include namespace llvm { @@ -43,14 +45,77 @@ class NodeInfo { int ModuloCycle = 0; // Cycle / II int Stage = 0; + // The earliest cycle at which this can be scheduled to meet latencies // This includes the lowerbound of the modulo condition, i.e. // Earliest(N) >= Cycle(N - NInstr) + II int Earliest = 0; - // For an LCD K1 -> K2, this holds II + Earliest(K2 - NInstr) - Latency(LCD) - // Instructions with lower Latest have higher priority in the - // top down scheduling - int Latest = 0; + + // The latest cycle at which this can be scheduled. This is a negative value + // relative to the length of the linear schedule. + // So -1 is the last cycle of the linear schedule, -Length is the first cycle + // of the linear schedule. Note that this length is usually rounded up to + // the next multiple of the initiation interval + int Latest = -1; + + // These are the values of Earliest and Latest as computed from the a-priori + // computations. During scheduling Earliest and Latest may be adjusted to + // more accurate values. The two values are cached here to facilitate cheaper + // reset before trying a new strategy for the same II. + int StaticEarliest = 0; + int StaticLatest = -1; + + // Slots necessary for this instruction. + SlotCounts Slots; + + // Record critical path components + // The Pred/Succ that pushed my Earliest/Latest + std::optional LastEarliestPusher; + std::optional LastLatestPusher; + // The number of Succs/Preds whose Earliest/Latest I have pushed. + int NumPushedEarliest = 0; + int NumPushedLatest = 0; + + // Latest corrected by taking Earliest of an LCD successor into account + int LCDLatest = -1; + + // The transitive closure of my predecessors + std::unordered_set Ancestors; + + // The transitive closure of my successors + std::unordered_set Offspring; + + /// Reset the node to the values computed statically + /// If FullReset is true, also reset the accumulated dynamic data. + void reset(bool FullReset); +}; + +class PostPipelinerStrategy { +protected: + ScheduleDAGInstrs &DAG; + std::vector &Info; + int LatestBias = 0; + +public: + PostPipelinerStrategy(ScheduleDAGInstrs &DAG, std::vector &Info, + int LatestBias) + : DAG(DAG), Info(Info), LatestBias(LatestBias) {}; + virtual ~PostPipelinerStrategy() {}; + // Provide a name for logging purposes + virtual std::string name() { return "PostPipelinerStrategy"; } + // Choose among available alternatives + virtual bool better(const SUnit &A, const SUnit &B) { return false; } + // Define the earliest cycle in which to insert \p N + virtual int earliest(const SUnit &N) { return Info[N.NodeNum].Earliest; } + // Define the latest cycle in which to insert \p N + virtual int latest(const SUnit &N) { + return Info[N.NodeNum].Latest + LatestBias; + } + // Select from top or from bottom. + virtual bool fromTop() { return true; } + // Report a final selection. This marks the start of selecting a new node. + // fromTop() should be invariant between calls to selected() + virtual void selected(const SUnit &N) {}; }; class PipelineScheduleVisitor { @@ -72,6 +137,7 @@ class PostPipeliner { int NTotalInstrs = 0; int FirstUnscheduled = 0; + int LastUnscheduled = -1; /// Holds the cycle of each SUnit. The following should hold: /// Cycle(N) mod II == Cycle(N % NInstr) mod II @@ -96,7 +162,8 @@ class PostPipeliner { int II = 1; int NStages = 0; - /// Place SU in cycle Cycle; update Earliest of dependent instructions + /// Place SU in cycle Cycle; update Earliest of successors and Latest + /// of predecessors void scheduleNode(SUnit &SU, int Cycle); /// Compute the stage in which each instruction runs @@ -108,20 +175,38 @@ class PostPipeliner { int fit(MachineInstr *MI, int Earliest, int NTries, int II); /// Provide some look ahead by seeing the effect of the first iteration - /// on the second iteration. - void computeLoopCarriedParameters(); + /// on the second iteration. May return false if the II isn't feasible. + bool computeLoopCarriedParameters(); + + /// Helpers of computeLoopCarriedParameters() + void computeForward(); + bool computeBackward(); + + // Given Earliest and Latest of each node in the first iteration, + // compute the smallest length of the linear schedule that is feasible. + // this length will be a multiple of the InitiationInterval + int computeMinScheduleLength() const; + + /// Try all heuristics, stop at the first that fits the II + /// If it returns true, a valid schedule is laid down in Info. + bool tryHeuristics(); /// Find the first available unscheduled instruction with the highest /// priority - int mostUrgent(); + int mostUrgent(PostPipelinerStrategy &Strategy); /// Schedule the original instructions, taking the modulo scoreboard /// into account - bool scheduleFirstIteration(); + bool scheduleFirstIteration(PostPipelinerStrategy &Strategy); /// Check that all copied instructions can run in the same modulo cycle bool scheduleOtherIterations(); + /// Reset dynamic scheduling data. + /// If FullReset is set, also reset information collected from earlier + /// data mining scheduling rounds + void resetSchedule(bool FullReset); + public: PostPipeliner(const AIEHazardRecognizer &HR, int NInstr); diff --git a/llvm/lib/Target/AIE/AIESlotCounts.cpp b/llvm/lib/Target/AIE/AIESlotCounts.cpp new file mode 100644 index 000000000000..e9d85e552ef7 --- /dev/null +++ b/llvm/lib/Target/AIE/AIESlotCounts.cpp @@ -0,0 +1,72 @@ +//===- AIESlotCounts.cpp - SlotCount utility ------------------------------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// + +#include "AIESlotCounts.h" + +namespace llvm { +namespace AIE { + +SlotCounts::SlotCounts(SlotBits Bits) { + while (Bits) { + assert(Size < MaxSlots); + Counts[Size] = Bits & 1; + Size++; + Bits >>= 1; + } +} + +SlotCounts::SlotCounts(const SlotCounts &Org) : Size(Org.Size) { + for (int I = 0; I < Size; I++) { + Counts[I] = Org.Counts[I]; + } +} + +int SlotCounts::max() { + int Max = 0; + for (int I = 0; I < Size; I++) { + Max = std::max(Max, int(Counts[I])); + } + return Max; +} + +SlotCounts &SlotCounts::operator+=(const SlotCounts &Other) { + // The common part + for (int I = 0; I < Size && I < Other.Size; I++) { + Counts[I] += Other.Counts[I]; + } + // Any excess from the other + while (Size < Other.Size) { + Counts[Size] = Other.Counts[Size]; + Size++; + } + assert(Size >= Other.Size); + assert(Size < MaxSlots); + return *this; +} + +SlotCounts SlotCounts::operator+(const SlotCounts &Other) const { + SlotCounts Result(*this); + return Result += Other; +} + +} // namespace AIE + +raw_ostream &operator<<(raw_ostream &OS, const AIE::SlotCounts &Val) { + OS << "{ "; + const char *Sep = ""; + for (int I = 0; I < Val.size(); I++) { + OS << Sep << Val[I]; + Sep = ", "; + } + OS << " }"; + return OS; +} + +} // namespace llvm diff --git a/llvm/lib/Target/AIE/AIESlotCounts.h b/llvm/lib/Target/AIE/AIESlotCounts.h new file mode 100644 index 000000000000..34ddd0d09542 --- /dev/null +++ b/llvm/lib/Target/AIE/AIESlotCounts.h @@ -0,0 +1,55 @@ +//===- AIESlotCounts.h - Resource computation utility ---------------------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// This defines a class that can be used to tally up the slots required for +// one or more instructions +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AIE_AIESLOTCOUNTS_H +#define LLVM_LIB_TARGET_AIE_AIESLOTCOUNTS_H + +#include "MCTargetDesc/AIEFormat.h" + +namespace llvm { +namespace AIE { + +/// Efficient representation of slot requirements +class SlotCounts { + static const int MaxSlots = 16; + int Counts[MaxSlots]; + // The number of valid Counts. Further counts are assumed to be zero. + int Size = 0; + +public: + // Useful constructors + SlotCounts() = default; + SlotCounts(SlotBits Bits); + SlotCounts(const SlotCounts &Org); + SlotCounts &operator=(const SlotCounts &Rhs) = default; + + // Compute the number of required cycles + int max(); + + // Add slot counts of Other to this + SlotCounts &operator+=(const SlotCounts &Other); + + // By-value addition. + SlotCounts operator+(const SlotCounts &Other) const; + + // Indexing + const int &operator[](int I) const { return Counts[I]; }; + + int size() const { return Size; } +}; +} // namespace AIE + +raw_ostream &operator<<(raw_ostream &OS, const AIE::SlotCounts &Val); + +} // namespace llvm +#endif // LLVM_LIB_TARGET_AIE_AIESLOTCOUNTS_H diff --git a/llvm/lib/Target/AIE/CMakeLists.txt b/llvm/lib/Target/AIE/CMakeLists.txt index 85dda5330112..191ea6305274 100644 --- a/llvm/lib/Target/AIE/CMakeLists.txt +++ b/llvm/lib/Target/AIE/CMakeLists.txt @@ -94,6 +94,7 @@ add_llvm_target(AIECodeGen AIEPseudoBranchExpansion.cpp AIERegClassConstrainer.cpp AIERegisterInfo.cpp + AIESlotCounts.cpp AIESplitInstructionRewriter.cpp AIESubRegConstrainer.cpp AIESubtarget.cpp diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/README b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/README new file mode 100644 index 000000000000..72bf8ee7396e --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/README @@ -0,0 +1,19 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +The tests in these directory are single loops that need to be post-pipelined. +They supply the input to the scheduler and test the generated assembly code. +It is likely that the detailed schedule changes over time, and this can be +automatically updated provided that neither the II nor the stage count grows. +If the stage count grows, automatic update is allowed if the II shrinks. +The II is the number of lines between the loop block's label upto and including +the cycle headed by the loop end label. +The stage count is determined by the immediate operand of setting the +lc register. + +Note that the LLVM IR doesn't match the actual MIR code. It is just a standard +loop providing some pointers into different spaces to dereference. + diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store.mir index fe25c964c908..92e2e7a8089b 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store.mir @@ -5,7 +5,8 @@ # # (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates -# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \ +# RUN: --debug-only=postpipeliner-summary -o - 2>&1 | FileCheck %s # add-store can run in a two-stage II=1 pipeline diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir index 06aa9aeb6418..401cbb80ed65 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir @@ -7,7 +7,8 @@ # (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates -# RUN: llc -O2 --mtriple=aie2 --start-before=postmisched %s -o - | FileCheck %s +# RUN: llc -O2 --mtriple=aie2 --start-before=postmisched %s \ +# RUN: --debug-only=postpipeliner-summary -o - 2>&1 | FileCheck %s --- | define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %cond, ptr %cond.i50, <16 x i32> %0, i32 %cond67.i79, i20 %idx.ext.i.i81, i20 %idx.ext.i404.i, i20 %idx.ext.i410.i, i20 %idx.ext.i434.i85, i32 %1, i20 %2, i20 %3, i20 %4, i20 %5, i20 %6, i32 %7, i32 %8, i32 %or9.i.i.i.i.i96, i32 %9, i20 %idx.ext.i422.i82, i20 %10, i20 %11, i20 %12, i20 %13, i20 %14, i20 %15, i20 %16, i20 %17, i20 %18, i20 %19, i20 %20, i20 %21, i20 %22, i20 %23, i32 %conv192.i107, i20 %24, i20 %idx.ext.i428.i, i20 %25, i20 %26, i20 %27, i32 %28) #0 { diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir new file mode 100644 index 000000000000..6866f48a3518 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir @@ -0,0 +1,238 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \ +# RUN: --debug-only=postpipeliner-summary -o - | FileCheck %s + + +# derived from conv2d_bf16_0 + +--- | + define dso_local void @conv2d(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: conv2d: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: nopa ; vldb wh8, [p0, #32]; nopx ; mov p7, p5 + ; CHECK-NEXT: vldb wl8, [p0], m4 + ; CHECK-NEXT: vldb wh10, [p0, #32] + ; CHECK-NEXT: vldb wl10, [p0], m4 + ; CHECK-NEXT: vldb wh1, [p0, #32] + ; CHECK-NEXT: vldb wl1, [p0], m4 + ; CHECK-NEXT: vldb wh3, [p0, #32]; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb.3d wl3, [p0], d1; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vlda wh7, [p4, #352]; vshift.align x0, x0, s0, x8, r3 + ; CHECK-NEXT: vlda wl7, [p4, #320]; movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopb ; vlda wh9, [p4, #416]; nops ; nopx ; vshift.align x2, x2, s0, x10, r3; nopv + ; CHECK-NEXT: nopb ; vlda wl9, [p4, #384]; nops ; nopx ; vshuffle x8, x0, x2, r9; nopv + ; CHECK-NEXT: vldb wh5, [p5, #32]; nopa ; nops ; nopx ; vshift.align x4, x4, s0, x1, r3; nopv + ; CHECK-NEXT: nopb ; vlda wl5, [p5], #256; nops ; nopx ; vshuffle x5, x0, x2, r25; nopv + ; CHECK-NEXT: nopb ; vlda wh11, [p4, #480]; nops ; nopx ; vshift.align x6, x6, s0, x3, r3; nopv + ; CHECK-NEXT: nopb ; vlda wl11, [p4, #448]; nops ; nopx ; vshuffle x3, x4, x6, r9; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x10, x4, x6, r25; vmac.f bml4, bml4, x8, x7, r29 + ; CHECK-NEXT: nopa ; vshuffle x1, x3, x5, r13 + ; CHECK-NEXT: vshuffle x3, x3, x5, r24; vmac.f bmh1, bmh1, x8, x9, r29 + ; CHECK-NEXT: mov r3, p0; vmac.f bmh0, bmh0, x1, x9, r29 + ; CHECK-NEXT: and r3, r3, r0; mov p4, p7; vmac.f bmh7, bmh7, x8, x5, r29 + ; CHECK-NEXT: add r3, r3, #34; vmac.f bmh5, bmh5, x1, x5, r29 + ; CHECK-NEXT: vmac.f bml2, bml2, x3, x5, r29 + ; CHECK-NEXT: vmac.f bml0, bml0, x10, x5, r29 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopx ; mov p7, p5; vmac.f bmh3, bmh3, x3, x9, r29 + ; CHECK-NEXT: nopa ; vldb wl8, [p0], m4; nopx ; vmac.f bmh2, bmh2, x10, x9, r29 + ; CHECK-NEXT: vldb wh10, [p0, #32]; vmac.f bml3, bml3, x1, x7, r29 + ; CHECK-NEXT: vldb wl10, [p0], m4; vmac.f bml6, bml6, x3, x7, r29 + ; CHECK-NEXT: vldb wh1, [p0, #32]; vmac.f bml5, bml5, x10, x7, r29 + ; CHECK-NEXT: vldb wl1, [p0], m4; vmac.f bmh6, bmh6, x8, x11, r29 + ; CHECK-NEXT: vldb wh3, [p0, #32]; vmac.f bmh4, bmh4, x1, x11, r29 + ; CHECK-NEXT: vldb.3d wl3, [p0], d1; vmac.f bml1, bml1, x3, x11, r29 + ; CHECK-NEXT: vlda wh7, [p4, #352]; vshift.align x0, x0, s0, x8, r3; vmac.f bmh8, bmh8, x10, x11, r29 + ; CHECK-NEXT: vlda wl7, [p4, #320] + ; CHECK-NEXT: vlda wh9, [p4, #416]; vshift.align x2, x2, s0, x10, r3 + ; CHECK-NEXT: vlda wl9, [p4, #384]; vshuffle x8, x0, x2, r9 + ; CHECK-NEXT: vldb wh5, [p5, #32]; vshift.align x4, x4, s0, x1, r3 + ; CHECK-NEXT: vlda wl5, [p5], #256; vshuffle x5, x0, x2, r25 + ; CHECK-NEXT: vlda wh11, [p4, #480]; vshift.align x6, x6, s0, x3, r3 + ; CHECK-NEXT: vlda wl11, [p4, #448]; vshuffle x3, x4, x6, r9 + ; CHECK-NEXT: vshuffle x10, x4, x6, r25; vmac.f bml4, bml4, x8, x7, r29 + ; CHECK-NEXT: vshuffle x1, x3, x5, r13 + ; CHECK-NEXT: vshuffle x3, x3, x5, r24; vmac.f bmh1, bmh1, x8, x9, r29 + ; CHECK-NEXT: mov r3, p0; vmac.f bmh0, bmh0, x1, x9, r29 + ; CHECK-NEXT: and r3, r3, r0; mov p4, p7; vmac.f bmh7, bmh7, x8, x5, r29 + ; CHECK-NEXT: add r3, r3, #34; vmac.f bmh5, bmh5, x1, x5, r29 + ; CHECK-NEXT: vmac.f bml2, bml2, x3, x5, r29 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bml0, bml0, x10, x5, r29 + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh3, bmh3, x3, x9, r29 + ; CHECK-NEXT: vmac.f bmh2, bmh2, x10, x9, r29 + ; CHECK-NEXT: vmac.f bml3, bml3, x1, x7, r29 + ; CHECK-NEXT: vmac.f bml6, bml6, x3, x7, r29 + ; CHECK-NEXT: vmac.f bml5, bml5, x10, x7, r29 + ; CHECK-NEXT: vmac.f bmh6, bmh6, x8, x11, r29 + ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x11, r29 + ; CHECK-NEXT: vmac.f bml1, bml1, x3, x11, r29 + ; CHECK-NEXT: vmac.f bmh8, bmh8, x10, x11, r29 + ; CHECK-NEXT: nopx + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: conv2d +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOV_RLC_imm10_pseudo 0 + $r1 = GE $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc3, $dc4, $dc5, $dc7, $dj1, $dj3, $dj5, $dj7, $dn1, $dn5, $dn7, $m0, $m1, $m3, $m4, $m5, $m7, $p0, $p1, $p2, $p3, $p4, $p5, $p6, $p7, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r13, $r14, $r15, $r16, $r17, $r18, $r19, $r20, $r21, $r22, $r23, $r24, $r25, $r26, $r27, $r28, $r29, $r30, $r31, $s0, $s1, $x0, $x2, $x4, $x6, $d1_3d:0x000000000003C870 + + $p7 = MOV_mv_scl $p5 + $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl8, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh10 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl10, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh1 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl1, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh3 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl3, $p0, $dc1, $dc5 = VLD_3D_pseudo $p0, $d1_3d :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $x0 = VSHIFT_ALIGN $x0, $s0, $x8, $r3 + $x2 = VSHIFT_ALIGN $x2, $s0, $x10, $r3 + $x4 = VSHIFT_ALIGN $x4, $s0, $x1, $r3 + $x6 = VSHIFT_ALIGN $x6, $s0, $x3, $r3 + $x8 = VSHUFFLE $x0, $x2, $r9 + $x3 = VSHUFFLE $x4, $x6, $r9 + $x5 = VSHUFFLE $x0, $x2, $r25 + $x10 = VSHUFFLE $x4, $x6, $r25 + $x1 = VSHUFFLE $x3, $x5, $r13 + $x3 = VSHUFFLE $x3, $x5, $r24 + $wh5 = VLD_idx_imm_3x32_pseudo $p5, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl5, $p5 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p5, 256 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh7 = VLDA_dmw_lda_w_ag_idx_imm $p4, 352 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl7 = VLDA_dmw_lda_w_ag_idx_imm $p4, 320 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh9 = VLDA_dmw_lda_w_ag_idx_imm $p4, 416 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl9 = VLDA_dmw_lda_w_ag_idx_imm $p4, 384 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh11 = VLDA_dmw_lda_w_ag_idx_imm $p4, 480 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl11 = VLDA_dmw_lda_w_ag_idx_imm $p4, 448 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x8, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x1, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x3, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x10, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x8, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x3, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x8, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x3, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x10, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x8, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x3, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x10, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask + $r3 = MOV_mv_scl $p0 + $r3 = AND $r3, $r0 + $r3 = nuw nsw ADD_add_r_ri $r3, 34, implicit-def $srcarry + $p4 = MOV_mv_scl $p7 + + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir new file mode 100644 index 000000000000..85064a43cb53 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir @@ -0,0 +1,233 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \ +# RUN: --debug-only=postpipeliner-summary -o - | FileCheck %s + + +# derived from conv2d_bf16_0 + +--- | + define dso_local void @conv2d(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: conv2d: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: vldb wh7, [p7, #32]; mov p4, p2 + ; CHECK-NEXT: vlda wl7, [p7], #256; paddb [p4], #320; nopx ; mov p5, p7 + ; CHECK-NEXT: vldb wh8, [p0, #32] + ; CHECK-NEXT: vldb wl8, [p0], m4 + ; CHECK-NEXT: vldb wh10, [p0, #32] + ; CHECK-NEXT: vldb wl10, [p0], m4 + ; CHECK-NEXT: vldb wh1, [p0, #32] + ; CHECK-NEXT: vldb wl1, [p0], m4; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb wh3, [p0, #32]; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb.3d wl3, [p0], d1; movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov r1, p0; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; and r2, r1, r0; vshift.align x0, x0, s0, x8, r3; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshift.align x2, x2, s0, x10, r3; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x5, x0, x2, r25; nopv + ; CHECK-NEXT: nopb ; vlda wh5, [p2, #352]; nops ; nopx ; vshift.align x4, x4, s0, x1, r3; nopv + ; CHECK-NEXT: vldb wl5, [p4], #64; nopa ; nops ; nopx ; vshuffle x8, x0, x2, r9; nopv + ; CHECK-NEXT: vldb wh9, [p4, #32]; add r3, r2, #34; vshift.align x6, x6, s0, x3, r3 + ; CHECK-NEXT: vldb wl9, [p4], #64; vshuffle x3, x4, x6, r9; vmac.f bmh7, bmh7, x8, x7, r29 + ; CHECK-NEXT: vldb wl11, [p4, #0]; vshuffle x10, x4, x6, r25 + ; CHECK-NEXT: vldb wh11, [p4, #32]; vshuffle x1, x3, x5, r13 + ; CHECK-NEXT: vshuffle x3, x3, x5, r24; vmac.f bml0, bml0, x10, x7, r29 + ; CHECK-NEXT: mov p2, p5; vmac.f bmh5, bmh5, x1, x7, r29 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wh7, [p7, #32]; nopa ; nops ; nopx ; mov p4, p2; vmac.f bml2, bml2, x3, x7, r29 + ; CHECK-NEXT: vlda wl7, [p7], #256; paddb [p4], #320; nopx ; mov p5, p7; vmac.f bml4, bml4, x8, x5, r29 + ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh1, bmh1, x8, x9, r29 + ; CHECK-NEXT: vldb wl8, [p0], m4; vmac.f bmh0, bmh0, x1, x9, r29 + ; CHECK-NEXT: vldb wh10, [p0, #32]; vmac.f bmh8, bmh8, x10, x11, r29 + ; CHECK-NEXT: vldb wl10, [p0], m4; vmac.f bmh6, bmh6, x8, x11, r29 + ; CHECK-NEXT: vldb wh1, [p0, #32]; vmac.f bmh3, bmh3, x3, x9, r29 + ; CHECK-NEXT: vldb wl1, [p0], m4; vmac.f bmh2, bmh2, x10, x9, r29 + ; CHECK-NEXT: vldb wh3, [p0, #32]; vmac.f bml3, bml3, x1, x5, r29 + ; CHECK-NEXT: vldb.3d wl3, [p0], d1; vmac.f bml6, bml6, x3, x5, r29 + ; CHECK-NEXT: mov r1, p0; vmac.f bml5, bml5, x10, x5, r29 + ; CHECK-NEXT: and r2, r1, r0; vshift.align x0, x0, s0, x8, r3; vmac.f bmh4, bmh4, x1, x11, r29 + ; CHECK-NEXT: vshift.align x2, x2, s0, x10, r3; vmac.f bml1, bml1, x3, x11, r29 + ; CHECK-NEXT: vshuffle x5, x0, x2, r25 + ; CHECK-NEXT: vlda wh5, [p2, #352]; vshift.align x4, x4, s0, x1, r3 + ; CHECK-NEXT: vldb wl5, [p4], #64; vshuffle x8, x0, x2, r9 + ; CHECK-NEXT: vldb wh9, [p4, #32]; add r3, r2, #34; vshift.align x6, x6, s0, x3, r3 + ; CHECK-NEXT: vldb wl9, [p4], #64; vshuffle x3, x4, x6, r9; vmac.f bmh7, bmh7, x8, x7, r29 + ; CHECK-NEXT: vldb wl11, [p4, #0]; vshuffle x10, x4, x6, r25 + ; CHECK-NEXT: vldb wh11, [p4, #32]; vshuffle x1, x3, x5, r13 + ; CHECK-NEXT: vshuffle x3, x3, x5, r24; vmac.f bml0, bml0, x10, x7, r29 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov p2, p5; vmac.f bmh5, bmh5, x1, x7, r29 + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; nopb ; nopxm ; vmac.f bml2, bml2, x3, x7, r29 + ; CHECK-NEXT: vmac.f bml4, bml4, x8, x5, r29 + ; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x9, r29 + ; CHECK-NEXT: vmac.f bmh0, bmh0, x1, x9, r29 + ; CHECK-NEXT: vmac.f bmh8, bmh8, x10, x11, r29 + ; CHECK-NEXT: vmac.f bmh6, bmh6, x8, x11, r29 + ; CHECK-NEXT: vmac.f bmh3, bmh3, x3, x9, r29 + ; CHECK-NEXT: vmac.f bmh2, bmh2, x10, x9, r29 + ; CHECK-NEXT: vmac.f bml3, bml3, x1, x5, r29 + ; CHECK-NEXT: vmac.f bml6, bml6, x3, x5, r29 + ; CHECK-NEXT: vmac.f bml5, bml5, x10, x5, r29 + ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x11, r29 + ; CHECK-NEXT: vmac.f bml1, bml1, x3, x11, r29 + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: conv2d +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOV_RLC_imm10_pseudo 0 + $r1 = GE $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc3, $dc4, $dc5, $dc7, $dj1, $dj3, $dj5, $dj7, $dn1, $dn5, $dn7, $m0, $m1, $m3, $m4, $m5, $m7, $p0, $p1, $p2, $p3, $p6, $p7, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r13, $r14, $r15, $r16, $r17, $r18, $r19, $r20, $r21, $r22, $r23, $r24, $r25, $r26, $r27, $r28, $r29, $r30, $r31, $s0, $s1, $x0, $x2, $x4, $x6, $d1_3d:0x000000000003C870 + + $p5 = MOV_mv_scl $p7 + $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07 + 32, addrspace 5) + $wl8, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh10 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07 + 32, addrspace 5) + $wl10, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh1 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07 + 32, addrspace 5) + $wl1, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh3 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07 + 32, addrspace 5) + $wl3, $p0, $dc1, $dc5 = VLD_3D_pseudo $p0, $d1_3d :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $x0 = VSHIFT_ALIGN $x0, $s0, $x8, $r3 + $x2 = VSHIFT_ALIGN $x2, $s0, $x10, $r3 + $x4 = VSHIFT_ALIGN $x4, $s0, $x1, $r3 + $x6 = VSHIFT_ALIGN $x6, $s0, $x3, $r3 + $x8 = VSHUFFLE $x0, $x2, $r9 + $x3 = VSHUFFLE $x4, $x6, $r9 + $x5 = VSHUFFLE $x0, $x2, $r25 + $x10 = VSHUFFLE $x4, $x6, $r25 + $x1 = VSHUFFLE $x3, $x5, $r13 + $x3 = VSHUFFLE $x3, $x5, $r24 + $wh7 = VLD_idx_imm_3x32_pseudo $p7, 32 :: (load (<16 x s16>) from %ir.d.addr.07 + 32, addrspace 5) + $wl7, $p7 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p7, 256 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $p4 = MOV_mv_scl $p2 + $p4 = nuw PADD_imm9_pseudo $p4, 320 + $wh5 = VLDA_dmw_lda_w_ag_idx_imm $p2, 352 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl5, $p4 = VLD_pstm_imm_4x32_pseudo $p4, 64 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh9 = VLD_idx_imm_3x32_pseudo $p4, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl9, $p4 = VLD_pstm_imm_4x32_pseudo $p4, 64 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh11 = VLD_idx_imm_3x32_pseudo $p4, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl11 = VLD_idx_imm_3x32_pseudo $p4, 0 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x8, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x1, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x3, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x10, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x8, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x3, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x8, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x3, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x10, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x8, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask + $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x3, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x10, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask + $r1 = MOV_mv_scl $p0 + $r2 = AND $r1, $r0 + $r3 = nuw nsw ADD_add_r_ri $r2, 34, implicit-def $srcarry + $p2 = MOV_mv_scl $p5 + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir index 655cdee89a7a..a5dae2d34a2a 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir @@ -29,13 +29,13 @@ ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: nopb ; lda r0, [p2, #0]; nops ; nopxm ; nopv - ; CHECK-NEXT: nopx + ; CHECK-NEXT: nopb ; lda r0, [p2, #0]; nops ; nopx ; mov p2, p1; nopv + ; CHECK-NEXT: nopa ; nopx + ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop - ; CHECK-NEXT: mov p2, p1 ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopb ; nopa ; st r0, [p0, #0]; nopxm ; nopv ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-1.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-1.mir new file mode 100644 index 000000000000..d2e5db47a8e9 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-1.mir @@ -0,0 +1,225 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s + + +# derived from gemm_bf16_0 + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: gemm: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh9, [p1], m6 + ; CHECK-NEXT: vldb wl5, [p1], m5 + ; CHECK-NEXT: vldb wh5, [p1], m6 + ; CHECK-NEXT: vldb wl7, [p1], m5 + ; CHECK-NEXT: vldb wh7, [p1], m6 + ; CHECK-NEXT: vldb wl8, [p0, #0] + ; CHECK-NEXT: vldb wh8, [p0, #32] + ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2 + ; CHECK-NEXT: vldb wh6, [p0, #96]; padds [p0], m4 + ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2 + ; CHECK-NEXT: vldb wl10, [p0], #32; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl3, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb.3d wh3, [p1], d1; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x1, x8, x10, r4; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x8, x8, x10, r16; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x0, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3 + ; CHECK-NEXT: vldb wh9, [p1], m6; nopx ; vmac.f bmh2, bmh2, x0, x11, r3 + ; CHECK-NEXT: vldb wl5, [p1], m5; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vldb wh5, [p1], m6; vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vldb wl7, [p1], m5; vmac.f bmh5, bmh5, x8, x5, r3 + ; CHECK-NEXT: vldb wh7, [p1], m6; vmac.f bmh6, bmh6, x0, x5, r3 + ; CHECK-NEXT: vldb wl8, [p0, #0]; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3 + ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2; vmac.f bml0, bml0, x8, x7, r3 + ; CHECK-NEXT: padds [p0], m4; vldb wh6, [p0, #96]; vmac.f bml1, bml1, x0, x7, r3 + ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2; vmac.f bml2, bml2, x6, x7, r3 + ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml3, bml3, x1, x3, r3 + ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2; vmac.f bml5, bml5, x8, x3, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bml6, bml6, x0, x3, r3 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb wl3, [p1], m5 + ; CHECK-NEXT: vldb.3d wh3, [p1], d1 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vshuffle x1, x8, x10, r4 + ; CHECK-NEXT: vshuffle x8, x8, x10, r16 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x0, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3 + ; CHECK-NEXT: nopa ; nopx ; vmac.f bmh2, bmh2, x0, x11, r3 + ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vmac.f bmh5, bmh5, x8, x5, r3 + ; CHECK-NEXT: vmac.f bmh6, bmh6, x0, x5, r3 + ; CHECK-NEXT: vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x7, r3 + ; CHECK-NEXT: vmac.f bml0, bml0, x8, x7, r3 + ; CHECK-NEXT: vmac.f bml1, bml1, x0, x7, r3 + ; CHECK-NEXT: vmac.f bml2, bml2, x6, x7, r3 + ; CHECK-NEXT: vmac.f bml3, bml3, x1, x3, r3 + ; CHECK-NEXT: vmac.f bml5, bml5, x8, x3, r3 + ; CHECK-NEXT: vmac.f bml6, bml6, x0, x3, r3 + ; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + renamable $r1 = MOV_RLC_imm10_pseudo 0 + renamable $r1 = GE killed renamable $r1, renamable $r0 + JNZ killed renamable $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13 + + $wl8 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl6 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh6 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $p0 = nuw PADD_mod_pseudo $p0, $m4 + $p0 = PADD_imm9_pseudo $p0, 128 + $wl10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x1 = VSHUFFLE $x8, $x10, $r4 + $wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x11 = VSHUFFLE $x9, $x9, $r2 + $x8 = VSHUFFLE $x8, $x10, $r16 + $wh7, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x0 = VSHUFFLE $x6, $x3, $r4 + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $x6 = VSHUFFLE $x6, $x3, $r16 + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x8, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $wl3, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x5 = VSHUFFLE $x5, $x5, $r2 + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x0, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x6, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $wh3, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x8, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $x7 = VSHUFFLE $x7, $x7, $r2 + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x0, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x6, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x1, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x8, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $x3 = VSHUFFLE $x3, $x3, $r2 + $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x0, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x6, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x8, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x0, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x6, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir new file mode 100644 index 000000000000..0cd59c1838d1 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir @@ -0,0 +1,225 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s + + +# derived from gemm_bf16_0 + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: gemm: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh9, [p1], m6 + ; CHECK-NEXT: vldb wl5, [p1], m5 + ; CHECK-NEXT: vldb wh5, [p1], m6 + ; CHECK-NEXT: vldb wl7, [p1], m5 + ; CHECK-NEXT: vldb wh7, [p1], m6 + ; CHECK-NEXT: vldb wl8, [p0, #0] + ; CHECK-NEXT: vldb wh8, [p0, #32] + ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2 + ; CHECK-NEXT: vldb wh6, [p0, #96]; padds [p0], m4 + ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2 + ; CHECK-NEXT: vldb wl10, [p0], #32; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl3, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb.3d wh3, [p1], d1; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x1, x8, x10, r4; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x0, x8, x10, r16; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x0, x11, r3 + ; CHECK-NEXT: vldb wh9, [p1], m6; nopx ; vmac.f bmh2, bmh2, x10, x11, r3 + ; CHECK-NEXT: vldb wl5, [p1], m5; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vldb wh5, [p1], m6; vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vldb wl7, [p1], m5; vmac.f bmh5, bmh5, x0, x5, r3 + ; CHECK-NEXT: vldb wh7, [p1], m6; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: vldb wl8, [p0, #0]; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3 + ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2; vmac.f bml0, bml0, x0, x7, r3 + ; CHECK-NEXT: padds [p0], m4; vldb wh6, [p0, #96]; vmac.f bml1, bml1, x10, x7, r3 + ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2; vmac.f bml2, bml2, x6, x7, r3 + ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml3, bml3, x1, x3, r3 + ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2; vmac.f bml5, bml5, x0, x3, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bml6, bml6, x10, x3, r3 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb wl3, [p1], m5 + ; CHECK-NEXT: vldb.3d wh3, [p1], d1 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vshuffle x1, x8, x10, r4 + ; CHECK-NEXT: vshuffle x0, x8, x10, r16 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x0, x11, r3 + ; CHECK-NEXT: nopa ; nopx ; vmac.f bmh2, bmh2, x10, x11, r3 + ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vmac.f bmh5, bmh5, x0, x5, r3 + ; CHECK-NEXT: vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x7, r3 + ; CHECK-NEXT: vmac.f bml0, bml0, x0, x7, r3 + ; CHECK-NEXT: vmac.f bml1, bml1, x10, x7, r3 + ; CHECK-NEXT: vmac.f bml2, bml2, x6, x7, r3 + ; CHECK-NEXT: vmac.f bml3, bml3, x1, x3, r3 + ; CHECK-NEXT: vmac.f bml5, bml5, x0, x3, r3 + ; CHECK-NEXT: vmac.f bml6, bml6, x10, x3, r3 + ; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + renamable $r1 = MOV_RLC_imm10_pseudo 0 + renamable $r1 = GE killed renamable $r1, renamable $r0 + JNZ killed renamable $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13 + + $wl8 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl6 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh6 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $p0 = nuw PADD_mod_pseudo $p0, $m4 + $p0 = PADD_imm9_pseudo $p0, 128 + $wl10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x1 = VSHUFFLE $x8, $x10, $r4 + $wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x11 = VSHUFFLE $x9, $x9, $r2 + $x0 = VSHUFFLE $x8, $x10, $r16 + $wh7, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x10 = VSHUFFLE $x6, $x3, $r4 + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $x6 = VSHUFFLE $x6, $x3, $r16 + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x0, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $wl3, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x5 = VSHUFFLE $x5, $x5, $r2 + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x6, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $wh3, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x0, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $x7 = VSHUFFLE $x7, $x7, $r2 + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x10, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x6, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x1, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x0, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $x3 = VSHUFFLE $x3, $x3, $r2 + $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x10, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x6, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x0, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x10, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x6, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir new file mode 100644 index 000000000000..7be844a699a2 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir @@ -0,0 +1,225 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s + + +# derived from gemm_bf16_0 + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: gemm: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh9, [p1], m6 + ; CHECK-NEXT: vldb wl5, [p1], m5 + ; CHECK-NEXT: vldb wh5, [p1], m6 + ; CHECK-NEXT: vldb wl7, [p1], m5 + ; CHECK-NEXT: vldb wh7, [p1], m6 + ; CHECK-NEXT: vldb wl8, [p0, #0] + ; CHECK-NEXT: vldb wh8, [p0, #32] + ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2 + ; CHECK-NEXT: vldb wh6, [p0, #96]; padds [p0], m4 + ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2 + ; CHECK-NEXT: vldb wl10, [p0], #32; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl3, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb.3d wh3, [p1], d1; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x1, x8, x10, r4; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x2, x2, x10, r16; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x0, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x2, x11, r3 + ; CHECK-NEXT: vldb wh9, [p1], m6; nopx ; vmac.f bmh2, bmh2, x0, x11, r3 + ; CHECK-NEXT: vldb wl5, [p1], m5; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vldb wh5, [p1], m6; vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vldb wl7, [p1], m5; vmac.f bmh5, bmh5, x2, x5, r3 + ; CHECK-NEXT: vldb wh7, [p1], m6; vmac.f bmh6, bmh6, x0, x5, r3 + ; CHECK-NEXT: vldb wl8, [p0, #0]; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3 + ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2; vmac.f bml0, bml0, x2, x7, r3 + ; CHECK-NEXT: padds [p0], m4; vldb wh6, [p0, #96]; vmac.f bml1, bml1, x0, x7, r3 + ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2; vmac.f bml2, bml2, x6, x7, r3 + ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml3, bml3, x1, x3, r3 + ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2; vmac.f bml5, bml5, x2, x3, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bml6, bml6, x0, x3, r3 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb wl3, [p1], m5 + ; CHECK-NEXT: vldb.3d wh3, [p1], d1 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vshuffle x1, x8, x10, r4 + ; CHECK-NEXT: vshuffle x2, x2, x10, r16 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x0, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x2, x11, r3 + ; CHECK-NEXT: nopa ; nopx ; vmac.f bmh2, bmh2, x0, x11, r3 + ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vmac.f bmh5, bmh5, x2, x5, r3 + ; CHECK-NEXT: vmac.f bmh6, bmh6, x0, x5, r3 + ; CHECK-NEXT: vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x7, r3 + ; CHECK-NEXT: vmac.f bml0, bml0, x2, x7, r3 + ; CHECK-NEXT: vmac.f bml1, bml1, x0, x7, r3 + ; CHECK-NEXT: vmac.f bml2, bml2, x6, x7, r3 + ; CHECK-NEXT: vmac.f bml3, bml3, x1, x3, r3 + ; CHECK-NEXT: vmac.f bml5, bml5, x2, x3, r3 + ; CHECK-NEXT: vmac.f bml6, bml6, x0, x3, r3 + ; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + renamable $r1 = MOV_RLC_imm10_pseudo 0 + renamable $r1 = GE killed renamable $r1, renamable $r0 + JNZ killed renamable $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13 + + $wl8 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl6 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh6 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $p0 = nuw PADD_mod_pseudo $p0, $m4 + $p0 = PADD_imm9_pseudo $p0, 128 + $wl10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x1 = VSHUFFLE $x8, $x10, $r4 + $wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x11 = VSHUFFLE $x9, $x9, $r2 + $x2 = VSHUFFLE $x2, $x10, $r16 + $wh7, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x0 = VSHUFFLE $x6, $x3, $r4 + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $x6 = VSHUFFLE $x6, $x3, $r16 + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x2, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $wl3, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x5 = VSHUFFLE $x5, $x5, $r2 + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x0, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x6, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $wh3, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x2, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $x7 = VSHUFFLE $x7, $x7, $r2 + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x0, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x6, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x1, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x2, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $x3 = VSHUFFLE $x3, $x3, $r2 + $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x0, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x6, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x2, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x0, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x6, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir new file mode 100644 index 000000000000..d27b6a59be9c --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir @@ -0,0 +1,213 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 + +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \ +# RUN: --debug-only=postpipeliner,postpipeliner-summary -o - | FileCheck %s + + +# derived from gemm_bf16_0 + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: gemm: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: vldb wl8, [p1], m5; nopx + ; CHECK-NEXT: vldb wh8, [p1], m6 + ; CHECK-NEXT: vldb wl9, [p1], m5 + ; CHECK-NEXT: vldb wl0, [p0, #0] + ; CHECK-NEXT: vldb wh0, [p0, #32] + ; CHECK-NEXT: vldb wl1, [p0, #64] + ; CHECK-NEXT: padds [p0], m4; vldb wh1, [p0, #96]; add.nc lc, r0, #-1 + ; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb wl2, [p0], #32; vshuffle x8, x8, x8, r6 + ; CHECK-NEXT: vldb wh2, [p0], #32; movxm le, #.L_LEnd0 + ; CHECK-NEXT: vldb wl3, [p0], #32; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl10, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh10, [p1], m6; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl11, [p1], m5; nopa ; nops ; nopx ; vshuffle x9, x9, x9, r6; nopv + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x4, x0, x2, r3; nopv + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wl8, [p1], m5; vshuffle x5, x0, x2, r16 + ; CHECK-NEXT: vldb wh8, [p1], m6; nopx ; vshuffle x6, x1, x3, r3; vmac.f bmh0, bmh0, x4, x8, r2 + ; CHECK-NEXT: vldb wl9, [p1], m5; vshuffle x7, x1, x3, r16; vmac.f bmh2, bmh2, x5, x8, r2 + ; CHECK-NEXT: vldb wl0, [p0, #0]; vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: vldb wh0, [p0, #32]; vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: vldb wl1, [p0, #64]; vshuffle x11, x11, x11, r6; vmac.f bmh4, bmh0, x4, x9, r2 + ; CHECK-NEXT: padds [p0], m4; vldb wh1, [p0, #96]; vmac.f bmh6, bmh2, x5, x9, r2 + ; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vldb wl2, [p0], #32; vshuffle x8, x8, x8, r6; vmac.f bmh7, bmh3, x7, x9, r2 + ; CHECK-NEXT: vldb wh2, [p0], #32; vmac.f bmh8, bmh0, x4, x10, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bml0, bmh1, x6, x10, r2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml1, bmh2, x5, x10, r2 + ; CHECK-NEXT: vldb wl10, [p1], m5; vmac.f bml2, bmh3, x7, x10, r2 + ; CHECK-NEXT: vldb wh10, [p1], m6; vmac.f bml3, bmh0, x4, x11, r2 + ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x9, x9, x9, r6; vmac.f bml4, bmh1, x6, x11, r2 + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; vmac.f bml5, bmh2, x5, x11, r2 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x4, x0, x2, r3; vmac.f bml6, bmh3, x7, x11, r2 + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x5, x0, x2, r16; nopv + ; CHECK-NEXT: vshuffle x6, x1, x3, r3; vmac.f bmh0, bmh0, x4, x8, r2 + ; CHECK-NEXT: vshuffle x7, x1, x3, r16; vmac.f bmh2, bmh2, x5, x8, r2 + ; CHECK-NEXT: vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: vshuffle x11, x11, x11, r6; vmac.f bmh4, bmh0, x4, x9, r2 + ; CHECK-NEXT: vmac.f bmh6, bmh2, x5, x9, r2 + ; CHECK-NEXT: vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vmac.f bmh7, bmh3, x7, x9, r2 + ; CHECK-NEXT: vmac.f bmh8, bmh0, x4, x10, r2 + ; CHECK-NEXT: vmac.f bml0, bmh1, x6, x10, r2 + ; CHECK-NEXT: vmac.f bml1, bmh2, x5, x10, r2 + ; CHECK-NEXT: vmac.f bml2, bmh3, x7, x10, r2 + ; CHECK-NEXT: vmac.f bml3, bmh0, x4, x11, r2 + ; CHECK-NEXT: vmac.f bml4, bmh1, x6, x11, r2 + ; CHECK-NEXT: vmac.f bml5, bmh2, x5, x11, r2 + ; CHECK-NEXT: vmac.f bml6, bmh3, x7, x11, r2 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOV_RLC_imm10_pseudo 0 + $r1 = GE $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13 + + $wl0 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh0 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh1 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $p0 = nuw PADDS_st_ptr_inc_idx $p0, $m4 + $p0 = nuw PADD_imm9_pseudo $p0, 128 + $wl2, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh2, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x4 = VSHUFFLE $x0, $x2, $r3 + $x5 = VSHUFFLE $x0, $x2, $r16 + $x6 = VSHUFFLE $x1, $x3, $r3 + $x7 = VSHUFFLE $x1, $x3, $r16 + $wl8, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh8, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl10, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh10, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl11, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh11, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x8 = VSHUFFLE $x8, $x8, $r6 + $x9 = VSHUFFLE $x9, $x9, $r6 + $x10 = VSHUFFLE $x10, $x10, $r6 + $x11 = VSHUFFLE $x11, $x11, $r6 + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml0 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml1 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml2 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml3 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml4 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml5 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml6 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask + + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA.mir new file mode 100644 index 000000000000..55b804bc9868 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA.mir @@ -0,0 +1,240 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \ +# RUN: --debug-only=postpipeliner,postpipeliner-summary -o - | FileCheck %s + + +# derived from gemm_bf16_0 + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: gemm: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: vldb wl0, [p0], #32; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh0, [p0], #32; nopx + ; CHECK-NEXT: vldb wl1, [p0], #32 + ; CHECK-NEXT: vldb wh1, [p0], #32 + ; CHECK-NEXT: vldb wl8, [p1], m5; padds [p0], m4 + ; CHECK-NEXT: vldb wl2, [p0], #32 + ; CHECK-NEXT: vldb wh2, [p0], #32 + ; CHECK-NEXT: vldb wh8, [p1], m6 + ; CHECK-NEXT: vldb wl3, [p0], #32 + ; CHECK-NEXT: vldb wl9, [p1], m5 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: vldb wh9, [p1], m6 + ; CHECK-NEXT: vldb wl10, [p1], m5 + ; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3 + ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x8, x8, x8, r6 + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x5, x0, x2, r16 + ; CHECK-NEXT: vldb wl0, [p0], #32; vmac.f bmh0, bmh0, x4, x8, r2 + ; CHECK-NEXT: vldb wh0, [p0], #32; vshuffle x6, x1, x3, r3; vmac.f bmh2, bmh2, x5, x8, r2 + ; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x7, x1, x3, r16 + ; CHECK-NEXT: vldb wh1, [p0], #32; vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: padds [p0], m4; vldb wl8, [p1], m5; vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: vldb wl2, [p0], #32; add.nc lc, r0, #-2; vmac.f bmh4, bmh0, x4, x9, r2 + ; CHECK-NEXT: vldb wh2, [p0], #32; vshuffle x11, x11, x11, r6; vmac.f bmh6, bmh2, x5, x9, r2 + ; CHECK-NEXT: vldb wh8, [p1], m6; movxm ls, #.LBB0_2; vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; movxm le, #.L_LEnd0; vmac.f bmh7, bmh3, x7, x9, r2 + ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopxm ; vmac.f bmh8, bmh0, x4, x10, r2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; nopa ; nops ; nopxm ; vmac.f bml0, bmh1, x6, x10, r2 + ; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; nops ; nopxm ; vmac.f bml1, bmh2, x5, x10, r2 + ; CHECK-NEXT: vldb wl10, [p1], m5; nopa ; nops ; nopxm ; vmac.f bml2, bmh3, x7, x10, r2 + ; CHECK-NEXT: vldb wh10, [p1], m6; nopa ; nops ; nopx ; vshuffle x4, x0, x2, r3; vmac.f bml3, bmh0, x4, x11, r2 + ; CHECK-NEXT: vldb wl11, [p1], m5; nopa ; nops ; nopx ; vshuffle x8, x8, x8, r6; vmac.f bml4, bmh1, x6, x11, r2 + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; nopa ; nops ; nopx ; vshuffle x5, x0, x2, r16; vmac.f bml5, bmh2, x5, x11, r2 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wl0, [p0], #32; nopa ; nops ; nopxm ; vmac.f bmh0, bmh0, x4, x8, r2 + ; CHECK-NEXT: vldb wh0, [p0], #32; vshuffle x6, x1, x3, r3; vmac.f bmh2, bmh2, x5, x8, r2 + ; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x7, x1, x3, r16; vmac.f bml6, bmh3, x7, x11, r2 + ; CHECK-NEXT: vldb wh1, [p0], #32; vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: padds [p0], m4; vldb wl8, [p1], m5; vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: vldb wl2, [p0], #32; vmac.f bmh4, bmh0, x4, x9, r2 + ; CHECK-NEXT: vldb wh2, [p0], #32; vshuffle x11, x11, x11, r6; vmac.f bmh6, bmh2, x5, x9, r2 + ; CHECK-NEXT: vldb wh8, [p1], m6; vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh7, bmh3, x7, x9, r2 + ; CHECK-NEXT: vldb wl9, [p1], m5; vmac.f bmh8, bmh0, x4, x10, r2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml0, bmh1, x6, x10, r2 + ; CHECK-NEXT: vldb wh9, [p1], m6; vmac.f bml1, bmh2, x5, x10, r2 + ; CHECK-NEXT: vldb wl10, [p1], m5; vmac.f bml2, bmh3, x7, x10, r2 + ; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3; vmac.f bml3, bmh0, x4, x11, r2 + ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x8, x8, x8, r6; vmac.f bml4, bmh1, x6, x11, r2 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; nopa ; nops ; nopx ; vshuffle x5, x0, x2, r16; vmac.f bml5, bmh2, x5, x11, r2 + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; nopb ; nopxm ; vmac.f bmh0, bmh0, x4, x8, r2 + ; CHECK-NEXT: vshuffle x6, x1, x3, r3; vmac.f bmh2, bmh2, x5, x8, r2 + ; CHECK-NEXT: vshuffle x7, x1, x3, r16; vmac.f bml6, bmh3, x7, x11, r2 + ; CHECK-NEXT: vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: vmac.f bmh4, bmh0, x4, x9, r2 + ; CHECK-NEXT: vshuffle x11, x11, x11, r6; vmac.f bmh6, bmh2, x5, x9, r2 + ; CHECK-NEXT: vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vmac.f bmh7, bmh3, x7, x9, r2 + ; CHECK-NEXT: vmac.f bmh8, bmh0, x4, x10, r2 + ; CHECK-NEXT: vmac.f bml0, bmh1, x6, x10, r2 + ; CHECK-NEXT: vmac.f bml1, bmh2, x5, x10, r2 + ; CHECK-NEXT: vmac.f bml2, bmh3, x7, x10, r2 + ; CHECK-NEXT: vmac.f bml3, bmh0, x4, x11, r2 + ; CHECK-NEXT: vmac.f bml4, bmh1, x6, x11, r2 + ; CHECK-NEXT: vmac.f bml5, bmh2, x5, x11, r2 + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: vmac.f bml6, bmh3, x7, x11, r2 + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOV_RLC_imm10_pseudo 0 + $r1 = GE $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13 + + $wl0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl1, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh1, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $p0 = nuw PADDS_st_ptr_inc_idx $p0, $m4 + $wl2, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh2, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x4 = VSHUFFLE $x0, $x2, $r3 + $x5 = VSHUFFLE $x0, $x2, $r16 + $x6 = VSHUFFLE $x1, $x3, $r3 + $x7 = VSHUFFLE $x1, $x3, $r16 + $wl8, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh8, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl10, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh10, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl11, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh11, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x8 = VSHUFFLE $x8, $x8, $r6 + $x9 = VSHUFFLE $x9, $x9, $r6 + $x10 = VSHUFFLE $x10, $x10, $r6 + $x11 = VSHUFFLE $x11, $x11, $r6 + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml0 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml1 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml2 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml3 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml4 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml5 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml6 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask + + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nooffset.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nooffset.mir new file mode 100644 index 000000000000..7653de7caab9 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nooffset.mir @@ -0,0 +1,243 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s + + +# derived from gemm_bf16_0 + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: gemm: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: nopa ; vldb wl11, [p1], m5; nopxm ; nops + ; CHECK-NEXT: vldb wh11, [p1], m6 + ; CHECK-NEXT: vldb wl5, [p1], m5 + ; CHECK-NEXT: vlda wh5, [p1], m6 + ; CHECK-NEXT: vldb wl8, [p0], #32 + ; CHECK-NEXT: vldb wh8, [p0], #32 + ; CHECK-NEXT: vldb wl1, [p0], #32 + ; CHECK-NEXT: vldb wh1, [p0], #32 + ; CHECK-NEXT: paddb [p0], m4 + ; CHECK-NEXT: vldb wl0, [p0], #32 + ; CHECK-NEXT: vldb wh0, [p0], #32; vshuffle x5, x5, x5, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: vldb wl0, [p1], m5 + ; CHECK-NEXT: vldb wh0, [p1], m6 + ; CHECK-NEXT: vldb wl7, [p1], m5 + ; CHECK-NEXT: vldb.3d wh7, [p1], d1 + ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x2, x8, x0, r16 + ; CHECK-NEXT: vldb wh11, [p1], m6; vshuffle x6, x8, x0, r4 + ; CHECK-NEXT: vldb wl5, [p1], m5; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r3 + ; CHECK-NEXT: vlda wh5, [p1], m6; vshuffle x10, x1, x3, r4; vmac.f bmh4, bmh4, x6, x5, r3 + ; CHECK-NEXT: vldb wl8, [p0], #32; vshuffle x3, x11, x11, r2; vmac.f bmh7, bmh7, x9, x5, r3 + ; CHECK-NEXT: vldb wh8, [p0], #32; vshuffle x0, x0, x0, r2; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x7, x7, x7, r2; vmac.f bmh0, bmh0, x6, x3, r3 + ; CHECK-NEXT: vldb wh1, [p0], #32; add.nc lc, r0, #-2; vmac.f bml2, bml2, x9, x0, r3 + ; CHECK-NEXT: paddb [p0], m4; movxm ls, #.LBB0_2; vmac.f bmh1, bmh1, x2, x3, r3 + ; CHECK-NEXT: vldb wl0, [p0], #32; movxm le, #.L_LEnd0; vmac.f bmh2, bmh2, x10, x3, r3 + ; CHECK-NEXT: vldb wh0, [p0], #32; nopa ; nops ; nopx ; vshuffle x5, x5, x5, r2; vmac.f bmh3, bmh3, x9, x3, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32; nopa ; nops ; nopxm ; vmac.f bmh8, bmh8, x6, x0, r3 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; nopa ; nops ; nopxm ; vmac.f bml0, bml0, x2, x0, r3 + ; CHECK-NEXT: vldb wl0, [p1], m5; nopa ; nops ; nopxm ; vmac.f bml1, bml1, x10, x0, r3 + ; CHECK-NEXT: vldb wh0, [p1], m6; nopa ; nops ; nopxm ; vmac.f bml3, bml3, x6, x7, r3 + ; CHECK-NEXT: vldb wl7, [p1], m5; nopa ; nops ; nopxm ; vmac.f bml5, bml5, x2, x7, r3 + ; CHECK-NEXT: vldb.3d wh7, [p1], d1; nopa ; nops ; nopxm ; vmac.f bml6, bml6, x10, x7, r3 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: nopa ; vldb wl11, [p1], m5; nopx ; vshuffle x2, x8, x0, r16; vmac.f bml4, bml4, x9, x7, r3 + ; CHECK-NEXT: vldb wh11, [p1], m6; vshuffle x6, x8, x0, r4 + ; CHECK-NEXT: vldb wl5, [p1], m5; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r3 + ; CHECK-NEXT: vlda wh5, [p1], m6; vshuffle x10, x1, x3, r4; vmac.f bmh4, bmh4, x6, x5, r3 + ; CHECK-NEXT: vldb wl8, [p0], #32; vshuffle x3, x11, x11, r2; vmac.f bmh7, bmh7, x9, x5, r3 + ; CHECK-NEXT: vldb wh8, [p0], #32; vshuffle x0, x0, x0, r2; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x7, x7, x7, r2; vmac.f bmh0, bmh0, x6, x3, r3 + ; CHECK-NEXT: vldb wh1, [p0], #32; vmac.f bml2, bml2, x9, x0, r3 + ; CHECK-NEXT: paddb [p0], m4; vmac.f bmh1, bmh1, x2, x3, r3 + ; CHECK-NEXT: vldb wl0, [p0], #32; vmac.f bmh2, bmh2, x10, x3, r3 + ; CHECK-NEXT: vldb wh0, [p0], #32; vshuffle x5, x5, x5, r2; vmac.f bmh3, bmh3, x9, x3, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh8, bmh8, x6, x0, r3 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml0, bml0, x2, x0, r3 + ; CHECK-NEXT: vldb wl0, [p1], m5; vmac.f bml1, bml1, x10, x0, r3 + ; CHECK-NEXT: vldb wh0, [p1], m6; vmac.f bml3, bml3, x6, x7, r3 + ; CHECK-NEXT: vldb wl7, [p1], m5; vmac.f bml5, bml5, x2, x7, r3 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: vldb.3d wh7, [p1], d1; nopa ; nops ; nopxm ; vmac.f bml6, bml6, x10, x7, r3 + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: vshuffle x2, x8, x0, r16; vmac.f bml4, bml4, x9, x7, r3 + ; CHECK-NEXT: vshuffle x6, x8, x0, r4 + ; CHECK-NEXT: vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r3 + ; CHECK-NEXT: vshuffle x10, x1, x3, r4; vmac.f bmh4, bmh4, x6, x5, r3 + ; CHECK-NEXT: vshuffle x3, x11, x11, r2; vmac.f bmh7, bmh7, x9, x5, r3 + ; CHECK-NEXT: vshuffle x0, x0, x0, r2; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh0, bmh0, x6, x3, r3 + ; CHECK-NEXT: vmac.f bml2, bml2, x9, x0, r3 + ; CHECK-NEXT: vmac.f bmh1, bmh1, x2, x3, r3 + ; CHECK-NEXT: vmac.f bmh2, bmh2, x10, x3, r3 + ; CHECK-NEXT: vmac.f bmh3, bmh3, x9, x3, r3 + ; CHECK-NEXT: vmac.f bmh8, bmh8, x6, x0, r3 + ; CHECK-NEXT: vmac.f bml0, bml0, x2, x0, r3 + ; CHECK-NEXT: vmac.f bml1, bml1, x10, x0, r3 + ; CHECK-NEXT: vmac.f bml3, bml3, x6, x7, r3 + ; CHECK-NEXT: vmac.f bml5, bml5, x2, x7, r3 + ; CHECK-NEXT: vmac.f bml6, bml6, x10, x7, r3 + ; CHECK-NEXT: vmac.f bml4, bml4, x9, x7, r3 + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %p5 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %p6 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %p6, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %p5, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %p5, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %p6, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + renamable $r1 = MOV_RLC_imm10_pseudo 0 + renamable $r1 = GE killed renamable $r1, renamable $r0 + JNZ killed renamable $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13 + + $wl8, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p6, addrspace 6) + $wh8, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p6, addrspace 6) + $wl1, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p6, addrspace 6) + $wh1, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p6, addrspace 6) + $p0 = nuw PADD_mod_pseudo $p0, $m4 + $wl0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p6, addrspace 6) + $wh0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p6, addrspace 6) + $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p6, addrspace 6) + $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.p6, addrspace 6) + $x6 = VSHUFFLE $x8, $x0, $r4 + $x2 = VSHUFFLE $x8, $x0, $r16 + $x10 = VSHUFFLE $x1, $x3, $r4 + $x9 = VSHUFFLE $x1, $x3, $r16 + $wl11, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.p6, addrspace 6) + $wh11, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.p6, addrspace 6) + $wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.p6, addrspace 6) + $wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.p6, addrspace 6) + $wl0, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.p6, addrspace 6) + $wh0, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.p6, addrspace 6) + $wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.p6, addrspace 6) + $wh7, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.p6, addrspace 6) + $x3 = VSHUFFLE $x11, $x11, $r2 + $x5 = VSHUFFLE $x5, $x5, $r2 + $x0 = VSHUFFLE $x0, $x0, $r2 + $x7 = VSHUFFLE $x7, $x7, $r2 + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x6, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x2, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x9, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x6, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x2, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x10, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x9, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x6, $x0, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x2, $x0, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x10, $x0, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x9, $x0, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x6, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x2, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x10, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x9, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched-nobanks.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched-nobanks.mir new file mode 100644 index 000000000000..c42c77b6337b --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched-nobanks.mir @@ -0,0 +1,215 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \ +# RUN: --debug-only=postpipeliner-summary -o - | FileCheck %s + + +# derived from gemm_bf16_0 + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: gemm: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: nopa ; vldb wl8, [p0, #0]; nopxm + ; CHECK-NEXT: vldb wh8, [p0, #32] + ; CHECK-NEXT: vldb wl1, [p0, #64] + ; CHECK-NEXT: vlda wh1, [p0, #96]; paddb [p0], m4 + ; CHECK-NEXT: paddb [p0], #128 + ; CHECK-NEXT: vldb wl10, [p0], #32 + ; CHECK-NEXT: vldb wh10, [p0], #32 + ; CHECK-NEXT: vldb wl3, [p0], #32 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; add.nc lc, r0, #-1 + ; CHECK-NEXT: movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb wl3, [p1], m5; movxm le, #.L_LEnd0 + ; CHECK-NEXT: vldb wh3, [p1], m6; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl5, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh5, [p1], m6; nopa ; nops ; nopx ; vshuffle x6, x8, x10, r3; nopv + ; CHECK-NEXT: vldb wl0, [p1], m5; nopa ; nops ; nopx ; vshuffle x11, x8, x10, r16; nopv + ; CHECK-NEXT: vldb wh0, [p1], m6; nopa ; nops ; nopx ; vshuffle x0, x1, x3, r3; nopv + ; CHECK-NEXT: vldb wl7, [p1], m5; nopa ; nops ; nopx ; vshuffle x9, x1, x3, r16; nopv + ; CHECK-NEXT: vldb.3d wh7, [p1], d1; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wl8, [p0, #0]; vshuffle x3, x3, x3, r6 + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopx + ; CHECK-NEXT: vldb wl1, [p0, #64]; vshuffle x5, x5, x5, r6; vmac.f bmh0, bmh0, x6, x3, r2 + ; CHECK-NEXT: vlda wh1, [p0, #96]; paddb [p0], m4; vmac.f bmh1, bmh1, x11, x3, r2 + ; CHECK-NEXT: paddb [p0], #128; vshuffle x0, x0, x0, r6; vmac.f bmh3, bmh3, x9, x3, r2 + ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bmh4, bmh4, x6, x5, r2 + ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r6; vmac.f bmh2, bmh2, x0, x3, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh5, bmh5, x11, x5, r2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bmh6, bmh6, x0, x5, r2 + ; CHECK-NEXT: vmac.f bmh7, bmh7, x9, x5, r2 + ; CHECK-NEXT: vldb wl3, [p1], m5; vmac.f bmh8, bmh8, x6, x0, r2 + ; CHECK-NEXT: vldb wh3, [p1], m6; vmac.f bml0, bml0, x11, x0, r2 + ; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml1, bml1, x0, x0, r2 + ; CHECK-NEXT: vldb wh5, [p1], m6; vshuffle x6, x8, x10, r3; vmac.f bml2, bml2, x9, x0, r2 + ; CHECK-NEXT: vldb wl0, [p1], m5; vshuffle x11, x8, x10, r16; vmac.f bml3, bml3, x6, x7, r2 + ; CHECK-NEXT: vldb wh0, [p1], m6; vshuffle x0, x1, x3, r3; vmac.f bml4, bml4, x11, x7, r2 + ; CHECK-NEXT: vldb wl7, [p1], m5; vshuffle x9, x1, x3, r16; vmac.f bml6, bml6, x0, x7, r2 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: vldb.3d wh7, [p1], d1; nopa ; nops ; nopxm ; vmac.f bml5, bml5, x9, x7, r2 + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x3, x3, x3, r6; nopv + ; CHECK-NEXT: nopx + ; CHECK-NEXT: vshuffle x5, x5, x5, r6; vmac.f bmh0, bmh0, x6, x3, r2 + ; CHECK-NEXT: vmac.f bmh1, bmh1, x11, x3, r2 + ; CHECK-NEXT: vshuffle x0, x0, x0, r6; vmac.f bmh3, bmh3, x9, x3, r2 + ; CHECK-NEXT: vmac.f bmh4, bmh4, x6, x5, r2 + ; CHECK-NEXT: vshuffle x7, x7, x7, r6; vmac.f bmh2, bmh2, x0, x3, r2 + ; CHECK-NEXT: vmac.f bmh5, bmh5, x11, x5, r2 + ; CHECK-NEXT: vmac.f bmh6, bmh6, x0, x5, r2 + ; CHECK-NEXT: vmac.f bmh7, bmh7, x9, x5, r2 + ; CHECK-NEXT: vmac.f bmh8, bmh8, x6, x0, r2 + ; CHECK-NEXT: vmac.f bml0, bml0, x11, x0, r2 + ; CHECK-NEXT: vmac.f bml1, bml1, x0, x0, r2 + ; CHECK-NEXT: vmac.f bml2, bml2, x9, x0, r2 + ; CHECK-NEXT: vmac.f bml3, bml3, x6, x7, r2 + ; CHECK-NEXT: vmac.f bml4, bml4, x11, x7, r2 + ; CHECK-NEXT: vmac.f bml6, bml6, x0, x7, r2 + ; CHECK-NEXT: vmac.f bml5, bml5, x9, x7, r2 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOV_RLC_imm10_pseudo 0 + $r1 = GE $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13 + + $wl8 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl1 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh1 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $p0 = nuw PADD_mod_pseudo $p0, $m4 + $p0 = PADD_imm9_pseudo $p0, 128 + $wl10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x6 = VSHUFFLE $x8, $x10, $r3 + $x11 = VSHUFFLE $x8, $x10, $r16 + $x0 = VSHUFFLE $x1, $x3, $r3 + $x9 = VSHUFFLE $x1, $x3, $r16 + $wl3, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh3, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl0, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh0, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh7, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x3 = VSHUFFLE $x3, $x3, $r6 + $x5 = VSHUFFLE $x5, $x5, $r6 + $x7 = VSHUFFLE $x7, $x7, $r6 + $x0 = VSHUFFLE $x0, $x0, $r6 + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x6, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x11, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x0, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x9, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x6, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x11, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x0, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x9, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x6, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x11, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x0, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x9, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x6, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x11, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x0, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x9, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask + + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir new file mode 100644 index 000000000000..dd2f9173bb7e --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir @@ -0,0 +1,208 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \ +# RUN: --debug-only=postpipeliner-summary -o - | FileCheck %s + + +# derived from gemm_bf16_0 + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: gemm: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: vlda wl11, [p1], m5; vldb wl8, [p0], #32; nopxm + ; CHECK-NEXT: vlda wh11, [p1], m6; vldb wh8, [p0], #32 + ; CHECK-NEXT: vlda wl5, [p1], m5; vldb wl1, [p0], #32 + ; CHECK-NEXT: vlda wh5, [p1], m6; vldb wh1, [p0], #32 + ; CHECK-NEXT: paddb [p0], m4 + ; CHECK-NEXT: vldb wl0, [p0], #32 + ; CHECK-NEXT: vldb wh0, [p0], #32; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb wl3, [p0], #32; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; movxm le, #.L_LEnd0 + ; CHECK-NEXT: vldb wl0, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh0, [p1], m6; nopa ; nops ; nopx ; vshuffle x5, x5, x5, r6; nopv + ; CHECK-NEXT: vldb wl7, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb.3d wh7, [p1], d1; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x6, x8, x0, r3; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x2, x8, x0, r16; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x10, x1, x3, r3; vmac.f bmh4, bmh4, x6, x5, r2 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wl8, [p0], #32; vlda wl11, [p1], m5; nops ; nopx ; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r2 + ; CHECK-NEXT: vlda wh11, [p1], m6; vldb wh8, [p0], #32; nopx ; vshuffle x3, x11, x11, r6; vmac.f bmh6, bmh6, x10, x5, r2 + ; CHECK-NEXT: vlda wl5, [p1], m5; vldb wl1, [p0], #32; vshuffle x0, x0, x0, r6; vmac.f bmh7, bmh7, x9, x5, r2 + ; CHECK-NEXT: vlda wh5, [p1], m6; vldb wh1, [p0], #32; vshuffle x7, x7, x7, r6; vmac.f bmh0, bmh0, x6, x3, r2 + ; CHECK-NEXT: paddb [p0], m4; vmac.f bmh1, bmh1, x2, x3, r2 + ; CHECK-NEXT: vldb wl0, [p0], #32; vmac.f bmh2, bmh2, x10, x3, r2 + ; CHECK-NEXT: vldb wh0, [p0], #32; vmac.f bmh3, bmh3, x9, x3, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh8, bmh8, x6, x0, r2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml0, bml0, x2, x0, r2 + ; CHECK-NEXT: vldb wl0, [p1], m5; vmac.f bml1, bml1, x10, x0, r2 + ; CHECK-NEXT: vldb wh0, [p1], m6; vshuffle x5, x5, x5, r6; vmac.f bml2, bml2, x9, x0, r2 + ; CHECK-NEXT: vldb wl7, [p1], m5; vmac.f bml3, bml3, x6, x7, r2 + ; CHECK-NEXT: vldb.3d wh7, [p1], d1; vmac.f bml4, bml4, x2, x7, r2 + ; CHECK-NEXT: vshuffle x6, x8, x0, r3; vmac.f bml6, bml6, x10, x7, r2 + ; CHECK-NEXT: vshuffle x2, x8, x0, r16; vmac.f bml5, bml5, x9, x7, r2 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x10, x1, x3, r3; vmac.f bmh4, bmh4, x6, x5, r2 + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopx ; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r2 + ; CHECK-NEXT: vshuffle x3, x11, x11, r6; vmac.f bmh6, bmh6, x10, x5, r2 + ; CHECK-NEXT: vshuffle x0, x0, x0, r6; vmac.f bmh7, bmh7, x9, x5, r2 + ; CHECK-NEXT: vshuffle x7, x7, x7, r6; vmac.f bmh0, bmh0, x6, x3, r2 + ; CHECK-NEXT: vmac.f bmh1, bmh1, x2, x3, r2 + ; CHECK-NEXT: vmac.f bmh2, bmh2, x10, x3, r2 + ; CHECK-NEXT: vmac.f bmh3, bmh3, x9, x3, r2 + ; CHECK-NEXT: vmac.f bmh8, bmh8, x6, x0, r2 + ; CHECK-NEXT: vmac.f bml0, bml0, x2, x0, r2 + ; CHECK-NEXT: vmac.f bml1, bml1, x10, x0, r2 + ; CHECK-NEXT: vmac.f bml2, bml2, x9, x0, r2 + ; CHECK-NEXT: vmac.f bml3, bml3, x6, x7, r2 + ; CHECK-NEXT: vmac.f bml4, bml4, x2, x7, r2 + ; CHECK-NEXT: vmac.f bml6, bml6, x10, x7, r2 + ; CHECK-NEXT: vmac.f bml5, bml5, x9, x7, r2 + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + $r1 = MOV_RLC_imm10_pseudo 0 + $r1 = GE $r1, $r0 + JNZ $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13 + + $wl8, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh8, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl1, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh1, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $p0 = nuw PADD_mod_pseudo $p0, $m4 + $wl0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x6 = VSHUFFLE $x8, $x0, $r3 + $x2 = VSHUFFLE $x8, $x0, $r16 + $x10 = VSHUFFLE $x1, $x3, $r3 + $x9 = VSHUFFLE $x1, $x3, $r16 + $wl11, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh11, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl0, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh0, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $wh7, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5) + $x3 = VSHUFFLE $x11, $x11, $r6 + $x5 = VSHUFFLE $x5, $x5, $r6 + $x0 = VSHUFFLE $x0, $x0, $r6 + $x7 = VSHUFFLE $x7, $x7, $r6 + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x6, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x2, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x9, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x6, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x2, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x10, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x9, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x6, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x2, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x10, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x9, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x6, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x2, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x10, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask + $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x9, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask + + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm.mir new file mode 100644 index 000000000000..faa3459fca85 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm.mir @@ -0,0 +1,225 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s + + +# derived from gemm_bf16_0 + +--- | + define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { + ; CHECK-LABEL: gemm: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: mova r1, #0; nopb ; nopxm + ; CHECK-NEXT: ge r1, r1, r0 + ; CHECK-NEXT: jnz r1, #.LBB0_4 + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + ; CHECK-NEXT: // %bb.1: // %for.body.preheader + ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh9, [p1], m6 + ; CHECK-NEXT: vldb wl5, [p1], m5 + ; CHECK-NEXT: vldb wh5, [p1], m6 + ; CHECK-NEXT: vldb wl7, [p1], m5 + ; CHECK-NEXT: vldb wh7, [p1], m6 + ; CHECK-NEXT: vldb wl8, [p0, #0] + ; CHECK-NEXT: vldb wh8, [p0, #32] + ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2 + ; CHECK-NEXT: vldb wh6, [p0, #96]; padds [p0], m4 + ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2 + ; CHECK-NEXT: vldb wl10, [p0], #32; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl3, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb.3d wh3, [p1], d1; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x1, x8, x10, r4; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x8, x8, x10, r16; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3 + ; CHECK-NEXT: vldb wh9, [p1], m6; nopx ; vmac.f bmh2, bmh2, x10, x11, r3 + ; CHECK-NEXT: vldb wl5, [p1], m5; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vldb wh5, [p1], m6; vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vldb wl7, [p1], m5; vmac.f bmh5, bmh5, x8, x5, r3 + ; CHECK-NEXT: vldb wh7, [p1], m6; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: vldb wl8, [p0, #0]; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3 + ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2; vmac.f bml0, bml0, x8, x7, r3 + ; CHECK-NEXT: padds [p0], m4; vldb wh6, [p0, #96]; vmac.f bml1, bml1, x10, x7, r3 + ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2; vmac.f bml2, bml2, x6, x7, r3 + ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml3, bml3, x1, x3, r3 + ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2; vmac.f bml5, bml5, x8, x3, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bml6, bml6, x10, x3, r3 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb wl3, [p1], m5 + ; CHECK-NEXT: vldb.3d wh3, [p1], d1 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vshuffle x1, x8, x10, r4 + ; CHECK-NEXT: vshuffle x8, x8, x10, r16 + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3 + ; CHECK-NEXT: nopa ; nopx ; vmac.f bmh2, bmh2, x10, x11, r3 + ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vmac.f bmh5, bmh5, x8, x5, r3 + ; CHECK-NEXT: vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x7, r3 + ; CHECK-NEXT: vmac.f bml0, bml0, x8, x7, r3 + ; CHECK-NEXT: vmac.f bml1, bml1, x10, x7, r3 + ; CHECK-NEXT: vmac.f bml2, bml2, x6, x7, r3 + ; CHECK-NEXT: vmac.f bml3, bml3, x1, x3, r3 + ; CHECK-NEXT: vmac.f bml5, bml5, x8, x3, r3 + ; CHECK-NEXT: vmac.f bml6, bml6, x10, x3, r3 + ; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup + ; CHECK-NEXT: nopa ; ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6 + } + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i32(i32) #1 + + attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.itercount.range", i64 10} + +... +--- +name: gemm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $p0, $p1, $r0 + + renamable $r1 = MOV_RLC_imm10_pseudo 0 + renamable $r1 = GE killed renamable $r1, renamable $r0 + JNZ killed renamable $r1, %bb.3 + DelayedSchedBarrier + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + liveins: $p0, $p1, $r0 + + $lc = ADD_NC $r0, 0 + $ls = MOVXM_lng_cg %bb.2 + $le = MOVXM_lng_cg + + bb.2.for.body (align 16): + liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13 + + $wl8 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl6 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh6 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $p0 = nuw PADD_mod_pseudo $p0, $m4 + $p0 = PADD_imm9_pseudo $p0, 128 + $wl10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x1 = VSHUFFLE $x8, $x10, $r4 + $wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x11 = VSHUFFLE $x9, $x9, $r2 + $x8 = VSHUFFLE $x8, $x10, $r16 + $wh7, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x10 = VSHUFFLE $x6, $x3, $r4 + $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $x6 = VSHUFFLE $x6, $x3, $r16 + $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x8, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $wl3, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $x5 = VSHUFFLE $x5, $x5, $r2 + $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x6, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask + $wh3, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6) + $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x8, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $x7 = VSHUFFLE $x7, $x7, $r2 + $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x10, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x6, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask + $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x1, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x8, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $x3 = VSHUFFLE $x3, $x3, $r2 + $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x10, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x6, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x8, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x10, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x6, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask + PseudoLoopEnd , %bb.2 + + bb.3.for.cond.cleanup (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir index 789ef09bdcaa..2cee43297f55 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir @@ -34,35 +34,43 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: nopa ; nopb ; nopx ; add.nc lc, r0, #-1 - ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; movxm ls, #.LBB0_2 - ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; nopxm + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32 + ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32 + ; CHECK-NEXT: nop + ; CHECK-NEXT: add.nc lc, r0, #-4 + ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh0, cm0, s1; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh0, cm0, s1; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; nops ; nopx ; vups.s32.s8 cm2, wh0, s1; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; nopb ; vsrs.s8.s32 wh0, cm0, s1 - ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1 - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: vups.s32.s8 cm2, wh0, s1 - ; CHECK-NEXT: vups.s32.s8 cm3, wh2, s1 - ; CHECK-NEXT: nop + ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1; nopx ; vups.s32.s8 cm3, wh2, s1; nopv + ; CHECK-NEXT: nopa ; nopb ; nopx ; vsrs.s8.s32 wh0, cm0, s1 ; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopb ; nopa ; vst.srs.s8.s32 cm3, s0, [p1], #32; nopxm ; nopv + ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; vst.srs.s8.s32 cm3, s0, [p1], #32; nopx ; vups.s32.s8 cm2, wh0, s1; nopv ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup - ; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1; nopx - ; CHECK-NEXT: vsrs.s8.s32 wh2, cm1, s1 - ; CHECK-NEXT: nop + ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh2, cm1, s1; nopx ; vups.s32.s8 cm3, wh2, s1; nopv + ; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1; nopb ; nopx + ; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32 + ; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32; vups.s32.s8 cm2, wh0, s1 + ; CHECK-NEXT: vsrs.s8.s32 wh2, cm1, s1; vups.s32.s8 cm3, wh2, s1 + ; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1 + ; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32 + ; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32; vups.s32.s8 cm2, wh0, s1 + ; CHECK-NEXT: vsrs.s8.s32 wh2, cm1, s1; vups.s32.s8 cm3, wh2, s1 ; CHECK-NEXT: nop - ; CHECK-NEXT: vups.s32.s8 cm2, wh0, s1 + ; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32 + ; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32; vups.s32.s8 cm2, wh0, s1 ; CHECK-NEXT: vups.s32.s8 cm3, wh2, s1 ; CHECK-NEXT: nop ; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/status_regs/srWAW.mir b/llvm/test/CodeGen/AIE/aie2/schedule/status_regs/srWAW.mir index ff6d666fbd80..258452803954 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/status_regs/srWAW.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/status_regs/srWAW.mir @@ -4,8 +4,9 @@ # # (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates -# RUN: llc -mtriple=aie2 --run-pass=postmisched --issue-limit=1 -debug-only=machine-scheduler %s -o - 2>%t.log -# RUN: cat %t.log | FileCheck %s --check-prefix=CHECK-WAW +# RUN: llc -mtriple=aie2 --run-pass=postmisched --issue-limit=1 \ +# RUN: -debug-only=machine-scheduler --aie-pipeliner-waw-sticky-registers=0 \ +# RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-WAW # REQUIRES: asserts # This test checks the write-after-write(WAW) dependencies