diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp index 5cc062a2db73..9875df3be2d3 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp @@ -19,7 +19,8 @@ #include #define DEBUG_TYPE "postpipeliner" -#define LLVM_SUMMARY(X) DEBUG_WITH_TYPE("postpipeliner-summary", X) +#define DEBUG_SUMMARY(X) DEBUG_WITH_TYPE("postpipeliner-summary", X) +#define DEBUG_FULL(X) DEBUG_WITH_TYPE("postpipeliner-full", X) namespace llvm::AIE { @@ -209,6 +210,7 @@ void PostPipeliner::scheduleNode(SUnit &SU, int Cycle) { if (NewEarliest > Info[SNum].Earliest) { Info[SNum].LastEarliestPusher = SU.NodeNum; Info[SNum].Earliest = NewEarliest; + Info[SU.NodeNum].NumPushedEarliest++; LLVM_DEBUG(dbgs() << SNum << " to " << Info[SNum].Earliest << " -; "); } } @@ -223,6 +225,7 @@ void PostPipeliner::scheduleNode(SUnit &SU, int Cycle) { if (NewLatest < Info[PNum].Latest) { Info[PNum].LastLatestPusher = SU.NodeNum; Info[PNum].Latest = NewLatest; + Info[SU.NodeNum].NumPushedLatest++; LLVM_DEBUG(dbgs() << PNum << " to - " << Info[PNum].Latest << "; "); } } @@ -463,12 +466,16 @@ int PostPipeliner::mostUrgent(PostPipelinerStrategy &Strategy) { return Best; } -void PostPipeliner::resetSchedule() { +void PostPipeliner::resetSchedule(bool ResetCritical) { Scoreboard.clear(); for (int K = 0; K < NTotalInstrs; K++) { auto &N = Info[K]; - N.LastEarliestPusher = -1; - N.LastLatestPusher = -1; + if (ResetCritical) { + N.NumPushedEarliest = 0; + N.NumPushedLatest = 0; + N.LastEarliestPusher = -1; + N.LastLatestPusher = -1; + } if (K < NInstr) { N.Earliest = N.StaticEarliest; N.Latest = N.StaticLatest; @@ -518,7 +525,7 @@ bool PostPipeliner::scheduleFirstIteration(PostPipelinerStrategy &Strategy) { scheduleNode(SU, Actual); Info[N].Scheduled = true; - LLVM_DEBUG(dbgs() << "Scoreboard\n"; Scoreboard.dumpFull();); + DEBUG_FULL(dbgs() << "Scoreboard\n"; Scoreboard.dumpFull();); } LLVM_DEBUG(dbgs() << "==== First iteration scheduled by " << Strategy.name() << "====\n"); @@ -557,78 +564,83 @@ PostPipelinerStrategy::~PostPipelinerStrategy() {} class ConfigStrategy : public PostPipelinerStrategy { public: - enum class Modulation { OneWay, Alternate, Split, RandomSwing }; + enum class Modulation { OneWay, Alternate }; private: Modulation Mode = Modulation::OneWay; bool FromTop = true; - uint64_t Param = 0; - uint64_t RandomState; - int Count = 0; - std::set SuccSiblingScheduled; - - // Most trivial seeds lead to boring initial sequences, - // so spice them up a bit. The multiplication spreads the bits across the word - // and the addition makes 0 interesting. We may hit the one fixpoint value, - // but chances are really slim. - uint64_t spice(uint64_t Value) { - return Value * 0x1555555555555555ull + 0x123456789abcdefull; - } - // Maximum length 63 bits LFSR - int randomBit() { - const uint64_t BitIn = (RandomState >> 62) ^ (RandomState >> 61); - RandomState = (RandomState << 1) | (BitIn & 1); - return RandomState & 1; - } + std::function Discriminators[8] = { + [&](const SUnit &A, const SUnit &B) { + auto &IA = Info[A.NodeNum]; + auto &IB = Info[B.NodeNum]; + return IA.NumPushedEarliest > IB.NumPushedEarliest; + }, + [&](const SUnit &A, const SUnit &B) { + return SuccSiblingScheduled.count(A.NodeNum) > + SuccSiblingScheduled.count(B.NodeNum); + }, + [&](const SUnit &A, const SUnit &B) { + auto &IA = Info[A.NodeNum]; + auto &IB = Info[B.NodeNum]; + return IA.LCDLatest < IB.LCDLatest; + }, + [&](const SUnit &A, const SUnit &B) { return A.NodeNum < B.NodeNum; }, + [&](const SUnit &A, const SUnit &B) { + auto &IA = Info[A.NodeNum]; + auto &IB = Info[B.NodeNum]; + return IA.NumPushedLatest > IB.NumPushedLatest; + }, + [&](const SUnit &A, const SUnit &B) { + auto &IA = Info[A.NodeNum]; + auto &IB = Info[B.NodeNum]; + return IA.Earliest > IB.Earliest; + }, + [&](const SUnit &A, const SUnit &B) { return false; }, + [&](const SUnit &A, const SUnit &B) { return A.NodeNum > B.NodeNum; }, + }; + int Priority[3] = {0, 1, 2}; bool better(const SUnit &A, const SUnit &B) override { - auto &IA = Info[A.NodeNum]; - auto &IB = Info[B.NodeNum]; - if (FromTop && SuccSiblingScheduled.count(A.NodeNum) > - SuccSiblingScheduled.count(B.NodeNum)) { - return true; - } - if (FromTop) { - return IA.LCDLatest < IB.LCDLatest; + int PBias = FromTop ? 0 : 3; + for (auto P : Priority) { + if (Discriminators[P + PBias](A, B)) { + return true; + } } - - return IA.Earliest > IB.Earliest; + return false; } bool fromTop() override { return FromTop; } + void selected(const SUnit &N) override { - switch (Mode) { - case Modulation::Alternate: - FromTop = !FromTop; - break; - case Modulation::Split: - if (Count) { - Count--; - if (!Count) { - FromTop = !FromTop; - } + if (FromTop) { + // Promote the critical path + NodeInfo *Pushed = &Info[N.NodeNum]; + while (Pushed->LastEarliestPusher >= 0) { + Pushed = &Info[Pushed->LastEarliestPusher]; + Pushed->NumPushedEarliest++; } - break; - case Modulation::RandomSwing: - FromTop = randomBit(); - break; - case Modulation::OneWay: + // Promote my siblings - if (FromTop) { - for (auto &SDep : N.Succs) { - if (SDep.getKind() != SDep::Data) { + for (auto &SDep : N.Succs) { + if (SDep.getKind() != SDep::Data) { + continue; + } + for (auto &PDep : SDep.getSUnit()->Preds) { + if (PDep.getKind() != SDep::Data) { continue; } - for (auto &PDep : SDep.getSUnit()->Preds) { - if (PDep.getKind() != SDep::Data) { - continue; - } - SuccSiblingScheduled.insert(PDep.getSUnit()->NodeNum); - } + SuccSiblingScheduled.insert(PDep.getSUnit()->NodeNum); } } - break; + } + // else { + // TODO : fill in !FromTop + // } + + if (Mode == Modulation::Alternate) { + FromTop = !FromTop; } } @@ -641,10 +653,10 @@ class ConfigStrategy : public PostPipelinerStrategy { ConfigStrategy(ScheduleDAGInstrs &DAG, std::vector &Info, int Length, enum Modulation Mode, bool FromTop, uint64_t Param) : PostPipelinerStrategy(DAG, Info, Length), Mode(Mode), FromTop(FromTop), - Param(Param), RandomState(spice(Param)) { - if (Mode == Modulation::Split) { - Count = Length * Param / 1000; - } + Param(Param) { + Priority[0] = (Param >> 0) & 0x3; + Priority[1] = (Param >> 2) & 0x3; + Priority[2] = (Param >> 4) & 0x3; } }; @@ -665,21 +677,25 @@ bool PostPipeliner::tryHeuristics() { SearchVolume *= Node.Latest + MinLength - Node.Earliest + 1; } - LLVM_SUMMARY(dbgs() << "-- MinLength=" << MinLength - << " SearchVolume=" << SearchVolume << "\n"); + DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength + << " SearchVolume=" << SearchVolume << "\n"); - constexpr auto OneWay = ConfigStrategy::Modulation::OneWay; - constexpr auto Alternate = ConfigStrategy::Modulation::Alternate; - constexpr auto Split = ConfigStrategy::Modulation::Split; + // Critical, sibling, min latest + static const int P012 = 0x24; + // Original order + static const int POrg = 0x3f; static const std::tuple Configs[] = { - {0, OneWay, true, 0}, {0, OneWay, false, 0}, - {1, OneWay, true, 0}, {1, OneWay, false, 0}, - {0, Split, true, 300}, {0, Split, true, 600}, - {0, Split, false, 300}, {0, Split, false, 600}, - {1, Alternate, false, 0}, {1, OneWay, true, 0}, - {1, OneWay, false, 0}, {1, Alternate, false, 0}, - + // Loosely speaking, a lower value of the first parameter targets + // a lower stage count, which benefits code size. + {0, ConfigStrategy::Modulation::OneWay, true, P012}, + {0, ConfigStrategy::Modulation::OneWay, false, P012}, + {0, ConfigStrategy::Modulation::OneWay, true, POrg}, + {0, ConfigStrategy::Modulation::OneWay, false, POrg}, + {1, ConfigStrategy::Modulation::OneWay, true, P012}, + {1, ConfigStrategy::Modulation::OneWay, false, P012}, + {1, ConfigStrategy::Modulation::OneWay, true, POrg}, + {1, ConfigStrategy::Modulation::OneWay, false, POrg}, }; int H = 0; for (auto &[Extra, Mode, FromTop, Param] : Configs) { @@ -688,16 +704,25 @@ bool PostPipeliner::tryHeuristics() { continue; } ConfigStrategy S(*DAG, Info, MinLength + Extra * II, Mode, FromTop, Param); - resetSchedule(); - LLVM_SUMMARY(dbgs() << "--- Strategy " << S.name()); + resetSchedule(/*ResetCritical=*/true); + DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name()); + if (scheduleFirstIteration(S) && scheduleOtherIterations()) { + DEBUG_SUMMARY(dbgs() << " found II=" << II << "\n"); + return true; + } + + DEBUG_SUMMARY(dbgs() << " failed\n"); + resetSchedule(/*ResetCritial=*/false); + DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name() + << " with critical path"); if (scheduleFirstIteration(S) && scheduleOtherIterations()) { - LLVM_SUMMARY(dbgs() << " found II=" << II << "\n"); + DEBUG_SUMMARY(dbgs() << " found II=" << II << "\n"); return true; } - LLVM_SUMMARY(dbgs() << " failed\n"); + DEBUG_SUMMARY(dbgs() << " failed\n"); H++; } - LLVM_SUMMARY(dbgs() << "=== II=" << II << " Failed ===\n"); + DEBUG_SUMMARY(dbgs() << "=== II=" << II << " Failed ===\n"); return false; } diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.h b/llvm/lib/Target/AIE/AIEPostPipeliner.h index 579f709dcffc..64318abe2620 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.h +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.h @@ -93,8 +93,12 @@ class NodeInfo { int Latest = -1; // Record critical path components + // The Pred/Succ that pushed my Earliest/Latest int LastEarliestPusher = -1; int LastLatestPusher = -1; + // The number of Succs/Preds whose Earliest/Latest I have pushed. + int NumPushedEarliest = 0; + int NumPushedLatest = 0; // Latest corrected taking Earliest of an LCD successor into account int LCDLatest = -1; @@ -196,8 +200,9 @@ class PostPipeliner { void computeForward(); bool computeBackward(); - /// Forget the previous round of scheduling - void resetSchedule(); + /// Forget the previous round of scheduling. Also forget the critical path + /// if ResetCritical is set. + void resetSchedule(bool ResetCritical); /// Try all heuristics, stop at the first that fits the II /// If it returns true, a valid schedule is laid down in Info. diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-1.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-1.mir index 305655f47d9a..2cb5d356493f 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-1.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-1.mir @@ -35,52 +35,50 @@ ; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; padds [p0], #128; nopxm ; nopv ; CHECK-NEXT: vldb wl10, [p0], #32; nopx ; CHECK-NEXT: vldb wh10, [p0], #32 + ; CHECK-NEXT: vldb wl3, [p0], #32 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 ; CHECK-NEXT: vldb wl5, [p1], m5 ; CHECK-NEXT: vldb wh5, [p1], m6 - ; CHECK-NEXT: vldb wl3, [p0], #32 - ; CHECK-NEXT: vldb wl7, [p1], m5 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2 + ; CHECK-NEXT: vldb wl7, [p1], m5; vshuffle x11, x9, x9, r2 ; CHECK-NEXT: vldb wh7, [p1], m6 ; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4 ; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x8, x8, x10, r16 - ; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3 - ; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x11, r3 - ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 - ; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh5, bmh5, x8, x5, r3 - ; CHECK-NEXT: vshuffle x6, x6, x3, r16 - ; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x0, x11, r3 - ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 - ; CHECK-NEXT: vmac.f bmh6, bmh6, x0, x5, r3 + ; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3 + ; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh2, bmh2, x0, x11, r3 + ; CHECK-NEXT: vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vmac.f bmh5, bmh5, x8, x5, r3 + ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh6, bmh6, x0, x5, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: nopa ; vldb wl8, [p0, #0]; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 - ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3 + ; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopx ; vmac.f bmh8, bmh8, x1, x7, r3 ; CHECK-NEXT: vldb wl6, [p0, #64]; vmac.f bml0, bml0, x8, x7, r3 ; CHECK-NEXT: vldb wh6, [p0, #96]; vmac.f bml1, bml1, x0, x7, r3 ; CHECK-NEXT: padds [p0], m4; vldb wl9, [p1], m5; vmac.f bml2, bml2, x6, x7, r3 ; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; vmac.f bml3, bml3, x1, x3, r3 ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml5, bml5, x8, x3, r3 ; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bml6, bml6, x0, x3, r3 - ; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: vldb wl5, [p1], m5 ; CHECK-NEXT: vldb wh5, [p1], m6 - ; CHECK-NEXT: vldb wl3, [p0], #32 - ; CHECK-NEXT: vldb wl7, [p1], m5 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2 + ; CHECK-NEXT: vldb wl7, [p1], m5; vshuffle x11, x9, x9, r2 ; CHECK-NEXT: vldb wh7, [p1], m6 ; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4 ; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x8, x8, x10, r16 - ; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3 - ; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x11, r3 - ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 - ; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh5, bmh5, x8, x5, r3 - ; CHECK-NEXT: vshuffle x6, x6, x3, r16 - ; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x0, x11, r3 - ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3 + ; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh2, bmh2, x0, x11, r3 + ; CHECK-NEXT: vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vmac.f bmh5, bmh5, x8, x5, r3 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh6, bmh6, x0, x5, r3 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x3, x3, x3, r2; vmac.f bmh6, bmh6, x0, x5, r3 ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vmac.f bmh7, bmh7, x6, x5, r3 ; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x7, r3 ; CHECK-NEXT: vmac.f bml0, bml0, x8, x7, r3 ; CHECK-NEXT: vmac.f bml1, bml1, x0, x7, r3 @@ -89,7 +87,6 @@ ; CHECK-NEXT: vmac.f bml5, bml5, x8, x3, r3 ; CHECK-NEXT: vmac.f bml6, bml6, x0, x3, r3 ; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3 - ; CHECK-NEXT: nopx ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir index 2e943946aec3..90cf2d235f1c 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir @@ -27,60 +27,64 @@ ; CHECK-NEXT: add.nc lc, r0, #-1 ; CHECK-NEXT: movxm ls, #.LBB0_2 ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl5, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh5, [p1], m6; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl7, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh7, [p1], m6; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl6, [p0, #64]; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wh6, [p0, #96]; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; padds [p0], m4; nopxm ; nopv - ; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; padds [p0], #128; nopxm ; nopv - ; CHECK-NEXT: vldb wl10, [p0], #32; nopx - ; CHECK-NEXT: vldb wh10, [p0], #32 - ; CHECK-NEXT: vldb wl5, [p1], m5 - ; CHECK-NEXT: vldb wh5, [p1], m6 + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopx + ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2 + ; CHECK-NEXT: vldb wh6, [p0, #96]; padds [p0], m4 + ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2 + ; CHECK-NEXT: vldb wl10, [p0], #32 + ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2 ; CHECK-NEXT: vldb wl3, [p0], #32 - ; CHECK-NEXT: vldb wl7, [p1], m5 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2 - ; CHECK-NEXT: vldb wh7, [p1], m6 - ; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4 - ; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x0, x8, x10, r16 - ; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3 - ; CHECK-NEXT: vmac.f bmh1, bmh1, x0, x11, r3 - ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 - ; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh5, bmh5, x0, x5, r3 - ; CHECK-NEXT: vshuffle x6, x6, x3, r16 - ; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x10, x11, r3 - ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 - ; CHECK-NEXT: vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb wl3, [p1], m5 + ; CHECK-NEXT: vldb.3d wh3, [p1], d1 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vshuffle x1, x8, x10, r4 + ; CHECK-NEXT: vshuffle x0, x8, x10, r16 + ; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x0, x11, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: nopa ; vldb wl8, [p0, #0]; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: nopa ; vldb wl9, [p1], m5; nopxm ; vmac.f bmh2, bmh2, x10, x11, r3 + ; CHECK-NEXT: vldb wh9, [p1], m6; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vldb wh5, [p1], m6; vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: vldb wl7, [p1], m5; vmac.f bmh5, bmh5, x0, x5, r3 + ; CHECK-NEXT: vldb wh7, [p1], m6; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: vldb wl8, [p0, #0]; vmac.f bmh7, bmh7, x6, x5, r3 ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3 - ; CHECK-NEXT: vldb wl6, [p0, #64]; vmac.f bml0, bml0, x0, x7, r3 - ; CHECK-NEXT: vldb wh6, [p0, #96]; vmac.f bml1, bml1, x10, x7, r3 - ; CHECK-NEXT: padds [p0], m4; vldb wl9, [p1], m5; vmac.f bml2, bml2, x6, x7, r3 - ; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; vmac.f bml3, bml3, x1, x3, r3 - ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml5, bml5, x0, x3, r3 - ; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bml6, bml6, x10, x3, r3 - ; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml4, bml4, x6, x3, r3 - ; CHECK-NEXT: vldb wh5, [p1], m6 - ; CHECK-NEXT: vldb wl3, [p0], #32 - ; CHECK-NEXT: vldb wl7, [p1], m5 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2 - ; CHECK-NEXT: vldb wh7, [p1], m6 - ; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4 - ; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x0, x8, x10, r16 - ; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3 - ; CHECK-NEXT: vmac.f bmh1, bmh1, x0, x11, r3 - ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 - ; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh5, bmh5, x0, x5, r3 - ; CHECK-NEXT: vshuffle x6, x6, x3, r16 - ; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x10, x11, r3 - ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2; vmac.f bml0, bml0, x0, x7, r3 + ; CHECK-NEXT: padds [p0], m4; vldb wh6, [p0, #96]; vmac.f bml1, bml1, x10, x7, r3 + ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2; vmac.f bml2, bml2, x6, x7, r3 + ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml3, bml3, x1, x3, r3 + ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2; vmac.f bml5, bml5, x0, x3, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bml6, bml6, x10, x3, r3 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb wl3, [p1], m5 + ; CHECK-NEXT: vldb.3d wh3, [p1], d1 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vshuffle x1, x8, x10, r4 + ; CHECK-NEXT: vshuffle x0, x8, x10, r16 + ; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x0, x11, r3 ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vmac.f bmh2, bmh2, x10, x11, r3 + ; CHECK-NEXT: nopx ; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: vmac.f bmh5, bmh5, x0, x5, r3 + ; CHECK-NEXT: vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: vmac.f bmh7, bmh7, x6, x5, r3 ; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x7, r3 ; CHECK-NEXT: vmac.f bml0, bml0, x0, x7, r3 ; CHECK-NEXT: vmac.f bml1, bml1, x10, x7, r3 @@ -88,13 +92,6 @@ ; CHECK-NEXT: vmac.f bml3, bml3, x1, x3, r3 ; CHECK-NEXT: vmac.f bml5, bml5, x0, x3, r3 ; CHECK-NEXT: vmac.f bml6, bml6, x10, x3, r3 - ; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3 - ; CHECK-NEXT: nopx - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir index af9e1589e96a..8dec880a39b1 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir @@ -27,60 +27,64 @@ ; CHECK-NEXT: add.nc lc, r0, #-1 ; CHECK-NEXT: movxm ls, #.LBB0_2 ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl5, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh5, [p1], m6; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl7, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh7, [p1], m6; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl6, [p0, #64]; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wh6, [p0, #96]; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; padds [p0], m4; nopxm ; nopv - ; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; padds [p0], #128; nopxm ; nopv - ; CHECK-NEXT: vldb wl10, [p0], #32; nopx - ; CHECK-NEXT: vldb wh10, [p0], #32 - ; CHECK-NEXT: vldb wl5, [p1], m5 - ; CHECK-NEXT: vldb wh5, [p1], m6 + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopx + ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2 + ; CHECK-NEXT: vldb wh6, [p0, #96]; padds [p0], m4 + ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2 + ; CHECK-NEXT: vldb wl10, [p0], #32 + ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2 ; CHECK-NEXT: vldb wl3, [p0], #32 - ; CHECK-NEXT: vldb wl7, [p1], m5 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2 - ; CHECK-NEXT: vldb wh7, [p1], m6 - ; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4 - ; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x2, x2, x10, r16 - ; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3 - ; CHECK-NEXT: vmac.f bmh1, bmh1, x2, x11, r3 - ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 - ; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh5, bmh5, x2, x5, r3 - ; CHECK-NEXT: vshuffle x6, x6, x3, r16 - ; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x0, x11, r3 - ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 - ; CHECK-NEXT: vmac.f bmh6, bmh6, x0, x5, r3 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb wl3, [p1], m5 + ; CHECK-NEXT: vldb.3d wh3, [p1], d1 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vshuffle x2, x2, x10, r16 + ; CHECK-NEXT: vshuffle x1, x8, x10, r4 + ; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh1, bmh1, x2, x11, r3 + ; CHECK-NEXT: vshuffle x6, x6, x3, r16; vmac.f bmh0, bmh0, x1, x11, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: nopa ; vldb wl8, [p0, #0]; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: nopa ; vldb wl9, [p1], m5; nopxm ; vmac.f bmh2, bmh2, x0, x11, r3 + ; CHECK-NEXT: vldb wh9, [p1], m6; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vldb wh5, [p1], m6; vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: vldb wl7, [p1], m5; vmac.f bmh5, bmh5, x2, x5, r3 + ; CHECK-NEXT: vldb wh7, [p1], m6; vmac.f bmh6, bmh6, x0, x5, r3 + ; CHECK-NEXT: vldb wl8, [p0, #0]; vmac.f bmh7, bmh7, x6, x5, r3 ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3 - ; CHECK-NEXT: vldb wl6, [p0, #64]; vmac.f bml0, bml0, x2, x7, r3 - ; CHECK-NEXT: vldb wh6, [p0, #96]; vmac.f bml1, bml1, x0, x7, r3 - ; CHECK-NEXT: padds [p0], m4; vldb wl9, [p1], m5; vmac.f bml2, bml2, x6, x7, r3 - ; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; vmac.f bml3, bml3, x1, x3, r3 - ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml5, bml5, x2, x3, r3 - ; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bml6, bml6, x0, x3, r3 - ; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml4, bml4, x6, x3, r3 - ; CHECK-NEXT: vldb wh5, [p1], m6 - ; CHECK-NEXT: vldb wl3, [p0], #32 - ; CHECK-NEXT: vldb wl7, [p1], m5 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2 - ; CHECK-NEXT: vldb wh7, [p1], m6 - ; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4 - ; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x2, x2, x10, r16 - ; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3 - ; CHECK-NEXT: vmac.f bmh1, bmh1, x2, x11, r3 - ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 - ; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh5, bmh5, x2, x5, r3 - ; CHECK-NEXT: vshuffle x6, x6, x3, r16 - ; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x0, x11, r3 - ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2; vmac.f bml0, bml0, x2, x7, r3 + ; CHECK-NEXT: padds [p0], m4; vldb wh6, [p0, #96]; vmac.f bml1, bml1, x0, x7, r3 + ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2; vmac.f bml2, bml2, x6, x7, r3 + ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml3, bml3, x1, x3, r3 + ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2; vmac.f bml5, bml5, x2, x3, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bml6, bml6, x0, x3, r3 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb wl3, [p1], m5 + ; CHECK-NEXT: vldb.3d wh3, [p1], d1 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vshuffle x2, x2, x10, r16 + ; CHECK-NEXT: vshuffle x1, x8, x10, r4 + ; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh1, bmh1, x2, x11, r3 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh6, bmh6, x0, x5, r3 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh0, bmh0, x1, x11, r3 ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vmac.f bmh2, bmh2, x0, x11, r3 + ; CHECK-NEXT: nopx ; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: vmac.f bmh5, bmh5, x2, x5, r3 + ; CHECK-NEXT: vmac.f bmh6, bmh6, x0, x5, r3 + ; CHECK-NEXT: vmac.f bmh7, bmh7, x6, x5, r3 ; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x7, r3 ; CHECK-NEXT: vmac.f bml0, bml0, x2, x7, r3 ; CHECK-NEXT: vmac.f bml1, bml1, x0, x7, r3 @@ -88,13 +92,6 @@ ; CHECK-NEXT: vmac.f bml3, bml3, x1, x3, r3 ; CHECK-NEXT: vmac.f bml5, bml5, x2, x3, r3 ; CHECK-NEXT: vmac.f bml6, bml6, x0, x3, r3 - ; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3 - ; CHECK-NEXT: nopx - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm.mir index fbd31a30aea4..72c73af64e31 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm.mir @@ -35,52 +35,50 @@ ; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; padds [p0], #128; nopxm ; nopv ; CHECK-NEXT: vldb wl10, [p0], #32; nopx ; CHECK-NEXT: vldb wh10, [p0], #32 + ; CHECK-NEXT: vldb wl3, [p0], #32 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 ; CHECK-NEXT: vldb wl5, [p1], m5 ; CHECK-NEXT: vldb wh5, [p1], m6 - ; CHECK-NEXT: vldb wl3, [p0], #32 - ; CHECK-NEXT: vldb wl7, [p1], m5 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2 + ; CHECK-NEXT: vldb wl7, [p1], m5; vshuffle x11, x9, x9, r2 ; CHECK-NEXT: vldb wh7, [p1], m6 ; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4 ; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x8, x8, x10, r16 - ; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3 - ; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x11, r3 - ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 - ; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh5, bmh5, x8, x5, r3 - ; CHECK-NEXT: vshuffle x6, x6, x3, r16 - ; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x10, x11, r3 - ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 - ; CHECK-NEXT: vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3 + ; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh2, bmh2, x10, x11, r3 + ; CHECK-NEXT: vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vmac.f bmh5, bmh5, x8, x5, r3 + ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh6, bmh6, x10, x5, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: nopa ; vldb wl8, [p0, #0]; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 - ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3 + ; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopx ; vmac.f bmh8, bmh8, x1, x7, r3 ; CHECK-NEXT: vldb wl6, [p0, #64]; vmac.f bml0, bml0, x8, x7, r3 ; CHECK-NEXT: vldb wh6, [p0, #96]; vmac.f bml1, bml1, x10, x7, r3 ; CHECK-NEXT: padds [p0], m4; vldb wl9, [p1], m5; vmac.f bml2, bml2, x6, x7, r3 ; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; vmac.f bml3, bml3, x1, x3, r3 ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml5, bml5, x8, x3, r3 ; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bml6, bml6, x10, x3, r3 - ; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: vldb wl5, [p1], m5 ; CHECK-NEXT: vldb wh5, [p1], m6 - ; CHECK-NEXT: vldb wl3, [p0], #32 - ; CHECK-NEXT: vldb wl7, [p1], m5 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2 + ; CHECK-NEXT: vldb wl7, [p1], m5; vshuffle x11, x9, x9, r2 ; CHECK-NEXT: vldb wh7, [p1], m6 ; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4 ; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x8, x8, x10, r16 - ; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3 - ; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x11, r3 - ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3 - ; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh5, bmh5, x8, x5, r3 - ; CHECK-NEXT: vshuffle x6, x6, x3, r16 - ; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x10, x11, r3 - ; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3 + ; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh2, bmh2, x10, x11, r3 + ; CHECK-NEXT: vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vmac.f bmh5, bmh5, x8, x5, r3 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x3, x3, x3, r2; vmac.f bmh6, bmh6, x10, x5, r3 ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vmac.f bmh7, bmh7, x6, x5, r3 ; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x7, r3 ; CHECK-NEXT: vmac.f bml0, bml0, x8, x7, r3 ; CHECK-NEXT: vmac.f bml1, bml1, x10, x7, r3 @@ -89,7 +87,6 @@ ; CHECK-NEXT: vmac.f bml5, bml5, x8, x3, r3 ; CHECK-NEXT: vmac.f bml6, bml6, x10, x3, r3 ; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3 - ; CHECK-NEXT: nopx ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop