From 9f3e514997573b39d035be5877e1485fbc2a6edb Mon Sep 17 00:00:00 2001 From: Martien de Jong Date: Wed, 6 Nov 2024 16:57:37 +0100 Subject: [PATCH] [AIE] more heurstics and an option to select a specific one rework interfaces to take SUnits as parameters; supply DAG and Info in constructors --- llvm/lib/Target/AIE/AIEPostPipeliner.cpp | 120 ++++++++++++++---- llvm/lib/Target/AIE/AIEPostPipeliner.h | 20 ++- .../postpipeliner/gemm-nopresched.mir | 22 ++-- 3 files changed, 117 insertions(+), 45 deletions(-) diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp index b17520761997..ef7bc13af35b 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp @@ -23,6 +23,11 @@ namespace llvm::AIE { +static cl::opt + Heuristic("aie-postpipeliner-heuristic", + cl::desc("Select one specific post-pipeliner heuristic"), + cl::init(-1), cl::Hidden); + PipelineScheduleVisitor::~PipelineScheduleVisitor() {} class PostPipelineDumper : public PipelineScheduleVisitor { @@ -301,7 +306,7 @@ bool PostPipeliner::computeBackward() { return Changed; } -void PostPipeliner::computeLoopCarriedParameters() { +bool PostPipeliner::computeLoopCarriedParameters() { // Forward properties like Earliest and Ancestors. computeForward(); @@ -336,7 +341,7 @@ void PostPipeliner::computeLoopCarriedParameters() { } // Loop carried dependences will have pushed away Earliest of the second - // iteration, which should stay in lock step with the first + // iteration, which should stay in lock step with the first. for (int K = 0; K < NInstr; K++) { const int K2 = K + NInstr; const int Earliest = Info[K2].Earliest - II; @@ -368,6 +373,7 @@ void PostPipeliner::computeLoopCarriedParameters() { N.StaticEarliest = N.Earliest; N.StaticLatest = N.Latest; } + return true; } void dumpGraph(int NInstr, const std::vector &Info, @@ -392,11 +398,14 @@ void dumpGraph(int NInstr, const std::vector &Info, if (S >= NInstr) { dbgs() << "_" << S % NInstr; } - - dbgs() << " # L=" << Dep.getSignedLatency(); if (Dep.getKind() == SDep::Output) { - dbgs() << " WAW"; + dbgs() << " [color=blue]"; + } else if (Dep.getKind() == SDep::Anti) { + dbgs() << " [color=black]"; + } else { + dbgs() << " [color=red]"; } + dbgs() << " # L=" << Dep.getSignedLatency(); dbgs() << "\n"; } } @@ -432,7 +441,7 @@ int PostPipeliner::mostUrgent(PostPipelinerStrategy &Strategy) { continue; } LLVM_DEBUG(dbgs() << " SU" << K); - if (Best == -1 || Strategy.better(Info[K], Info[Best])) { + if (Best == -1 || Strategy.better(SU, DAG->SUnits[Best])) { Best = K; LLVM_DEBUG(dbgs() << "*"); } @@ -468,8 +477,8 @@ bool PostPipeliner::scheduleFirstIteration(PostPipelinerStrategy &Strategy) { LLVM_DEBUG(dbgs() << " Trying " << N << "\n"); SUnit &SU = DAG->SUnits[N]; MachineInstr *MI = SU.getInstr(); - const int Earliest = Strategy.earliest(Info[N]); - const int Latest = Strategy.latest(Info[N]); + const int Earliest = Strategy.earliest(SU); + const int Latest = Strategy.latest(SU); // Find the first cycle that fits. We try every position modulo II const int Actual = Strategy.fromTop() ? fit(MI, Earliest, Latest, II) : fit(MI, Latest, Earliest, II); @@ -478,7 +487,7 @@ bool PostPipeliner::scheduleFirstIteration(PostPipelinerStrategy &Strategy) { LLVM_DEBUG(dbgs() << "Out of resources\n"); return false; } - Strategy.selected(Info[N]); + Strategy.selected(SU); const int LocalCycle = Actual % II; const MemoryBankBits MemoryBanks = HR.getMemoryBanks(MI); int Cycle = -Depth + LocalCycle; @@ -533,7 +542,7 @@ PostPipelinerStrategy::~PostPipelinerStrategy() {} class ConfigStrategy : public PostPipelinerStrategy { public: - enum class Modulation { OneWay, Alternate, RandomSwing }; + enum class Modulation { OneWay, Alternate, Split, RandomSwing }; private: Modulation Mode = Modulation::OneWay; @@ -541,6 +550,9 @@ class ConfigStrategy : public PostPipelinerStrategy { uint64_t Param = 0; uint64_t RandomState; + int Count = 0; + + std::set SuccSiblingScheduled; // Most trivial seeds lead to boring initial sequences, // so spice them up a bit. The multiplication spreads the bits across the word @@ -556,23 +568,51 @@ class ConfigStrategy : public PostPipelinerStrategy { return RandomState & 1; } - bool better(const NodeInfo &A, const NodeInfo &B) override { + bool better(const SUnit &A, const SUnit &B) override { + auto &IA = Info[A.NodeNum]; + auto &IB = Info[B.NodeNum]; + if (FromTop && SuccSiblingScheduled.count(A.NodeNum) > + SuccSiblingScheduled.count(B.NodeNum)) { + return true; + } if (FromTop) { - return A.LCDLatest < B.LCDLatest; + return IA.LCDLatest < IB.LCDLatest; } - return A.Earliest > B.Earliest; + return IA.Earliest > IB.Earliest; } bool fromTop() override { return FromTop; } - void selected(NodeInfo &N) override { + void selected(const SUnit &N) override { switch (Mode) { case Modulation::Alternate: FromTop = !FromTop; break; + case Modulation::Split: + if (Count) { + Count--; + if (!Count) { + FromTop = !FromTop; + } + } + break; case Modulation::RandomSwing: FromTop = randomBit(); break; case Modulation::OneWay: + // Promote my siblings + if (FromTop) { + for (auto &SDep : N.Succs) { + if (SDep.getKind() != SDep::Data) { + continue; + } + for (auto &PDep : SDep.getSUnit()->Preds) { + if (PDep.getKind() != SDep::Data) { + continue; + } + SuccSiblingScheduled.insert(PDep.getSUnit()->NodeNum); + } + } + } break; } } @@ -583,31 +623,56 @@ class ConfigStrategy : public PostPipelinerStrategy { std::to_string(int(Mode)) + "_" + std::to_string(FromTop) + "_" + std::to_string(Param); } - ConfigStrategy(int Length, enum Modulation Mode, bool FromTop, uint64_t Param) - : PostPipelinerStrategy(Length), Mode(Mode), FromTop(FromTop), - RandomState(spice(Param)) {} + ConfigStrategy(ScheduleDAGInstrs &DAG, std::vector &Info, + int Length, enum Modulation Mode, bool FromTop, uint64_t Param) + : PostPipelinerStrategy(DAG, Info, Length), Mode(Mode), FromTop(FromTop), + Param(Param), RandomState(spice(Param)) { + if (Mode == Modulation::Split) { + Count = Length * Param / 1000; + } + } }; bool PostPipeliner::tryHeuristics() { + // The minimum length makes sure that every node has a range in which is + // can be scheduled int MinLength = II; - for (auto &Node : Info) { - while (Node.Latest + MinLength < Node.Earliest) { + for (int K = 0; K < NInstr; K++) { + auto &Node = Info[K]; + while (Node.Earliest > Node.Latest + MinLength) { MinLength += II; } } + int64_t SearchVolume = 1; + const int64_t Limit = int64_t(1) << 50; // Quintillion + for (int K = 0; K < NInstr && SearchVolume < Limit; K++) { + auto &Node = Info[K]; + SearchVolume *= Node.Latest + MinLength - Node.Earliest + 1; + } + + LLVM_SUMMARY(dbgs() << "-- MinLength=" << MinLength + << " SearchVolume=" << SearchVolume << "\n"); constexpr auto OneWay = ConfigStrategy::Modulation::OneWay; constexpr auto Alternate = ConfigStrategy::Modulation::Alternate; - constexpr auto RandomSwing = ConfigStrategy::Modulation::RandomSwing; - static const std::tuple + constexpr auto Split = ConfigStrategy::Modulation::Split; + static const std::tuple Configs[] = { - {false, OneWay, true, 0}, - {false, OneWay, false, 0}, - {false, Alternate, false, 0}, - {false, RandomSwing, false, 1}, + {0, OneWay, true, 0}, {0, OneWay, false, 0}, + {1, OneWay, true, 0}, {1, OneWay, false, 0}, + {0, Split, true, 300}, {0, Split, true, 600}, + {0, Split, false, 300}, {0, Split, false, 600}, + {1, Alternate, false, 0}, {1, OneWay, true, 0}, + {1, OneWay, false, 0}, {1, Alternate, false, 0}, + }; - for (auto &[UseMin, Mode, FromTop, Param] : Configs) { - ConfigStrategy S(UseMin ? MinLength : NCopies * II, Mode, FromTop, Param); + int H = 0; + for (auto &[Extra, Mode, FromTop, Param] : Configs) { + if (Heuristic >= 0 && Heuristic != H) { + H++; + continue; + } + ConfigStrategy S(*DAG, Info, MinLength + Extra * II, Mode, FromTop, Param); resetSchedule(); LLVM_SUMMARY(dbgs() << "--- Strategy " << S.name()); if (scheduleFirstIteration(S) && scheduleOtherIterations()) { @@ -615,6 +680,7 @@ bool PostPipeliner::tryHeuristics() { return true; } LLVM_SUMMARY(dbgs() << " failed\n"); + H++; } LLVM_SUMMARY(dbgs() << "=== II=" << II << " Failed ===\n"); return false; diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.h b/llvm/lib/Target/AIE/AIEPostPipeliner.h index 2c982a43d184..3dfaf43cacc9 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.h +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.h @@ -104,21 +104,27 @@ class NodeInfo { class PostPipelinerStrategy { protected: + ScheduleDAGInstrs &DAG; + std::vector &Info; int LatestBias = 0; public: - PostPipelinerStrategy(int LatestBias) : LatestBias(LatestBias){}; + PostPipelinerStrategy(ScheduleDAGInstrs &DAG, std::vector &Info, + int LatestBias) + : DAG(DAG), Info(Info), LatestBias(LatestBias){}; virtual ~PostPipelinerStrategy(); virtual std::string name() { return "PostPipelinerStrategy"; } // Choose among available alternatives - virtual bool better(const NodeInfo &A, const NodeInfo &B) { return false; } + virtual bool better(const SUnit &A, const SUnit &B) { return false; } // Tweak the effective earliest - virtual int earliest(const NodeInfo &N) { return N.Earliest; } + virtual int earliest(const SUnit &N) { return Info[N.NodeNum].Earliest; } // Select from top or from bottom. - virtual int latest(const NodeInfo &N) { return N.Latest + LatestBias; } + virtual int latest(const SUnit &N) { + return Info[N.NodeNum].Latest + LatestBias; + } // Report a final selection. This marks the start of selecting a new node. // fromTop() should be invariant between calls to selected() - virtual void selected(NodeInfo &N){}; + virtual void selected(const SUnit &N){}; virtual bool fromTop() { return true; } }; @@ -179,8 +185,8 @@ class PostPipeliner { int fit(MachineInstr *MI, int Earliest, int NTries, int II); /// Provide some look ahead by seeing the effect of the first iteration - /// on the second iteration. - void computeLoopCarriedParameters(); + /// on the second iteration. May return false if the II isn't feasible. + bool computeLoopCarriedParameters(); /// Helpers of computeLoopCarriedParameters() void computeForward(); diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir index cde7765f7766..bdda8a24ad3b 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir @@ -48,13 +48,13 @@ ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldb wl8, [p0], #32; vlda wl11, [p1], m5; nops ; nopx ; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r2 - ; CHECK-NEXT: vlda wh11, [p1], m6; vldb wh8, [p0], #32; nopx ; vshuffle x3, x11, x11, r6; vmac.f bmh6, bmh6, x10, x5, r2 - ; CHECK-NEXT: vlda wl5, [p1], m5; vldb wl1, [p0], #32; vshuffle x0, x0, x0, r6; vmac.f bmh7, bmh7, x9, x5, r2 - ; CHECK-NEXT: vlda wh5, [p1], m6; vldb wh1, [p0], #32; vshuffle x7, x7, x7, r6; vmac.f bmh0, bmh0, x6, x3, r2 - ; CHECK-NEXT: paddb [p0], m4; vmac.f bmh1, bmh1, x2, x3, r2 - ; CHECK-NEXT: vldb wl0, [p0], #32; vmac.f bmh2, bmh2, x10, x3, r2 - ; CHECK-NEXT: vldb wh0, [p0], #32; vmac.f bmh3, bmh3, x9, x3, r2 - ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh8, bmh8, x6, x0, r2 + ; CHECK-NEXT: vlda wh11, [p1], m6; vldb wh8, [p0], #32; nopx ; vshuffle x0, x0, x0, r6; vmac.f bmh6, bmh6, x10, x5, r2 + ; CHECK-NEXT: vlda wl5, [p1], m5; vldb wl1, [p0], #32; vshuffle x3, x11, x11, r6; vmac.f bmh7, bmh7, x9, x5, r2 + ; CHECK-NEXT: vlda wh5, [p1], m6; vldb wh1, [p0], #32; vshuffle x7, x7, x7, r6; vmac.f bmh8, bmh8, x6, x0, r2 + ; CHECK-NEXT: paddb [p0], m4; vmac.f bmh0, bmh0, x6, x3, r2 + ; CHECK-NEXT: vldb wl0, [p0], #32; vmac.f bmh1, bmh1, x2, x3, r2 + ; CHECK-NEXT: vldb wh0, [p0], #32; vmac.f bmh2, bmh2, x10, x3, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh3, bmh3, x9, x3, r2 ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml0, bml0, x2, x0, r2 ; CHECK-NEXT: vldb wl0, [p1], m5; vmac.f bml1, bml1, x10, x0, r2 ; CHECK-NEXT: vldb wh0, [p1], m6; vshuffle x5, x5, x5, r6; vmac.f bml2, bml2, x9, x0, r2 @@ -66,13 +66,13 @@ ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x10, x1, x3, r3; vmac.f bmh4, bmh4, x6, x5, r2 ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup ; CHECK-NEXT: nopx ; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r2 - ; CHECK-NEXT: vshuffle x3, x11, x11, r6; vmac.f bmh6, bmh6, x10, x5, r2 - ; CHECK-NEXT: vshuffle x0, x0, x0, r6; vmac.f bmh7, bmh7, x9, x5, r2 - ; CHECK-NEXT: vshuffle x7, x7, x7, r6; vmac.f bmh0, bmh0, x6, x3, r2 + ; CHECK-NEXT: vshuffle x0, x0, x0, r6; vmac.f bmh6, bmh6, x10, x5, r2 + ; CHECK-NEXT: vshuffle x3, x11, x11, r6; vmac.f bmh7, bmh7, x9, x5, r2 + ; CHECK-NEXT: vshuffle x7, x7, x7, r6; vmac.f bmh8, bmh8, x6, x0, r2 + ; CHECK-NEXT: vmac.f bmh0, bmh0, x6, x3, r2 ; CHECK-NEXT: vmac.f bmh1, bmh1, x2, x3, r2 ; CHECK-NEXT: vmac.f bmh2, bmh2, x10, x3, r2 ; CHECK-NEXT: vmac.f bmh3, bmh3, x9, x3, r2 - ; CHECK-NEXT: vmac.f bmh8, bmh8, x6, x0, r2 ; CHECK-NEXT: vmac.f bml0, bml0, x2, x0, r2 ; CHECK-NEXT: vmac.f bml1, bml1, x10, x0, r2 ; CHECK-NEXT: vmac.f bml2, bml2, x9, x0, r2