From 5024c325fadb1238552bf01d717b412f84ac6bb2 Mon Sep 17 00:00:00 2001 From: Martien de Jong Date: Fri, 6 Dec 2024 16:02:19 +0100 Subject: [PATCH] [AIE] Implement bottom up scheduling with symmetric priority components --- llvm/lib/Target/AIE/AIEPostPipeliner.cpp | 56 ++++++++++++++----- .../AIE/aie2/schedule/postpipeliner/round.mir | 45 +++++++-------- 2 files changed, 65 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp index 05c256558ca3..fb8a3b2434ef 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp @@ -522,6 +522,8 @@ class DefaultStrategy : public PostPipelinerStrategy { }; class ConfigStrategy : public PostPipelinerStrategy { + bool TopDown = true; + public: enum PriorityComponent { NodeNum, @@ -552,22 +554,27 @@ class ConfigStrategy : public PostPipelinerStrategy { private: std::string Name; std::set SuccSiblingScheduled; + std::set PredSiblingScheduled; std::function Discriminators[PriorityComponent::Size] = { - [&](const SUnit &A, const SUnit &B) { return A.NodeNum < B.NodeNum; }, + [&](const SUnit &A, const SUnit &B) { + return TopDown ? A.NodeNum < B.NodeNum : A.NodeNum > B.NodeNum; + }, [&](const SUnit &A, const SUnit &B) { auto &IA = Info[A.NodeNum]; auto &IB = Info[B.NodeNum]; - return IA.Latest < IB.Latest; + return TopDown ? IA.Latest < IB.Latest : IA.Earliest > IB.Earliest; }, [&](const SUnit &A, const SUnit &B) { auto &IA = Info[A.NodeNum]; auto &IB = Info[B.NodeNum]; - return IA.NumPushedEarliest > IB.NumPushedEarliest; + return TopDown ? IA.NumPushedEarliest > IB.NumPushedEarliest + : IA.NumPushedLatest > IB.NumPushedLatest; }, [&](const SUnit &A, const SUnit &B) { - return SuccSiblingScheduled.count(A.NodeNum) > - SuccSiblingScheduled.count(B.NodeNum); + std::set &Sibling = + TopDown ? SuccSiblingScheduled : PredSiblingScheduled; + return Sibling.count(A.NodeNum) > Sibling.count(B.NodeNum); }, [&](const SUnit &A, const SUnit &B) { auto &IA = Info[A.NodeNum]; @@ -577,6 +584,8 @@ class ConfigStrategy : public PostPipelinerStrategy { }; std::vector Priority; + bool fromTop() override { return TopDown; } + bool better(const SUnit &A, const SUnit &B) override { for (auto P : Priority) { if (Discriminators[P](A, B)) { @@ -606,14 +615,26 @@ class ConfigStrategy : public PostPipelinerStrategy { SuccSiblingScheduled.insert(PDep.getSUnit()->NodeNum); } } + for (auto &PDep : N.Preds) { + if (PDep.getKind() != SDep::Data) { + continue; + } + for (auto &SDep : PDep.getSUnit()->Succs) { + if (SDep.getKind() != SDep::Data) { + continue; + } + PredSiblingScheduled.insert(PDep.getSUnit()->NodeNum); + } + } } public: std::string name() override { return Name; } ConfigStrategy(ScheduleDAGInstrs &DAG, std::vector &Info, - int Length, ArrayRef Components) - : PostPipelinerStrategy(DAG, Info, Length) { - Name = "Config_" + std::to_string(Length); + int Length, bool TopDown, + ArrayRef Components) + : PostPipelinerStrategy(DAG, Info, Length), TopDown(TopDown) { + Name = "Config_" + std::to_string(Length) + std::to_string(TopDown); for (auto Comp : Components) { Name += "_" + getPriorityName(Comp); Priority.emplace_back(Comp); @@ -623,15 +644,21 @@ class ConfigStrategy : public PostPipelinerStrategy { static const struct { int ExtraStages; + bool TopDown; bool Rerun; ConfigStrategy::PriorityComponent Components[3]; } Strategies[] = { // Loosely speaking, a lower value of the first parameter targets // a lower stage count, which benefits code size. - {1, false, {ConfigStrategy::NodeNum}}, - {1, false, {ConfigStrategy::Latest}}, - {1, true, {ConfigStrategy::Critical}}, - {1, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}}, + // Rerurn is only useful for heuristics that use it, e.g. Critical + {1, true, false, {ConfigStrategy::NodeNum}}, + {1, true, false, {ConfigStrategy::Latest}}, + {1, true, true, {ConfigStrategy::Critical}}, + {1, true, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}}, + {0, false, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}}, + {1, false, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}}, + // This is pure bottom up + {1, false, false, {ConfigStrategy::NodeNum}}, }; bool PostPipeliner::tryHeuristics() { @@ -640,11 +667,12 @@ bool PostPipeliner::tryHeuristics() { DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength << "\n"); int HeuristicIndex = 0; - for (auto &[ExtraStages, Rerun, Components] : Strategies) { + for (auto &[ExtraStages, TopDown, Rerun, Components] : Strategies) { if (Heuristic >= 0 && Heuristic != HeuristicIndex++) { continue; } - ConfigStrategy S(*DAG, Info, MinLength + ExtraStages * II, Components); + ConfigStrategy S(*DAG, Info, MinLength + ExtraStages * II, TopDown, + Components); resetSchedule(/*FullReset=*/true); DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name()); if (scheduleFirstIteration(S) && scheduleOtherIterations()) { diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir index 18a91da1ebe0..2cee43297f55 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir @@ -34,46 +34,47 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; nops ; nopxm ; nopv - ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32 + ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; nopxm ; CHECK-NEXT: nop ; CHECK-NEXT: nop + ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32 + ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32 ; CHECK-NEXT: nop - ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; add.nc lc, r0, #-3 + ; CHECK-NEXT: add.nc lc, r0, #-4 ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; movxm ls, #.LBB0_2 - ; CHECK-NEXT: movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; movxm le, #.L_LEnd0 ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh0, cm0, s1; nopxm ; nopv - ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh0, cm0, s1; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vups.s32.s8 cm2, wh0, s1; nopv - ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh0, cm0, s1; nopx ; vups.s32.s8 cm3, wh2, s1; nopv + ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; nops ; nopx ; vups.s32.s8 cm2, wh0, s1; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1; nopxm ; nopv - ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; nopb ; vst.srs.s8.s32 cm2, s0, [p1], #32 - ; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32 - ; CHECK-NEXT: vups.s32.s8 cm2, wh0, s1 + ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1; nopx ; vups.s32.s8 cm3, wh2, s1; nopv + ; CHECK-NEXT: nopa ; nopb ; nopx ; vsrs.s8.s32 wh0, cm0, s1 + ; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh0, cm0, s1; nopx ; vups.s32.s8 cm3, wh2, s1; nopv + ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; vst.srs.s8.s32 cm3, s0, [p1], #32; nopx ; vups.s32.s8 cm2, wh0, s1; nopv ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup - ; CHECK-NEXT: nopa ; vsrs.s8.s32 wh2, cm1, s1; nopx + ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh2, cm1, s1; nopx ; vups.s32.s8 cm3, wh2, s1; nopv + ; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1; nopb ; nopx ; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32 - ; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32 - ; CHECK-NEXT: vups.s32.s8 cm2, wh0, s1 - ; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1; vups.s32.s8 cm3, wh2, s1 - ; CHECK-NEXT: vsrs.s8.s32 wh2, cm1, s1 + ; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32; vups.s32.s8 cm2, wh0, s1 + ; CHECK-NEXT: vsrs.s8.s32 wh2, cm1, s1; vups.s32.s8 cm3, wh2, s1 + ; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1 ; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32 - ; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32 - ; CHECK-NEXT: vups.s32.s8 cm2, wh0, s1 + ; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32; vups.s32.s8 cm2, wh0, s1 + ; CHECK-NEXT: vsrs.s8.s32 wh2, cm1, s1; vups.s32.s8 cm3, wh2, s1 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32 + ; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32; vups.s32.s8 cm2, wh0, s1 ; CHECK-NEXT: vups.s32.s8 cm3, wh2, s1 ; CHECK-NEXT: nop ; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32 ; CHECK-NEXT: vst.srs.s8.s32 cm3, s0, [p1], #32 - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup ; CHECK-NEXT: nopa ; ret lr