Skip to content

Commit

Permalink
[AIE] Re-pipeline with knowledge of a failing critical path
Browse files Browse the repository at this point in the history
some more logging
  • Loading branch information
Martien de Jong committed Nov 18, 2024
1 parent a0d79fd commit 38a992a
Show file tree
Hide file tree
Showing 6 changed files with 260 additions and 242 deletions.
185 changes: 105 additions & 80 deletions llvm/lib/Target/AIE/AIEPostPipeliner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
#include <unordered_set>

#define DEBUG_TYPE "postpipeliner"
#define LLVM_SUMMARY(X) DEBUG_WITH_TYPE("postpipeliner-summary", X)
#define DEBUG_SUMMARY(X) DEBUG_WITH_TYPE("postpipeliner-summary", X)
#define DEBUG_FULL(X) DEBUG_WITH_TYPE("postpipeliner-full", X)

namespace llvm::AIE {

Expand Down Expand Up @@ -209,6 +210,7 @@ void PostPipeliner::scheduleNode(SUnit &SU, int Cycle) {
if (NewEarliest > Info[SNum].Earliest) {
Info[SNum].LastEarliestPusher = SU.NodeNum;
Info[SNum].Earliest = NewEarliest;
Info[SU.NodeNum].NumPushedEarliest++;
LLVM_DEBUG(dbgs() << SNum << " to " << Info[SNum].Earliest << " -; ");
}
}
Expand All @@ -223,6 +225,7 @@ void PostPipeliner::scheduleNode(SUnit &SU, int Cycle) {
if (NewLatest < Info[PNum].Latest) {
Info[PNum].LastLatestPusher = SU.NodeNum;
Info[PNum].Latest = NewLatest;
Info[SU.NodeNum].NumPushedLatest++;
LLVM_DEBUG(dbgs() << PNum << " to - " << Info[PNum].Latest << "; ");
}
}
Expand Down Expand Up @@ -463,12 +466,16 @@ int PostPipeliner::mostUrgent(PostPipelinerStrategy &Strategy) {
return Best;
}

void PostPipeliner::resetSchedule() {
void PostPipeliner::resetSchedule(bool ResetCritical) {
Scoreboard.clear();
for (int K = 0; K < NTotalInstrs; K++) {
auto &N = Info[K];
N.LastEarliestPusher = -1;
N.LastLatestPusher = -1;
if (ResetCritical) {
N.NumPushedEarliest = 0;
N.NumPushedLatest = 0;
N.LastEarliestPusher = -1;
N.LastLatestPusher = -1;
}
if (K < NInstr) {
N.Earliest = N.StaticEarliest;
N.Latest = N.StaticLatest;
Expand Down Expand Up @@ -518,7 +525,7 @@ bool PostPipeliner::scheduleFirstIteration(PostPipelinerStrategy &Strategy) {

scheduleNode(SU, Actual);
Info[N].Scheduled = true;
LLVM_DEBUG(dbgs() << "Scoreboard\n"; Scoreboard.dumpFull(););
DEBUG_FULL(dbgs() << "Scoreboard\n"; Scoreboard.dumpFull(););
}
LLVM_DEBUG(dbgs() << "==== First iteration scheduled by " << Strategy.name()
<< "====\n");
Expand Down Expand Up @@ -557,78 +564,83 @@ PostPipelinerStrategy::~PostPipelinerStrategy() {}

class ConfigStrategy : public PostPipelinerStrategy {
public:
enum class Modulation { OneWay, Alternate, Split, RandomSwing };
enum class Modulation { OneWay, Alternate };

private:
Modulation Mode = Modulation::OneWay;
bool FromTop = true;

uint64_t Param = 0;
uint64_t RandomState;
int Count = 0;

std::set<int> SuccSiblingScheduled;

// Most trivial seeds lead to boring initial sequences,
// so spice them up a bit. The multiplication spreads the bits across the word
// and the addition makes 0 interesting. We may hit the one fixpoint value,
// but chances are really slim.
uint64_t spice(uint64_t Value) {
return Value * 0x1555555555555555ull + 0x123456789abcdefull;
}
// Maximum length 63 bits LFSR
int randomBit() {
const uint64_t BitIn = (RandomState >> 62) ^ (RandomState >> 61);
RandomState = (RandomState << 1) | (BitIn & 1);
return RandomState & 1;
}
std::function<bool(const SUnit &A, const SUnit &B)> Discriminators[8] = {
[&](const SUnit &A, const SUnit &B) {
auto &IA = Info[A.NodeNum];
auto &IB = Info[B.NodeNum];
return IA.NumPushedEarliest > IB.NumPushedEarliest;
},
[&](const SUnit &A, const SUnit &B) {
return SuccSiblingScheduled.count(A.NodeNum) >
SuccSiblingScheduled.count(B.NodeNum);
},
[&](const SUnit &A, const SUnit &B) {
auto &IA = Info[A.NodeNum];
auto &IB = Info[B.NodeNum];
return IA.LCDLatest < IB.LCDLatest;
},
[&](const SUnit &A, const SUnit &B) { return A.NodeNum < B.NodeNum; },
[&](const SUnit &A, const SUnit &B) {
auto &IA = Info[A.NodeNum];
auto &IB = Info[B.NodeNum];
return IA.NumPushedLatest > IB.NumPushedLatest;
},
[&](const SUnit &A, const SUnit &B) {
auto &IA = Info[A.NodeNum];
auto &IB = Info[B.NodeNum];
return IA.Earliest > IB.Earliest;
},
[&](const SUnit &A, const SUnit &B) { return false; },
[&](const SUnit &A, const SUnit &B) { return A.NodeNum > B.NodeNum; },
};
int Priority[3] = {0, 1, 2};

bool better(const SUnit &A, const SUnit &B) override {
auto &IA = Info[A.NodeNum];
auto &IB = Info[B.NodeNum];
if (FromTop && SuccSiblingScheduled.count(A.NodeNum) >
SuccSiblingScheduled.count(B.NodeNum)) {
return true;
}
if (FromTop) {
return IA.LCDLatest < IB.LCDLatest;
int PBias = FromTop ? 0 : 3;
for (auto P : Priority) {
if (Discriminators[P + PBias](A, B)) {
return true;
}
}

return IA.Earliest > IB.Earliest;
return false;
}
bool fromTop() override { return FromTop; }

void selected(const SUnit &N) override {
switch (Mode) {
case Modulation::Alternate:
FromTop = !FromTop;
break;
case Modulation::Split:
if (Count) {
Count--;
if (!Count) {
FromTop = !FromTop;
}
if (FromTop) {
// Promote the critical path
NodeInfo *Pushed = &Info[N.NodeNum];
while (Pushed->LastEarliestPusher >= 0) {
Pushed = &Info[Pushed->LastEarliestPusher];
Pushed->NumPushedEarliest++;
}
break;
case Modulation::RandomSwing:
FromTop = randomBit();
break;
case Modulation::OneWay:

// Promote my siblings
if (FromTop) {
for (auto &SDep : N.Succs) {
if (SDep.getKind() != SDep::Data) {
for (auto &SDep : N.Succs) {
if (SDep.getKind() != SDep::Data) {
continue;
}
for (auto &PDep : SDep.getSUnit()->Preds) {
if (PDep.getKind() != SDep::Data) {
continue;
}
for (auto &PDep : SDep.getSUnit()->Preds) {
if (PDep.getKind() != SDep::Data) {
continue;
}
SuccSiblingScheduled.insert(PDep.getSUnit()->NodeNum);
}
SuccSiblingScheduled.insert(PDep.getSUnit()->NodeNum);
}
}
break;
}
// else {
// TODO : fill in !FromTop
// }

if (Mode == Modulation::Alternate) {
FromTop = !FromTop;
}
}

Expand All @@ -641,10 +653,10 @@ class ConfigStrategy : public PostPipelinerStrategy {
ConfigStrategy(ScheduleDAGInstrs &DAG, std::vector<NodeInfo> &Info,
int Length, enum Modulation Mode, bool FromTop, uint64_t Param)
: PostPipelinerStrategy(DAG, Info, Length), Mode(Mode), FromTop(FromTop),
Param(Param), RandomState(spice(Param)) {
if (Mode == Modulation::Split) {
Count = Length * Param / 1000;
}
Param(Param) {
Priority[0] = (Param >> 0) & 0x3;
Priority[1] = (Param >> 2) & 0x3;
Priority[2] = (Param >> 4) & 0x3;
}
};

Expand All @@ -665,21 +677,25 @@ bool PostPipeliner::tryHeuristics() {
SearchVolume *= Node.Latest + MinLength - Node.Earliest + 1;
}

LLVM_SUMMARY(dbgs() << "-- MinLength=" << MinLength
<< " SearchVolume=" << SearchVolume << "\n");
DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength
<< " SearchVolume=" << SearchVolume << "\n");

constexpr auto OneWay = ConfigStrategy::Modulation::OneWay;
constexpr auto Alternate = ConfigStrategy::Modulation::Alternate;
constexpr auto Split = ConfigStrategy::Modulation::Split;
// Critical, sibling, min latest
static const int P012 = 0x24;
// Original order
static const int POrg = 0x3f;
static const std::tuple<int, ConfigStrategy::Modulation, bool, uint64_t>
Configs[] = {
{0, OneWay, true, 0}, {0, OneWay, false, 0},
{1, OneWay, true, 0}, {1, OneWay, false, 0},
{0, Split, true, 300}, {0, Split, true, 600},
{0, Split, false, 300}, {0, Split, false, 600},
{1, Alternate, false, 0}, {1, OneWay, true, 0},
{1, OneWay, false, 0}, {1, Alternate, false, 0},

// Loosely speaking, a lower value of the first parameter targets
// a lower stage count, which benefits code size.
{0, ConfigStrategy::Modulation::OneWay, true, P012},
{0, ConfigStrategy::Modulation::OneWay, false, P012},
{0, ConfigStrategy::Modulation::OneWay, true, POrg},
{0, ConfigStrategy::Modulation::OneWay, false, POrg},
{1, ConfigStrategy::Modulation::OneWay, true, P012},
{1, ConfigStrategy::Modulation::OneWay, false, P012},
{1, ConfigStrategy::Modulation::OneWay, true, POrg},
{1, ConfigStrategy::Modulation::OneWay, false, POrg},
};
int H = 0;
for (auto &[Extra, Mode, FromTop, Param] : Configs) {
Expand All @@ -688,16 +704,25 @@ bool PostPipeliner::tryHeuristics() {
continue;
}
ConfigStrategy S(*DAG, Info, MinLength + Extra * II, Mode, FromTop, Param);
resetSchedule();
LLVM_SUMMARY(dbgs() << "--- Strategy " << S.name());
resetSchedule(/*ResetCritical=*/true);
DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name());
if (scheduleFirstIteration(S) && scheduleOtherIterations()) {
DEBUG_SUMMARY(dbgs() << " found II=" << II << "\n");
return true;
}

DEBUG_SUMMARY(dbgs() << " failed\n");
resetSchedule(/*ResetCritial=*/false);
DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name()
<< " with critical path");
if (scheduleFirstIteration(S) && scheduleOtherIterations()) {
LLVM_SUMMARY(dbgs() << " found II=" << II << "\n");
DEBUG_SUMMARY(dbgs() << " found II=" << II << "\n");
return true;
}
LLVM_SUMMARY(dbgs() << " failed\n");
DEBUG_SUMMARY(dbgs() << " failed\n");
H++;
}
LLVM_SUMMARY(dbgs() << "=== II=" << II << " Failed ===\n");
DEBUG_SUMMARY(dbgs() << "=== II=" << II << " Failed ===\n");
return false;
}

Expand Down
9 changes: 7 additions & 2 deletions llvm/lib/Target/AIE/AIEPostPipeliner.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,12 @@ class NodeInfo {
int Latest = -1;

// Record critical path components
// The Pred/Succ that pushed my Earliest/Latest
int LastEarliestPusher = -1;
int LastLatestPusher = -1;
// The number of Succs/Preds whose Earliest/Latest I have pushed.
int NumPushedEarliest = 0;
int NumPushedLatest = 0;

// Latest corrected taking Earliest of an LCD successor into account
int LCDLatest = -1;
Expand Down Expand Up @@ -196,8 +200,9 @@ class PostPipeliner {
void computeForward();
bool computeBackward();

/// Forget the previous round of scheduling
void resetSchedule();
/// Forget the previous round of scheduling. Also forget the critical path
/// if ResetCritical is set.
void resetSchedule(bool ResetCritical);

/// Try all heuristics, stop at the first that fits the II
/// If it returns true, a valid schedule is laid down in Info.
Expand Down
51 changes: 24 additions & 27 deletions llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-1.mir
Original file line number Diff line number Diff line change
Expand Up @@ -35,52 +35,50 @@
; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; padds [p0], #128; nopxm ; nopv
; CHECK-NEXT: vldb wl10, [p0], #32; nopx
; CHECK-NEXT: vldb wh10, [p0], #32
; CHECK-NEXT: vldb wl3, [p0], #32
; CHECK-NEXT: vldb.3d wh3, [p0], d0
; CHECK-NEXT: vldb wl5, [p1], m5
; CHECK-NEXT: vldb wh5, [p1], m6
; CHECK-NEXT: vldb wl3, [p0], #32
; CHECK-NEXT: vldb wl7, [p1], m5
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2
; CHECK-NEXT: vldb wl7, [p1], m5; vshuffle x11, x9, x9, r2
; CHECK-NEXT: vldb wh7, [p1], m6
; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4
; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x8, x8, x10, r16
; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3
; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x11, r3
; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3
; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh5, bmh5, x8, x5, r3
; CHECK-NEXT: vshuffle x6, x6, x3, r16
; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x0, x11, r3
; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3
; CHECK-NEXT: vmac.f bmh6, bmh6, x0, x5, r3
; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3
; CHECK-NEXT: vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3
; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh2, bmh2, x0, x11, r3
; CHECK-NEXT: vmac.f bmh3, bmh3, x6, x11, r3
; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh4, bmh4, x1, x5, r3
; CHECK-NEXT: vmac.f bmh5, bmh5, x8, x5, r3
; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh6, bmh6, x0, x5, r3
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: nopa ; vldb wl8, [p0, #0]; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3
; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3
; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3
; CHECK-NEXT: vldb wh8, [p0, #32]; nopx ; vmac.f bmh8, bmh8, x1, x7, r3
; CHECK-NEXT: vldb wl6, [p0, #64]; vmac.f bml0, bml0, x8, x7, r3
; CHECK-NEXT: vldb wh6, [p0, #96]; vmac.f bml1, bml1, x0, x7, r3
; CHECK-NEXT: padds [p0], m4; vldb wl9, [p1], m5; vmac.f bml2, bml2, x6, x7, r3
; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; vmac.f bml3, bml3, x1, x3, r3
; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bml5, bml5, x8, x3, r3
; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bml6, bml6, x0, x3, r3
; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml4, bml4, x6, x3, r3
; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bml4, bml4, x6, x3, r3
; CHECK-NEXT: vldb.3d wh3, [p0], d0
; CHECK-NEXT: vldb wl5, [p1], m5
; CHECK-NEXT: vldb wh5, [p1], m6
; CHECK-NEXT: vldb wl3, [p0], #32
; CHECK-NEXT: vldb wl7, [p1], m5
; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x11, x9, x9, r2
; CHECK-NEXT: vldb wl7, [p1], m5; vshuffle x11, x9, x9, r2
; CHECK-NEXT: vldb wh7, [p1], m6
; CHECK-NEXT: vldb wl3, [p1], m5; vshuffle x1, x8, x10, r4
; CHECK-NEXT: vldb.3d wh3, [p1], d1; vshuffle x8, x8, x10, r16
; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh0, bmh0, x1, x11, r3
; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x11, r3
; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x5, r3
; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh5, bmh5, x8, x5, r3
; CHECK-NEXT: vshuffle x6, x6, x3, r16
; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh2, bmh2, x0, x11, r3
; CHECK-NEXT: vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3
; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3
; CHECK-NEXT: vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3
; CHECK-NEXT: vshuffle x5, x5, x5, r2; vmac.f bmh2, bmh2, x0, x11, r3
; CHECK-NEXT: vmac.f bmh3, bmh3, x6, x11, r3
; CHECK-NEXT: vshuffle x7, x7, x7, r2; vmac.f bmh4, bmh4, x1, x5, r3
; CHECK-NEXT: vmac.f bmh5, bmh5, x8, x5, r3
; CHECK-NEXT: .L_LEnd0:
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh6, bmh6, x0, x5, r3
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x3, x3, x3, r2; vmac.f bmh6, bmh6, x0, x5, r3
; CHECK-NEXT: // %bb.3: // %for.cond.cleanup
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3
; CHECK-NEXT: vmac.f bmh7, bmh7, x6, x5, r3
; CHECK-NEXT: vmac.f bmh8, bmh8, x1, x7, r3
; CHECK-NEXT: vmac.f bml0, bml0, x8, x7, r3
; CHECK-NEXT: vmac.f bml1, bml1, x0, x7, r3
Expand All @@ -89,7 +87,6 @@
; CHECK-NEXT: vmac.f bml5, bml5, x8, x3, r3
; CHECK-NEXT: vmac.f bml6, bml6, x0, x3, r3
; CHECK-NEXT: vmac.f bml4, bml4, x6, x3, r3
; CHECK-NEXT: nopx
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
Expand Down
Loading

0 comments on commit 38a992a

Please sign in to comment.