From 6659147d359bb7ee6fb8941e42ca22295ad0e375 Mon Sep 17 00:00:00 2001 From: Martien de Jong Date: Thu, 17 Oct 2024 10:26:10 +0200 Subject: [PATCH] [AIE] Include 'Latest' in postpipeliner heuristic Compute Latest relative to last cycle of last Stage (using negative values) --- llvm/lib/Target/AIE/AIEPostPipeliner.cpp | 58 ++++++++++------ .../aie2/schedule/postpipeliner/conv2d.mir | 24 +++---- .../schedule/postpipeliner/conv2d_bf16-1.mir | 45 ++++++------ .../schedule/postpipeliner/conv2d_bf16.mir | 37 +++++----- .../gemm-feasibleRA-nopstinc.mir | 68 +++++++++++++------ .../postpipeliner/gemm-feasibleRA.mir | 68 +++++++++++++------ 6 files changed, 182 insertions(+), 118 deletions(-) diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp index 961aacb6acca..0aa4d74a247f 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp @@ -190,29 +190,35 @@ void PostPipeliner::computeLoopCarriedParameters() { } // Propagate Earliest upstream, initialize Latest + // Unrestricted: last cycle of last stage + const int Latest = NCopies * II - 1; for (int K = 0; K < NInstr; K++) { const int K2 = K + NInstr; const int Earliest = Info[K2].Earliest - II; Info[K].Earliest = std::max(Info[K].Earliest, Earliest); - // Unrestricted: Beyond the last stage. - Info[K].Latest = NCopies * II; + Info[K].Latest = Latest; + Info[K2].Latest = Latest; } - // Propagate Latest upstream. Latest is the latest - // that is admissible for Earliest to be achievable within II - for (int K = 0; K < NInstr; K++) { - const int K2 = K + NInstr; - const int Earliest = Info[K2].Earliest; - const auto &SU = DAG->SUnits[K2]; - for (auto &Dep : SU.Preds) { - const auto *Pred = Dep.getSUnit(); - // Any predecessor in the first iteration - int K1 = Pred->NodeNum; - if (K1 < NInstr) { - const int Latest = Earliest - Dep.getSignedLatency(); - Info[K1].Latest = std::min(Info[K1].Latest, Latest); + + // Compute Latest. Use a fixpoint loop, because plain reversed + // order may not be topological for predecessors + bool Changed = true; + while (Changed) { + Changed = false; + for (int K = NInstr - 1; K >= 0; K--) { + SUnit &SU = DAG->SUnits[K]; + const int Latest = Info[K].Latest; + for (auto &Dep : SU.Preds) { + int P = Dep.getSUnit()->NodeNum; + int NewLatest = Latest - Dep.getSignedLatency(); + if (NewLatest < Info[P].Latest) { + Info[P].Latest = NewLatest; + Changed = true; + } } } } + LLVM_DEBUG(for (int K = 0; K < NInstr; K++) { dbgs() << "SU" << K << " : " << Info[K].Earliest << " - " << Info[K].Latest << "\n"; @@ -238,7 +244,12 @@ void dumpGraph(int NInstr, const std::vector &Info, if (S >= NInstr) { dbgs() << "_" << S % NInstr; } - dbgs() << "# L=" << Dep.getSignedLatency() << "\n"; + + dbgs() << " # L=" << Dep.getSignedLatency(); + if (Dep.getKind() == SDep::Output) { + dbgs() << " WAW"; + } + dbgs() << "\n"; } } dbgs() << "}\n"; @@ -251,20 +262,27 @@ int PostPipeliner::mostUrgent() { } assert(FirstUnscheduled < NInstr); + // 'Latest' accounts for the critical path of the linear schedule + auto Better = [this](int N, int Ref) { + if (Info[N].Latest < Info[Ref].Latest) { + return true; + } + + return false; + }; + int Best = -1; LLVM_DEBUG(dbgs() << "Available:"); for (int K = FirstUnscheduled; K < NInstr; K++) { const auto &SU = DAG->SUnits[K]; // Check whether it is available - if (any_of(SU.Preds, [&](const auto &Dep) { + if (Info[K].Scheduled || any_of(SU.Preds, [&](const auto &Dep) { return !Info[Dep.getSUnit()->NodeNum].Scheduled; })) { continue; } LLVM_DEBUG(dbgs() << " SU" << K); - // Yeah, I know. This is a difficult way to schedule in the original - // node order. Have patience, my friend. - if (Best == -1) { + if (Best == -1 || Better(K, Best)) { Best = K; LLVM_DEBUG(dbgs() << "*"); } diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir index 3ac1a241beac..682ec3d11fde 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir @@ -103,10 +103,10 @@ ; CHECK-NEXT: vldb wh6, [p0], m6 ; CHECK-NEXT: vldb wl8, [p0], m6; add r0, r5, #33; vshift.align x2, x2, s1, x8, r0 ; CHECK-NEXT: vldb.3d wh8, [p0], d0; vshuffle x9, x4, x2, r2 - ; CHECK-NEXT: vldb wl10, [p1], #32; vshuffle x1, x9, x0, r8 - ; CHECK-NEXT: vldb wh10, [p1], #32; vshuffle x3, x4, x2, r3; vmac cm1, cm1, x9, x10, r4 - ; CHECK-NEXT: vldb wl7, [p1], #32; vshuffle x5, x3, x0, r8; vmac cm2, cm2, x1, x10, r4 - ; CHECK-NEXT: vldb wh7, [p1], #32; mov r6, p0; vmac cm3, cm3, x3, x10, r4 + ; CHECK-NEXT: vldb wl10, [p1], #32; vshuffle x3, x4, x2, r3 + ; CHECK-NEXT: vldb wh10, [p1], #32; vshuffle x1, x9, x0, r8; vmac cm1, cm1, x9, x10, r4 + ; CHECK-NEXT: vldb wl7, [p1], #32; vshuffle x5, x3, x0, r8; vmac cm3, cm3, x3, x10, r4 + ; CHECK-NEXT: vldb wh7, [p1], #32; mov r6, p0; vmac cm2, cm2, x1, x10, r4 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %inner.loop ; CHECK-NEXT: // Parent Loop BB0_1 Depth=1 @@ -115,21 +115,21 @@ ; CHECK-NEXT: nopa ; vldb wh6, [p0], m6; nopx ; vmac cm5, cm5, x9, x7, r4 ; CHECK-NEXT: vldb wl8, [p0], m6; add r0, r5, #33; vshift.align x2, x2, s1, x8, r0; vmac cm6, cm6, x1, x7, r4 ; CHECK-NEXT: vldb.3d wh8, [p0], d0; vshuffle x9, x4, x2, r2; vmac cm7, cm7, x3, x7, r4 - ; CHECK-NEXT: vldb wl10, [p1], #32; vshuffle x1, x9, x0, r8; vmac cm0, cm0, x5, x7, r4 - ; CHECK-NEXT: vldb wh10, [p1], #32; vshuffle x3, x4, x2, r3; vmac cm1, cm1, x9, x10, r4 - ; CHECK-NEXT: vldb wl7, [p1], #32; vshuffle x5, x3, x0, r8; vmac cm2, cm2, x1, x10, r4 + ; CHECK-NEXT: vldb wl10, [p1], #32; vshuffle x3, x4, x2, r3; vmac cm0, cm0, x5, x7, r4 + ; CHECK-NEXT: vldb wh10, [p1], #32; vshuffle x1, x9, x0, r8; vmac cm1, cm1, x9, x10, r4 + ; CHECK-NEXT: vldb wl7, [p1], #32; vshuffle x5, x3, x0, r8; vmac cm3, cm3, x3, x10, r4 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: vldb wh7, [p1], #32; nopa ; nops ; nopx ; mov r6, p0; vmac cm3, cm3, x3, x10, r4 + ; CHECK-NEXT: vldb wh7, [p1], #32; nopa ; nops ; nopx ; mov r6, p0; vmac cm2, cm2, x1, x10, r4 ; CHECK-NEXT: // %bb.3: // %outer.loop.latch ; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: nopa ; and r5, r6, r9; vshift.align x4, x4, s1, x6, r0; vmac cm4, cm4, x5, x10, r4 ; CHECK-NEXT: vmac cm5, cm5, x9, x7, r4 ; CHECK-NEXT: add r0, r5, #33; vshift.align x2, x2, s1, x8, r0; vmac cm6, cm6, x1, x7, r4 ; CHECK-NEXT: vshuffle x9, x4, x2, r2; vmac cm7, cm7, x3, x7, r4 - ; CHECK-NEXT: vshuffle x1, x9, x0, r8; vmac cm0, cm0, x5, x7, r4 - ; CHECK-NEXT: vshuffle x3, x4, x2, r3; vmac cm1, cm1, x9, x10, r4 - ; CHECK-NEXT: vshuffle x5, x3, x0, r8; vmac cm2, cm2, x1, x10, r4 - ; CHECK-NEXT: vmac cm3, cm3, x3, x10, r4 + ; CHECK-NEXT: vshuffle x3, x4, x2, r3; vmac cm0, cm0, x5, x7, r4 + ; CHECK-NEXT: vshuffle x1, x9, x0, r8; vmac cm1, cm1, x9, x10, r4 + ; CHECK-NEXT: vshuffle x5, x3, x0, r8; vmac cm3, cm3, x3, x10, r4 + ; CHECK-NEXT: vmac cm2, cm2, x1, x10, r4 ; CHECK-NEXT: vmac cm4, cm4, x5, x10, r4 ; CHECK-NEXT: vmac cm5, cm5, x9, x7, r4 ; CHECK-NEXT: vmac cm6, cm6, x1, x7, r4 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir index 0f4b40ce222c..c1b8eeded62f 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir @@ -36,21 +36,20 @@ ; CHECK-NEXT: vldb wl1, [p0], m4; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; vldb wh3, [p0, #32]; nopx ; CHECK-NEXT: vldb.3d wl3, [p0], d1 - ; CHECK-NEXT: vlda wh7, [p4, #352]; vshift.align x0, x0, s0, x8, r3 - ; CHECK-NEXT: vlda wl7, [p4, #320] - ; CHECK-NEXT: vlda wh9, [p4, #416]; vshift.align x2, x2, s0, x10, r3 - ; CHECK-NEXT: vlda wl9, [p4, #384]; vshuffle x8, x0, x2, r9 - ; CHECK-NEXT: vldb wh5, [p5, #32]; vshift.align x4, x4, s0, x1, r3 - ; CHECK-NEXT: vlda wl5, [p5], #256; vshuffle x5, x0, x2, r25 + ; CHECK-NEXT: vlda wh9, [p4, #416]; vshift.align x0, x0, s0, x8, r3 + ; CHECK-NEXT: vlda wl9, [p4, #384] + ; CHECK-NEXT: vlda wh7, [p4, #352]; vshift.align x2, x2, s0, x10, r3 + ; CHECK-NEXT: vldb wh5, [p5, #32]; vshuffle x5, x0, x2, r25 + ; CHECK-NEXT: vlda wl5, [p5], #256; vshift.align x4, x4, s0, x1, r3 + ; CHECK-NEXT: vlda wl7, [p4, #320]; vshuffle x8, x0, x2, r9 ; CHECK-NEXT: vlda wh11, [p4, #480]; vshift.align x6, x6, s0, x3, r3 ; CHECK-NEXT: vlda wl11, [p4, #448]; vshuffle x3, x4, x6, r9 - ; CHECK-NEXT: vshuffle x10, x4, x6, r25 ; CHECK-NEXT: vshuffle x1, x3, x5, r13 ; CHECK-NEXT: vshuffle x3, x3, x5, r24 - ; CHECK-NEXT: mov r3, p0 - ; CHECK-NEXT: and r3, r3, r0; mov p4, p7; vmac.f bmh7, bmh7, x8, x5, r29 - ; CHECK-NEXT: add r3, r3, #34; vmac.f bmh5, bmh5, x1, x5, r29 - ; CHECK-NEXT: vmac.f bml2, bml2, x3, x5, r29 + ; CHECK-NEXT: vshuffle x10, x4, x6, r25 + ; CHECK-NEXT: mov r3, p0; vmac.f bmh7, bmh7, x8, x5, r29 + ; CHECK-NEXT: and r3, r3, r0; mov p4, p7; vmac.f bmh5, bmh5, x1, x5, r29 + ; CHECK-NEXT: add r3, r3, #34; vmac.f bml2, bml2, x3, x5, r29 ; CHECK-NEXT: vmac.f bml0, bml0, x10, x5, r29 ; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x9, r29 ; CHECK-NEXT: vmac.f bmh0, bmh0, x1, x9, r29 @@ -66,28 +65,27 @@ ; CHECK-NEXT: vldb wl1, [p0], m4; vmac.f bmh6, bmh6, x8, x11, r29 ; CHECK-NEXT: vldb wh3, [p0, #32]; vmac.f bmh4, bmh4, x1, x11, r29 ; CHECK-NEXT: vldb.3d wl3, [p0], d1; vmac.f bml1, bml1, x3, x11, r29 - ; CHECK-NEXT: vlda wh7, [p4, #352]; vshift.align x0, x0, s0, x8, r3; vmac.f bmh8, bmh8, x10, x11, r29 - ; CHECK-NEXT: vlda wl7, [p4, #320] - ; CHECK-NEXT: vlda wh9, [p4, #416]; vshift.align x2, x2, s0, x10, r3 - ; CHECK-NEXT: vlda wl9, [p4, #384]; vshuffle x8, x0, x2, r9 - ; CHECK-NEXT: vldb wh5, [p5, #32]; vshift.align x4, x4, s0, x1, r3 - ; CHECK-NEXT: vlda wl5, [p5], #256; vshuffle x5, x0, x2, r25 + ; CHECK-NEXT: vlda wh9, [p4, #416]; vshift.align x0, x0, s0, x8, r3; vmac.f bmh8, bmh8, x10, x11, r29 + ; CHECK-NEXT: vlda wl9, [p4, #384] + ; CHECK-NEXT: vlda wh7, [p4, #352]; vshift.align x2, x2, s0, x10, r3 + ; CHECK-NEXT: vldb wh5, [p5, #32]; vshuffle x5, x0, x2, r25 + ; CHECK-NEXT: vlda wl5, [p5], #256; vshift.align x4, x4, s0, x1, r3 + ; CHECK-NEXT: vlda wl7, [p4, #320]; vshuffle x8, x0, x2, r9 ; CHECK-NEXT: vlda wh11, [p4, #480]; vshift.align x6, x6, s0, x3, r3 ; CHECK-NEXT: vlda wl11, [p4, #448]; vshuffle x3, x4, x6, r9 - ; CHECK-NEXT: vshuffle x10, x4, x6, r25 ; CHECK-NEXT: vshuffle x1, x3, x5, r13 ; CHECK-NEXT: vshuffle x3, x3, x5, r24 - ; CHECK-NEXT: mov r3, p0 - ; CHECK-NEXT: and r3, r3, r0; mov p4, p7; vmac.f bmh7, bmh7, x8, x5, r29 - ; CHECK-NEXT: add r3, r3, #34; vmac.f bmh5, bmh5, x1, x5, r29 - ; CHECK-NEXT: vmac.f bml2, bml2, x3, x5, r29 + ; CHECK-NEXT: vshuffle x10, x4, x6, r25 + ; CHECK-NEXT: mov r3, p0; vmac.f bmh7, bmh7, x8, x5, r29 + ; CHECK-NEXT: and r3, r3, r0; mov p4, p7; vmac.f bmh5, bmh5, x1, x5, r29 + ; CHECK-NEXT: add r3, r3, #34; vmac.f bml2, bml2, x3, x5, r29 ; CHECK-NEXT: vmac.f bml0, bml0, x10, x5, r29 ; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x9, r29 ; CHECK-NEXT: vmac.f bmh0, bmh0, x1, x9, r29 ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bmh3, bmh3, x3, x9, r29 ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup - ; CHECK-NEXT: nopa ; nopb ; nopx ; vmac.f bmh2, bmh2, x10, x9, r29 + ; CHECK-NEXT: nopa ; nopb ; nopxm ; vmac.f bmh2, bmh2, x10, x9, r29 ; CHECK-NEXT: vmac.f bml4, bml4, x8, x7, r29 ; CHECK-NEXT: vmac.f bml3, bml3, x1, x7, r29 ; CHECK-NEXT: vmac.f bml6, bml6, x3, x7, r29 @@ -113,7 +111,6 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop - ; CHECK-NEXT: nop ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup ; CHECK-NEXT: nopa ; ret lr diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir index 346e56f42fc1..25321ddb2188 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir @@ -39,16 +39,15 @@ ; CHECK-NEXT: vldb wh7, [p7, #32]; vshift.align x0, x0, s0, x8, r3 ; CHECK-NEXT: vlda wl7, [p7], #256; paddb [p4], #320; mov r1, p0 ; CHECK-NEXT: and r2, r1, r0; vshift.align x2, x2, s0, x10, r3 - ; CHECK-NEXT: vshuffle x8, x0, x2, r9 - ; CHECK-NEXT: vlda wh5, [p2, #352]; vshift.align x4, x4, s0, x1, r3 - ; CHECK-NEXT: vldb wl5, [p4], #64; vshuffle x5, x0, x2, r25 - ; CHECK-NEXT: vldb wh9, [p4, #32]; add r3, r2, #34; vshift.align x6, x6, s0, x3, r3 - ; CHECK-NEXT: vldb wl9, [p4], #64; vshuffle x3, x4, x6, r9 - ; CHECK-NEXT: vldb wh11, [p4, #32]; vshuffle x10, x4, x6, r25; vmac.f bmh7, bmh7, x8, x7, r29 - ; CHECK-NEXT: vldb wl11, [p4, #0]; vshuffle x1, x3, x5, r13 + ; CHECK-NEXT: vldb wl5, [p4], #64; vshuffle x8, x0, x2, r9 + ; CHECK-NEXT: vldb wh9, [p4, #32]; vshift.align x4, x4, s0, x1, r3 + ; CHECK-NEXT: vldb wl9, [p4], #64; vshuffle x5, x0, x2, r25 + ; CHECK-NEXT: vlda wh5, [p2, #352]; add r3, r2, #34; vshift.align x6, x6, s0, x3, r3 + ; CHECK-NEXT: vldb wh11, [p4, #32]; vshuffle x3, x4, x6, r9 + ; CHECK-NEXT: vldb wl11, [p4, #0]; vshuffle x1, x3, x5, r13; vmac.f bmh7, bmh7, x8, x7, r29 ; CHECK-NEXT: vshuffle x3, x3, x5, r24 - ; CHECK-NEXT: mov p2, p5; vmac.f bmh5, bmh5, x1, x7, r29 - ; CHECK-NEXT: vmac.f bml2, bml2, x3, x7, r29 + ; CHECK-NEXT: vshuffle x10, x4, x6, r25; vmac.f bmh5, bmh5, x1, x7, r29 + ; CHECK-NEXT: mov p2, p5; vmac.f bml2, bml2, x3, x7, r29 ; CHECK-NEXT: vmac.f bml0, bml0, x10, x7, r29 ; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x9, r29 ; CHECK-NEXT: vmac.f bmh0, bmh0, x1, x9, r29 @@ -67,16 +66,15 @@ ; CHECK-NEXT: vldb wh7, [p7, #32]; vshift.align x0, x0, s0, x8, r3; vmac.f bmh8, bmh8, x10, x11, r29 ; CHECK-NEXT: vlda wl7, [p7], #256; paddb [p4], #320; mov r1, p0 ; CHECK-NEXT: and r2, r1, r0; vshift.align x2, x2, s0, x10, r3 - ; CHECK-NEXT: vshuffle x8, x0, x2, r9 - ; CHECK-NEXT: vlda wh5, [p2, #352]; vshift.align x4, x4, s0, x1, r3 - ; CHECK-NEXT: vldb wl5, [p4], #64; vshuffle x5, x0, x2, r25 - ; CHECK-NEXT: vldb wh9, [p4, #32]; add r3, r2, #34; vshift.align x6, x6, s0, x3, r3 - ; CHECK-NEXT: vldb wl9, [p4], #64; vshuffle x3, x4, x6, r9 - ; CHECK-NEXT: vldb wh11, [p4, #32]; vshuffle x10, x4, x6, r25; vmac.f bmh7, bmh7, x8, x7, r29 - ; CHECK-NEXT: vldb wl11, [p4, #0]; vshuffle x1, x3, x5, r13 + ; CHECK-NEXT: vldb wl5, [p4], #64; vshuffle x8, x0, x2, r9 + ; CHECK-NEXT: vldb wh9, [p4, #32]; vshift.align x4, x4, s0, x1, r3 + ; CHECK-NEXT: vldb wl9, [p4], #64; vshuffle x5, x0, x2, r25 + ; CHECK-NEXT: vlda wh5, [p2, #352]; add r3, r2, #34; vshift.align x6, x6, s0, x3, r3 + ; CHECK-NEXT: vldb wh11, [p4, #32]; vshuffle x3, x4, x6, r9 + ; CHECK-NEXT: vldb wl11, [p4, #0]; vshuffle x1, x3, x5, r13; vmac.f bmh7, bmh7, x8, x7, r29 ; CHECK-NEXT: vshuffle x3, x3, x5, r24 - ; CHECK-NEXT: mov p2, p5; vmac.f bmh5, bmh5, x1, x7, r29 - ; CHECK-NEXT: vmac.f bml2, bml2, x3, x7, r29 + ; CHECK-NEXT: vshuffle x10, x4, x6, r25; vmac.f bmh5, bmh5, x1, x7, r29 + ; CHECK-NEXT: mov p2, p5; vmac.f bml2, bml2, x3, x7, r29 ; CHECK-NEXT: vmac.f bml0, bml0, x10, x7, r29 ; CHECK-NEXT: vmac.f bmh1, bmh1, x8, x9, r29 ; CHECK-NEXT: vmac.f bmh0, bmh0, x1, x9, r29 @@ -92,8 +90,7 @@ ; CHECK-NEXT: vmac.f bmh4, bmh4, x1, x11, r29 ; CHECK-NEXT: vmac.f bml1, bml1, x3, x11, r29 ; CHECK-NEXT: vmac.f bmh8, bmh8, x10, x11, r29 - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop + ; CHECK-NEXT: nopx ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir index b62ff74d0bd3..6631eb1b3a5f 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir @@ -25,7 +25,7 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: add.nc lc, r0, #-1 + ; CHECK-NEXT: add.nc lc, r0, #-2 ; CHECK-NEXT: movxm ls, #.LBB0_2 ; CHECK-NEXT: movxm le, #.L_LEnd0 ; CHECK-NEXT: vldb wl0, [p0, #0]; nopa ; nops ; nopxm ; nopv @@ -34,48 +34,60 @@ ; CHECK-NEXT: vldb wh1, [p0, #96]; nopa ; padds [p0], m4; nopxm ; nopv ; CHECK-NEXT: vldb wl8, [p1], m5; nopa ; padds [p0], #128; nopxm ; nopv ; CHECK-NEXT: vldb wl2, [p0], #32; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; vldb wh2, [p0], #32; nopxm ; nops + ; CHECK-NEXT: vldb wh2, [p0], #32; nopx ; CHECK-NEXT: vldb wl3, [p0], #32 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0 ; CHECK-NEXT: vldb wh8, [p1], m6 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 ; CHECK-NEXT: vldb wl9, [p1], m5 ; CHECK-NEXT: vldb wh9, [p1], m6 ; CHECK-NEXT: vldb wl10, [p1], m5 ; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3 ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x5, x0, x2, r16 - ; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x6, x1, x3, r3 - ; CHECK-NEXT: vshuffle x7, x1, x3, r16 - ; CHECK-NEXT: vshuffle x8, x8, x8, r6 + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x8, x8, x8, r6 + ; CHECK-NEXT: vldb wl0, [p0, #0]; vshuffle x6, x1, x3, r3 + ; CHECK-NEXT: vldb wh0, [p0, #32]; vshuffle x7, x1, x3, r16; vmac.f bmh0, bmh0, x4, x8, r2 + ; CHECK-NEXT: vldb wl1, [p0, #64]; vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: padds [p0], m4; vldb wh1, [p0, #96]; vmac.f bmh2, bmh2, x5, x8, r2 + ; CHECK-NEXT: padds [p0], #128; vldb wl8, [p1], m5; vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: vldb wl2, [p0], #32; vmac.f bmh4, bmh0, x4, x9, r2 + ; CHECK-NEXT: vldb wh2, [p0], #32; vshuffle x11, x11, x11, r6; vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh6, bmh2, x5, x9, r2 + ; CHECK-NEXT: vldb wh8, [p1], m6; vmac.f bmh7, bmh3, x7, x9, r2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bmh8, bmh0, x4, x10, r2 + ; CHECK-NEXT: vldb wl9, [p1], m5; vmac.f bml0, bmh1, x6, x10, r2 + ; CHECK-NEXT: vldb wh9, [p1], m6; vmac.f bml1, bmh2, x5, x10, r2 + ; CHECK-NEXT: vldb wl10, [p1], m5; vmac.f bml2, bmh3, x7, x10, r2 + ; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3; vmac.f bml3, bmh0, x4, x11, r2 + ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x5, x0, x2, r16; vmac.f bml4, bmh1, x6, x11, r2 + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x8, x8, x8, r6; vmac.f bml5, bmh2, x5, x11, r2 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vldb wl0, [p0, #0]; nopa ; nops ; nopx ; vshuffle x9, x9, x9, r6; nopv - ; CHECK-NEXT: vldb wh0, [p0, #32]; nopx ; vmac.f bmh0, bmh0, x4, x8, r2 - ; CHECK-NEXT: vldb wl1, [p0, #64]; vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: vldb wl0, [p0, #0]; nopa ; nops ; nopx ; vshuffle x6, x1, x3, r3; vmac.f bml6, bmh3, x7, x11, r2 + ; CHECK-NEXT: vldb wh0, [p0, #32]; vshuffle x7, x1, x3, r16; vmac.f bmh0, bmh0, x4, x8, r2 + ; CHECK-NEXT: vldb wl1, [p0, #64]; vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2 ; CHECK-NEXT: padds [p0], m4; vldb wh1, [p0, #96]; vmac.f bmh2, bmh2, x5, x8, r2 - ; CHECK-NEXT: padds [p0], #128; vldb wl8, [p1], m5; vshuffle x11, x11, x11, r6; vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: padds [p0], #128; vldb wl8, [p1], m5; vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2 ; CHECK-NEXT: vldb wl2, [p0], #32; vmac.f bmh4, bmh0, x4, x9, r2 - ; CHECK-NEXT: vldb wh2, [p0], #32; vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vldb wh2, [p0], #32; vshuffle x11, x11, x11, r6; vmac.f bmh5, bmh1, x6, x9, r2 ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh6, bmh2, x5, x9, r2 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bmh7, bmh3, x7, x9, r2 - ; CHECK-NEXT: vldb wh8, [p1], m6; vmac.f bmh8, bmh0, x4, x10, r2 + ; CHECK-NEXT: vldb wh8, [p1], m6; vmac.f bmh7, bmh3, x7, x9, r2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bmh8, bmh0, x4, x10, r2 ; CHECK-NEXT: vldb wl9, [p1], m5; vmac.f bml0, bmh1, x6, x10, r2 ; CHECK-NEXT: vldb wh9, [p1], m6; vmac.f bml1, bmh2, x5, x10, r2 ; CHECK-NEXT: vldb wl10, [p1], m5; vmac.f bml2, bmh3, x7, x10, r2 ; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3; vmac.f bml3, bmh0, x4, x11, r2 ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x5, x0, x2, r16; vmac.f bml4, bmh1, x6, x11, r2 - ; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x6, x1, x3, r3; vmac.f bml5, bmh2, x5, x11, r2 - ; CHECK-NEXT: vshuffle x7, x1, x3, r16; vmac.f bml6, bmh3, x7, x11, r2 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x8, x8, x8, r6; nopv + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; nopa ; nops ; nopx ; vshuffle x8, x8, x8, r6; vmac.f bml5, bmh2, x5, x11, r2 ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup - ; CHECK-NEXT: nopa ; vshuffle x9, x9, x9, r6 - ; CHECK-NEXT: vmac.f bmh0, bmh0, x4, x8, r2 - ; CHECK-NEXT: vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x6, x1, x3, r3; vmac.f bml6, bmh3, x7, x11, r2 + ; CHECK-NEXT: nopx ; vshuffle x7, x1, x3, r16; vmac.f bmh0, bmh0, x4, x8, r2 + ; CHECK-NEXT: vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2 ; CHECK-NEXT: vmac.f bmh2, bmh2, x5, x8, r2 - ; CHECK-NEXT: vshuffle x11, x11, x11, r6; vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2 ; CHECK-NEXT: vmac.f bmh4, bmh0, x4, x9, r2 - ; CHECK-NEXT: vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vshuffle x11, x11, x11, r6; vmac.f bmh5, bmh1, x6, x9, r2 ; CHECK-NEXT: vmac.f bmh6, bmh2, x5, x9, r2 ; CHECK-NEXT: vmac.f bmh7, bmh3, x7, x9, r2 ; CHECK-NEXT: vmac.f bmh8, bmh0, x4, x10, r2 @@ -87,6 +99,20 @@ ; CHECK-NEXT: vmac.f bml5, bmh2, x5, x11, r2 ; CHECK-NEXT: vmac.f bml6, bmh3, x7, x11, r2 ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup ; CHECK-NEXT: nopa ; ret lr diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA.mir index f79564877880..92ed051e861c 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA.mir @@ -25,7 +25,7 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: add.nc lc, r0, #-1 + ; CHECK-NEXT: add.nc lc, r0, #-2 ; CHECK-NEXT: movxm ls, #.LBB0_2 ; CHECK-NEXT: movxm le, #.L_LEnd0 ; CHECK-NEXT: vldb wl0, [p0], #32; nopa ; nops ; nopxm ; nopv @@ -34,48 +34,60 @@ ; CHECK-NEXT: vldb wh1, [p0], #32; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: vldb wl8, [p1], m5; nopa ; padds [p0], m4; nopxm ; nopv ; CHECK-NEXT: vldb wl2, [p0], #32; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; vldb wh2, [p0], #32; nopxm ; nops + ; CHECK-NEXT: nopa ; vldb wh2, [p0], #32; nopx ; CHECK-NEXT: vldb wl3, [p0], #32 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0 ; CHECK-NEXT: vldb wh8, [p1], m6 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 ; CHECK-NEXT: vldb wl9, [p1], m5 ; CHECK-NEXT: vldb wh9, [p1], m6 ; CHECK-NEXT: vldb wl10, [p1], m5 ; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3 ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x5, x0, x2, r16 - ; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x6, x1, x3, r3 - ; CHECK-NEXT: vshuffle x7, x1, x3, r16 - ; CHECK-NEXT: vshuffle x8, x8, x8, r6 + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x8, x8, x8, r6 + ; CHECK-NEXT: vldb wl0, [p0], #32; vshuffle x6, x1, x3, r3 + ; CHECK-NEXT: vldb wh0, [p0], #32; vshuffle x7, x1, x3, r16; vmac.f bmh0, bmh0, x4, x8, r2 + ; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: vldb wh1, [p0], #32; vmac.f bmh2, bmh2, x5, x8, r2 + ; CHECK-NEXT: padds [p0], m4; vldb wl8, [p1], m5; vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: vldb wl2, [p0], #32; vmac.f bmh4, bmh0, x4, x9, r2 + ; CHECK-NEXT: vldb wh2, [p0], #32; vshuffle x11, x11, x11, r6; vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh6, bmh2, x5, x9, r2 + ; CHECK-NEXT: vldb wh8, [p1], m6; vmac.f bmh7, bmh3, x7, x9, r2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bmh8, bmh0, x4, x10, r2 + ; CHECK-NEXT: vldb wl9, [p1], m5; vmac.f bml0, bmh1, x6, x10, r2 + ; CHECK-NEXT: vldb wh9, [p1], m6; vmac.f bml1, bmh2, x5, x10, r2 + ; CHECK-NEXT: vldb wl10, [p1], m5; vmac.f bml2, bmh3, x7, x10, r2 + ; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3; vmac.f bml3, bmh0, x4, x11, r2 + ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x5, x0, x2, r16; vmac.f bml4, bmh1, x6, x11, r2 + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x8, x8, x8, r6; vmac.f bml5, bmh2, x5, x11, r2 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vldb wl0, [p0], #32; nopa ; nops ; nopx ; vshuffle x9, x9, x9, r6; nopv - ; CHECK-NEXT: nopa ; vldb wh0, [p0], #32; nopx ; vmac.f bmh0, bmh0, x4, x8, r2 - ; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: vldb wl0, [p0], #32; nopa ; nops ; nopx ; vshuffle x6, x1, x3, r3; vmac.f bml6, bmh3, x7, x11, r2 + ; CHECK-NEXT: vldb wh0, [p0], #32; nopx ; vshuffle x7, x1, x3, r16; vmac.f bmh0, bmh0, x4, x8, r2 + ; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2 ; CHECK-NEXT: vldb wh1, [p0], #32; vmac.f bmh2, bmh2, x5, x8, r2 - ; CHECK-NEXT: padds [p0], m4; vldb wl8, [p1], m5; vshuffle x11, x11, x11, r6; vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: padds [p0], m4; vldb wl8, [p1], m5; vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2 ; CHECK-NEXT: vldb wl2, [p0], #32; vmac.f bmh4, bmh0, x4, x9, r2 - ; CHECK-NEXT: vldb wh2, [p0], #32; vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vldb wh2, [p0], #32; vshuffle x11, x11, x11, r6; vmac.f bmh5, bmh1, x6, x9, r2 ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh6, bmh2, x5, x9, r2 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bmh7, bmh3, x7, x9, r2 - ; CHECK-NEXT: vldb wh8, [p1], m6; vmac.f bmh8, bmh0, x4, x10, r2 + ; CHECK-NEXT: vldb wh8, [p1], m6; vmac.f bmh7, bmh3, x7, x9, r2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bmh8, bmh0, x4, x10, r2 ; CHECK-NEXT: vldb wl9, [p1], m5; vmac.f bml0, bmh1, x6, x10, r2 ; CHECK-NEXT: vldb wh9, [p1], m6; vmac.f bml1, bmh2, x5, x10, r2 ; CHECK-NEXT: vldb wl10, [p1], m5; vmac.f bml2, bmh3, x7, x10, r2 ; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3; vmac.f bml3, bmh0, x4, x11, r2 ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x5, x0, x2, r16; vmac.f bml4, bmh1, x6, x11, r2 - ; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x6, x1, x3, r3; vmac.f bml5, bmh2, x5, x11, r2 - ; CHECK-NEXT: vshuffle x7, x1, x3, r16; vmac.f bml6, bmh3, x7, x11, r2 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x8, x8, x8, r6; nopv + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; nopa ; nops ; nopx ; vshuffle x8, x8, x8, r6; vmac.f bml5, bmh2, x5, x11, r2 ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup - ; CHECK-NEXT: nopa ; vshuffle x9, x9, x9, r6 - ; CHECK-NEXT: vmac.f bmh0, bmh0, x4, x8, r2 - ; CHECK-NEXT: vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x6, x1, x3, r3; vmac.f bml6, bmh3, x7, x11, r2 + ; CHECK-NEXT: nopx ; vshuffle x7, x1, x3, r16; vmac.f bmh0, bmh0, x4, x8, r2 + ; CHECK-NEXT: vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2 ; CHECK-NEXT: vmac.f bmh2, bmh2, x5, x8, r2 - ; CHECK-NEXT: vshuffle x11, x11, x11, r6; vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2 ; CHECK-NEXT: vmac.f bmh4, bmh0, x4, x9, r2 - ; CHECK-NEXT: vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vshuffle x11, x11, x11, r6; vmac.f bmh5, bmh1, x6, x9, r2 ; CHECK-NEXT: vmac.f bmh6, bmh2, x5, x9, r2 ; CHECK-NEXT: vmac.f bmh7, bmh3, x7, x9, r2 ; CHECK-NEXT: vmac.f bmh8, bmh0, x4, x10, r2 @@ -87,6 +99,20 @@ ; CHECK-NEXT: vmac.f bml5, bmh2, x5, x11, r2 ; CHECK-NEXT: vmac.f bml6, bmh3, x7, x11, r2 ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup ; CHECK-NEXT: nopa ; ret lr