diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
index a8bcbe216736..c355c9e7be19 100644
--- a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
+++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
@@ -704,6 +704,8 @@ AIEBaseSubtarget::getPostRAMutationsImpl(const Triple &TT) {
   std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
   Mutations.emplace_back(std::make_unique<LockDelays>());
   if (!TT.isAIE1()) {
+    if (EnableWAWStickyRegisters)
+      Mutations.emplace_back(std::make_unique<WAWStickyRegistersEdges>());
     Mutations.emplace_back(std::make_unique<RegionEndEdges>());
     Mutations.emplace_back(std::make_unique<MemoryEdges>());
     Mutations.emplace_back(std::make_unique<MachineSchedWAWEdges>());
diff --git a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
index d6896bd624da..57fa9ed2ab52 100644
--- a/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
+++ b/llvm/lib/Target/AIE/AIEMachineScheduler.cpp
@@ -1320,8 +1320,9 @@ void llvm::AIEPostRASchedStrategy::buildGraph(ScheduleDAGMI &DAG, AAResults *AA,
     assert(BS.getRegions().size() == 1);
     // Try to wrap the linear schedule within II.
     // We virtually unroll the body by the stagecount, computed from rounding
-    // up the length divided by II.
-    NCopies = (BS.getScheduleLength() + II - 1) / II;
+    // up the length divided by II, adding one more stage to account for
+    // the added resource contention
+    NCopies = (BS.getScheduleLength() + II - 1) / II + 1;
   }
   DEBUG_BLOCKS(dbgs() << "    buildGraph, NCopies=" << NCopies << "\n");
   for (int S = 0; S < NCopies; S++) {
@@ -1386,6 +1387,8 @@ void AIEScheduleDAGMI::schedule() {
     // If it succeeds, we need to implement it, if we fail we fall back on the
     // normal loop schedule
     SchedImpl->buildGraph(*this, AA);
+    postProcessDAG();
+
     auto &PostSWP = BS.getPostSWP();
     if (PostSWP.schedule(*this, BS.FixPoint.II)) {
       BS.setPipelined();
diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp
index 961aacb6acca..48f194d0878b 100644
--- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp
+++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp
@@ -12,14 +12,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "AIEPostPipeliner.h"
+#include "AIESlotCounts.h"
 #include "Utils/AIELoopUtils.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/Support/MathExtras.h"
 
 #define DEBUG_TYPE "postpipeliner"
+#define DEBUG_SUMMARY(X) DEBUG_WITH_TYPE("postpipeliner-summary", X)
+#define DEBUG_FULL(X) DEBUG_WITH_TYPE("postpipeliner-full", X)
 
 namespace llvm::AIE {
 
+static cl::opt<int>
+    Heuristic("aie-postpipeliner-heuristic",
+              cl::desc("Select one specific post-pipeliner heuristic"),
+              cl::init(-1), cl::Hidden);
+
 PipelineScheduleVisitor::~PipelineScheduleVisitor() {}
 
 class PostPipelineDumper : public PipelineScheduleVisitor {
@@ -106,32 +115,24 @@ bool PostPipeliner::canAccept(MachineBasicBlock &LoopBlock) {
   return true;
 }
 
+static SlotCounts getSlotCounts(MachineInstr &MI, const AIEBaseInstrInfo *TII) {
+  auto *SlotInfo = TII->getSlotInfo(TII->getSlotKind(MI.getOpcode()));
+  return SlotInfo ? SlotInfo->getSlotSet() : 0;
+}
+
 int PostPipeliner::getResMII(MachineBasicBlock &LoopBlock) {
-  // For each instruction, find the first cycle in which it fits and collect the
-  // maximum
-  std::vector<uint64_t> Scoreboard(NInstr, 0);
-  int MII = 1;
+  // Add up all slot requirements and return the maximum slot count
+  SlotCounts Counts;
   for (auto &MI : LoopBlock) {
-    auto *SlotInfo = TII->getSlotInfo(TII->getSlotKind(MI.getOpcode()));
-    SlotBits Slots = SlotInfo ? SlotInfo->getSlotSet() : 0;
-
-    int C = 0;
-    while (C < NInstr && (Scoreboard[C] & Slots)) {
-      C++;
-    }
-    if (C >= NInstr) {
-      MII = NInstr;
-      break;
-    }
-    Scoreboard[C] |= Slots;
-    MII = std::max(MII, C + 1);
+    Counts += getSlotCounts(MI, TII);
   }
+  int MII = Counts.max();
   LLVM_DEBUG(dbgs() << "PostPipeliner: ResMII=" << MII << "\n");
   return MII;
 }
 
-// This assigns Cycle of SU, Earliest of its predecessors and Earliest of
-// the next instance of SU.
+// This assigns Cycle of SU, Earliest of its successors and Latest of its
+// predecessors
 void PostPipeliner::scheduleNode(SUnit &SU, int Cycle) {
   LLVM_DEBUG(dbgs() << "PostPipeline " << SU.NodeNum << " in cycle " << Cycle
                     << ". ");
@@ -145,8 +146,25 @@ void PostPipeliner::scheduleNode(SUnit &SU, int Cycle) {
     const int SNum = Succ->NodeNum;
     const int NewEarliest = Cycle + Latency;
     if (NewEarliest > Info[SNum].Earliest) {
+      Info[SNum].LastEarliestPusher = SU.NodeNum;
       Info[SNum].Earliest = NewEarliest;
-      LLVM_DEBUG(dbgs() << SNum << " to " << Info[SNum].Earliest << "; ");
+      Info[SU.NodeNum].NumPushedEarliest++;
+      LLVM_DEBUG(dbgs() << SNum << " to " << Info[SNum].Earliest << " -; ");
+    }
+  }
+  for (auto &Dep : SU.Preds) {
+    int Latency = Dep.getSignedLatency();
+    auto *Pred = Dep.getSUnit();
+    if (Pred->isBoundaryNode()) {
+      continue;
+    }
+    const int PNum = Pred->NodeNum;
+    const int NewLatest = Cycle - Latency;
+    if (NewLatest < Info[PNum].Latest) {
+      Info[PNum].LastLatestPusher = SU.NodeNum;
+      Info[PNum].Latest = NewLatest;
+      Info[SU.NodeNum].NumPushedLatest++;
+      LLVM_DEBUG(dbgs() << PNum << " to - " << Info[PNum].Latest << "; ");
     }
   }
   LLVM_DEBUG(dbgs() << "\n");
@@ -160,8 +178,10 @@ void PostPipeliner::scheduleNode(SUnit &SU, int Cycle) {
 // Check resources. We only insert at the position modulo II. Since we insert
 // all iterations separately, the resources that wrap around accumulate in the
 // overflow area, causing conflicts when inserting future iterations
-int PostPipeliner::fit(MachineInstr *MI, int Earliest, int NTries, int II) {
-  for (int C = Earliest; C < Earliest + NTries; C++) {
+int PostPipeliner::fit(MachineInstr *MI, int First, int Last, int II) {
+  const int Step = First > Last ? -1 : 1;
+  LLVM_DEBUG(dbgs() << "   " << First << ", " << Last << ", " << Step << "\n");
+  for (int C = First; C != Last; C += Step) {
     int Mod = C % II;
     LLVM_DEBUG(dbgs() << "   at " << C << " (" << Mod << ")\n");
     if (!HR.checkConflict(Scoreboard, *MI, -Depth + Mod)) {
@@ -174,49 +194,153 @@ int PostPipeliner::fit(MachineInstr *MI, int Earliest, int NTries, int II) {
   return -1;
 }
 
-void PostPipeliner::computeLoopCarriedParameters() {
-  // We schedule the first iteration, only using earliest. This updates
-  // earliest of the successors. Any successor in the second iteration
-  // represents a loop carried dependence, and we account for that by
-  // propagating its Earliest back to the first iteration
-  // Note that we don't have to clean the effects of this exploration,
-  // since the real scheduling will overwrite Cycle, and the ultimate Earliest
-  // will never be less than we compute here.
-
+void PostPipeliner::computeForward() {
+  // The forward order defines a topological sort, so we can compute
+  // Earliest and Ancestors in a single forward sweep
   for (int K = 0; K < NInstr; K++) {
+    auto &Me = Info[K];
+    SUnit &SU = DAG->SUnits[K];
+    for (auto &Dep : SU.Preds) {
+      if (Dep.getKind() != SDep::Data) {
+        continue;
+      }
+      int P = Dep.getSUnit()->NodeNum;
+      assert(P < K);
+      Me.Ancestors.insert(P);
+      auto &Pred = Info[P];
+      for (int Anc : Pred.Ancestors) {
+        Me.Ancestors.insert(Anc);
+      }
+    }
+    for (auto &Dep : SU.Succs) {
+      auto *Succ = Dep.getSUnit();
+      if (Succ->isBoundaryNode()) {
+        continue;
+      }
+      auto &SInfo = Info[Succ->NodeNum];
+      const int NewEarliest = Me.Earliest + Dep.getSignedLatency();
+      SInfo.Earliest = std::max(SInfo.Earliest, NewEarliest);
+    }
+    Me.Slots = getSlotCounts(*SU.getInstr(), TII);
+  }
+}
+
+bool PostPipeliner::computeBackward() {
+  bool Changed = false;
+
+  auto AddOffspring = [&Changed](NodeInfo &Info, int E) {
+    if (Info.Offspring.insert(E).second) {
+      Changed = true;
+    }
+  };
+
+  // Traversing backwards will speed convergence a bit
+  for (int K = NInstr - 1; K >= 0; K--) {
     SUnit &SU = DAG->SUnits[K];
-    const int Earliest = Info[K].Earliest;
-    scheduleNode(SU, Earliest);
+    auto &Me = Info[K];
+    const int Latest = Info[K].Latest;
+    for (auto &Dep : SU.Preds) {
+      if (Dep.getKind() != SDep::Data) {
+        continue;
+      }
+      int P = Dep.getSUnit()->NodeNum;
+      auto &Pred = Info[P];
+      AddOffspring(Pred, K);
+      for (auto Offs : Me.Offspring) {
+        AddOffspring(Pred, Offs);
+      }
+      int NewLatest = Latest - Dep.getSignedLatency();
+      if (NewLatest < Pred.Latest) {
+        Pred.Latest = NewLatest;
+        Changed = true;
+      }
+    }
   }
+  return Changed;
+}
+
+bool PostPipeliner::computeLoopCarriedParameters() {
+
+  // Forward properties like Earliest and Ancestors.
+  computeForward();
 
-  // Propagate Earliest upstream, initialize Latest
+  // Backward properties like Latest and Offspring.
+  // Use a fixpoint loop, because plain reversed order may not be topological
+  // for predecessors
+  while (computeBackward()) {
+    /* EMPTY */;
+  }
+
+  // Adjust Earliest and Latest with resource requirements.
+  // FIXME: We do not account for negative latencies here. This can lead to
+  // suboptimality, but we only include true dependences, where negative
+  // latencies are rare.
   for (int K = 0; K < NInstr; K++) {
-    const int K2 = K + NInstr;
-    const int Earliest = Info[K2].Earliest - II;
+    auto &Me = Info[K];
+    SlotCounts ASlots(Me.Slots);
+    for (int A : Me.Ancestors) {
+      ASlots += Info[A].Slots;
+    }
+    SlotCounts OSlots(Me.Slots);
+    for (int O : Me.Offspring) {
+      OSlots += Info[O].Slots;
+    }
+    LLVM_DEBUG(dbgs() << "SU" << K << " : " << Info[K].Earliest << " - "
+                      << Info[K].Latest << " " << ASlots << " " << OSlots
+                      << "\n");
+    Me.Earliest = std::max(Me.Earliest, 0 + (ASlots.max() - 1));
+    Me.Latest = std::min(Me.Latest, -1 - (OSlots.max() - 1));
+    LLVM_DEBUG(dbgs() << "    -> " << Info[K].Earliest << " - "
+                      << Info[K].Latest << "\n");
+  }
+
+  // Loop carried dependences will have pushed away Earliest of the second
+  // iteration, which should stay in lock step with the first.
+  for (int K = 0; K < NInstr; K++) {
+    const int KNextIter = K + NInstr;
+    const int Earliest = Info[KNextIter].Earliest - II;
     Info[K].Earliest = std::max(Info[K].Earliest, Earliest);
-    // Unrestricted: Beyond the last stage.
-    Info[K].Latest = NCopies * II;
   }
-  // Propagate Latest upstream. Latest is the latest
-  // that is admissible for Earliest to be achievable within II
+
+  // Make Earliest of the second iteration push up Latest of the first
   for (int K = 0; K < NInstr; K++) {
-    const int K2 = K + NInstr;
-    const int Earliest = Info[K2].Earliest;
-    const auto &SU = DAG->SUnits[K2];
-    for (auto &Dep : SU.Preds) {
-      const auto *Pred = Dep.getSUnit();
-      // Any predecessor in the first iteration
-      int K1 = Pred->NodeNum;
-      if (K1 < NInstr) {
-        const int Latest = Earliest - Dep.getSignedLatency();
-        Info[K1].Latest = std::min(Info[K1].Latest, Latest);
+    auto &Me = Info[K];
+    int LCDLatest = Me.Latest;
+    auto &SU = DAG->SUnits[K];
+    for (auto &Dep : SU.Succs) {
+      const int S = Dep.getSUnit()->NodeNum;
+      if (S < NInstr) {
+        continue;
       }
+      const int Earliest = Info[S - NInstr].Earliest;
+      const int Latest = Earliest - Dep.getSignedLatency();
+      LCDLatest = std::min(LCDLatest, Latest);
+    }
+    Me.LCDLatest = LCDLatest;
+    if (LCDLatest != Me.Latest) {
+      LLVM_DEBUG(dbgs() << "SU" << K << " LCDLatest=" << Me.LCDLatest << "\n");
     }
   }
-  LLVM_DEBUG(for (int K = 0; K < NInstr; K++) {
-    dbgs() << "SU" << K << " : " << Info[K].Earliest << " - " << Info[K].Latest
-           << "\n";
-  });
+
+  // Save the static values for ease of reset
+  for (auto &N : Info) {
+    N.StaticEarliest = N.Earliest;
+    N.StaticLatest = N.Latest;
+  }
+  return true;
+}
+
+int PostPipeliner::computeMinScheduleLength() const {
+  // The minimum length makes sure that every node has a range in which it
+  // can be scheduled
+  int MinLength = II;
+  for (int K = 0; K < NInstr; K++) {
+    auto &Node = Info[K];
+    while (Node.Earliest > Node.Latest + MinLength) {
+      MinLength += II;
+    }
+  }
+  return MinLength;
 }
 
 void dumpGraph(int NInstr, const std::vector<NodeInfo> &Info,
@@ -238,33 +362,54 @@ void dumpGraph(int NInstr, const std::vector<NodeInfo> &Info,
       if (S >= NInstr) {
         dbgs() << "_" << S % NInstr;
       }
-      dbgs() << "# L=" << Dep.getSignedLatency() << "\n";
+      if (Dep.getKind() == SDep::Data) {
+        dbgs() << " [color=red] ";
+      } else if (Dep.getKind() == SDep::Output) {
+        dbgs() << " [color=black] ";
+      } else if (Dep.getKind() == SDep::Anti) {
+        dbgs() << " [color=blue] ";
+      }
+
+      dbgs() << " # L=" << Dep.getSignedLatency();
+      if (Dep.getKind() == SDep::Output) {
+        dbgs() << " WAW";
+      }
+      dbgs() << "\n";
     }
   }
   dbgs() << "}\n";
 }
 
-int PostPipeliner::mostUrgent() {
-  assert(FirstUnscheduled < NInstr);
+int PostPipeliner::mostUrgent(PostPipelinerStrategy &Strategy) {
+  assert(FirstUnscheduled <= LastUnscheduled);
   while (Info[FirstUnscheduled].Scheduled) {
     FirstUnscheduled++;
   }
-  assert(FirstUnscheduled < NInstr);
+  while (Info[LastUnscheduled].Scheduled) {
+    LastUnscheduled--;
+  }
+  assert(FirstUnscheduled <= LastUnscheduled);
+
+  auto NotScheduled = [&](const auto &Dep) {
+    auto *SU = Dep.getSUnit();
+    if (SU->isBoundaryNode()) {
+      return false;
+    }
+    int N = SU->NodeNum;
+    return N < NInstr && !Info[N].Scheduled;
+  };
 
   int Best = -1;
   LLVM_DEBUG(dbgs() << "Available:");
-  for (int K = FirstUnscheduled; K < NInstr; K++) {
+  for (int K = FirstUnscheduled; K <= LastUnscheduled; K++) {
     const auto &SU = DAG->SUnits[K];
+    auto &Edges = Strategy.fromTop() ? SU.Preds : SU.Succs;
     // Check whether it is available
-    if (any_of(SU.Preds, [&](const auto &Dep) {
-          return !Info[Dep.getSUnit()->NodeNum].Scheduled;
-        })) {
+    if (Info[K].Scheduled || any_of(Edges, NotScheduled)) {
       continue;
     }
     LLVM_DEBUG(dbgs() << " SU" << K);
-    // Yeah, I know. This is a difficult way to schedule in the original
-    // node order. Have patience, my friend.
-    if (Best == -1) {
+    if (Best == -1 || Strategy.better(SU, DAG->SUnits[Best])) {
       Best = K;
       LLVM_DEBUG(dbgs() << "*");
     }
@@ -274,24 +419,41 @@ int PostPipeliner::mostUrgent() {
   return Best;
 }
 
-bool PostPipeliner::scheduleFirstIteration() {
+void PostPipeliner::resetSchedule(bool FullReset) {
+  Scoreboard.clear();
+  for (int K = 0; K < NTotalInstrs; K++) {
+    auto &N = Info[K];
+    N.reset(FullReset);
+    if (K < NInstr) {
+      N.Earliest = N.StaticEarliest;
+      N.Latest = N.StaticLatest;
+    }
+  }
+
+  FirstUnscheduled = 0;
+  LastUnscheduled = NInstr - 1;
+}
+
+bool PostPipeliner::scheduleFirstIteration(PostPipelinerStrategy &Strategy) {
   // Set up the basic schedule from the original instructions
   for (int K = 0; K < NInstr; K++) {
-    const int N = mostUrgent();
+    const int N = mostUrgent(Strategy);
     LLVM_DEBUG(dbgs() << "  Trying " << N << "\n");
     SUnit &SU = DAG->SUnits[N];
     MachineInstr *MI = SU.getInstr();
-    const int Earliest = Info[N].Earliest;
+    const int Earliest = Strategy.earliest(SU);
+    const int Latest = Strategy.latest(SU);
     // Find the first cycle that fits. We try every position modulo II
-    const int Actual = fit(MI, Earliest, II, II);
+    const int Actual = Strategy.fromTop() ? fit(MI, Earliest, Latest + 1, II)
+                                          : fit(MI, Latest, Earliest - 1, II);
     if (Actual < 0) {
       // out of resources for this II;
       LLVM_DEBUG(dbgs() << "Out of resources\n");
       return false;
     }
+    Strategy.selected(SU);
     const int LocalCycle = Actual % II;
     const MemoryBankBits MemoryBanks = HR.getMemoryBanks(MI);
-    LLVM_DEBUG(dbgs() << "  Emit in " << -Depth + LocalCycle << "\n");
     int Cycle = -Depth + LocalCycle;
     LLVM_DEBUG(dbgs() << "  Emit in " << Cycle << "\n");
     for (int N = 0; N < NCopies; N++) {
@@ -306,12 +468,23 @@ bool PostPipeliner::scheduleFirstIteration() {
 
     scheduleNode(SU, Actual);
     Info[N].Scheduled = true;
-    LLVM_DEBUG(dbgs() << "Scoreboard\n"; Scoreboard.dumpFull(););
+    DEBUG_FULL(dbgs() << "Scoreboard\n"; Scoreboard.dumpFull(););
   }
-  LLVM_DEBUG(dbgs() << "==== First iteration scheduled ======\n");
+  LLVM_DEBUG(dbgs() << "==== First iteration scheduled by " << Strategy.name()
+                    << "====\n");
   return true;
 }
 
+namespace {
+void dumpEarliestChain(const std::vector<NodeInfo> &Info, int N) {
+  auto Prev = Info[N].LastEarliestPusher;
+  if (Prev) {
+    dumpEarliestChain(Info, *Prev);
+  }
+  dbgs() << "  --> " << N << " @" << Info[N].Cycle << "\n";
+}
+} // namespace
+
 bool PostPipeliner::scheduleOtherIterations() {
   // Make sure that all the copies can be placed at II from the previous one.
   // This looks like overkill, but it accommodates dependences that span
@@ -328,8 +501,9 @@ bool PostPipeliner::scheduleOtherIterations() {
 
       // All iterations following the first one should fit exactly
       if (Earliest > Insert) {
-        LLVM_DEBUG(dbgs() << "  Latency not met (Earliest=" << Earliest
-                          << ")\n");
+        LLVM_DEBUG(dbgs() << "  Latency not met for " << N
+                          << "(Earliest=" << Earliest << ")\n";
+                   dumpEarliestChain(Info, N););
         return false;
       }
 
@@ -339,6 +513,195 @@ bool PostPipeliner::scheduleOtherIterations() {
   return true;
 }
 
+class DefaultStrategy : public PostPipelinerStrategy {
+public:
+  DefaultStrategy(ScheduleDAGMI &DAG, std::vector<NodeInfo> &Info,
+                  int LatestBias)
+      : PostPipelinerStrategy(DAG, Info, LatestBias) {}
+  bool better(const SUnit &A, const SUnit &B) override {
+    return Info[A.NodeNum].Latest < Info[B.NodeNum].Latest;
+  }
+};
+
+class ConfigStrategy : public PostPipelinerStrategy {
+  bool TopDown = true;
+
+public:
+  enum PriorityComponent {
+    NodeNum,
+    Latest,
+    Critical,
+    Sibling,
+    LCDLatest,
+    Size
+  };
+  static std::string getPriorityName(PriorityComponent Component) {
+    switch (Component) {
+    case PriorityComponent::NodeNum:
+      return "NodeNum";
+    case PriorityComponent::Latest:
+      return "Latest";
+    case PriorityComponent::Critical:
+      return "Critical";
+    case PriorityComponent::Sibling:
+      return "Sibling";
+    case PriorityComponent::LCDLatest:
+      return "LcdLatest";
+    default:
+      break;
+    }
+    return "Size - Illegal";
+  }
+
+private:
+  std::string Name;
+  std::set<int> SuccSiblingScheduled;
+  std::set<int> PredSiblingScheduled;
+  std::function<bool(const SUnit &A, const SUnit &B)>
+      Discriminators[PriorityComponent::Size] = {
+          [&](const SUnit &A, const SUnit &B) {
+            return TopDown ? A.NodeNum < B.NodeNum : A.NodeNum > B.NodeNum;
+          },
+          [&](const SUnit &A, const SUnit &B) {
+            auto &IA = Info[A.NodeNum];
+            auto &IB = Info[B.NodeNum];
+            return TopDown ? IA.Latest < IB.Latest : IA.Earliest > IB.Earliest;
+          },
+          [&](const SUnit &A, const SUnit &B) {
+            auto &IA = Info[A.NodeNum];
+            auto &IB = Info[B.NodeNum];
+            return TopDown ? IA.NumPushedEarliest > IB.NumPushedEarliest
+                           : IA.NumPushedLatest > IB.NumPushedLatest;
+          },
+          [&](const SUnit &A, const SUnit &B) {
+            std::set<int> &Sibling =
+                TopDown ? SuccSiblingScheduled : PredSiblingScheduled;
+            return Sibling.count(A.NodeNum) > Sibling.count(B.NodeNum);
+          },
+          [&](const SUnit &A, const SUnit &B) {
+            auto &IA = Info[A.NodeNum];
+            auto &IB = Info[B.NodeNum];
+            return IA.LCDLatest < IB.LCDLatest;
+          },
+      };
+  std::vector<PriorityComponent> Priority;
+
+  bool fromTop() override { return TopDown; }
+
+  bool better(const SUnit &A, const SUnit &B) override {
+    for (auto P : Priority) {
+      if (Discriminators[P](A, B)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void selected(const SUnit &N) override {
+    // Promote the critical path
+    NodeInfo *Pushed = &Info[N.NodeNum];
+    while (Pushed->LastEarliestPusher) {
+      Pushed = &Info[*Pushed->LastEarliestPusher];
+      Pushed->NumPushedEarliest++;
+    }
+
+    // Promote my siblings
+    for (auto &SDep : N.Succs) {
+      if (SDep.getKind() != SDep::Data) {
+        continue;
+      }
+      for (auto &PDep : SDep.getSUnit()->Preds) {
+        if (PDep.getKind() != SDep::Data) {
+          continue;
+        }
+        SuccSiblingScheduled.insert(PDep.getSUnit()->NodeNum);
+      }
+    }
+    for (auto &PDep : N.Preds) {
+      if (PDep.getKind() != SDep::Data) {
+        continue;
+      }
+      for (auto &SDep : PDep.getSUnit()->Succs) {
+        if (SDep.getKind() != SDep::Data) {
+          continue;
+        }
+        PredSiblingScheduled.insert(PDep.getSUnit()->NodeNum);
+      }
+    }
+  }
+
+public:
+  std::string name() override { return Name; }
+  ConfigStrategy(ScheduleDAGInstrs &DAG, std::vector<NodeInfo> &Info,
+                 int Length, bool TopDown,
+                 ArrayRef<PriorityComponent> Components)
+      : PostPipelinerStrategy(DAG, Info, Length), TopDown(TopDown) {
+    Name = "Config_" + std::to_string(Length) + "_" + std::to_string(TopDown);
+    for (auto Comp : Components) {
+      Name += "_" + getPriorityName(Comp);
+      Priority.emplace_back(Comp);
+    }
+  }
+};
+
+static const struct {
+  int ExtraStages;
+  bool TopDown;
+  bool Rerun;
+  ConfigStrategy::PriorityComponent Components[3];
+} Strategies[] = {
+    // Loosely speaking, a lower value of the first parameter targets
+    // a lower stage count, which benefits code size.
+    // Rerurn is only useful for heuristics that use it, e.g. Critical
+    {1, true, false, {ConfigStrategy::NodeNum}},
+    {1, true, false, {ConfigStrategy::Latest}},
+    {1, true, true, {ConfigStrategy::Critical}},
+    {1, true, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}},
+    {0, false, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}},
+    {1, false, true, {ConfigStrategy::Critical, ConfigStrategy::LCDLatest}},
+    // This is pure bottom up
+    {1, false, false, {ConfigStrategy::NodeNum}},
+};
+
+bool PostPipeliner::tryHeuristics() {
+  int MinLength = computeMinScheduleLength();
+
+  DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength << "\n");
+
+  int HeuristicIndex = 0;
+  for (auto &[ExtraStages, TopDown, Rerun, Components] : Strategies) {
+    if (Heuristic >= 0 && Heuristic != HeuristicIndex++) {
+      continue;
+    }
+    ConfigStrategy S(*DAG, Info, MinLength + ExtraStages * II, TopDown,
+                     Components);
+    resetSchedule(/*FullReset=*/true);
+    DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name() << "\n");
+    if (scheduleFirstIteration(S) && scheduleOtherIterations()) {
+      DEBUG_SUMMARY(dbgs() << "    Strategy " << S.name() << " found II=" << II
+                           << "\n");
+      return true;
+    }
+
+    DEBUG_SUMMARY(dbgs() << " failed\n");
+    if (!Rerun) {
+      continue;
+    }
+
+    // Rerun with dynamic information retained
+    resetSchedule(/*FullReset=*/false);
+    DEBUG_SUMMARY(dbgs() << "--- Strategy " << S.name()
+                         << " with critical path");
+    if (scheduleFirstIteration(S) && scheduleOtherIterations()) {
+      DEBUG_SUMMARY(dbgs() << " found II=" << II << "\n");
+      return true;
+    }
+    DEBUG_SUMMARY(dbgs() << " failed\n");
+  }
+  DEBUG_SUMMARY(dbgs() << "=== II=" << II << " Failed ===\n");
+  return false;
+}
+
 bool PostPipeliner::schedule(ScheduleDAGMI &TheDAG, int InitiationInterval) {
   NTotalInstrs = TheDAG.SUnits.size();
   assert(NTotalInstrs % NInstr == 0);
@@ -349,7 +712,6 @@ bool PostPipeliner::schedule(ScheduleDAGMI &TheDAG, int InitiationInterval) {
   }
   II = InitiationInterval;
   DAG = &TheDAG;
-  FirstUnscheduled = 0;
 
   // Let's not skimp on size here. This allows us to insert any instruction
   // in the unrolled dag.
@@ -358,12 +720,14 @@ bool PostPipeliner::schedule(ScheduleDAGMI &TheDAG, int InitiationInterval) {
 
   Info.clear();
   Info.resize(NTotalInstrs);
+
   LLVM_DEBUG(for (int I = 0; I < NInstr;
                   I++) { dbgs() << I << " " << *DAG->SUnits[I].getInstr(); });
   LLVM_DEBUG(dumpGraph(NInstr, Info, DAG));
 
   computeLoopCarriedParameters();
-  if (!scheduleFirstIteration() || !scheduleOtherIterations()) {
+
+  if (!tryHeuristics()) {
     LLVM_DEBUG(dbgs() << "PostPipeliner: No schedule found\n");
     return false;
   }
@@ -461,4 +825,17 @@ void PostPipeliner::updateTripCount() const {
   TII->adjustTripCount(*TripCountDef, -Delta);
 }
 
+void NodeInfo::reset(bool FullReset) {
+  Cycle = 0;
+  Scheduled = false;
+  Earliest = 0;
+  Latest = -1;
+  if (FullReset) {
+    NumPushedEarliest = 0;
+    NumPushedLatest = 0;
+    LastEarliestPusher = {};
+    LastLatestPusher = {};
+  }
+}
+
 } // namespace llvm::AIE
diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.h b/llvm/lib/Target/AIE/AIEPostPipeliner.h
index d9bcb558103a..5fa8ca8d7f49 100644
--- a/llvm/lib/Target/AIE/AIEPostPipeliner.h
+++ b/llvm/lib/Target/AIE/AIEPostPipeliner.h
@@ -15,8 +15,10 @@
 #define LLVM_LIB_TARGET_AIE_AIEPOSTPIPELINER_H
 
 #include "AIEHazardRecognizer.h"
+#include "AIESlotCounts.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/ResourceScoreboard.h"
+#include <unordered_set>
 #include <vector>
 
 namespace llvm {
@@ -43,14 +45,77 @@ class NodeInfo {
   int ModuloCycle = 0;
   // Cycle / II
   int Stage = 0;
+
   // The earliest cycle at which this can be scheduled to meet latencies
   // This includes the lowerbound of the modulo condition, i.e.
   // Earliest(N) >= Cycle(N - NInstr) + II
   int Earliest = 0;
-  // For an LCD K1 -> K2, this holds II + Earliest(K2 - NInstr) - Latency(LCD)
-  // Instructions with lower Latest have higher priority in the
-  // top down scheduling
-  int Latest = 0;
+
+  // The latest cycle at which this can be scheduled. This is a negative value
+  // relative to the length of the linear schedule.
+  // So -1 is the last cycle of the linear schedule, -Length is the first cycle
+  // of the linear schedule. Note that this length is usually rounded up to
+  // the next multiple of the initiation interval
+  int Latest = -1;
+
+  // These are the values of Earliest and Latest as computed from the a-priori
+  // computations. During scheduling Earliest and Latest may be adjusted to
+  // more accurate values. The two values are cached here to facilitate cheaper
+  // reset before trying a new strategy for the same II.
+  int StaticEarliest = 0;
+  int StaticLatest = -1;
+
+  // Slots necessary for this instruction.
+  SlotCounts Slots;
+
+  // Record critical path components
+  // The Pred/Succ that pushed my Earliest/Latest
+  std::optional<int> LastEarliestPusher;
+  std::optional<int> LastLatestPusher;
+  // The number of Succs/Preds whose Earliest/Latest I have pushed.
+  int NumPushedEarliest = 0;
+  int NumPushedLatest = 0;
+
+  // Latest corrected by taking Earliest of an LCD successor into account
+  int LCDLatest = -1;
+
+  // The transitive closure of my predecessors
+  std::unordered_set<int> Ancestors;
+
+  // The transitive closure of my successors
+  std::unordered_set<int> Offspring;
+
+  /// Reset the node to the values computed statically
+  /// If FullReset is true, also reset the accumulated dynamic data.
+  void reset(bool FullReset);
+};
+
+class PostPipelinerStrategy {
+protected:
+  ScheduleDAGInstrs &DAG;
+  std::vector<NodeInfo> &Info;
+  int LatestBias = 0;
+
+public:
+  PostPipelinerStrategy(ScheduleDAGInstrs &DAG, std::vector<NodeInfo> &Info,
+                        int LatestBias)
+      : DAG(DAG), Info(Info), LatestBias(LatestBias) {};
+  virtual ~PostPipelinerStrategy() {};
+  // Provide a name for logging purposes
+  virtual std::string name() { return "PostPipelinerStrategy"; }
+  // Choose among available alternatives
+  virtual bool better(const SUnit &A, const SUnit &B) { return false; }
+  // Define the earliest cycle in which to insert \p N
+  virtual int earliest(const SUnit &N) { return Info[N.NodeNum].Earliest; }
+  // Define the latest cycle in which to insert \p N
+  virtual int latest(const SUnit &N) {
+    return Info[N.NodeNum].Latest + LatestBias;
+  }
+  // Select from top or from bottom.
+  virtual bool fromTop() { return true; }
+  // Report a final selection. This marks the start of selecting a new node.
+  // fromTop() should be invariant between calls to selected()
+  virtual void selected(const SUnit &N) {};
 };
 
 class PipelineScheduleVisitor {
@@ -72,6 +137,7 @@ class PostPipeliner {
 
   int NTotalInstrs = 0;
   int FirstUnscheduled = 0;
+  int LastUnscheduled = -1;
 
   /// Holds the cycle of each SUnit. The following should hold:
   /// Cycle(N) mod II == Cycle(N % NInstr) mod II
@@ -96,7 +162,8 @@ class PostPipeliner {
   int II = 1;
   int NStages = 0;
 
-  /// Place SU in cycle Cycle; update Earliest of dependent instructions
+  /// Place SU in cycle Cycle; update Earliest of successors and Latest
+  /// of predecessors
   void scheduleNode(SUnit &SU, int Cycle);
 
   /// Compute the stage in which each instruction runs
@@ -108,20 +175,38 @@ class PostPipeliner {
   int fit(MachineInstr *MI, int Earliest, int NTries, int II);
 
   /// Provide some look ahead by seeing the effect of the first iteration
-  /// on the second iteration.
-  void computeLoopCarriedParameters();
+  /// on the second iteration. May return false if the II isn't feasible.
+  bool computeLoopCarriedParameters();
+
+  /// Helpers of computeLoopCarriedParameters()
+  void computeForward();
+  bool computeBackward();
+
+  // Given Earliest and Latest of each node in the first iteration,
+  // compute the smallest length of the linear schedule that is feasible.
+  // this length will be a multiple of the InitiationInterval
+  int computeMinScheduleLength() const;
+
+  /// Try all heuristics, stop at the first that fits the II
+  /// If it returns true, a valid schedule is laid down in Info.
+  bool tryHeuristics();
 
   /// Find the first available unscheduled instruction with the highest
   /// priority
-  int mostUrgent();
+  int mostUrgent(PostPipelinerStrategy &Strategy);
 
   /// Schedule the original instructions, taking the modulo scoreboard
   /// into account
-  bool scheduleFirstIteration();
+  bool scheduleFirstIteration(PostPipelinerStrategy &Strategy);
 
   /// Check that all copied instructions can run in the same modulo cycle
   bool scheduleOtherIterations();
 
+  /// Reset dynamic scheduling data.
+  /// If FullReset is set, also reset information collected from earlier
+  /// data mining scheduling rounds
+  void resetSchedule(bool FullReset);
+
 public:
   PostPipeliner(const AIEHazardRecognizer &HR, int NInstr);
 
diff --git a/llvm/lib/Target/AIE/AIESlotCounts.cpp b/llvm/lib/Target/AIE/AIESlotCounts.cpp
new file mode 100644
index 000000000000..e9d85e552ef7
--- /dev/null
+++ b/llvm/lib/Target/AIE/AIESlotCounts.cpp
@@ -0,0 +1,72 @@
+//===- AIESlotCounts.cpp - SlotCount utility ------------------------------===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+//
+//===----------------------------------------------------------------------===//
+
+#include "AIESlotCounts.h"
+
+namespace llvm {
+namespace AIE {
+
+SlotCounts::SlotCounts(SlotBits Bits) {
+  while (Bits) {
+    assert(Size < MaxSlots);
+    Counts[Size] = Bits & 1;
+    Size++;
+    Bits >>= 1;
+  }
+}
+
+SlotCounts::SlotCounts(const SlotCounts &Org) : Size(Org.Size) {
+  for (int I = 0; I < Size; I++) {
+    Counts[I] = Org.Counts[I];
+  }
+}
+
+int SlotCounts::max() {
+  int Max = 0;
+  for (int I = 0; I < Size; I++) {
+    Max = std::max(Max, int(Counts[I]));
+  }
+  return Max;
+}
+
+SlotCounts &SlotCounts::operator+=(const SlotCounts &Other) {
+  // The common part
+  for (int I = 0; I < Size && I < Other.Size; I++) {
+    Counts[I] += Other.Counts[I];
+  }
+  // Any excess from the other
+  while (Size < Other.Size) {
+    Counts[Size] = Other.Counts[Size];
+    Size++;
+  }
+  assert(Size >= Other.Size);
+  assert(Size < MaxSlots);
+  return *this;
+}
+
+SlotCounts SlotCounts::operator+(const SlotCounts &Other) const {
+  SlotCounts Result(*this);
+  return Result += Other;
+}
+
+} // namespace AIE
+
+raw_ostream &operator<<(raw_ostream &OS, const AIE::SlotCounts &Val) {
+  OS << "{ ";
+  const char *Sep = "";
+  for (int I = 0; I < Val.size(); I++) {
+    OS << Sep << Val[I];
+    Sep = ", ";
+  }
+  OS << " }";
+  return OS;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Target/AIE/AIESlotCounts.h b/llvm/lib/Target/AIE/AIESlotCounts.h
new file mode 100644
index 000000000000..34ddd0d09542
--- /dev/null
+++ b/llvm/lib/Target/AIE/AIESlotCounts.h
@@ -0,0 +1,55 @@
+//===- AIESlotCounts.h - Resource computation utility ---------------------===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+//
+//===----------------------------------------------------------------------===//
+// This defines a class that can be used to tally up the slots required for
+// one or more instructions
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AIE_AIESLOTCOUNTS_H
+#define LLVM_LIB_TARGET_AIE_AIESLOTCOUNTS_H
+
+#include "MCTargetDesc/AIEFormat.h"
+
+namespace llvm {
+namespace AIE {
+
+/// Efficient representation of slot requirements
+class SlotCounts {
+  static const int MaxSlots = 16;
+  int Counts[MaxSlots];
+  // The number of valid Counts. Further counts are assumed to be zero.
+  int Size = 0;
+
+public:
+  // Useful constructors
+  SlotCounts() = default;
+  SlotCounts(SlotBits Bits);
+  SlotCounts(const SlotCounts &Org);
+  SlotCounts &operator=(const SlotCounts &Rhs) = default;
+
+  // Compute the number of required cycles
+  int max();
+
+  // Add slot counts of Other to this
+  SlotCounts &operator+=(const SlotCounts &Other);
+
+  // By-value addition.
+  SlotCounts operator+(const SlotCounts &Other) const;
+
+  // Indexing
+  const int &operator[](int I) const { return Counts[I]; };
+
+  int size() const { return Size; }
+};
+} // namespace AIE
+
+raw_ostream &operator<<(raw_ostream &OS, const AIE::SlotCounts &Val);
+
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_AIE_AIESLOTCOUNTS_H
diff --git a/llvm/lib/Target/AIE/CMakeLists.txt b/llvm/lib/Target/AIE/CMakeLists.txt
index 85dda5330112..191ea6305274 100644
--- a/llvm/lib/Target/AIE/CMakeLists.txt
+++ b/llvm/lib/Target/AIE/CMakeLists.txt
@@ -94,6 +94,7 @@ add_llvm_target(AIECodeGen
    AIEPseudoBranchExpansion.cpp
    AIERegClassConstrainer.cpp
    AIERegisterInfo.cpp
+   AIESlotCounts.cpp
    AIESplitInstructionRewriter.cpp
    AIESubRegConstrainer.cpp
    AIESubtarget.cpp
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/README b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/README
new file mode 100644
index 000000000000..72bf8ee7396e
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/README
@@ -0,0 +1,19 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+The tests in these directory are single loops that need to be post-pipelined.
+They supply the input to the scheduler and test the generated assembly code. 
+It is likely that the detailed schedule changes over time, and this can be
+automatically updated provided that neither the II nor the stage count grows.
+If the stage count grows, automatic update is allowed if the II shrinks.
+The II is the number of lines between the loop block's label upto and including
+the cycle headed by the loop end label.
+The stage count is determined by the immediate operand of setting the
+lc register.
+
+Note that the LLVM IR doesn't match the actual MIR code. It is just a standard
+loop providing some pointers into different spaces to dereference.
+
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store.mir
index fe25c964c908..92e2e7a8089b 100644
--- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store.mir
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store.mir
@@ -5,7 +5,8 @@
 #
 # (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
 
-# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s
+# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \
+# RUN:     --debug-only=postpipeliner-summary -o - 2>&1 | FileCheck %s
 
 # add-store can run in a two-stage II=1 pipeline
 
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir
index 06aa9aeb6418..401cbb80ed65 100644
--- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir
@@ -7,7 +7,8 @@
 # (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
 
 
-# RUN: llc -O2 --mtriple=aie2 --start-before=postmisched  %s -o - | FileCheck %s
+# RUN: llc -O2 --mtriple=aie2 --start-before=postmisched  %s \
+# RUN:  --debug-only=postpipeliner-summary -o - 2>&1 | FileCheck %s
 
 --- |
   define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %cond, ptr %cond.i50, <16 x i32> %0, i32 %cond67.i79, i20 %idx.ext.i.i81, i20 %idx.ext.i404.i, i20 %idx.ext.i410.i, i20 %idx.ext.i434.i85, i32 %1, i20 %2, i20 %3, i20 %4, i20 %5, i20 %6, i32 %7, i32 %8, i32 %or9.i.i.i.i.i96, i32 %9, i20 %idx.ext.i422.i82, i20 %10, i20 %11, i20 %12, i20 %13, i20 %14, i20 %15, i20 %16, i20 %17, i20 %18, i20 %19, i20 %20, i20 %21, i20 %22, i20 %23, i32 %conv192.i107, i20 %24, i20 %idx.ext.i428.i, i20 %25, i20 %26, i20 %27, i32 %28) #0 {
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir
new file mode 100644
index 000000000000..6866f48a3518
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir
@@ -0,0 +1,238 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \
+# RUN:   --debug-only=postpipeliner-summary -o - | FileCheck %s
+
+
+# derived from conv2d_bf16_0
+
+--- |
+  define dso_local void @conv2d(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 {
+  ; CHECK-LABEL: conv2d:
+  ; CHECK:         .p2align 4
+  ; CHECK-NEXT:  // %bb.0: // %entry
+  ; CHECK-NEXT:    mova r1, #0; nopb ; nopxm
+  ; CHECK-NEXT:    ge r1, r1, r0
+  ; CHECK-NEXT:    jnz r1, #.LBB0_4
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+  ; CHECK-NEXT:    nopa ; vldb wh8, [p0, #32]; nopx ; mov p7, p5
+  ; CHECK-NEXT:    vldb wl8, [p0], m4
+  ; CHECK-NEXT:    vldb wh10, [p0, #32]
+  ; CHECK-NEXT:    vldb wl10, [p0], m4
+  ; CHECK-NEXT:    vldb wh1, [p0, #32]
+  ; CHECK-NEXT:    vldb wl1, [p0], m4
+  ; CHECK-NEXT:    vldb wh3, [p0, #32]; add.nc lc, r0, #-1
+  ; CHECK-NEXT:    vldb.3d wl3, [p0], d1; movxm ls, #.LBB0_2
+  ; CHECK-NEXT:    vlda wh7, [p4, #352]; vshift.align x0, x0, s0, x8, r3
+  ; CHECK-NEXT:    vlda wl7, [p4, #320]; movxm le, #.L_LEnd0
+  ; CHECK-NEXT:    nopb ; vlda wh9, [p4, #416]; nops ; nopx ; vshift.align x2, x2, s0, x10, r3; nopv
+  ; CHECK-NEXT:    nopb ; vlda wl9, [p4, #384]; nops ; nopx ; vshuffle x8, x0, x2, r9; nopv
+  ; CHECK-NEXT:    vldb wh5, [p5, #32]; nopa ; nops ; nopx ; vshift.align x4, x4, s0, x1, r3; nopv
+  ; CHECK-NEXT:    nopb ; vlda wl5, [p5], #256; nops ; nopx ; vshuffle x5, x0, x2, r25; nopv
+  ; CHECK-NEXT:    nopb ; vlda wh11, [p4, #480]; nops ; nopx ; vshift.align x6, x6, s0, x3, r3; nopv
+  ; CHECK-NEXT:    nopb ; vlda wl11, [p4, #448]; nops ; nopx ; vshuffle x3, x4, x6, r9; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x10, x4, x6, r25; vmac.f bml4, bml4, x8, x7, r29
+  ; CHECK-NEXT:    nopa ; vshuffle x1, x3, x5, r13
+  ; CHECK-NEXT:    vshuffle x3, x3, x5, r24; vmac.f bmh1, bmh1, x8, x9, r29
+  ; CHECK-NEXT:    mov r3, p0; vmac.f bmh0, bmh0, x1, x9, r29
+  ; CHECK-NEXT:    and r3, r3, r0; mov p4, p7; vmac.f bmh7, bmh7, x8, x5, r29
+  ; CHECK-NEXT:    add r3, r3, #34; vmac.f bmh5, bmh5, x1, x5, r29
+  ; CHECK-NEXT:    vmac.f bml2, bml2, x3, x5, r29
+  ; CHECK-NEXT:    vmac.f bml0, bml0, x10, x5, r29
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_2: // %for.body
+  ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+  ; CHECK-NEXT:    vldb wh8, [p0, #32]; nopa ; nops ; nopx ; mov p7, p5; vmac.f bmh3, bmh3, x3, x9, r29
+  ; CHECK-NEXT:    nopa ; vldb wl8, [p0], m4; nopx ; vmac.f bmh2, bmh2, x10, x9, r29
+  ; CHECK-NEXT:    vldb wh10, [p0, #32]; vmac.f bml3, bml3, x1, x7, r29
+  ; CHECK-NEXT:    vldb wl10, [p0], m4; vmac.f bml6, bml6, x3, x7, r29
+  ; CHECK-NEXT:    vldb wh1, [p0, #32]; vmac.f bml5, bml5, x10, x7, r29
+  ; CHECK-NEXT:    vldb wl1, [p0], m4; vmac.f bmh6, bmh6, x8, x11, r29
+  ; CHECK-NEXT:    vldb wh3, [p0, #32]; vmac.f bmh4, bmh4, x1, x11, r29
+  ; CHECK-NEXT:    vldb.3d wl3, [p0], d1; vmac.f bml1, bml1, x3, x11, r29
+  ; CHECK-NEXT:    vlda wh7, [p4, #352]; vshift.align x0, x0, s0, x8, r3; vmac.f bmh8, bmh8, x10, x11, r29
+  ; CHECK-NEXT:    vlda wl7, [p4, #320]
+  ; CHECK-NEXT:    vlda wh9, [p4, #416]; vshift.align x2, x2, s0, x10, r3
+  ; CHECK-NEXT:    vlda wl9, [p4, #384]; vshuffle x8, x0, x2, r9
+  ; CHECK-NEXT:    vldb wh5, [p5, #32]; vshift.align x4, x4, s0, x1, r3
+  ; CHECK-NEXT:    vlda wl5, [p5], #256; vshuffle x5, x0, x2, r25
+  ; CHECK-NEXT:    vlda wh11, [p4, #480]; vshift.align x6, x6, s0, x3, r3
+  ; CHECK-NEXT:    vlda wl11, [p4, #448]; vshuffle x3, x4, x6, r9
+  ; CHECK-NEXT:    vshuffle x10, x4, x6, r25; vmac.f bml4, bml4, x8, x7, r29
+  ; CHECK-NEXT:    vshuffle x1, x3, x5, r13
+  ; CHECK-NEXT:    vshuffle x3, x3, x5, r24; vmac.f bmh1, bmh1, x8, x9, r29
+  ; CHECK-NEXT:    mov r3, p0; vmac.f bmh0, bmh0, x1, x9, r29
+  ; CHECK-NEXT:    and r3, r3, r0; mov p4, p7; vmac.f bmh7, bmh7, x8, x5, r29
+  ; CHECK-NEXT:    add r3, r3, #34; vmac.f bmh5, bmh5, x1, x5, r29
+  ; CHECK-NEXT:    vmac.f bml2, bml2, x3, x5, r29
+  ; CHECK-NEXT:  .L_LEnd0:
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; vmac.f bml0, bml0, x10, x5, r29
+  ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; vmac.f bmh3, bmh3, x3, x9, r29
+  ; CHECK-NEXT:    vmac.f bmh2, bmh2, x10, x9, r29
+  ; CHECK-NEXT:    vmac.f bml3, bml3, x1, x7, r29
+  ; CHECK-NEXT:    vmac.f bml6, bml6, x3, x7, r29
+  ; CHECK-NEXT:    vmac.f bml5, bml5, x10, x7, r29
+  ; CHECK-NEXT:    vmac.f bmh6, bmh6, x8, x11, r29
+  ; CHECK-NEXT:    vmac.f bmh4, bmh4, x1, x11, r29
+  ; CHECK-NEXT:    vmac.f bml1, bml1, x3, x11, r29
+  ; CHECK-NEXT:    vmac.f bmh8, bmh8, x10, x11, r29
+  ; CHECK-NEXT:    nopx
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_4: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopa ; ret lr
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  entry:
+    %cmp5 = icmp sgt i32 %n, 0
+    br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %n)
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    ret void
+
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ]
+    %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ]
+    %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2
+    %add = add nsw i32 %0, 1
+    store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2
+    %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1
+    %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1
+    %1 = call i1 @llvm.loop.decrement.i32(i32 1)
+    br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6
+  }
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare void @llvm.set.loop.iterations.i32(i32) #1
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i1 @llvm.loop.decrement.i32(i32) #1
+
+  attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+  attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn }
+
+  !llvm.module.flags = !{!0}
+  !llvm.ident = !{!1}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"}
+  !2 = !{!3, !3, i64 0}
+  !3 = !{!"int", !4, i64 0}
+  !4 = !{!"omnipotent char", !5, i64 0}
+  !5 = !{!"Simple C/C++ TBAA"}
+  !6 = distinct !{!6, !7, !8}
+  !7 = !{!"llvm.loop.mustprogress"}
+  !8 = !{!"llvm.loop.itercount.range", i64 10}
+
+...
+---
+name:            conv2d
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0.entry (align 16):
+    successors: %bb.1(0x50000000), %bb.3(0x30000000)
+    liveins: $p0, $p1, $r0
+
+    $r1 = MOV_RLC_imm10_pseudo 0
+    $r1 = GE $r1, $r0
+    JNZ $r1, %bb.3
+    DelayedSchedBarrier
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+    liveins: $p0, $p1, $r0
+
+    $lc = ADD_NC $r0, 0
+    $ls = MOVXM_lng_cg %bb.2
+    $le = MOVXM_lng_cg <mcsymbol .L_LEnd0>
+
+  bb.2.for.body (align 16):
+    liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc3, $dc4, $dc5, $dc7, $dj1, $dj3, $dj5, $dj7, $dn1, $dn5, $dn7, $m0, $m1, $m3, $m4, $m5, $m7, $p0, $p1, $p2, $p3, $p4, $p5, $p6, $p7, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r13, $r14, $r15, $r16, $r17, $r18, $r19, $r20, $r21, $r22, $r23, $r24, $r25, $r26, $r27, $r28, $r29, $r30, $r31, $s0, $s1, $x0, $x2, $x4, $x6, $d1_3d:0x000000000003C870
+
+    $p7 = MOV_mv_scl $p5
+    $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wl8, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wh10 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wl10, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wh1 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wl1, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wh3 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wl3, $p0, $dc1, $dc5 = VLD_3D_pseudo $p0, $d1_3d :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $x0 = VSHIFT_ALIGN $x0, $s0, $x8, $r3
+    $x2 = VSHIFT_ALIGN $x2, $s0, $x10, $r3
+    $x4 = VSHIFT_ALIGN $x4, $s0, $x1, $r3
+    $x6 = VSHIFT_ALIGN $x6, $s0, $x3, $r3
+    $x8 = VSHUFFLE $x0, $x2, $r9
+    $x3 = VSHUFFLE $x4, $x6, $r9
+    $x5 = VSHUFFLE $x0, $x2, $r25
+    $x10 = VSHUFFLE $x4, $x6, $r25
+    $x1 = VSHUFFLE $x3, $x5, $r13
+    $x3 = VSHUFFLE $x3, $x5, $r24
+    $wh5 = VLD_idx_imm_3x32_pseudo $p5, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wl5, $p5 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p5, 256 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wh7 = VLDA_dmw_lda_w_ag_idx_imm $p4, 352 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wl7 = VLDA_dmw_lda_w_ag_idx_imm $p4, 320 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wh9 = VLDA_dmw_lda_w_ag_idx_imm $p4, 416 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wl9 = VLDA_dmw_lda_w_ag_idx_imm $p4, 384 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wh11 = VLDA_dmw_lda_w_ag_idx_imm $p4, 480 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wl11 = VLDA_dmw_lda_w_ag_idx_imm $p4, 448 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x8, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x1, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x3, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x10, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x8, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x3, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x8, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x3, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x10, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x8, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x3, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x10, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $r3 = MOV_mv_scl $p0
+    $r3 = AND $r3, $r0
+    $r3 = nuw nsw ADD_add_r_ri $r3, 34, implicit-def $srcarry
+    $p4 = MOV_mv_scl $p7
+
+    PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2
+
+  bb.3.for.cond.cleanup (align 16):
+    RET implicit $lr
+    DelayedSchedBarrier
+
+...
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir
new file mode 100644
index 000000000000..85064a43cb53
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir
@@ -0,0 +1,233 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \
+# RUN:   --debug-only=postpipeliner-summary -o - | FileCheck %s
+
+
+# derived from conv2d_bf16_0
+
+--- |
+  define dso_local void @conv2d(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 {
+  ; CHECK-LABEL: conv2d:
+  ; CHECK:         .p2align 4
+  ; CHECK-NEXT:  // %bb.0: // %entry
+  ; CHECK-NEXT:    mova r1, #0; nopb ; nopxm
+  ; CHECK-NEXT:    ge r1, r1, r0
+  ; CHECK-NEXT:    jnz r1, #.LBB0_4
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+  ; CHECK-NEXT:    vldb wh7, [p7, #32]; mov p4, p2
+  ; CHECK-NEXT:    vlda wl7, [p7], #256; paddb [p4], #320; nopx ; mov p5, p7
+  ; CHECK-NEXT:    vldb wh8, [p0, #32]
+  ; CHECK-NEXT:    vldb wl8, [p0], m4
+  ; CHECK-NEXT:    vldb wh10, [p0, #32]
+  ; CHECK-NEXT:    vldb wl10, [p0], m4
+  ; CHECK-NEXT:    vldb wh1, [p0, #32]
+  ; CHECK-NEXT:    vldb wl1, [p0], m4; add.nc lc, r0, #-1
+  ; CHECK-NEXT:    vldb wh3, [p0, #32]; movxm ls, #.LBB0_2
+  ; CHECK-NEXT:    vldb.3d wl3, [p0], d1; movxm le, #.L_LEnd0
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; mov r1, p0; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; and r2, r1, r0; vshift.align x0, x0, s0, x8, r3; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshift.align x2, x2, s0, x10, r3; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x5, x0, x2, r25; nopv
+  ; CHECK-NEXT:    nopb ; vlda wh5, [p2, #352]; nops ; nopx ; vshift.align x4, x4, s0, x1, r3; nopv
+  ; CHECK-NEXT:    vldb wl5, [p4], #64; nopa ; nops ; nopx ; vshuffle x8, x0, x2, r9; nopv
+  ; CHECK-NEXT:    vldb wh9, [p4, #32]; add r3, r2, #34; vshift.align x6, x6, s0, x3, r3
+  ; CHECK-NEXT:    vldb wl9, [p4], #64; vshuffle x3, x4, x6, r9; vmac.f bmh7, bmh7, x8, x7, r29
+  ; CHECK-NEXT:    vldb wl11, [p4, #0]; vshuffle x10, x4, x6, r25
+  ; CHECK-NEXT:    vldb wh11, [p4, #32]; vshuffle x1, x3, x5, r13
+  ; CHECK-NEXT:    vshuffle x3, x3, x5, r24; vmac.f bml0, bml0, x10, x7, r29
+  ; CHECK-NEXT:    mov p2, p5; vmac.f bmh5, bmh5, x1, x7, r29
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_2: // %for.body
+  ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+  ; CHECK-NEXT:    vldb wh7, [p7, #32]; nopa ; nops ; nopx ; mov p4, p2; vmac.f bml2, bml2, x3, x7, r29
+  ; CHECK-NEXT:    vlda wl7, [p7], #256; paddb [p4], #320; nopx ; mov p5, p7; vmac.f bml4, bml4, x8, x5, r29
+  ; CHECK-NEXT:    vldb wh8, [p0, #32]; vmac.f bmh1, bmh1, x8, x9, r29
+  ; CHECK-NEXT:    vldb wl8, [p0], m4; vmac.f bmh0, bmh0, x1, x9, r29
+  ; CHECK-NEXT:    vldb wh10, [p0, #32]; vmac.f bmh8, bmh8, x10, x11, r29
+  ; CHECK-NEXT:    vldb wl10, [p0], m4; vmac.f bmh6, bmh6, x8, x11, r29
+  ; CHECK-NEXT:    vldb wh1, [p0, #32]; vmac.f bmh3, bmh3, x3, x9, r29
+  ; CHECK-NEXT:    vldb wl1, [p0], m4; vmac.f bmh2, bmh2, x10, x9, r29
+  ; CHECK-NEXT:    vldb wh3, [p0, #32]; vmac.f bml3, bml3, x1, x5, r29
+  ; CHECK-NEXT:    vldb.3d wl3, [p0], d1; vmac.f bml6, bml6, x3, x5, r29
+  ; CHECK-NEXT:    mov r1, p0; vmac.f bml5, bml5, x10, x5, r29
+  ; CHECK-NEXT:    and r2, r1, r0; vshift.align x0, x0, s0, x8, r3; vmac.f bmh4, bmh4, x1, x11, r29
+  ; CHECK-NEXT:    vshift.align x2, x2, s0, x10, r3; vmac.f bml1, bml1, x3, x11, r29
+  ; CHECK-NEXT:    vshuffle x5, x0, x2, r25
+  ; CHECK-NEXT:    vlda wh5, [p2, #352]; vshift.align x4, x4, s0, x1, r3
+  ; CHECK-NEXT:    vldb wl5, [p4], #64; vshuffle x8, x0, x2, r9
+  ; CHECK-NEXT:    vldb wh9, [p4, #32]; add r3, r2, #34; vshift.align x6, x6, s0, x3, r3
+  ; CHECK-NEXT:    vldb wl9, [p4], #64; vshuffle x3, x4, x6, r9; vmac.f bmh7, bmh7, x8, x7, r29
+  ; CHECK-NEXT:    vldb wl11, [p4, #0]; vshuffle x10, x4, x6, r25
+  ; CHECK-NEXT:    vldb wh11, [p4, #32]; vshuffle x1, x3, x5, r13
+  ; CHECK-NEXT:    vshuffle x3, x3, x5, r24; vmac.f bml0, bml0, x10, x7, r29
+  ; CHECK-NEXT:  .L_LEnd0:
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; mov p2, p5; vmac.f bmh5, bmh5, x1, x7, r29
+  ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopa ; nopb ; nopxm ; vmac.f bml2, bml2, x3, x7, r29
+  ; CHECK-NEXT:    vmac.f bml4, bml4, x8, x5, r29
+  ; CHECK-NEXT:    vmac.f bmh1, bmh1, x8, x9, r29
+  ; CHECK-NEXT:    vmac.f bmh0, bmh0, x1, x9, r29
+  ; CHECK-NEXT:    vmac.f bmh8, bmh8, x10, x11, r29
+  ; CHECK-NEXT:    vmac.f bmh6, bmh6, x8, x11, r29
+  ; CHECK-NEXT:    vmac.f bmh3, bmh3, x3, x9, r29
+  ; CHECK-NEXT:    vmac.f bmh2, bmh2, x10, x9, r29
+  ; CHECK-NEXT:    vmac.f bml3, bml3, x1, x5, r29
+  ; CHECK-NEXT:    vmac.f bml6, bml6, x3, x5, r29
+  ; CHECK-NEXT:    vmac.f bml5, bml5, x10, x5, r29
+  ; CHECK-NEXT:    vmac.f bmh4, bmh4, x1, x11, r29
+  ; CHECK-NEXT:    vmac.f bml1, bml1, x3, x11, r29
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_4: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopa ; ret lr
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  entry:
+    %cmp5 = icmp sgt i32 %n, 0
+    br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %n)
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    ret void
+
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ]
+    %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ]
+    %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2
+    %add = add nsw i32 %0, 1
+    store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2
+    %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1
+    %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1
+    %1 = call i1 @llvm.loop.decrement.i32(i32 1)
+    br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6
+  }
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare void @llvm.set.loop.iterations.i32(i32) #1
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i1 @llvm.loop.decrement.i32(i32) #1
+
+  attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+  attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn }
+
+  !llvm.module.flags = !{!0}
+  !llvm.ident = !{!1}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"}
+  !2 = !{!3, !3, i64 0}
+  !3 = !{!"int", !4, i64 0}
+  !4 = !{!"omnipotent char", !5, i64 0}
+  !5 = !{!"Simple C/C++ TBAA"}
+  !6 = distinct !{!6, !7, !8}
+  !7 = !{!"llvm.loop.mustprogress"}
+  !8 = !{!"llvm.loop.itercount.range", i64 10}
+
+...
+---
+name:            conv2d
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0.entry (align 16):
+    successors: %bb.1(0x50000000), %bb.3(0x30000000)
+    liveins: $p0, $p1, $r0
+
+    $r1 = MOV_RLC_imm10_pseudo 0
+    $r1 = GE $r1, $r0
+    JNZ $r1, %bb.3
+    DelayedSchedBarrier
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+    liveins: $p0, $p1, $r0
+
+    $lc = ADD_NC $r0, 0
+    $ls = MOVXM_lng_cg %bb.2
+    $le = MOVXM_lng_cg <mcsymbol .L_LEnd0>
+
+  bb.2.for.body (align 16):
+    liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc3, $dc4, $dc5, $dc7, $dj1, $dj3, $dj5, $dj7, $dn1, $dn5, $dn7, $m0, $m1, $m3, $m4, $m5, $m7, $p0, $p1, $p2, $p3, $p6, $p7, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r13, $r14, $r15, $r16, $r17, $r18, $r19, $r20, $r21, $r22, $r23, $r24, $r25, $r26, $r27, $r28, $r29, $r30, $r31, $s0, $s1, $x0, $x2, $x4, $x6, $d1_3d:0x000000000003C870
+
+    $p5 = MOV_mv_scl $p7
+    $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07 + 32, addrspace 5)
+    $wl8, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wh10 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07 + 32, addrspace 5)
+    $wl10, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wh1 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07 + 32, addrspace 5)
+    $wl1, $p0 = VLD_pstm_pseudo $p0, $m4 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wh3 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.d.addr.07 + 32, addrspace 5)
+    $wl3, $p0, $dc1, $dc5 = VLD_3D_pseudo $p0, $d1_3d :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $x0 = VSHIFT_ALIGN $x0, $s0, $x8, $r3
+    $x2 = VSHIFT_ALIGN $x2, $s0, $x10, $r3
+    $x4 = VSHIFT_ALIGN $x4, $s0, $x1, $r3
+    $x6 = VSHIFT_ALIGN $x6, $s0, $x3, $r3
+    $x8 = VSHUFFLE $x0, $x2, $r9
+    $x3 = VSHUFFLE $x4, $x6, $r9
+    $x5 = VSHUFFLE $x0, $x2, $r25
+    $x10 = VSHUFFLE $x4, $x6, $r25
+    $x1 = VSHUFFLE $x3, $x5, $r13
+    $x3 = VSHUFFLE $x3, $x5, $r24
+    $wh7 = VLD_idx_imm_3x32_pseudo $p7, 32 :: (load (<16 x s16>) from %ir.d.addr.07 + 32, addrspace 5)
+    $wl7, $p7 = VLDA_dmw_lda_w_ag_pstm_nrm_imm $p7, 256 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $p4 = MOV_mv_scl $p2
+    $p4 = nuw PADD_imm9_pseudo $p4, 320
+    $wh5 = VLDA_dmw_lda_w_ag_idx_imm $p2, 352 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wl5, $p4 = VLD_pstm_imm_4x32_pseudo $p4, 64 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wh9 = VLD_idx_imm_3x32_pseudo $p4, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wl9, $p4 = VLD_pstm_imm_4x32_pseudo $p4, 64 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wh11 = VLD_idx_imm_3x32_pseudo $p4, 32 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wl11 = VLD_idx_imm_3x32_pseudo $p4, 0 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x8, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x1, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x3, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x10, $x7, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x8, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x3, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x9, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x8, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x3, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x10, $x5, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x8, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x3, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x10, $x11, $r29, implicit-def $srfpflags, implicit $crfpmask
+    $r1 = MOV_mv_scl $p0
+    $r2 = AND $r1, $r0
+    $r3 = nuw nsw ADD_add_r_ri $r2, 34, implicit-def $srcarry
+    $p2 = MOV_mv_scl $p5
+    PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2
+
+  bb.3.for.cond.cleanup (align 16):
+    RET implicit $lr
+    DelayedSchedBarrier
+
+...
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir
index 655cdee89a7a..a5dae2d34a2a 100644
--- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir
@@ -29,13 +29,13 @@
   ; CHECK-NEXT:    .p2align 4
   ; CHECK-NEXT:  .LBB0_1: // %for.body
   ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-  ; CHECK-NEXT:    nopb ; lda r0, [p2, #0]; nops ; nopxm ; nopv
-  ; CHECK-NEXT:    nopx
+  ; CHECK-NEXT:    nopb ; lda r0, [p2, #0]; nops ; nopx ; mov p2, p1; nopv
+  ; CHECK-NEXT:    nopa ; nopx
+  ; CHECK-NEXT:    nop
   ; CHECK-NEXT:    nop
   ; CHECK-NEXT:    nop
   ; CHECK-NEXT:    nop
   ; CHECK-NEXT:    nop
-  ; CHECK-NEXT:    mov p2, p1
   ; CHECK-NEXT:  .L_LEnd0:
   ; CHECK-NEXT:    nopb ; nopa ; st r0, [p0, #0]; nopxm ; nopv
   ; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-1.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-1.mir
new file mode 100644
index 000000000000..d2e5db47a8e9
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-1.mir
@@ -0,0 +1,225 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s
+
+
+# derived from gemm_bf16_0
+
+--- |
+  define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 {
+  ; CHECK-LABEL: gemm:
+  ; CHECK:         .p2align 4
+  ; CHECK-NEXT:  // %bb.0: // %entry
+  ; CHECK-NEXT:    mova r1, #0; nopb ; nopxm
+  ; CHECK-NEXT:    ge r1, r1, r0
+  ; CHECK-NEXT:    jnz r1, #.LBB0_4
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+  ; CHECK-NEXT:    vldb wl9, [p1], m5; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb wh9, [p1], m6
+  ; CHECK-NEXT:    vldb wl5, [p1], m5
+  ; CHECK-NEXT:    vldb wh5, [p1], m6
+  ; CHECK-NEXT:    vldb wl7, [p1], m5
+  ; CHECK-NEXT:    vldb wh7, [p1], m6
+  ; CHECK-NEXT:    vldb wl8, [p0, #0]
+  ; CHECK-NEXT:    vldb wh8, [p0, #32]
+  ; CHECK-NEXT:    vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2
+  ; CHECK-NEXT:    vldb wh6, [p0, #96]; padds [p0], m4
+  ; CHECK-NEXT:    padds [p0], #128; vshuffle x5, x5, x5, r2
+  ; CHECK-NEXT:    vldb wl10, [p0], #32; add.nc lc, r0, #-1
+  ; CHECK-NEXT:    vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2
+  ; CHECK-NEXT:    vldb wl3, [p0], #32; movxm ls, #.LBB0_2
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0; movxm le, #.L_LEnd0
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb wl3, [p1], m5; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb.3d wh3, [p1], d1; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x1, x8, x10, r4; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x8, x8, x10, r16; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x0, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_2: // %for.body
+  ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+  ; CHECK-NEXT:    vldb wl9, [p1], m5; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3
+  ; CHECK-NEXT:    vldb wh9, [p1], m6; nopx ; vmac.f bmh2, bmh2, x0, x11, r3
+  ; CHECK-NEXT:    vldb wl5, [p1], m5; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3
+  ; CHECK-NEXT:    vldb wh5, [p1], m6; vmac.f bmh4, bmh4, x1, x5, r3
+  ; CHECK-NEXT:    vldb wl7, [p1], m5; vmac.f bmh5, bmh5, x8, x5, r3
+  ; CHECK-NEXT:    vldb wh7, [p1], m6; vmac.f bmh6, bmh6, x0, x5, r3
+  ; CHECK-NEXT:    vldb wl8, [p0, #0]; vmac.f bmh7, bmh7, x6, x5, r3
+  ; CHECK-NEXT:    vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3
+  ; CHECK-NEXT:    vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2; vmac.f bml0, bml0, x8, x7, r3
+  ; CHECK-NEXT:    padds [p0], m4; vldb wh6, [p0, #96]; vmac.f bml1, bml1, x0, x7, r3
+  ; CHECK-NEXT:    padds [p0], #128; vshuffle x5, x5, x5, r2; vmac.f bml2, bml2, x6, x7, r3
+  ; CHECK-NEXT:    vldb wl10, [p0], #32; vmac.f bml3, bml3, x1, x3, r3
+  ; CHECK-NEXT:    vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2; vmac.f bml5, bml5, x8, x3, r3
+  ; CHECK-NEXT:    vldb wl3, [p0], #32; vmac.f bml6, bml6, x0, x3, r3
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0; vmac.f bml4, bml4, x6, x3, r3
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    vldb wl3, [p1], m5
+  ; CHECK-NEXT:    vldb.3d wh3, [p1], d1
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    vshuffle x1, x8, x10, r4
+  ; CHECK-NEXT:    vshuffle x8, x8, x10, r16
+  ; CHECK-NEXT:  .L_LEnd0:
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x0, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3
+  ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3
+  ; CHECK-NEXT:    nopa ; nopx ; vmac.f bmh2, bmh2, x0, x11, r3
+  ; CHECK-NEXT:    vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3
+  ; CHECK-NEXT:    vmac.f bmh4, bmh4, x1, x5, r3
+  ; CHECK-NEXT:    vmac.f bmh5, bmh5, x8, x5, r3
+  ; CHECK-NEXT:    vmac.f bmh6, bmh6, x0, x5, r3
+  ; CHECK-NEXT:    vmac.f bmh7, bmh7, x6, x5, r3
+  ; CHECK-NEXT:    vmac.f bmh8, bmh8, x1, x7, r3
+  ; CHECK-NEXT:    vmac.f bml0, bml0, x8, x7, r3
+  ; CHECK-NEXT:    vmac.f bml1, bml1, x0, x7, r3
+  ; CHECK-NEXT:    vmac.f bml2, bml2, x6, x7, r3
+  ; CHECK-NEXT:    vmac.f bml3, bml3, x1, x3, r3
+  ; CHECK-NEXT:    vmac.f bml5, bml5, x8, x3, r3
+  ; CHECK-NEXT:    vmac.f bml6, bml6, x0, x3, r3
+  ; CHECK-NEXT:    vmac.f bml4, bml4, x6, x3, r3
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_4: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopa ; ret lr
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  entry:
+    %cmp5 = icmp sgt i32 %n, 0
+    br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %n)
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    ret void
+
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ]
+    %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ]
+    %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2
+    %add = add nsw i32 %0, 1
+    store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2
+    %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1
+    %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1
+    %1 = call i1 @llvm.loop.decrement.i32(i32 1)
+    br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6
+  }
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare void @llvm.set.loop.iterations.i32(i32) #1
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i1 @llvm.loop.decrement.i32(i32) #1
+
+  attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+  attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn }
+
+  !llvm.module.flags = !{!0}
+  !llvm.ident = !{!1}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"}
+  !2 = !{!3, !3, i64 0}
+  !3 = !{!"int", !4, i64 0}
+  !4 = !{!"omnipotent char", !5, i64 0}
+  !5 = !{!"Simple C/C++ TBAA"}
+  !6 = distinct !{!6, !7, !8}
+  !7 = !{!"llvm.loop.mustprogress"}
+  !8 = !{!"llvm.loop.itercount.range", i64 10}
+
+...
+---
+name:            gemm
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0.entry (align 16):
+    successors: %bb.1(0x50000000), %bb.3(0x30000000)
+    liveins: $p0, $p1, $r0
+
+    renamable $r1 = MOV_RLC_imm10_pseudo 0
+    renamable $r1 = GE killed renamable $r1, renamable $r0
+    JNZ killed renamable $r1, %bb.3
+    DelayedSchedBarrier
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+    liveins: $p0, $p1, $r0
+
+    $lc = ADD_NC $r0, 0
+    $ls = MOVXM_lng_cg %bb.2
+    $le = MOVXM_lng_cg <mcsymbol .L_LEnd0>
+
+  bb.2.for.body (align 16):
+    liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13
+
+    $wl8 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl6 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh6 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $p0 = nuw PADD_mod_pseudo $p0, $m4
+    $p0 = PADD_imm9_pseudo $p0, 128
+    $wl10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x1 = VSHUFFLE $x8, $x10, $r4
+    $wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x11 = VSHUFFLE $x9, $x9, $r2
+    $x8 = VSHUFFLE $x8, $x10, $r16
+    $wh7, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x0 = VSHUFFLE $x6, $x3, $r4
+    $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $x6 = VSHUFFLE $x6, $x3, $r16
+    $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x8, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $wl3, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x5 = VSHUFFLE $x5, $x5, $r2
+    $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x0, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x6, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $wh3, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x8, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $x7 = VSHUFFLE $x7, $x7, $r2
+    $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x0, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x6, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x1, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x8, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $x3 = VSHUFFLE $x3, $x3, $r2
+    $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x0, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x6, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x8, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x0, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x6, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2
+
+  bb.3.for.cond.cleanup (align 16):
+    RET implicit $lr
+    DelayedSchedBarrier
+
+...
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir
new file mode 100644
index 000000000000..0cd59c1838d1
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir
@@ -0,0 +1,225 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s
+
+
+# derived from gemm_bf16_0
+
+--- |
+  define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 {
+  ; CHECK-LABEL: gemm:
+  ; CHECK:         .p2align 4
+  ; CHECK-NEXT:  // %bb.0: // %entry
+  ; CHECK-NEXT:    mova r1, #0; nopb ; nopxm
+  ; CHECK-NEXT:    ge r1, r1, r0
+  ; CHECK-NEXT:    jnz r1, #.LBB0_4
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+  ; CHECK-NEXT:    vldb wl9, [p1], m5; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb wh9, [p1], m6
+  ; CHECK-NEXT:    vldb wl5, [p1], m5
+  ; CHECK-NEXT:    vldb wh5, [p1], m6
+  ; CHECK-NEXT:    vldb wl7, [p1], m5
+  ; CHECK-NEXT:    vldb wh7, [p1], m6
+  ; CHECK-NEXT:    vldb wl8, [p0, #0]
+  ; CHECK-NEXT:    vldb wh8, [p0, #32]
+  ; CHECK-NEXT:    vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2
+  ; CHECK-NEXT:    vldb wh6, [p0, #96]; padds [p0], m4
+  ; CHECK-NEXT:    padds [p0], #128; vshuffle x5, x5, x5, r2
+  ; CHECK-NEXT:    vldb wl10, [p0], #32; add.nc lc, r0, #-1
+  ; CHECK-NEXT:    vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2
+  ; CHECK-NEXT:    vldb wl3, [p0], #32; movxm ls, #.LBB0_2
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0; movxm le, #.L_LEnd0
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb wl3, [p1], m5; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb.3d wh3, [p1], d1; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x1, x8, x10, r4; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x0, x8, x10, r16; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_2: // %for.body
+  ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+  ; CHECK-NEXT:    vldb wl9, [p1], m5; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x0, x11, r3
+  ; CHECK-NEXT:    vldb wh9, [p1], m6; nopx ; vmac.f bmh2, bmh2, x10, x11, r3
+  ; CHECK-NEXT:    vldb wl5, [p1], m5; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3
+  ; CHECK-NEXT:    vldb wh5, [p1], m6; vmac.f bmh4, bmh4, x1, x5, r3
+  ; CHECK-NEXT:    vldb wl7, [p1], m5; vmac.f bmh5, bmh5, x0, x5, r3
+  ; CHECK-NEXT:    vldb wh7, [p1], m6; vmac.f bmh6, bmh6, x10, x5, r3
+  ; CHECK-NEXT:    vldb wl8, [p0, #0]; vmac.f bmh7, bmh7, x6, x5, r3
+  ; CHECK-NEXT:    vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3
+  ; CHECK-NEXT:    vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2; vmac.f bml0, bml0, x0, x7, r3
+  ; CHECK-NEXT:    padds [p0], m4; vldb wh6, [p0, #96]; vmac.f bml1, bml1, x10, x7, r3
+  ; CHECK-NEXT:    padds [p0], #128; vshuffle x5, x5, x5, r2; vmac.f bml2, bml2, x6, x7, r3
+  ; CHECK-NEXT:    vldb wl10, [p0], #32; vmac.f bml3, bml3, x1, x3, r3
+  ; CHECK-NEXT:    vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2; vmac.f bml5, bml5, x0, x3, r3
+  ; CHECK-NEXT:    vldb wl3, [p0], #32; vmac.f bml6, bml6, x10, x3, r3
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0; vmac.f bml4, bml4, x6, x3, r3
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    vldb wl3, [p1], m5
+  ; CHECK-NEXT:    vldb.3d wh3, [p1], d1
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    vshuffle x1, x8, x10, r4
+  ; CHECK-NEXT:    vshuffle x0, x8, x10, r16
+  ; CHECK-NEXT:  .L_LEnd0:
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3
+  ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x0, x11, r3
+  ; CHECK-NEXT:    nopa ; nopx ; vmac.f bmh2, bmh2, x10, x11, r3
+  ; CHECK-NEXT:    vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3
+  ; CHECK-NEXT:    vmac.f bmh4, bmh4, x1, x5, r3
+  ; CHECK-NEXT:    vmac.f bmh5, bmh5, x0, x5, r3
+  ; CHECK-NEXT:    vmac.f bmh6, bmh6, x10, x5, r3
+  ; CHECK-NEXT:    vmac.f bmh7, bmh7, x6, x5, r3
+  ; CHECK-NEXT:    vmac.f bmh8, bmh8, x1, x7, r3
+  ; CHECK-NEXT:    vmac.f bml0, bml0, x0, x7, r3
+  ; CHECK-NEXT:    vmac.f bml1, bml1, x10, x7, r3
+  ; CHECK-NEXT:    vmac.f bml2, bml2, x6, x7, r3
+  ; CHECK-NEXT:    vmac.f bml3, bml3, x1, x3, r3
+  ; CHECK-NEXT:    vmac.f bml5, bml5, x0, x3, r3
+  ; CHECK-NEXT:    vmac.f bml6, bml6, x10, x3, r3
+  ; CHECK-NEXT:    vmac.f bml4, bml4, x6, x3, r3
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_4: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopa ; ret lr
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  entry:
+    %cmp5 = icmp sgt i32 %n, 0
+    br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %n)
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    ret void
+
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ]
+    %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ]
+    %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2
+    %add = add nsw i32 %0, 1
+    store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2
+    %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1
+    %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1
+    %1 = call i1 @llvm.loop.decrement.i32(i32 1)
+    br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6
+  }
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare void @llvm.set.loop.iterations.i32(i32) #1
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i1 @llvm.loop.decrement.i32(i32) #1
+
+  attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+  attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn }
+
+  !llvm.module.flags = !{!0}
+  !llvm.ident = !{!1}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"}
+  !2 = !{!3, !3, i64 0}
+  !3 = !{!"int", !4, i64 0}
+  !4 = !{!"omnipotent char", !5, i64 0}
+  !5 = !{!"Simple C/C++ TBAA"}
+  !6 = distinct !{!6, !7, !8}
+  !7 = !{!"llvm.loop.mustprogress"}
+  !8 = !{!"llvm.loop.itercount.range", i64 10}
+
+...
+---
+name:            gemm
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0.entry (align 16):
+    successors: %bb.1(0x50000000), %bb.3(0x30000000)
+    liveins: $p0, $p1, $r0
+
+    renamable $r1 = MOV_RLC_imm10_pseudo 0
+    renamable $r1 = GE killed renamable $r1, renamable $r0
+    JNZ killed renamable $r1, %bb.3
+    DelayedSchedBarrier
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+    liveins: $p0, $p1, $r0
+
+    $lc = ADD_NC $r0, 0
+    $ls = MOVXM_lng_cg %bb.2
+    $le = MOVXM_lng_cg <mcsymbol .L_LEnd0>
+
+  bb.2.for.body (align 16):
+    liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13
+
+    $wl8 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl6 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh6 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $p0 = nuw PADD_mod_pseudo $p0, $m4
+    $p0 = PADD_imm9_pseudo $p0, 128
+    $wl10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x1 = VSHUFFLE $x8, $x10, $r4
+    $wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x11 = VSHUFFLE $x9, $x9, $r2
+    $x0 = VSHUFFLE $x8, $x10, $r16
+    $wh7, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x10 = VSHUFFLE $x6, $x3, $r4
+    $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $x6 = VSHUFFLE $x6, $x3, $r16
+    $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x0, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $wl3, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x5 = VSHUFFLE $x5, $x5, $r2
+    $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x6, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $wh3, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x0, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $x7 = VSHUFFLE $x7, $x7, $r2
+    $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x10, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x6, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x1, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x0, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $x3 = VSHUFFLE $x3, $x3, $r2
+    $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x10, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x6, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x0, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x10, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x6, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2
+
+  bb.3.for.cond.cleanup (align 16):
+    RET implicit $lr
+    DelayedSchedBarrier
+
+...
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir
new file mode 100644
index 000000000000..7be844a699a2
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir
@@ -0,0 +1,225 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s
+
+
+# derived from gemm_bf16_0
+
+--- |
+  define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 {
+  ; CHECK-LABEL: gemm:
+  ; CHECK:         .p2align 4
+  ; CHECK-NEXT:  // %bb.0: // %entry
+  ; CHECK-NEXT:    mova r1, #0; nopb ; nopxm
+  ; CHECK-NEXT:    ge r1, r1, r0
+  ; CHECK-NEXT:    jnz r1, #.LBB0_4
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+  ; CHECK-NEXT:    vldb wl9, [p1], m5; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb wh9, [p1], m6
+  ; CHECK-NEXT:    vldb wl5, [p1], m5
+  ; CHECK-NEXT:    vldb wh5, [p1], m6
+  ; CHECK-NEXT:    vldb wl7, [p1], m5
+  ; CHECK-NEXT:    vldb wh7, [p1], m6
+  ; CHECK-NEXT:    vldb wl8, [p0, #0]
+  ; CHECK-NEXT:    vldb wh8, [p0, #32]
+  ; CHECK-NEXT:    vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2
+  ; CHECK-NEXT:    vldb wh6, [p0, #96]; padds [p0], m4
+  ; CHECK-NEXT:    padds [p0], #128; vshuffle x5, x5, x5, r2
+  ; CHECK-NEXT:    vldb wl10, [p0], #32; add.nc lc, r0, #-1
+  ; CHECK-NEXT:    vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2
+  ; CHECK-NEXT:    vldb wl3, [p0], #32; movxm ls, #.LBB0_2
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0; movxm le, #.L_LEnd0
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb wl3, [p1], m5; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb.3d wh3, [p1], d1; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x1, x8, x10, r4; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x2, x2, x10, r16; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x0, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_2: // %for.body
+  ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+  ; CHECK-NEXT:    vldb wl9, [p1], m5; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x2, x11, r3
+  ; CHECK-NEXT:    vldb wh9, [p1], m6; nopx ; vmac.f bmh2, bmh2, x0, x11, r3
+  ; CHECK-NEXT:    vldb wl5, [p1], m5; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3
+  ; CHECK-NEXT:    vldb wh5, [p1], m6; vmac.f bmh4, bmh4, x1, x5, r3
+  ; CHECK-NEXT:    vldb wl7, [p1], m5; vmac.f bmh5, bmh5, x2, x5, r3
+  ; CHECK-NEXT:    vldb wh7, [p1], m6; vmac.f bmh6, bmh6, x0, x5, r3
+  ; CHECK-NEXT:    vldb wl8, [p0, #0]; vmac.f bmh7, bmh7, x6, x5, r3
+  ; CHECK-NEXT:    vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3
+  ; CHECK-NEXT:    vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2; vmac.f bml0, bml0, x2, x7, r3
+  ; CHECK-NEXT:    padds [p0], m4; vldb wh6, [p0, #96]; vmac.f bml1, bml1, x0, x7, r3
+  ; CHECK-NEXT:    padds [p0], #128; vshuffle x5, x5, x5, r2; vmac.f bml2, bml2, x6, x7, r3
+  ; CHECK-NEXT:    vldb wl10, [p0], #32; vmac.f bml3, bml3, x1, x3, r3
+  ; CHECK-NEXT:    vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2; vmac.f bml5, bml5, x2, x3, r3
+  ; CHECK-NEXT:    vldb wl3, [p0], #32; vmac.f bml6, bml6, x0, x3, r3
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0; vmac.f bml4, bml4, x6, x3, r3
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    vldb wl3, [p1], m5
+  ; CHECK-NEXT:    vldb.3d wh3, [p1], d1
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    vshuffle x1, x8, x10, r4
+  ; CHECK-NEXT:    vshuffle x2, x2, x10, r16
+  ; CHECK-NEXT:  .L_LEnd0:
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x0, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3
+  ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x2, x11, r3
+  ; CHECK-NEXT:    nopa ; nopx ; vmac.f bmh2, bmh2, x0, x11, r3
+  ; CHECK-NEXT:    vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3
+  ; CHECK-NEXT:    vmac.f bmh4, bmh4, x1, x5, r3
+  ; CHECK-NEXT:    vmac.f bmh5, bmh5, x2, x5, r3
+  ; CHECK-NEXT:    vmac.f bmh6, bmh6, x0, x5, r3
+  ; CHECK-NEXT:    vmac.f bmh7, bmh7, x6, x5, r3
+  ; CHECK-NEXT:    vmac.f bmh8, bmh8, x1, x7, r3
+  ; CHECK-NEXT:    vmac.f bml0, bml0, x2, x7, r3
+  ; CHECK-NEXT:    vmac.f bml1, bml1, x0, x7, r3
+  ; CHECK-NEXT:    vmac.f bml2, bml2, x6, x7, r3
+  ; CHECK-NEXT:    vmac.f bml3, bml3, x1, x3, r3
+  ; CHECK-NEXT:    vmac.f bml5, bml5, x2, x3, r3
+  ; CHECK-NEXT:    vmac.f bml6, bml6, x0, x3, r3
+  ; CHECK-NEXT:    vmac.f bml4, bml4, x6, x3, r3
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_4: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopa ; ret lr
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  entry:
+    %cmp5 = icmp sgt i32 %n, 0
+    br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %n)
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    ret void
+
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ]
+    %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ]
+    %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2
+    %add = add nsw i32 %0, 1
+    store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2
+    %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1
+    %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1
+    %1 = call i1 @llvm.loop.decrement.i32(i32 1)
+    br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6
+  }
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare void @llvm.set.loop.iterations.i32(i32) #1
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i1 @llvm.loop.decrement.i32(i32) #1
+
+  attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+  attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn }
+
+  !llvm.module.flags = !{!0}
+  !llvm.ident = !{!1}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"}
+  !2 = !{!3, !3, i64 0}
+  !3 = !{!"int", !4, i64 0}
+  !4 = !{!"omnipotent char", !5, i64 0}
+  !5 = !{!"Simple C/C++ TBAA"}
+  !6 = distinct !{!6, !7, !8}
+  !7 = !{!"llvm.loop.mustprogress"}
+  !8 = !{!"llvm.loop.itercount.range", i64 10}
+
+...
+---
+name:            gemm
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0.entry (align 16):
+    successors: %bb.1(0x50000000), %bb.3(0x30000000)
+    liveins: $p0, $p1, $r0
+
+    renamable $r1 = MOV_RLC_imm10_pseudo 0
+    renamable $r1 = GE killed renamable $r1, renamable $r0
+    JNZ killed renamable $r1, %bb.3
+    DelayedSchedBarrier
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+    liveins: $p0, $p1, $r0
+
+    $lc = ADD_NC $r0, 0
+    $ls = MOVXM_lng_cg %bb.2
+    $le = MOVXM_lng_cg <mcsymbol .L_LEnd0>
+
+  bb.2.for.body (align 16):
+    liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13
+
+    $wl8 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl6 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh6 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $p0 = nuw PADD_mod_pseudo $p0, $m4
+    $p0 = PADD_imm9_pseudo $p0, 128
+    $wl10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x1 = VSHUFFLE $x8, $x10, $r4
+    $wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x11 = VSHUFFLE $x9, $x9, $r2
+    $x2 = VSHUFFLE $x2, $x10, $r16
+    $wh7, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x0 = VSHUFFLE $x6, $x3, $r4
+    $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $x6 = VSHUFFLE $x6, $x3, $r16
+    $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x2, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $wl3, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x5 = VSHUFFLE $x5, $x5, $r2
+    $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x0, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x6, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $wh3, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x2, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $x7 = VSHUFFLE $x7, $x7, $r2
+    $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x0, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x6, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x1, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x2, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $x3 = VSHUFFLE $x3, $x3, $r2
+    $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x0, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x6, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x2, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x0, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x6, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2
+
+  bb.3.for.cond.cleanup (align 16):
+    RET implicit $lr
+    DelayedSchedBarrier
+
+...
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir
new file mode 100644
index 000000000000..d27b6a59be9c
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir
@@ -0,0 +1,213 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \
+# RUN:   --debug-only=postpipeliner,postpipeliner-summary -o - | FileCheck %s
+
+
+# derived from gemm_bf16_0
+
+--- |
+  define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 {
+  ; CHECK-LABEL: gemm:
+  ; CHECK:         .p2align 4
+  ; CHECK-NEXT:  // %bb.0: // %entry
+  ; CHECK-NEXT:    mova r1, #0; nopb ; nopxm
+  ; CHECK-NEXT:    ge r1, r1, r0
+  ; CHECK-NEXT:    jnz r1, #.LBB0_4
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+  ; CHECK-NEXT:    vldb wl8, [p1], m5; nopx
+  ; CHECK-NEXT:    vldb wh8, [p1], m6
+  ; CHECK-NEXT:    vldb wl9, [p1], m5
+  ; CHECK-NEXT:    vldb wl0, [p0, #0]
+  ; CHECK-NEXT:    vldb wh0, [p0, #32]
+  ; CHECK-NEXT:    vldb wl1, [p0, #64]
+  ; CHECK-NEXT:    padds [p0], m4; vldb wh1, [p0, #96]; add.nc lc, r0, #-1
+  ; CHECK-NEXT:    padds [p0], #128; vldb wh9, [p1], m6; movxm ls, #.LBB0_2
+  ; CHECK-NEXT:    vldb wl2, [p0], #32; vshuffle x8, x8, x8, r6
+  ; CHECK-NEXT:    vldb wh2, [p0], #32; movxm le, #.L_LEnd0
+  ; CHECK-NEXT:    vldb wl3, [p0], #32; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb wl10, [p1], m5; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb wh10, [p1], m6; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb wl11, [p1], m5; nopa ; nops ; nopx ; vshuffle x9, x9, x9, r6; nopv
+  ; CHECK-NEXT:    vldb.3d wh11, [p1], d1; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x4, x0, x2, r3; nopv
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_2: // %for.body
+  ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+  ; CHECK-NEXT:    vldb wl8, [p1], m5; vshuffle x5, x0, x2, r16
+  ; CHECK-NEXT:    vldb wh8, [p1], m6; nopx ; vshuffle x6, x1, x3, r3; vmac.f bmh0, bmh0, x4, x8, r2
+  ; CHECK-NEXT:    vldb wl9, [p1], m5; vshuffle x7, x1, x3, r16; vmac.f bmh2, bmh2, x5, x8, r2
+  ; CHECK-NEXT:    vldb wl0, [p0, #0]; vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2
+  ; CHECK-NEXT:    vldb wh0, [p0, #32]; vmac.f bmh3, bmh3, x7, x8, r2
+  ; CHECK-NEXT:    vldb wl1, [p0, #64]; vshuffle x11, x11, x11, r6; vmac.f bmh4, bmh0, x4, x9, r2
+  ; CHECK-NEXT:    padds [p0], m4; vldb wh1, [p0, #96]; vmac.f bmh6, bmh2, x5, x9, r2
+  ; CHECK-NEXT:    padds [p0], #128; vldb wh9, [p1], m6; vmac.f bmh5, bmh1, x6, x9, r2
+  ; CHECK-NEXT:    vldb wl2, [p0], #32; vshuffle x8, x8, x8, r6; vmac.f bmh7, bmh3, x7, x9, r2
+  ; CHECK-NEXT:    vldb wh2, [p0], #32; vmac.f bmh8, bmh0, x4, x10, r2
+  ; CHECK-NEXT:    vldb wl3, [p0], #32; vmac.f bml0, bmh1, x6, x10, r2
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0; vmac.f bml1, bmh2, x5, x10, r2
+  ; CHECK-NEXT:    vldb wl10, [p1], m5; vmac.f bml2, bmh3, x7, x10, r2
+  ; CHECK-NEXT:    vldb wh10, [p1], m6; vmac.f bml3, bmh0, x4, x11, r2
+  ; CHECK-NEXT:    vldb wl11, [p1], m5; vshuffle x9, x9, x9, r6; vmac.f bml4, bmh1, x6, x11, r2
+  ; CHECK-NEXT:    vldb.3d wh11, [p1], d1; vmac.f bml5, bmh2, x5, x11, r2
+  ; CHECK-NEXT:  .L_LEnd0:
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x4, x0, x2, r3; vmac.f bml6, bmh3, x7, x11, r2
+  ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x5, x0, x2, r16; nopv
+  ; CHECK-NEXT:    vshuffle x6, x1, x3, r3; vmac.f bmh0, bmh0, x4, x8, r2
+  ; CHECK-NEXT:    vshuffle x7, x1, x3, r16; vmac.f bmh2, bmh2, x5, x8, r2
+  ; CHECK-NEXT:    vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2
+  ; CHECK-NEXT:    vmac.f bmh3, bmh3, x7, x8, r2
+  ; CHECK-NEXT:    vshuffle x11, x11, x11, r6; vmac.f bmh4, bmh0, x4, x9, r2
+  ; CHECK-NEXT:    vmac.f bmh6, bmh2, x5, x9, r2
+  ; CHECK-NEXT:    vmac.f bmh5, bmh1, x6, x9, r2
+  ; CHECK-NEXT:    vmac.f bmh7, bmh3, x7, x9, r2
+  ; CHECK-NEXT:    vmac.f bmh8, bmh0, x4, x10, r2
+  ; CHECK-NEXT:    vmac.f bml0, bmh1, x6, x10, r2
+  ; CHECK-NEXT:    vmac.f bml1, bmh2, x5, x10, r2
+  ; CHECK-NEXT:    vmac.f bml2, bmh3, x7, x10, r2
+  ; CHECK-NEXT:    vmac.f bml3, bmh0, x4, x11, r2
+  ; CHECK-NEXT:    vmac.f bml4, bmh1, x6, x11, r2
+  ; CHECK-NEXT:    vmac.f bml5, bmh2, x5, x11, r2
+  ; CHECK-NEXT:    vmac.f bml6, bmh3, x7, x11, r2
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_4: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopa ; ret lr
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  entry:
+    %cmp5 = icmp sgt i32 %n, 0
+    br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %n)
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    ret void
+
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ]
+    %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ]
+    %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2
+    %add = add nsw i32 %0, 1
+    store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2
+    %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1
+    %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1
+    %1 = call i1 @llvm.loop.decrement.i32(i32 1)
+    br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6
+  }
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare void @llvm.set.loop.iterations.i32(i32) #1
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i1 @llvm.loop.decrement.i32(i32) #1
+
+  attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+  attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn }
+
+  !llvm.module.flags = !{!0}
+  !llvm.ident = !{!1}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"}
+  !2 = !{!3, !3, i64 0}
+  !3 = !{!"int", !4, i64 0}
+  !4 = !{!"omnipotent char", !5, i64 0}
+  !5 = !{!"Simple C/C++ TBAA"}
+  !6 = distinct !{!6, !7, !8}
+  !7 = !{!"llvm.loop.mustprogress"}
+  !8 = !{!"llvm.loop.itercount.range", i64 10}
+
+...
+---
+name:            gemm
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0.entry (align 16):
+    successors: %bb.1(0x50000000), %bb.3(0x30000000)
+    liveins: $p0, $p1, $r0
+
+    $r1 = MOV_RLC_imm10_pseudo 0
+    $r1 = GE $r1, $r0
+    JNZ $r1, %bb.3
+    DelayedSchedBarrier
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+    liveins: $p0, $p1, $r0
+
+    $lc = ADD_NC $r0, 0
+    $ls = MOVXM_lng_cg %bb.2
+    $le = MOVXM_lng_cg <mcsymbol .L_LEnd0>
+
+  bb.2.for.body (align 16):
+    liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13
+
+    $wl0 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh0 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl1 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh1 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $p0 = nuw PADDS_st_ptr_inc_idx $p0, $m4
+    $p0 = nuw PADD_imm9_pseudo $p0, 128
+    $wl2, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh2, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x4 = VSHUFFLE $x0, $x2, $r3
+    $x5 = VSHUFFLE $x0, $x2, $r16
+    $x6 = VSHUFFLE $x1, $x3, $r3
+    $x7 = VSHUFFLE $x1, $x3, $r16
+    $wl8, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh8, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl10, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh10, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl11, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh11, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x8 = VSHUFFLE $x8, $x8, $r6
+    $x9 = VSHUFFLE $x9, $x9, $r6
+    $x10 = VSHUFFLE $x10, $x10, $r6
+    $x11 = VSHUFFLE $x11, $x11, $r6
+    $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh4 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh5 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh6 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh7 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh8 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml0 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml1 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml2 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml3 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml4 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml5 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml6 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask
+
+    PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2
+
+  bb.3.for.cond.cleanup (align 16):
+    RET implicit $lr
+    DelayedSchedBarrier
+
+...
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA.mir
new file mode 100644
index 000000000000..55b804bc9868
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA.mir
@@ -0,0 +1,240 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \
+# RUN:   --debug-only=postpipeliner,postpipeliner-summary -o - | FileCheck %s
+
+
+# derived from gemm_bf16_0
+
+--- |
+  define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 {
+  ; CHECK-LABEL: gemm:
+  ; CHECK:         .p2align 4
+  ; CHECK-NEXT:  // %bb.0: // %entry
+  ; CHECK-NEXT:    mova r1, #0; nopb ; nopxm
+  ; CHECK-NEXT:    ge r1, r1, r0
+  ; CHECK-NEXT:    jnz r1, #.LBB0_4
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+  ; CHECK-NEXT:    vldb wl0, [p0], #32; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb wh0, [p0], #32; nopx
+  ; CHECK-NEXT:    vldb wl1, [p0], #32
+  ; CHECK-NEXT:    vldb wh1, [p0], #32
+  ; CHECK-NEXT:    vldb wl8, [p1], m5; padds [p0], m4
+  ; CHECK-NEXT:    vldb wl2, [p0], #32
+  ; CHECK-NEXT:    vldb wh2, [p0], #32
+  ; CHECK-NEXT:    vldb wh8, [p1], m6
+  ; CHECK-NEXT:    vldb wl3, [p0], #32
+  ; CHECK-NEXT:    vldb wl9, [p1], m5
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0
+  ; CHECK-NEXT:    vldb wh9, [p1], m6
+  ; CHECK-NEXT:    vldb wl10, [p1], m5
+  ; CHECK-NEXT:    vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3
+  ; CHECK-NEXT:    vldb wl11, [p1], m5; vshuffle x8, x8, x8, r6
+  ; CHECK-NEXT:    vldb.3d wh11, [p1], d1; vshuffle x5, x0, x2, r16
+  ; CHECK-NEXT:    vldb wl0, [p0], #32; vmac.f bmh0, bmh0, x4, x8, r2
+  ; CHECK-NEXT:    vldb wh0, [p0], #32; vshuffle x6, x1, x3, r3; vmac.f bmh2, bmh2, x5, x8, r2
+  ; CHECK-NEXT:    vldb wl1, [p0], #32; vshuffle x7, x1, x3, r16
+  ; CHECK-NEXT:    vldb wh1, [p0], #32; vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2
+  ; CHECK-NEXT:    padds [p0], m4; vldb wl8, [p1], m5; vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2
+  ; CHECK-NEXT:    vldb wl2, [p0], #32; add.nc lc, r0, #-2; vmac.f bmh4, bmh0, x4, x9, r2
+  ; CHECK-NEXT:    vldb wh2, [p0], #32; vshuffle x11, x11, x11, r6; vmac.f bmh6, bmh2, x5, x9, r2
+  ; CHECK-NEXT:    vldb wh8, [p1], m6; movxm ls, #.LBB0_2; vmac.f bmh5, bmh1, x6, x9, r2
+  ; CHECK-NEXT:    vldb wl3, [p0], #32; movxm le, #.L_LEnd0; vmac.f bmh7, bmh3, x7, x9, r2
+  ; CHECK-NEXT:    vldb wl9, [p1], m5; nopa ; nops ; nopxm ; vmac.f bmh8, bmh0, x4, x10, r2
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0; nopa ; nops ; nopxm ; vmac.f bml0, bmh1, x6, x10, r2
+  ; CHECK-NEXT:    vldb wh9, [p1], m6; nopa ; nops ; nopxm ; vmac.f bml1, bmh2, x5, x10, r2
+  ; CHECK-NEXT:    vldb wl10, [p1], m5; nopa ; nops ; nopxm ; vmac.f bml2, bmh3, x7, x10, r2
+  ; CHECK-NEXT:    vldb wh10, [p1], m6; nopa ; nops ; nopx ; vshuffle x4, x0, x2, r3; vmac.f bml3, bmh0, x4, x11, r2
+  ; CHECK-NEXT:    vldb wl11, [p1], m5; nopa ; nops ; nopx ; vshuffle x8, x8, x8, r6; vmac.f bml4, bmh1, x6, x11, r2
+  ; CHECK-NEXT:    vldb.3d wh11, [p1], d1; nopa ; nops ; nopx ; vshuffle x5, x0, x2, r16; vmac.f bml5, bmh2, x5, x11, r2
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_2: // %for.body
+  ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+  ; CHECK-NEXT:    vldb wl0, [p0], #32; nopa ; nops ; nopxm ; vmac.f bmh0, bmh0, x4, x8, r2
+  ; CHECK-NEXT:    vldb wh0, [p0], #32; vshuffle x6, x1, x3, r3; vmac.f bmh2, bmh2, x5, x8, r2
+  ; CHECK-NEXT:    vldb wl1, [p0], #32; vshuffle x7, x1, x3, r16; vmac.f bml6, bmh3, x7, x11, r2
+  ; CHECK-NEXT:    vldb wh1, [p0], #32; vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2
+  ; CHECK-NEXT:    padds [p0], m4; vldb wl8, [p1], m5; vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2
+  ; CHECK-NEXT:    vldb wl2, [p0], #32; vmac.f bmh4, bmh0, x4, x9, r2
+  ; CHECK-NEXT:    vldb wh2, [p0], #32; vshuffle x11, x11, x11, r6; vmac.f bmh6, bmh2, x5, x9, r2
+  ; CHECK-NEXT:    vldb wh8, [p1], m6; vmac.f bmh5, bmh1, x6, x9, r2
+  ; CHECK-NEXT:    vldb wl3, [p0], #32; vmac.f bmh7, bmh3, x7, x9, r2
+  ; CHECK-NEXT:    vldb wl9, [p1], m5; vmac.f bmh8, bmh0, x4, x10, r2
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0; vmac.f bml0, bmh1, x6, x10, r2
+  ; CHECK-NEXT:    vldb wh9, [p1], m6; vmac.f bml1, bmh2, x5, x10, r2
+  ; CHECK-NEXT:    vldb wl10, [p1], m5; vmac.f bml2, bmh3, x7, x10, r2
+  ; CHECK-NEXT:    vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3; vmac.f bml3, bmh0, x4, x11, r2
+  ; CHECK-NEXT:    vldb wl11, [p1], m5; vshuffle x8, x8, x8, r6; vmac.f bml4, bmh1, x6, x11, r2
+  ; CHECK-NEXT:  .L_LEnd0:
+  ; CHECK-NEXT:    vldb.3d wh11, [p1], d1; nopa ; nops ; nopx ; vshuffle x5, x0, x2, r16; vmac.f bml5, bmh2, x5, x11, r2
+  ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopa ; nopb ; nopxm ; vmac.f bmh0, bmh0, x4, x8, r2
+  ; CHECK-NEXT:    vshuffle x6, x1, x3, r3; vmac.f bmh2, bmh2, x5, x8, r2
+  ; CHECK-NEXT:    vshuffle x7, x1, x3, r16; vmac.f bml6, bmh3, x7, x11, r2
+  ; CHECK-NEXT:    vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2
+  ; CHECK-NEXT:    vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2
+  ; CHECK-NEXT:    vmac.f bmh4, bmh0, x4, x9, r2
+  ; CHECK-NEXT:    vshuffle x11, x11, x11, r6; vmac.f bmh6, bmh2, x5, x9, r2
+  ; CHECK-NEXT:    vmac.f bmh5, bmh1, x6, x9, r2
+  ; CHECK-NEXT:    vmac.f bmh7, bmh3, x7, x9, r2
+  ; CHECK-NEXT:    vmac.f bmh8, bmh0, x4, x10, r2
+  ; CHECK-NEXT:    vmac.f bml0, bmh1, x6, x10, r2
+  ; CHECK-NEXT:    vmac.f bml1, bmh2, x5, x10, r2
+  ; CHECK-NEXT:    vmac.f bml2, bmh3, x7, x10, r2
+  ; CHECK-NEXT:    vmac.f bml3, bmh0, x4, x11, r2
+  ; CHECK-NEXT:    vmac.f bml4, bmh1, x6, x11, r2
+  ; CHECK-NEXT:    vmac.f bml5, bmh2, x5, x11, r2
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    vmac.f bml6, bmh3, x7, x11, r2
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_4: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopa ; ret lr
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  entry:
+    %cmp5 = icmp sgt i32 %n, 0
+    br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %n)
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    ret void
+
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ]
+    %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ]
+    %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2
+    %add = add nsw i32 %0, 1
+    store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2
+    %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1
+    %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1
+    %1 = call i1 @llvm.loop.decrement.i32(i32 1)
+    br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6
+  }
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare void @llvm.set.loop.iterations.i32(i32) #1
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i1 @llvm.loop.decrement.i32(i32) #1
+
+  attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+  attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn }
+
+  !llvm.module.flags = !{!0}
+  !llvm.ident = !{!1}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"}
+  !2 = !{!3, !3, i64 0}
+  !3 = !{!"int", !4, i64 0}
+  !4 = !{!"omnipotent char", !5, i64 0}
+  !5 = !{!"Simple C/C++ TBAA"}
+  !6 = distinct !{!6, !7, !8}
+  !7 = !{!"llvm.loop.mustprogress"}
+  !8 = !{!"llvm.loop.itercount.range", i64 10}
+
+...
+---
+name:            gemm
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0.entry (align 16):
+    successors: %bb.1(0x50000000), %bb.3(0x30000000)
+    liveins: $p0, $p1, $r0
+
+    $r1 = MOV_RLC_imm10_pseudo 0
+    $r1 = GE $r1, $r0
+    JNZ $r1, %bb.3
+    DelayedSchedBarrier
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+    liveins: $p0, $p1, $r0
+
+    $lc = ADD_NC $r0, 0
+    $ls = MOVXM_lng_cg %bb.2
+    $le = MOVXM_lng_cg <mcsymbol .L_LEnd0>
+
+  bb.2.for.body (align 16):
+    liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13
+
+    $wl0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl1, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh1, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $p0 = nuw PADDS_st_ptr_inc_idx $p0, $m4
+    $wl2, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh2, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x4 = VSHUFFLE $x0, $x2, $r3
+    $x5 = VSHUFFLE $x0, $x2, $r16
+    $x6 = VSHUFFLE $x1, $x3, $r3
+    $x7 = VSHUFFLE $x1, $x3, $r16
+    $wl8, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh8, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl10, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh10, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl11, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh11, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x8 = VSHUFFLE $x8, $x8, $r6
+    $x9 = VSHUFFLE $x9, $x9, $r6
+    $x10 = VSHUFFLE $x10, $x10, $r6
+    $x11 = VSHUFFLE $x11, $x11, $r6
+    $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x8, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh4 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh5 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh6 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh7 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x9, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh8 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml0 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml1 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml2 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x10, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml3 = VMAC_F_vmac_bm_core_dense $bmh0, $x4, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml4 = VMAC_F_vmac_bm_core_dense $bmh1, $x6, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml5 = VMAC_F_vmac_bm_core_dense $bmh2, $x5, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml6 = VMAC_F_vmac_bm_core_dense $bmh3, $x7, $x11, $r2, implicit-def $srfpflags, implicit $crfpmask
+
+    PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2
+
+  bb.3.for.cond.cleanup (align 16):
+    RET implicit $lr
+    DelayedSchedBarrier
+
+...
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nooffset.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nooffset.mir
new file mode 100644
index 000000000000..7653de7caab9
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nooffset.mir
@@ -0,0 +1,243 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s
+
+
+# derived from gemm_bf16_0
+
+--- |
+  define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 {
+  ; CHECK-LABEL: gemm:
+  ; CHECK:         .p2align 4
+  ; CHECK-NEXT:  // %bb.0: // %entry
+  ; CHECK-NEXT:    mova r1, #0; nopb ; nopxm
+  ; CHECK-NEXT:    ge r1, r1, r0
+  ; CHECK-NEXT:    jnz r1, #.LBB0_4
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+  ; CHECK-NEXT:    nopa ; vldb wl11, [p1], m5; nopxm ; nops
+  ; CHECK-NEXT:    vldb wh11, [p1], m6
+  ; CHECK-NEXT:    vldb wl5, [p1], m5
+  ; CHECK-NEXT:    vlda wh5, [p1], m6
+  ; CHECK-NEXT:    vldb wl8, [p0], #32
+  ; CHECK-NEXT:    vldb wh8, [p0], #32
+  ; CHECK-NEXT:    vldb wl1, [p0], #32
+  ; CHECK-NEXT:    vldb wh1, [p0], #32
+  ; CHECK-NEXT:    paddb [p0], m4
+  ; CHECK-NEXT:    vldb wl0, [p0], #32
+  ; CHECK-NEXT:    vldb wh0, [p0], #32; vshuffle x5, x5, x5, r2
+  ; CHECK-NEXT:    vldb wl3, [p0], #32
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0
+  ; CHECK-NEXT:    vldb wl0, [p1], m5
+  ; CHECK-NEXT:    vldb wh0, [p1], m6
+  ; CHECK-NEXT:    vldb wl7, [p1], m5
+  ; CHECK-NEXT:    vldb.3d wh7, [p1], d1
+  ; CHECK-NEXT:    vldb wl11, [p1], m5; vshuffle x2, x8, x0, r16
+  ; CHECK-NEXT:    vldb wh11, [p1], m6; vshuffle x6, x8, x0, r4
+  ; CHECK-NEXT:    vldb wl5, [p1], m5; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r3
+  ; CHECK-NEXT:    vlda wh5, [p1], m6; vshuffle x10, x1, x3, r4; vmac.f bmh4, bmh4, x6, x5, r3
+  ; CHECK-NEXT:    vldb wl8, [p0], #32; vshuffle x3, x11, x11, r2; vmac.f bmh7, bmh7, x9, x5, r3
+  ; CHECK-NEXT:    vldb wh8, [p0], #32; vshuffle x0, x0, x0, r2; vmac.f bmh6, bmh6, x10, x5, r3
+  ; CHECK-NEXT:    vldb wl1, [p0], #32; vshuffle x7, x7, x7, r2; vmac.f bmh0, bmh0, x6, x3, r3
+  ; CHECK-NEXT:    vldb wh1, [p0], #32; add.nc lc, r0, #-2; vmac.f bml2, bml2, x9, x0, r3
+  ; CHECK-NEXT:    paddb [p0], m4; movxm ls, #.LBB0_2; vmac.f bmh1, bmh1, x2, x3, r3
+  ; CHECK-NEXT:    vldb wl0, [p0], #32; movxm le, #.L_LEnd0; vmac.f bmh2, bmh2, x10, x3, r3
+  ; CHECK-NEXT:    vldb wh0, [p0], #32; nopa ; nops ; nopx ; vshuffle x5, x5, x5, r2; vmac.f bmh3, bmh3, x9, x3, r3
+  ; CHECK-NEXT:    vldb wl3, [p0], #32; nopa ; nops ; nopxm ; vmac.f bmh8, bmh8, x6, x0, r3
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0; nopa ; nops ; nopxm ; vmac.f bml0, bml0, x2, x0, r3
+  ; CHECK-NEXT:    vldb wl0, [p1], m5; nopa ; nops ; nopxm ; vmac.f bml1, bml1, x10, x0, r3
+  ; CHECK-NEXT:    vldb wh0, [p1], m6; nopa ; nops ; nopxm ; vmac.f bml3, bml3, x6, x7, r3
+  ; CHECK-NEXT:    vldb wl7, [p1], m5; nopa ; nops ; nopxm ; vmac.f bml5, bml5, x2, x7, r3
+  ; CHECK-NEXT:    vldb.3d wh7, [p1], d1; nopa ; nops ; nopxm ; vmac.f bml6, bml6, x10, x7, r3
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_2: // %for.body
+  ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+  ; CHECK-NEXT:    nopa ; vldb wl11, [p1], m5; nopx ; vshuffle x2, x8, x0, r16; vmac.f bml4, bml4, x9, x7, r3
+  ; CHECK-NEXT:    vldb wh11, [p1], m6; vshuffle x6, x8, x0, r4
+  ; CHECK-NEXT:    vldb wl5, [p1], m5; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r3
+  ; CHECK-NEXT:    vlda wh5, [p1], m6; vshuffle x10, x1, x3, r4; vmac.f bmh4, bmh4, x6, x5, r3
+  ; CHECK-NEXT:    vldb wl8, [p0], #32; vshuffle x3, x11, x11, r2; vmac.f bmh7, bmh7, x9, x5, r3
+  ; CHECK-NEXT:    vldb wh8, [p0], #32; vshuffle x0, x0, x0, r2; vmac.f bmh6, bmh6, x10, x5, r3
+  ; CHECK-NEXT:    vldb wl1, [p0], #32; vshuffle x7, x7, x7, r2; vmac.f bmh0, bmh0, x6, x3, r3
+  ; CHECK-NEXT:    vldb wh1, [p0], #32; vmac.f bml2, bml2, x9, x0, r3
+  ; CHECK-NEXT:    paddb [p0], m4; vmac.f bmh1, bmh1, x2, x3, r3
+  ; CHECK-NEXT:    vldb wl0, [p0], #32; vmac.f bmh2, bmh2, x10, x3, r3
+  ; CHECK-NEXT:    vldb wh0, [p0], #32; vshuffle x5, x5, x5, r2; vmac.f bmh3, bmh3, x9, x3, r3
+  ; CHECK-NEXT:    vldb wl3, [p0], #32; vmac.f bmh8, bmh8, x6, x0, r3
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0; vmac.f bml0, bml0, x2, x0, r3
+  ; CHECK-NEXT:    vldb wl0, [p1], m5; vmac.f bml1, bml1, x10, x0, r3
+  ; CHECK-NEXT:    vldb wh0, [p1], m6; vmac.f bml3, bml3, x6, x7, r3
+  ; CHECK-NEXT:    vldb wl7, [p1], m5; vmac.f bml5, bml5, x2, x7, r3
+  ; CHECK-NEXT:  .L_LEnd0:
+  ; CHECK-NEXT:    vldb.3d wh7, [p1], d1; nopa ; nops ; nopxm ; vmac.f bml6, bml6, x10, x7, r3
+  ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup
+  ; CHECK-NEXT:    vshuffle x2, x8, x0, r16; vmac.f bml4, bml4, x9, x7, r3
+  ; CHECK-NEXT:    vshuffle x6, x8, x0, r4
+  ; CHECK-NEXT:    vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r3
+  ; CHECK-NEXT:    vshuffle x10, x1, x3, r4; vmac.f bmh4, bmh4, x6, x5, r3
+  ; CHECK-NEXT:    vshuffle x3, x11, x11, r2; vmac.f bmh7, bmh7, x9, x5, r3
+  ; CHECK-NEXT:    vshuffle x0, x0, x0, r2; vmac.f bmh6, bmh6, x10, x5, r3
+  ; CHECK-NEXT:    vshuffle x7, x7, x7, r2; vmac.f bmh0, bmh0, x6, x3, r3
+  ; CHECK-NEXT:    vmac.f bml2, bml2, x9, x0, r3
+  ; CHECK-NEXT:    vmac.f bmh1, bmh1, x2, x3, r3
+  ; CHECK-NEXT:    vmac.f bmh2, bmh2, x10, x3, r3
+  ; CHECK-NEXT:    vmac.f bmh3, bmh3, x9, x3, r3
+  ; CHECK-NEXT:    vmac.f bmh8, bmh8, x6, x0, r3
+  ; CHECK-NEXT:    vmac.f bml0, bml0, x2, x0, r3
+  ; CHECK-NEXT:    vmac.f bml1, bml1, x10, x0, r3
+  ; CHECK-NEXT:    vmac.f bml3, bml3, x6, x7, r3
+  ; CHECK-NEXT:    vmac.f bml5, bml5, x2, x7, r3
+  ; CHECK-NEXT:    vmac.f bml6, bml6, x10, x7, r3
+  ; CHECK-NEXT:    vmac.f bml4, bml4, x9, x7, r3
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_4: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopa ; ret lr
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  entry:
+    %cmp5 = icmp sgt i32 %n, 0
+    br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %n)
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    ret void
+
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %p5 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ]
+    %p6 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ]
+    %0 = load i32, ptr addrspace(6) %p6, align 4, !tbaa !2
+    %add = add nsw i32 %0, 1
+    store i32 %add, ptr addrspace(5) %p5, align 4, !tbaa !2
+    %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %p5, i20 1
+    %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %p6, i20 1
+    %1 = call i1 @llvm.loop.decrement.i32(i32 1)
+    br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6
+  }
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare void @llvm.set.loop.iterations.i32(i32) #1
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i1 @llvm.loop.decrement.i32(i32) #1
+
+  attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+  attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn }
+
+  !llvm.module.flags = !{!0}
+  !llvm.ident = !{!1}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"}
+  !2 = !{!3, !3, i64 0}
+  !3 = !{!"int", !4, i64 0}
+  !4 = !{!"omnipotent char", !5, i64 0}
+  !5 = !{!"Simple C/C++ TBAA"}
+  !6 = distinct !{!6, !7, !8}
+  !7 = !{!"llvm.loop.mustprogress"}
+  !8 = !{!"llvm.loop.itercount.range", i64 10}
+
+...
+---
+name:            gemm
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0.entry (align 16):
+    successors: %bb.1(0x50000000), %bb.3(0x30000000)
+    liveins: $p0, $p1, $r0
+
+    renamable $r1 = MOV_RLC_imm10_pseudo 0
+    renamable $r1 = GE killed renamable $r1, renamable $r0
+    JNZ killed renamable $r1, %bb.3
+    DelayedSchedBarrier
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+    liveins: $p0, $p1, $r0
+
+    $lc = ADD_NC $r0, 0
+    $ls = MOVXM_lng_cg %bb.2
+    $le = MOVXM_lng_cg <mcsymbol .L_LEnd0>
+
+  bb.2.for.body (align 16):
+    liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13
+
+    $wl8, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p6, addrspace 6)
+    $wh8, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p6, addrspace 6)
+    $wl1, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p6, addrspace 6)
+    $wh1, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p6, addrspace 6)
+    $p0 = nuw PADD_mod_pseudo $p0, $m4
+    $wl0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p6, addrspace 6)
+    $wh0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p6, addrspace 6)
+    $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.p6, addrspace 6)
+    $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.p6, addrspace 6)
+    $x6 = VSHUFFLE $x8, $x0, $r4
+    $x2 = VSHUFFLE $x8, $x0, $r16
+    $x10 = VSHUFFLE $x1, $x3, $r4
+    $x9 = VSHUFFLE $x1, $x3, $r16
+    $wl11, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.p6, addrspace 6)
+    $wh11, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.p6, addrspace 6)
+    $wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.p6, addrspace 6)
+    $wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.p6, addrspace 6)
+    $wl0, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.p6, addrspace 6)
+    $wh0, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.p6, addrspace 6)
+    $wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.p6, addrspace 6)
+    $wh7, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.p6, addrspace 6)
+    $x3 = VSHUFFLE $x11, $x11, $r2
+    $x5 = VSHUFFLE $x5, $x5, $r2
+    $x0 = VSHUFFLE $x0, $x0, $r2
+    $x7 = VSHUFFLE $x7, $x7, $r2
+    $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x6, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x2, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x9, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x6, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x2, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x10, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x9, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x6, $x0, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x2, $x0, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x10, $x0, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x9, $x0, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x6, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x2, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x10, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x9, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2
+
+  bb.3.for.cond.cleanup (align 16):
+    RET implicit $lr
+    DelayedSchedBarrier
+
+...
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched-nobanks.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched-nobanks.mir
new file mode 100644
index 000000000000..c42c77b6337b
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched-nobanks.mir
@@ -0,0 +1,215 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \
+# RUN:   --debug-only=postpipeliner-summary -o - | FileCheck %s
+
+
+# derived from gemm_bf16_0
+
+--- |
+  define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 {
+  ; CHECK-LABEL: gemm:
+  ; CHECK:         .p2align 4
+  ; CHECK-NEXT:  // %bb.0: // %entry
+  ; CHECK-NEXT:    mova r1, #0; nopb ; nopxm
+  ; CHECK-NEXT:    ge r1, r1, r0
+  ; CHECK-NEXT:    jnz r1, #.LBB0_4
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+  ; CHECK-NEXT:    nopa ; vldb wl8, [p0, #0]; nopxm
+  ; CHECK-NEXT:    vldb wh8, [p0, #32]
+  ; CHECK-NEXT:    vldb wl1, [p0, #64]
+  ; CHECK-NEXT:    vlda wh1, [p0, #96]; paddb [p0], m4
+  ; CHECK-NEXT:    paddb [p0], #128
+  ; CHECK-NEXT:    vldb wl10, [p0], #32
+  ; CHECK-NEXT:    vldb wh10, [p0], #32
+  ; CHECK-NEXT:    vldb wl3, [p0], #32
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0; add.nc lc, r0, #-1
+  ; CHECK-NEXT:    movxm ls, #.LBB0_2
+  ; CHECK-NEXT:    vldb wl3, [p1], m5; movxm le, #.L_LEnd0
+  ; CHECK-NEXT:    vldb wh3, [p1], m6; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb wl5, [p1], m5; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb wh5, [p1], m6; nopa ; nops ; nopx ; vshuffle x6, x8, x10, r3; nopv
+  ; CHECK-NEXT:    vldb wl0, [p1], m5; nopa ; nops ; nopx ; vshuffle x11, x8, x10, r16; nopv
+  ; CHECK-NEXT:    vldb wh0, [p1], m6; nopa ; nops ; nopx ; vshuffle x0, x1, x3, r3; nopv
+  ; CHECK-NEXT:    vldb wl7, [p1], m5; nopa ; nops ; nopx ; vshuffle x9, x1, x3, r16; nopv
+  ; CHECK-NEXT:    vldb.3d wh7, [p1], d1; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_2: // %for.body
+  ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+  ; CHECK-NEXT:    vldb wl8, [p0, #0]; vshuffle x3, x3, x3, r6
+  ; CHECK-NEXT:    vldb wh8, [p0, #32]; nopx
+  ; CHECK-NEXT:    vldb wl1, [p0, #64]; vshuffle x5, x5, x5, r6; vmac.f bmh0, bmh0, x6, x3, r2
+  ; CHECK-NEXT:    vlda wh1, [p0, #96]; paddb [p0], m4; vmac.f bmh1, bmh1, x11, x3, r2
+  ; CHECK-NEXT:    paddb [p0], #128; vshuffle x0, x0, x0, r6; vmac.f bmh3, bmh3, x9, x3, r2
+  ; CHECK-NEXT:    vldb wl10, [p0], #32; vmac.f bmh4, bmh4, x6, x5, r2
+  ; CHECK-NEXT:    vldb wh10, [p0], #32; vshuffle x7, x7, x7, r6; vmac.f bmh2, bmh2, x0, x3, r2
+  ; CHECK-NEXT:    vldb wl3, [p0], #32; vmac.f bmh5, bmh5, x11, x5, r2
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0; vmac.f bmh6, bmh6, x0, x5, r2
+  ; CHECK-NEXT:    vmac.f bmh7, bmh7, x9, x5, r2
+  ; CHECK-NEXT:    vldb wl3, [p1], m5; vmac.f bmh8, bmh8, x6, x0, r2
+  ; CHECK-NEXT:    vldb wh3, [p1], m6; vmac.f bml0, bml0, x11, x0, r2
+  ; CHECK-NEXT:    vldb wl5, [p1], m5; vmac.f bml1, bml1, x0, x0, r2
+  ; CHECK-NEXT:    vldb wh5, [p1], m6; vshuffle x6, x8, x10, r3; vmac.f bml2, bml2, x9, x0, r2
+  ; CHECK-NEXT:    vldb wl0, [p1], m5; vshuffle x11, x8, x10, r16; vmac.f bml3, bml3, x6, x7, r2
+  ; CHECK-NEXT:    vldb wh0, [p1], m6; vshuffle x0, x1, x3, r3; vmac.f bml4, bml4, x11, x7, r2
+  ; CHECK-NEXT:    vldb wl7, [p1], m5; vshuffle x9, x1, x3, r16; vmac.f bml6, bml6, x0, x7, r2
+  ; CHECK-NEXT:  .L_LEnd0:
+  ; CHECK-NEXT:    vldb.3d wh7, [p1], d1; nopa ; nops ; nopxm ; vmac.f bml5, bml5, x9, x7, r2
+  ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x3, x3, x3, r6; nopv
+  ; CHECK-NEXT:    nopx
+  ; CHECK-NEXT:    vshuffle x5, x5, x5, r6; vmac.f bmh0, bmh0, x6, x3, r2
+  ; CHECK-NEXT:    vmac.f bmh1, bmh1, x11, x3, r2
+  ; CHECK-NEXT:    vshuffle x0, x0, x0, r6; vmac.f bmh3, bmh3, x9, x3, r2
+  ; CHECK-NEXT:    vmac.f bmh4, bmh4, x6, x5, r2
+  ; CHECK-NEXT:    vshuffle x7, x7, x7, r6; vmac.f bmh2, bmh2, x0, x3, r2
+  ; CHECK-NEXT:    vmac.f bmh5, bmh5, x11, x5, r2
+  ; CHECK-NEXT:    vmac.f bmh6, bmh6, x0, x5, r2
+  ; CHECK-NEXT:    vmac.f bmh7, bmh7, x9, x5, r2
+  ; CHECK-NEXT:    vmac.f bmh8, bmh8, x6, x0, r2
+  ; CHECK-NEXT:    vmac.f bml0, bml0, x11, x0, r2
+  ; CHECK-NEXT:    vmac.f bml1, bml1, x0, x0, r2
+  ; CHECK-NEXT:    vmac.f bml2, bml2, x9, x0, r2
+  ; CHECK-NEXT:    vmac.f bml3, bml3, x6, x7, r2
+  ; CHECK-NEXT:    vmac.f bml4, bml4, x11, x7, r2
+  ; CHECK-NEXT:    vmac.f bml6, bml6, x0, x7, r2
+  ; CHECK-NEXT:    vmac.f bml5, bml5, x9, x7, r2
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_4: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopa ; ret lr
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  entry:
+    %cmp5 = icmp sgt i32 %n, 0
+    br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %n)
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    ret void
+
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ]
+    %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ]
+    %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2
+    %add = add nsw i32 %0, 1
+    store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2
+    %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1
+    %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1
+    %1 = call i1 @llvm.loop.decrement.i32(i32 1)
+    br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6
+  }
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare void @llvm.set.loop.iterations.i32(i32) #1
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i1 @llvm.loop.decrement.i32(i32) #1
+
+  attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+  attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn }
+
+  !llvm.module.flags = !{!0}
+  !llvm.ident = !{!1}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"}
+  !2 = !{!3, !3, i64 0}
+  !3 = !{!"int", !4, i64 0}
+  !4 = !{!"omnipotent char", !5, i64 0}
+  !5 = !{!"Simple C/C++ TBAA"}
+  !6 = distinct !{!6, !7, !8}
+  !7 = !{!"llvm.loop.mustprogress"}
+  !8 = !{!"llvm.loop.itercount.range", i64 10}
+
+...
+---
+name:            gemm
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0.entry (align 16):
+    successors: %bb.1(0x50000000), %bb.3(0x30000000)
+    liveins: $p0, $p1, $r0
+
+    $r1 = MOV_RLC_imm10_pseudo 0
+    $r1 = GE $r1, $r0
+    JNZ $r1, %bb.3
+    DelayedSchedBarrier
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+    liveins: $p0, $p1, $r0
+
+    $lc = ADD_NC $r0, 0
+    $ls = MOVXM_lng_cg %bb.2
+    $le = MOVXM_lng_cg <mcsymbol .L_LEnd0>
+
+  bb.2.for.body (align 16):
+    liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13
+
+    $wl8 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl1 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh1 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $p0 = nuw PADD_mod_pseudo $p0, $m4
+    $p0 = PADD_imm9_pseudo $p0, 128
+    $wl10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x6 = VSHUFFLE $x8, $x10, $r3
+    $x11 = VSHUFFLE $x8, $x10, $r16
+    $x0 = VSHUFFLE $x1, $x3, $r3
+    $x9 = VSHUFFLE $x1, $x3, $r16
+    $wl3, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh3, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl0, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh0, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh7, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x3 = VSHUFFLE $x3, $x3, $r6
+    $x5 = VSHUFFLE $x5, $x5, $r6
+    $x7 = VSHUFFLE $x7, $x7, $r6
+    $x0 = VSHUFFLE $x0, $x0, $r6
+    $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x6, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x11, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x0, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x9, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x6, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x11, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x0, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x9, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x6, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x11, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x0, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x9, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x6, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x11, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x0, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x9, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask
+
+    PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2
+
+  bb.3.for.cond.cleanup (align 16):
+    RET implicit $lr
+    DelayedSchedBarrier
+
+...
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir
new file mode 100644
index 000000000000..dd2f9173bb7e
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir
@@ -0,0 +1,208 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \
+# RUN:   --debug-only=postpipeliner-summary -o - | FileCheck %s
+
+
+# derived from gemm_bf16_0
+
+--- |
+  define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 {
+  ; CHECK-LABEL: gemm:
+  ; CHECK:         .p2align 4
+  ; CHECK-NEXT:  // %bb.0: // %entry
+  ; CHECK-NEXT:    mova r1, #0; nopb ; nopxm
+  ; CHECK-NEXT:    ge r1, r1, r0
+  ; CHECK-NEXT:    jnz r1, #.LBB0_4
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+  ; CHECK-NEXT:    vlda wl11, [p1], m5; vldb wl8, [p0], #32; nopxm
+  ; CHECK-NEXT:    vlda wh11, [p1], m6; vldb wh8, [p0], #32
+  ; CHECK-NEXT:    vlda wl5, [p1], m5; vldb wl1, [p0], #32
+  ; CHECK-NEXT:    vlda wh5, [p1], m6; vldb wh1, [p0], #32
+  ; CHECK-NEXT:    paddb [p0], m4
+  ; CHECK-NEXT:    vldb wl0, [p0], #32
+  ; CHECK-NEXT:    vldb wh0, [p0], #32; add.nc lc, r0, #-1
+  ; CHECK-NEXT:    vldb wl3, [p0], #32; movxm ls, #.LBB0_2
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0; movxm le, #.L_LEnd0
+  ; CHECK-NEXT:    vldb wl0, [p1], m5; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb wh0, [p1], m6; nopa ; nops ; nopx ; vshuffle x5, x5, x5, r6; nopv
+  ; CHECK-NEXT:    vldb wl7, [p1], m5; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb.3d wh7, [p1], d1; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x6, x8, x0, r3; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x2, x8, x0, r16; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x10, x1, x3, r3; vmac.f bmh4, bmh4, x6, x5, r2
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_2: // %for.body
+  ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+  ; CHECK-NEXT:    vldb wl8, [p0], #32; vlda wl11, [p1], m5; nops ; nopx ; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r2
+  ; CHECK-NEXT:    vlda wh11, [p1], m6; vldb wh8, [p0], #32; nopx ; vshuffle x3, x11, x11, r6; vmac.f bmh6, bmh6, x10, x5, r2
+  ; CHECK-NEXT:    vlda wl5, [p1], m5; vldb wl1, [p0], #32; vshuffle x0, x0, x0, r6; vmac.f bmh7, bmh7, x9, x5, r2
+  ; CHECK-NEXT:    vlda wh5, [p1], m6; vldb wh1, [p0], #32; vshuffle x7, x7, x7, r6; vmac.f bmh0, bmh0, x6, x3, r2
+  ; CHECK-NEXT:    paddb [p0], m4; vmac.f bmh1, bmh1, x2, x3, r2
+  ; CHECK-NEXT:    vldb wl0, [p0], #32; vmac.f bmh2, bmh2, x10, x3, r2
+  ; CHECK-NEXT:    vldb wh0, [p0], #32; vmac.f bmh3, bmh3, x9, x3, r2
+  ; CHECK-NEXT:    vldb wl3, [p0], #32; vmac.f bmh8, bmh8, x6, x0, r2
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0; vmac.f bml0, bml0, x2, x0, r2
+  ; CHECK-NEXT:    vldb wl0, [p1], m5; vmac.f bml1, bml1, x10, x0, r2
+  ; CHECK-NEXT:    vldb wh0, [p1], m6; vshuffle x5, x5, x5, r6; vmac.f bml2, bml2, x9, x0, r2
+  ; CHECK-NEXT:    vldb wl7, [p1], m5; vmac.f bml3, bml3, x6, x7, r2
+  ; CHECK-NEXT:    vldb.3d wh7, [p1], d1; vmac.f bml4, bml4, x2, x7, r2
+  ; CHECK-NEXT:    vshuffle x6, x8, x0, r3; vmac.f bml6, bml6, x10, x7, r2
+  ; CHECK-NEXT:    vshuffle x2, x8, x0, r16; vmac.f bml5, bml5, x9, x7, r2
+  ; CHECK-NEXT:  .L_LEnd0:
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x10, x1, x3, r3; vmac.f bmh4, bmh4, x6, x5, r2
+  ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopx ; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r2
+  ; CHECK-NEXT:    vshuffle x3, x11, x11, r6; vmac.f bmh6, bmh6, x10, x5, r2
+  ; CHECK-NEXT:    vshuffle x0, x0, x0, r6; vmac.f bmh7, bmh7, x9, x5, r2
+  ; CHECK-NEXT:    vshuffle x7, x7, x7, r6; vmac.f bmh0, bmh0, x6, x3, r2
+  ; CHECK-NEXT:    vmac.f bmh1, bmh1, x2, x3, r2
+  ; CHECK-NEXT:    vmac.f bmh2, bmh2, x10, x3, r2
+  ; CHECK-NEXT:    vmac.f bmh3, bmh3, x9, x3, r2
+  ; CHECK-NEXT:    vmac.f bmh8, bmh8, x6, x0, r2
+  ; CHECK-NEXT:    vmac.f bml0, bml0, x2, x0, r2
+  ; CHECK-NEXT:    vmac.f bml1, bml1, x10, x0, r2
+  ; CHECK-NEXT:    vmac.f bml2, bml2, x9, x0, r2
+  ; CHECK-NEXT:    vmac.f bml3, bml3, x6, x7, r2
+  ; CHECK-NEXT:    vmac.f bml4, bml4, x2, x7, r2
+  ; CHECK-NEXT:    vmac.f bml6, bml6, x10, x7, r2
+  ; CHECK-NEXT:    vmac.f bml5, bml5, x9, x7, r2
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_4: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopa ; ret lr
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  entry:
+    %cmp5 = icmp sgt i32 %n, 0
+    br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %n)
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    ret void
+
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ]
+    %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ]
+    %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2
+    %add = add nsw i32 %0, 1
+    store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2
+    %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1
+    %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1
+    %1 = call i1 @llvm.loop.decrement.i32(i32 1)
+    br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6
+  }
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare void @llvm.set.loop.iterations.i32(i32) #1
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i1 @llvm.loop.decrement.i32(i32) #1
+
+  attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+  attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn }
+
+  !llvm.module.flags = !{!0}
+  !llvm.ident = !{!1}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"}
+  !2 = !{!3, !3, i64 0}
+  !3 = !{!"int", !4, i64 0}
+  !4 = !{!"omnipotent char", !5, i64 0}
+  !5 = !{!"Simple C/C++ TBAA"}
+  !6 = distinct !{!6, !7, !8}
+  !7 = !{!"llvm.loop.mustprogress"}
+  !8 = !{!"llvm.loop.itercount.range", i64 10}
+
+...
+---
+name:            gemm
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0.entry (align 16):
+    successors: %bb.1(0x50000000), %bb.3(0x30000000)
+    liveins: $p0, $p1, $r0
+
+    $r1 = MOV_RLC_imm10_pseudo 0
+    $r1 = GE $r1, $r0
+    JNZ $r1, %bb.3
+    DelayedSchedBarrier
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+    liveins: $p0, $p1, $r0
+
+    $lc = ADD_NC $r0, 0
+    $ls = MOVXM_lng_cg %bb.2
+    $le = MOVXM_lng_cg <mcsymbol .L_LEnd0>
+
+  bb.2.for.body (align 16):
+    liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13
+
+    $wl8, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh8, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl1, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh1, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $p0 = nuw PADD_mod_pseudo $p0, $m4
+    $wl0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh0, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x6 = VSHUFFLE $x8, $x0, $r3
+    $x2 = VSHUFFLE $x8, $x0, $r16
+    $x10 = VSHUFFLE $x1, $x3, $r3
+    $x9 = VSHUFFLE $x1, $x3, $r16
+    $wl11, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wh11, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wl0, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wh0, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $wh7, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.d.addr.07, addrspace 5)
+    $x3 = VSHUFFLE $x11, $x11, $r6
+    $x5 = VSHUFFLE $x5, $x5, $r6
+    $x0 = VSHUFFLE $x0, $x0, $r6
+    $x7 = VSHUFFLE $x7, $x7, $r6
+    $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x6, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x2, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x9, $x3, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x6, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x2, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x10, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x9, $x5, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x6, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x2, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x10, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x9, $x0, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x6, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x2, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x10, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask
+    $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x9, $x7, $r2, implicit-def $srfpflags, implicit $crfpmask
+
+    PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2
+
+  bb.3.for.cond.cleanup (align 16):
+    RET implicit $lr
+    DelayedSchedBarrier
+
+...
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm.mir
new file mode 100644
index 000000000000..faa3459fca85
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm.mir
@@ -0,0 +1,225 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+# RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s -o - | FileCheck %s
+
+
+# derived from gemm_bf16_0
+
+--- |
+  define dso_local void @gemm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 {
+  ; CHECK-LABEL: gemm:
+  ; CHECK:         .p2align 4
+  ; CHECK-NEXT:  // %bb.0: // %entry
+  ; CHECK-NEXT:    mova r1, #0; nopb ; nopxm
+  ; CHECK-NEXT:    ge r1, r1, r0
+  ; CHECK-NEXT:    jnz r1, #.LBB0_4
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+  ; CHECK-NEXT:    vldb wl9, [p1], m5; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb wh9, [p1], m6
+  ; CHECK-NEXT:    vldb wl5, [p1], m5
+  ; CHECK-NEXT:    vldb wh5, [p1], m6
+  ; CHECK-NEXT:    vldb wl7, [p1], m5
+  ; CHECK-NEXT:    vldb wh7, [p1], m6
+  ; CHECK-NEXT:    vldb wl8, [p0, #0]
+  ; CHECK-NEXT:    vldb wh8, [p0, #32]
+  ; CHECK-NEXT:    vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2
+  ; CHECK-NEXT:    vldb wh6, [p0, #96]; padds [p0], m4
+  ; CHECK-NEXT:    padds [p0], #128; vshuffle x5, x5, x5, r2
+  ; CHECK-NEXT:    vldb wl10, [p0], #32; add.nc lc, r0, #-1
+  ; CHECK-NEXT:    vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2
+  ; CHECK-NEXT:    vldb wl3, [p0], #32; movxm ls, #.LBB0_2
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0; movxm le, #.L_LEnd0
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb wl3, [p1], m5; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vldb.3d wh3, [p1], d1; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x1, x8, x10, r4; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x8, x8, x10, r16; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_2: // %for.body
+  ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+  ; CHECK-NEXT:    vldb wl9, [p1], m5; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3
+  ; CHECK-NEXT:    vldb wh9, [p1], m6; nopx ; vmac.f bmh2, bmh2, x10, x11, r3
+  ; CHECK-NEXT:    vldb wl5, [p1], m5; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3
+  ; CHECK-NEXT:    vldb wh5, [p1], m6; vmac.f bmh4, bmh4, x1, x5, r3
+  ; CHECK-NEXT:    vldb wl7, [p1], m5; vmac.f bmh5, bmh5, x8, x5, r3
+  ; CHECK-NEXT:    vldb wh7, [p1], m6; vmac.f bmh6, bmh6, x10, x5, r3
+  ; CHECK-NEXT:    vldb wl8, [p0, #0]; vmac.f bmh7, bmh7, x6, x5, r3
+  ; CHECK-NEXT:    vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3
+  ; CHECK-NEXT:    vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2; vmac.f bml0, bml0, x8, x7, r3
+  ; CHECK-NEXT:    padds [p0], m4; vldb wh6, [p0, #96]; vmac.f bml1, bml1, x10, x7, r3
+  ; CHECK-NEXT:    padds [p0], #128; vshuffle x5, x5, x5, r2; vmac.f bml2, bml2, x6, x7, r3
+  ; CHECK-NEXT:    vldb wl10, [p0], #32; vmac.f bml3, bml3, x1, x3, r3
+  ; CHECK-NEXT:    vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2; vmac.f bml5, bml5, x8, x3, r3
+  ; CHECK-NEXT:    vldb wl3, [p0], #32; vmac.f bml6, bml6, x10, x3, r3
+  ; CHECK-NEXT:    vldb.3d wh3, [p0], d0; vmac.f bml4, bml4, x6, x3, r3
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    vldb wl3, [p1], m5
+  ; CHECK-NEXT:    vldb.3d wh3, [p1], d1
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    vshuffle x1, x8, x10, r4
+  ; CHECK-NEXT:    vshuffle x8, x8, x10, r16
+  ; CHECK-NEXT:  .L_LEnd0:
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3
+  ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3
+  ; CHECK-NEXT:    nopa ; nopx ; vmac.f bmh2, bmh2, x10, x11, r3
+  ; CHECK-NEXT:    vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3
+  ; CHECK-NEXT:    vmac.f bmh4, bmh4, x1, x5, r3
+  ; CHECK-NEXT:    vmac.f bmh5, bmh5, x8, x5, r3
+  ; CHECK-NEXT:    vmac.f bmh6, bmh6, x10, x5, r3
+  ; CHECK-NEXT:    vmac.f bmh7, bmh7, x6, x5, r3
+  ; CHECK-NEXT:    vmac.f bmh8, bmh8, x1, x7, r3
+  ; CHECK-NEXT:    vmac.f bml0, bml0, x8, x7, r3
+  ; CHECK-NEXT:    vmac.f bml1, bml1, x10, x7, r3
+  ; CHECK-NEXT:    vmac.f bml2, bml2, x6, x7, r3
+  ; CHECK-NEXT:    vmac.f bml3, bml3, x1, x3, r3
+  ; CHECK-NEXT:    vmac.f bml5, bml5, x8, x3, r3
+  ; CHECK-NEXT:    vmac.f bml6, bml6, x10, x3, r3
+  ; CHECK-NEXT:    vmac.f bml4, bml4, x6, x3, r3
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    .p2align 4
+  ; CHECK-NEXT:  .LBB0_4: // %for.cond.cleanup
+  ; CHECK-NEXT:    nopa ; ret lr
+  ; CHECK-NEXT:    nop // Delay Slot 5
+  ; CHECK-NEXT:    nop // Delay Slot 4
+  ; CHECK-NEXT:    nop // Delay Slot 3
+  ; CHECK-NEXT:    nop // Delay Slot 2
+  ; CHECK-NEXT:    nop // Delay Slot 1
+  entry:
+    %cmp5 = icmp sgt i32 %n, 0
+    br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %n)
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    ret void
+
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ]
+    %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ]
+    %0 = load i32, ptr addrspace(6) %s.addr.06, align 4, !tbaa !2
+    %add = add nsw i32 %0, 1
+    store i32 %add, ptr addrspace(5) %d.addr.07, align 4, !tbaa !2
+    %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1
+    %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1
+    %1 = call i1 @llvm.loop.decrement.i32(i32 1)
+    br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !6
+  }
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare void @llvm.set.loop.iterations.i32(i32) #1
+
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i1 @llvm.loop.decrement.i32(i32) #1
+
+  attributes #0 = { nofree norecurse nosync nounwind memory(readwrite, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+  attributes #1 = { nocallback noduplicate nofree nosync nounwind willreturn }
+
+  !llvm.module.flags = !{!0}
+  !llvm.ident = !{!1}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{!"clang version 18.0.0git (git@github.com:Xilinx/llvm-aie.git 6532135c22419e573eaa75f1cc5defff7e04f931)"}
+  !2 = !{!3, !3, i64 0}
+  !3 = !{!"int", !4, i64 0}
+  !4 = !{!"omnipotent char", !5, i64 0}
+  !5 = !{!"Simple C/C++ TBAA"}
+  !6 = distinct !{!6, !7, !8}
+  !7 = !{!"llvm.loop.mustprogress"}
+  !8 = !{!"llvm.loop.itercount.range", i64 10}
+
+...
+---
+name:            gemm
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0.entry (align 16):
+    successors: %bb.1(0x50000000), %bb.3(0x30000000)
+    liveins: $p0, $p1, $r0
+
+    renamable $r1 = MOV_RLC_imm10_pseudo 0
+    renamable $r1 = GE killed renamable $r1, renamable $r0
+    JNZ killed renamable $r1, %bb.3
+    DelayedSchedBarrier
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+    liveins: $p0, $p1, $r0
+
+    $lc = ADD_NC $r0, 0
+    $ls = MOVXM_lng_cg %bb.2
+    $le = MOVXM_lng_cg <mcsymbol .L_LEnd0>
+
+  bb.2.for.body (align 16):
+    liveins: $bmh0, $bmh1, $bmh2, $bmh3, $bmh4, $bmh5, $bmh6, $bmh7, $bmh8, $bml0, $bml1, $bml2, $bml3, $bml4, $bml5, $bml6, $dc0, $dc1, $dc2, $dc3, $dc4, $dc5, $dj0, $dj1, $dj2, $dj3, $dj4, $dj5, $dn0, $dn1, $dn2, $dn3, $dn4, $dn5, $m0, $m1, $m4, $m5, $m6, $m7, $p0, $p1, $p2, $p3, $p4, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12, $r16, $x0, $x2, $x4, $d0_3d, $d1_3d, $r13
+
+    $wl8 = VLD_idx_imm_3x32_pseudo $p0, 0 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh8 = VLD_idx_imm_3x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl6 = VLD_idx_imm_3x32_pseudo $p0, 64 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh6 = VLD_idx_imm_3x32_pseudo $p0, 96 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $p0 = nuw PADD_mod_pseudo $p0, $m4
+    $p0 = PADD_imm9_pseudo $p0, 128
+    $wl10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh10, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl9, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh9, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl3, $p0 = VLD_pstm_imm_4x32_pseudo $p0, 32 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh3, $p0, $dc0, $dc4 = VLD_3D_pseudo $p0, $d0_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wl5, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $wh5, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x1 = VSHUFFLE $x8, $x10, $r4
+    $wl7, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x11 = VSHUFFLE $x9, $x9, $r2
+    $x8 = VSHUFFLE $x8, $x10, $r16
+    $wh7, $p1 = VLD_pstm_pseudo $p1, $m6 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x10 = VSHUFFLE $x6, $x3, $r4
+    $bmh0 = VMAC_F_vmac_bm_core_dense $bmh0, $x1, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $x6 = VSHUFFLE $x6, $x3, $r16
+    $bmh1 = VMAC_F_vmac_bm_core_dense $bmh1, $x8, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $wl3, $p1 = VLD_pstm_pseudo $p1, $m5 :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $x5 = VSHUFFLE $x5, $x5, $r2
+    $bmh2 = VMAC_F_vmac_bm_core_dense $bmh2, $x10, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh3 = VMAC_F_vmac_bm_core_dense $bmh3, $x6, $x11, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $wh3, $p1, $dc1, $dc5 = VLD_3D_pseudo $p1, $d1_3d :: (load (<16 x s16>) from %ir.s.addr.06, addrspace 6)
+    $bmh4 = VMAC_F_vmac_bm_core_dense $bmh4, $x1, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh5 = VMAC_F_vmac_bm_core_dense $bmh5, $x8, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $x7 = VSHUFFLE $x7, $x7, $r2
+    $bmh6 = VMAC_F_vmac_bm_core_dense $bmh6, $x10, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh7 = VMAC_F_vmac_bm_core_dense $bmh7, $x6, $x5, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bmh8 = VMAC_F_vmac_bm_core_dense $bmh8, $x1, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml0 = VMAC_F_vmac_bm_core_dense $bml0, $x8, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $x3 = VSHUFFLE $x3, $x3, $r2
+    $bml1 = VMAC_F_vmac_bm_core_dense $bml1, $x10, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml2 = VMAC_F_vmac_bm_core_dense $bml2, $x6, $x7, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml3 = VMAC_F_vmac_bm_core_dense $bml3, $x1, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml5 = VMAC_F_vmac_bm_core_dense $bml5, $x8, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml6 = VMAC_F_vmac_bm_core_dense $bml6, $x10, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    $bml4 = VMAC_F_vmac_bm_core_dense $bml4, $x6, $x3, $r3, implicit-def $srfpflags, implicit $crfpmask
+    PseudoLoopEnd <mcsymbol .L_LEnd0>, %bb.2
+
+  bb.3.for.cond.cleanup (align 16):
+    RET implicit $lr
+    DelayedSchedBarrier
+
+...
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir
index 789ef09bdcaa..2cee43297f55 100644
--- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir
@@ -34,35 +34,43 @@
   ; CHECK-NEXT:    nop // Delay Slot 2
   ; CHECK-NEXT:    nop // Delay Slot 1
   ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-  ; CHECK-NEXT:    nopa ; nopb ; nopx ; add.nc lc, r0, #-1
-  ; CHECK-NEXT:    vlda.ups.s32.s8 cm0, s0, [p0], #32; movxm ls, #.LBB0_2
-  ; CHECK-NEXT:    vlda.ups.s32.s8 cm1, s0, [p0], #32; movxm le, #.L_LEnd0
-  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
-  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
-  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
-  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
-  ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    vlda.ups.s32.s8 cm0, s0, [p0], #32; nopxm
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    vlda.ups.s32.s8 cm1, s0, [p0], #32
+  ; CHECK-NEXT:    vlda.ups.s32.s8 cm0, s0, [p0], #32
+  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    add.nc lc, r0, #-4
+  ; CHECK-NEXT:    vlda.ups.s32.s8 cm1, s0, [p0], #32; movxm ls, #.LBB0_2
+  ; CHECK-NEXT:    vlda.ups.s32.s8 cm0, s0, [p0], #32; movxm le, #.L_LEnd0
+  ; CHECK-NEXT:    nopb ; nopa ; vsrs.s8.s32 wh0, cm0, s1; nopxm ; nopv
   ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1; nopxm ; nopv
+  ; CHECK-NEXT:    nopb ; nopa ; vsrs.s8.s32 wh0, cm0, s1; nopxm ; nopv
   ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
+  ; CHECK-NEXT:    nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; nops ; nopx ; vups.s32.s8 cm2, wh0, s1; nopv
   ; CHECK-NEXT:    .p2align 4
   ; CHECK-NEXT:  .LBB0_2: // %for.body
   ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-  ; CHECK-NEXT:    vlda.ups.s32.s8 cm0, s0, [p0], #32; nopb ; vsrs.s8.s32 wh0, cm0, s1
-  ; CHECK-NEXT:    vlda.ups.s32.s8 cm1, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1
-  ; CHECK-NEXT:    nop
-  ; CHECK-NEXT:    nop
-  ; CHECK-NEXT:    vups.s32.s8 cm2, wh0, s1
-  ; CHECK-NEXT:    vups.s32.s8 cm3, wh2, s1
-  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1; nopx ; vups.s32.s8 cm3, wh2, s1; nopv
+  ; CHECK-NEXT:    nopa ; nopb ; nopx ; vsrs.s8.s32 wh0, cm0, s1
   ; CHECK-NEXT:    vst.srs.s8.s32 cm2, s0, [p1], #32
   ; CHECK-NEXT:  .L_LEnd0:
-  ; CHECK-NEXT:    nopb ; nopa ; vst.srs.s8.s32 cm3, s0, [p1], #32; nopxm ; nopv
+  ; CHECK-NEXT:    nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; vst.srs.s8.s32 cm3, s0, [p1], #32; nopx ; vups.s32.s8 cm2, wh0, s1; nopv
   ; CHECK-NEXT:  // %bb.3: // %for.cond.cleanup
-  ; CHECK-NEXT:    vsrs.s8.s32 wh0, cm0, s1; nopx
-  ; CHECK-NEXT:    vsrs.s8.s32 wh2, cm1, s1
-  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    nopb ; nopa ; vsrs.s8.s32 wh2, cm1, s1; nopx ; vups.s32.s8 cm3, wh2, s1; nopv
+  ; CHECK-NEXT:    vsrs.s8.s32 wh0, cm0, s1; nopb ; nopx
+  ; CHECK-NEXT:    vst.srs.s8.s32 cm2, s0, [p1], #32
+  ; CHECK-NEXT:    vst.srs.s8.s32 cm3, s0, [p1], #32; vups.s32.s8 cm2, wh0, s1
+  ; CHECK-NEXT:    vsrs.s8.s32 wh2, cm1, s1; vups.s32.s8 cm3, wh2, s1
+  ; CHECK-NEXT:    vsrs.s8.s32 wh0, cm0, s1
+  ; CHECK-NEXT:    vst.srs.s8.s32 cm2, s0, [p1], #32
+  ; CHECK-NEXT:    vst.srs.s8.s32 cm3, s0, [p1], #32; vups.s32.s8 cm2, wh0, s1
+  ; CHECK-NEXT:    vsrs.s8.s32 wh2, cm1, s1; vups.s32.s8 cm3, wh2, s1
   ; CHECK-NEXT:    nop
-  ; CHECK-NEXT:    vups.s32.s8 cm2, wh0, s1
+  ; CHECK-NEXT:    vst.srs.s8.s32 cm2, s0, [p1], #32
+  ; CHECK-NEXT:    vst.srs.s8.s32 cm3, s0, [p1], #32; vups.s32.s8 cm2, wh0, s1
   ; CHECK-NEXT:    vups.s32.s8 cm3, wh2, s1
   ; CHECK-NEXT:    nop
   ; CHECK-NEXT:    vst.srs.s8.s32 cm2, s0, [p1], #32
diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/status_regs/srWAW.mir b/llvm/test/CodeGen/AIE/aie2/schedule/status_regs/srWAW.mir
index ff6d666fbd80..258452803954 100644
--- a/llvm/test/CodeGen/AIE/aie2/schedule/status_regs/srWAW.mir
+++ b/llvm/test/CodeGen/AIE/aie2/schedule/status_regs/srWAW.mir
@@ -4,8 +4,9 @@
 #
 # (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
 
-# RUN: llc -mtriple=aie2 --run-pass=postmisched --issue-limit=1 -debug-only=machine-scheduler %s -o - 2>%t.log
-# RUN: cat %t.log | FileCheck %s --check-prefix=CHECK-WAW
+# RUN: llc -mtriple=aie2 --run-pass=postmisched --issue-limit=1 \
+# RUN:   -debug-only=machine-scheduler --aie-pipeliner-waw-sticky-registers=0 \
+# RUN:   %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-WAW
 # REQUIRES: asserts
 
 # This test checks the write-after-write(WAW) dependencies