[Sink] Allow sinking of loads to distant blocks #135986

LU-JOHN · 2025-04-16T16:49:14Z

Allow more loads to be sunk by ensuring that no conflicting stores are on paths to the target block.

… blocks Signed-off-by: John Lu <[email protected]>

Signed-off-by: John Lu <[email protected]>

llvmbot · 2025-04-16T16:49:51Z

@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-llvm-transforms

Author: None (LU-JOHN)

Changes

Allow more loads to be sunk by ensuring that no conflicting stores are on paths to the target block.

Patch is 30.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135986.diff

8 Files Affected:

(modified) llvm/lib/Transforms/Scalar/Sink.cpp (+52-24)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll (+44-42)
(modified) llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll (+13-14)
(modified) llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll (+2-2)
(modified) llvm/test/CodeGen/AMDGPU/set-wave-priority.ll (+2-1)
(modified) llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll (+3-2)
(modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+20-20)
(added) llvm/test/Transforms/Sink/loadsink.ll (+198)

diff --git a/llvm/lib/Transforms/Scalar/Sink.cpp b/llvm/lib/Transforms/Scalar/Sink.cpp
index 1a48a59c4189e..57ce0c8990f4a 100644
--- a/llvm/lib/Transforms/Scalar/Sink.cpp
+++ b/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -27,43 +27,71 @@ using namespace llvm;
 STATISTIC(NumSunk, "Number of instructions sunk");
 STATISTIC(NumSinkIter, "Number of sinking iterations");
 
-static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
-                         SmallPtrSetImpl<Instruction *> &Stores) {
-
-  if (Inst->mayWriteToMemory()) {
-    Stores.insert(Inst);
-    return false;
-  }
-
+static bool hasStoreConflict(Instruction *Inst, AliasAnalysis &AA,
+                             SmallPtrSetImpl<Instruction *> &Stores) {
   if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
     MemoryLocation Loc = MemoryLocation::get(L);
     for (Instruction *S : Stores)
       if (isModSet(AA.getModRefInfo(S, Loc)))
-        return false;
+        return true;
+  } else if (auto *Call = dyn_cast<CallBase>(Inst)) {
+    for (Instruction *S : Stores)
+      if (isModSet(AA.getModRefInfo(S, Call)))
+        return true;
   }
+  return false;
+}
 
+static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
+                         SmallPtrSetImpl<Instruction *> &Stores) {
+  if (Inst->mayWriteToMemory()) {
+    Stores.insert(Inst);
+    return false;
+  }
   if (Inst->isTerminator() || isa<PHINode>(Inst) || Inst->isEHPad() ||
       Inst->mayThrow() || !Inst->willReturn())
     return false;
-
-  if (auto *Call = dyn_cast<CallBase>(Inst)) {
-    // Convergent operations cannot be made control-dependent on additional
-    // values.
+  // Convergent operations cannot be made control-dependent on additional
+  // values.
+  if (auto *Call = dyn_cast<CallBase>(Inst))
     if (Call->isConvergent())
       return false;
+  if (hasStoreConflict(Inst, AA, Stores))
+    return false;
+  return true;
+}
 
-    for (Instruction *S : Stores)
-      if (isModSet(AA.getModRefInfo(S, Call)))
-        return false;
-  }
+typedef SmallPtrSet<BasicBlock *, 8> BlocksSet;
+static void findStores(SmallPtrSetImpl<Instruction *> &Stores,
+                       BasicBlock *LoadBB, BasicBlock *BB,
+                       BlocksSet &VisitedBlocksSet) {
+  if (BB == LoadBB || VisitedBlocksSet.contains(BB))
+    return;
+  VisitedBlocksSet.insert(BB);
+
+  for (Instruction &Inst : *BB)
+    if (Inst.mayWriteToMemory())
+      Stores.insert(&Inst);
+  for (BasicBlock *Pred : predecessors(BB))
+    findStores(Stores, LoadBB, Pred, VisitedBlocksSet);
+}
 
-  return true;
+static bool hasConflictingStoreBeforeSuccToSinkTo(AliasAnalysis &AA,
+                                                  Instruction *ReadMemInst,
+                                                  BasicBlock *SuccToSinkTo) {
+  BlocksSet VisitedBlocksSet;
+  SmallPtrSet<Instruction *, 8> Stores;
+  BasicBlock *LoadBB = ReadMemInst->getParent();
+  for (BasicBlock *Pred : predecessors(SuccToSinkTo))
+    findStores(Stores, LoadBB, Pred, VisitedBlocksSet);
+  return hasStoreConflict(ReadMemInst, AA, Stores);
 }
 
 /// IsAcceptableTarget - Return true if it is possible to sink the instruction
 /// in the specified basic block.
-static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
-                               DominatorTree &DT, LoopInfo &LI) {
+static bool IsAcceptableTarget(AliasAnalysis &AA, Instruction *Inst,
+                               BasicBlock *SuccToSinkTo, DominatorTree &DT,
+                               LoopInfo &LI) {
   assert(Inst && "Instruction to be sunk is null");
   assert(SuccToSinkTo && "Candidate sink target is null");
 
@@ -76,10 +104,10 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
   // just punt.
   // FIXME: Split critical edges if not backedges.
   if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
-    // We cannot sink a load across a critical edge - there may be stores in
-    // other code paths.
+    // Ensure that there is no conflicting store on any path to SuccToSinkTo.
     if (Inst->mayReadFromMemory() &&
-        !Inst->hasMetadata(LLVMContext::MD_invariant_load))
+        !Inst->hasMetadata(LLVMContext::MD_invariant_load) &&
+        hasConflictingStoreBeforeSuccToSinkTo(AA, Inst, SuccToSinkTo))
       return false;
 
     // We don't want to sink across a critical edge if we don't dominate the
@@ -153,7 +181,7 @@ static bool SinkInstruction(Instruction *Inst,
     // The nearest common dominator may be in a parent loop of BB, which may not
     // be beneficial. Find an ancestor.
     while (SuccToSinkTo != BB &&
-           !IsAcceptableTarget(Inst, SuccToSinkTo, DT, LI))
+           !IsAcceptableTarget(AA, Inst, SuccToSinkTo, DT, LI))
       SuccToSinkTo = DT.getNode(SuccToSinkTo)->getIDom()->getBlock();
     if (SuccToSinkTo == BB)
       SuccToSinkTo = nullptr;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 074272f7bed86..28ade94040688 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -1330,13 +1330,7 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
 define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [8 x i32], ptr addrspace(1) %in, [8 x i32], ptr addrspace(1) %dummy) {
 ; GFX7-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7-NEXT:    s_mov_b32 s2, 0
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_dwordx3 v[1:3], v[1:2], s[0:3], 0 addr64
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
 ; GFX7-NEXT:    s_mov_b64 vcc, 0
 ; GFX7-NEXT:    s_and_saveexec_b64 s[6:7], s[0:1]
@@ -1355,24 +1349,22 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX7-NEXT:    s_or_b64 vcc, s[8:9], s[0:1]
 ; GFX7-NEXT:  .LBB13_2: ; %exit
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
 ; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_load_dwordx3 v[1:3], v[1:2]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
 ; GFX8-NEXT:    s_mov_b64 vcc, 0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
@@ -1391,12 +1383,20 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX8-NEXT:    s_or_b64 vcc, s[6:7], s[0:1]
 ; GFX8-NEXT:  .LBB13_2: ; %exit
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e64 v0, s[0:1], v0, v2
+; GFX8-NEXT:    v_addc_u32_e64 v1, s[0:1], 0, v1, s[0:1]
+; GFX8-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_add_u32 s0, s0, 8
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -1404,12 +1404,8 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ;
 ; GFX10_W32-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX10_W32:       ; %bb.0: ; %entry
-; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x28
-; GFX10_W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX10_W32-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    global_load_dwordx3 v[1:3], v1, s[0:1]
 ; GFX10_W32-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX10_W32-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX10_W32-NEXT:    s_and_saveexec_b32 s1, s0
 ; GFX10_W32-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX10_W32-NEXT:  ; %bb.1: ; %bb
@@ -1426,9 +1422,14 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX10_W32-NEXT:    s_or_b32 vcc_lo, s2, s0
 ; GFX10_W32-NEXT:  .LBB13_2: ; %exit
 ; GFX10_W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x28
+; GFX10_W32-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10_W32-NEXT:    global_load_dwordx3 v[0:2], v0, s[0:1]
+; GFX10_W32-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
+; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1] offset:8
@@ -1436,12 +1437,8 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX10_W64:       ; %bb.0: ; %entry
-; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x28
-; GFX10_W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX10_W64-NEXT:    s_mov_b64 vcc, 0
-; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    global_load_dwordx3 v[1:3], v1, s[0:1]
 ; GFX10_W64-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
+; GFX10_W64-NEXT:    s_mov_b64 vcc, 0
 ; GFX10_W64-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX10_W64-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX10_W64-NEXT:  ; %bb.1: ; %bb
@@ -1458,9 +1455,14 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX10_W64-NEXT:    s_or_b64 vcc, s[6:7], s[0:1]
 ; GFX10_W64-NEXT:  .LBB13_2: ; %exit
 ; GFX10_W64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x28
+; GFX10_W64-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10_W64-NEXT:    global_load_dwordx3 v[0:2], v0, s[0:1]
+; GFX10_W64-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
+; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1] offset:8
@@ -1468,14 +1470,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ;
 ; GFX11_W32-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX11_W32:       ; %bb.0: ; %entry
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x28
-; GFX11_W32-NEXT:    v_and_b32_e32 v3, 0x3ff, v0
+; GFX11_W32-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11_W32-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX11_W32-NEXT:    v_lshlrev_b32_e32 v0, 2, v3
-; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    global_load_b96 v[0:2], v0, s[0:1]
 ; GFX11_W32-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11_W32-NEXT:    v_cmpx_eq_u32_e32 0, v3
+; GFX11_W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX11_W32-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX11_W32-NEXT:  ; %bb.1: ; %bb
 ; GFX11_W32-NEXT:    s_load_b64 s[2:3], s[4:5], 0x50
@@ -1491,6 +1489,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX11_W32-NEXT:    s_or_b32 vcc_lo, s2, s0
 ; GFX11_W32-NEXT:  .LBB13_2: ; %exit
 ; GFX11_W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x28
+; GFX11_W32-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11_W32-NEXT:    global_load_b96 v[0:2], v0, s[0:1]
 ; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11_W32-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
@@ -1501,14 +1503,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ;
 ; GFX11_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX11_W64:       ; %bb.0: ; %entry
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x28
-; GFX11_W64-NEXT:    v_and_b32_e32 v3, 0x3ff, v0
+; GFX11_W64-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11_W64-NEXT:    s_mov_b64 vcc, 0
 ; GFX11_W64-NEXT:    s_mov_b64 s[2:3], exec
-; GFX11_W64-NEXT:    v_lshlrev_b32_e32 v0, 2, v3
-; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    global_load_b96 v[0:2], v0, s[0:1]
-; GFX11_W64-NEXT:    v_cmpx_eq_u32_e32 0, v3
+; GFX11_W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX11_W64-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX11_W64-NEXT:  ; %bb.1: ; %bb
 ; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x50
@@ -1524,6 +1522,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX11_W64-NEXT:    s_or_b64 vcc, s[6:7], s[0:1]
 ; GFX11_W64-NEXT:  .LBB13_2: ; %exit
 ; GFX11_W64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x28
+; GFX11_W64-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11_W64-NEXT:    global_load_b96 v[0:2], v0, s[0:1]
 ; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX11_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11_W64-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index 4a6b2ebd3d203..500659ea0ca86 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -877,14 +877,11 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    ; =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ; Child Loop BB1_3 Depth 2
 ; CHECK-NEXT:    ; Child Loop BB1_8 Depth 2
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, s4, v44
 ; CHECK-NEXT:    s_lshl_b32 s5, s4, 5
 ; CHECK-NEXT:    s_add_i32 s53, s4, 1
 ; CHECK-NEXT:    s_add_i32 s6, s4, 5
-; CHECK-NEXT:    v_or3_b32 v47, s5, v42, s53
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v46, v0
-; CHECK-NEXT:    v_mov_b32_e32 v56, s53
+; CHECK-NEXT:    v_or3_b32 v46, s5, v42, s53
+; CHECK-NEXT:    v_mov_b32_e32 v47, s53
 ; CHECK-NEXT:    s_mov_b32 s5, exec_lo
 ; CHECK-NEXT:    v_cmpx_lt_u32_e64 s6, v41
 ; CHECK-NEXT:    s_cbranch_execz .LBB1_5
@@ -898,46 +895,48 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    s_add_i32 s7, s7, 4
 ; CHECK-NEXT:    v_add_nc_u32_e32 v43, 1, v43
 ; CHECK-NEXT:    s_add_i32 s8, s4, s7
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, s7, v47
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, s7, v46
 ; CHECK-NEXT:    s_add_i32 s9, s8, 5
 ; CHECK-NEXT:    s_add_i32 s8, s8, 1
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s9, v41
-; CHECK-NEXT:    v_mov_b32_e32 v56, s8
+; CHECK-NEXT:    v_mov_b32_e32 v47, s8
 ; CHECK-NEXT:    s_or_b32 s6, vcc_lo, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB1_3
 ; CHECK-NEXT:  ; %bb.4: ; %Flow3
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; CHECK-NEXT:    v_mov_b32_e32 v47, v0
+; CHECK-NEXT:    v_mov_b32_e32 v46, v0
 ; CHECK-NEXT:  .LBB1_5: ; %Flow4
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; CHECK-NEXT:    s_mov_b32 s54, exec_lo
-; CHECK-NEXT:    v_cmpx_lt_u32_e64 v56, v41
+; CHECK-NEXT:    v_cmpx_lt_u32_e64 v47, v41
 ; CHECK-NEXT:    s_cbranch_execz .LBB1_11
 ; CHECK-NEXT:  ; %bb.6: ; %.103.preheader
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, s4, v44
 ; CHECK-NEXT:    s_mov_b32 s55, 0
+; CHECK-NEXT:    ds_read_u8 v56, v0
 ; CHECK-NEXT:    s_inst_prefetch 0x1
 ; CHECK-NEXT:    s_branch .LBB1_8
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB1_7: ; %.114
 ; CHECK-NEXT:    ; in Loop: Header=BB1_8 Depth=2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s64
-; CHECK-NEXT:    v_add_nc_u32_e32 v56, 1, v56
 ; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v56, v41
+; CHECK-NEXT:    v_add_nc_u32_e32 v46, 1, v46
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v47, v41
 ; CHECK-NEXT:    s_or_b32 s55, vcc_lo, s55
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s55
 ; CHECK-NEXT:    s_cbranch_execz .LBB1_10
 ; CHECK-NEXT:  .LBB1_8: ; %.103
 ; CHECK-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, v44, v56
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, v44, v47
 ; CHECK-NEXT:    ds_read_u8 v0, v0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; CHECK-NEXT:    s_and_saveexec_b32 s64, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB1_7
 ; CHECK-NEXT:  ; %bb.9: ; %.110
@@ -958,7 +957,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
 ; CHECK-NEXT:    v_add_nc_u32_e32 v43, 1, v43
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CHECK-NEXT:    ds_write_b32 v0, v47
+; CHECK-NEXT:    ds_write_b32 v0, v46
 ; CHECK-NEXT:    s_branch .LBB1_7
 ; CHECK-NEXT:  .LBB1_10: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB1_1 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
index c5732531f5423..ec95a7ed03b95 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
@@ -73,8 +73,8 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
 }
 
 ; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure:
-; GFX908:    NumSgprs: 64
-; GFX908-GCNTRACKERS:    NumSgprs: 64
+; GFX908:    NumSgprs: 56
+; GFX908-GCNTRACKERS:    NumSgprs: 56
 ; GFX908:    NumVgprs: 43
 ; GFX908-GCNTRACKERS:    NumVgprs: 39
 ; GFX908:    Occupancy: 5
diff --git a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
index a27d1217031ca..0e30b4bb5925c 100644
--- a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
@@ -72,13 +72,14 @@ entry:
 
 a:
   %v2 = call <2 x float> @llvm.amdgcn.struct.ptr.buffer.load.v2f32(ptr addrspace(8) %p, i32 0, i32 0, i32 1, i32 0)
+  %v3 = fadd <2 x float> %v1, %v2
   %v20 = extractelement <2 x float> %v2, i32 0
   %v21 = extractelement <2 x float> %v2, i32 1
   %cond2 = fcmp ult float %v20, %v21
   br i1 %cond2, label %b, label %c
 
 b:
-  ret <2 x float> %v2
+  ret <2 x float> %v3
 
 c:
   %v4 = fadd <2 x float> %v1, %v1
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index 8dfd841671730..7426ecca7301a 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -41,7 +41,8 @@ bb:
   %tmp20 = extractelement <4 x float> %tmp18, i32 1
   %tmp21 = extractelement <4 x float> %tmp18, i32 2
   %tmp22 = extractelement <4 x float> %tmp18, i32 3
-  %tmp23 = bitcast float %tmp14 to i32
+  %tmp23 = fadd float %tmp14, %tmp22
+  %tmp24 = bitcast f...
[truncated]

llvm/lib/Transforms/Scalar/Sink.cpp

Signed-off-by: John Lu <[email protected]>

llvm/lib/Transforms/Scalar/Sink.cpp

Signed-off-by: John Lu <[email protected]>

nikic

What you're doing here looks like it will be extremely expensive even if you don't hit a degenerate case.

You're performing AA queries on all stores in a potentially large number of blocks (and I think the block walk may not even stop at the origin block if there is a way past it), and you're doing that for every single potentially sinkable load.

If you want to do something like this you probably need to make use of MemorySSA or similar.

llvm/lib/Transforms/Scalar/Sink.cpp

Signed-off-by: John Lu <[email protected]>

LU-JOHN · 2025-04-18T01:30:36Z

What you're doing here looks like it will be extremely expensive even if you don't hit a degenerate case.

You're performing AA queries on all stores in a potentially large number of blocks (and I think the block walk may not even stop at the origin block if there is a way past it), and you're doing that for every single potentially sinkable load.

If you want to do something like this you probably need to make use of MemorySSA or similar.

The block walk must stop in the load block since the load block dominates the uses. I'll and a limit to the number of stores that are analyzed. Is a default of 6 okay?

arsenm · 2025-04-18T16:03:19Z

The block walk must stop in the load block since the load block dominates the uses. I'll and a limit to the number of stores that are analyzed. Is a default of 6 okay?

Probably should avoid doing ad-hoc memory analysis, and use one of the existing analyses

Signed-off-by: John Lu <[email protected]>

llvm/lib/Transforms/Scalar/Sink.cpp

arsenm · 2025-04-26T07:26:28Z

llvm/lib/Transforms/Scalar/Sink.cpp

+static cl::opt<unsigned> SinkLoadStoreLimit(
+    "sink-load-store-limit", cl::Hidden, cl::init(4),
+    cl::desc("Maximum number of stores in descendant blocks that will be "
+             "analyzed when attempting to sink a load."));


Can you avoid this super specific flag? Is there some threshold MemorySSA usually uses? At worst can it be a pass parameter instead?

I see a MaxCheckLimit in MemorySSA.cpp, but it is local to MemorySSA.cpp. EarlyCSE.cpp, DeadStoreElimination.cpp, and LICM.cpp all have hidden options setting limits on memory SSA usage. Is it okay if we stick with a hidden option? Otherwise, by "pass parameter" do you mean a limit specific to Sink that is not implemented as a user option?

llvm/test/Transforms/Sink/loadsink.ll

Signed-off-by: John Lu <[email protected]>

LU-JOHN added 4 commits April 16, 2025 06:39

Extend Sink pass to allow loads to be sunk to non-immediate successor…

36eeb25

… blocks Signed-off-by: John Lu <[email protected]>

Update tests

f9fc4a8

Signed-off-by: John Lu <[email protected]>

Update NumSgprs for GCN Trackers test

e78335e

Signed-off-by: John Lu <[email protected]>

Add test that sinks part of the way to the use

5480713

Signed-off-by: John Lu <[email protected]>

llvmbot added backend:AMDGPU llvm:globalisel llvm:transforms labels Apr 16, 2025

dtcxzyw requested review from arsenm, shiltian and Pierre-vh April 17, 2025 06:08

arsenm reviewed Apr 17, 2025

View reviewed changes

llvm/lib/Transforms/Scalar/Sink.cpp Outdated Show resolved Hide resolved

llvm/lib/Transforms/Scalar/Sink.cpp Outdated Show resolved Hide resolved

Undo changes in isSafeToMove. Save for later PR

1543488

Signed-off-by: John Lu <[email protected]>

LU-JOHN requested a review from arsenm April 17, 2025 18:25

shiltian reviewed Apr 17, 2025

View reviewed changes

llvm/lib/Transforms/Scalar/Sink.cpp Outdated Show resolved Hide resolved

shiltian reviewed Apr 17, 2025

View reviewed changes

llvm/lib/Transforms/Scalar/Sink.cpp Outdated Show resolved Hide resolved

LU-JOHN added 2 commits April 17, 2025 14:50

Avoid two lookups. Use using instead of typedef.

5176797

Signed-off-by: John Lu <[email protected]>

Remove file

0322b92

Signed-off-by: John Lu <[email protected]>

LU-JOHN requested a review from shiltian April 17, 2025 19:53

nikic reviewed Apr 17, 2025

View reviewed changes

llvm/lib/Transforms/Scalar/Sink.cpp Outdated Show resolved Hide resolved

Use BatchAA

97cd81b

Signed-off-by: John Lu <[email protected]>

If finding stores is too complicated assume there is a conflict

1539a5c

Signed-off-by: John Lu <[email protected]>

LU-JOHN marked this pull request as draft April 18, 2025 18:15

Use MemorySSA to find stores

e6cec74

Signed-off-by: John Lu <[email protected]>

LU-JOHN marked this pull request as ready for review April 24, 2025 17:08

LU-JOHN requested a review from nikic April 24, 2025 17:09

LU-JOHN added 2 commits April 24, 2025 15:11

Add MemorySSA pass bookkeeping code

e71736f

Signed-off-by: John Lu <[email protected]>

Add Memory SSA to checks

9be673c

Signed-off-by: John Lu <[email protected]>

arsenm reviewed Apr 26, 2025

View reviewed changes

Add braces.

de8d2a4

Signed-off-by: John Lu <[email protected]>

LU-JOHN requested a review from arsenm April 28, 2025 14:30

Test -sink-load-store-limit

6aba0ea

Signed-off-by: John Lu <[email protected]>

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Sink] Allow sinking of loads to distant blocks #135986

[Sink] Allow sinking of loads to distant blocks #135986

Uh oh!

LU-JOHN commented Apr 16, 2025

Uh oh!

llvmbot commented Apr 16, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

nikic left a comment

Uh oh!

Uh oh!

LU-JOHN commented Apr 18, 2025

Uh oh!

arsenm commented Apr 18, 2025

Uh oh!

Uh oh!

Uh oh!

arsenm Apr 26, 2025

Uh oh!

LU-JOHN Apr 26, 2025

Uh oh!

Uh oh!

Uh oh!

[Sink] Allow sinking of loads to distant blocks #135986

Are you sure you want to change the base?

[Sink] Allow sinking of loads to distant blocks #135986

Uh oh!

Conversation

LU-JOHN commented Apr 16, 2025

Uh oh!

llvmbot commented Apr 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

LU-JOHN commented Apr 18, 2025

Uh oh!

arsenm commented Apr 18, 2025

Uh oh!

Uh oh!

Uh oh!

arsenm Apr 26, 2025

Choose a reason for hiding this comment

Uh oh!

LU-JOHN Apr 26, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Apr 16, 2025 •

edited

Loading