-
Notifications
You must be signed in to change notification settings - Fork 14.6k
[Sink] Allow sinking of loads to distant blocks #135986
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
… blocks Signed-off-by: John Lu <[email protected]>
Signed-off-by: John Lu <[email protected]>
Signed-off-by: John Lu <[email protected]>
Signed-off-by: John Lu <[email protected]>
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-transforms Author: None (LU-JOHN) ChangesAllow more loads to be sunk by ensuring that no conflicting stores are on paths to the target block. Patch is 30.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135986.diff 8 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/Sink.cpp b/llvm/lib/Transforms/Scalar/Sink.cpp
index 1a48a59c4189e..57ce0c8990f4a 100644
--- a/llvm/lib/Transforms/Scalar/Sink.cpp
+++ b/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -27,43 +27,71 @@ using namespace llvm;
STATISTIC(NumSunk, "Number of instructions sunk");
STATISTIC(NumSinkIter, "Number of sinking iterations");
-static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
- SmallPtrSetImpl<Instruction *> &Stores) {
-
- if (Inst->mayWriteToMemory()) {
- Stores.insert(Inst);
- return false;
- }
-
+static bool hasStoreConflict(Instruction *Inst, AliasAnalysis &AA,
+ SmallPtrSetImpl<Instruction *> &Stores) {
if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
MemoryLocation Loc = MemoryLocation::get(L);
for (Instruction *S : Stores)
if (isModSet(AA.getModRefInfo(S, Loc)))
- return false;
+ return true;
+ } else if (auto *Call = dyn_cast<CallBase>(Inst)) {
+ for (Instruction *S : Stores)
+ if (isModSet(AA.getModRefInfo(S, Call)))
+ return true;
}
+ return false;
+}
+static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
+ SmallPtrSetImpl<Instruction *> &Stores) {
+ if (Inst->mayWriteToMemory()) {
+ Stores.insert(Inst);
+ return false;
+ }
if (Inst->isTerminator() || isa<PHINode>(Inst) || Inst->isEHPad() ||
Inst->mayThrow() || !Inst->willReturn())
return false;
-
- if (auto *Call = dyn_cast<CallBase>(Inst)) {
- // Convergent operations cannot be made control-dependent on additional
- // values.
+ // Convergent operations cannot be made control-dependent on additional
+ // values.
+ if (auto *Call = dyn_cast<CallBase>(Inst))
if (Call->isConvergent())
return false;
+ if (hasStoreConflict(Inst, AA, Stores))
+ return false;
+ return true;
+}
- for (Instruction *S : Stores)
- if (isModSet(AA.getModRefInfo(S, Call)))
- return false;
- }
+typedef SmallPtrSet<BasicBlock *, 8> BlocksSet;
+static void findStores(SmallPtrSetImpl<Instruction *> &Stores,
+ BasicBlock *LoadBB, BasicBlock *BB,
+ BlocksSet &VisitedBlocksSet) {
+ if (BB == LoadBB || VisitedBlocksSet.contains(BB))
+ return;
+ VisitedBlocksSet.insert(BB);
+
+ for (Instruction &Inst : *BB)
+ if (Inst.mayWriteToMemory())
+ Stores.insert(&Inst);
+ for (BasicBlock *Pred : predecessors(BB))
+ findStores(Stores, LoadBB, Pred, VisitedBlocksSet);
+}
- return true;
+static bool hasConflictingStoreBeforeSuccToSinkTo(AliasAnalysis &AA,
+ Instruction *ReadMemInst,
+ BasicBlock *SuccToSinkTo) {
+ BlocksSet VisitedBlocksSet;
+ SmallPtrSet<Instruction *, 8> Stores;
+ BasicBlock *LoadBB = ReadMemInst->getParent();
+ for (BasicBlock *Pred : predecessors(SuccToSinkTo))
+ findStores(Stores, LoadBB, Pred, VisitedBlocksSet);
+ return hasStoreConflict(ReadMemInst, AA, Stores);
}
/// IsAcceptableTarget - Return true if it is possible to sink the instruction
/// in the specified basic block.
-static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
- DominatorTree &DT, LoopInfo &LI) {
+static bool IsAcceptableTarget(AliasAnalysis &AA, Instruction *Inst,
+ BasicBlock *SuccToSinkTo, DominatorTree &DT,
+ LoopInfo &LI) {
assert(Inst && "Instruction to be sunk is null");
assert(SuccToSinkTo && "Candidate sink target is null");
@@ -76,10 +104,10 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
// just punt.
// FIXME: Split critical edges if not backedges.
if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
- // We cannot sink a load across a critical edge - there may be stores in
- // other code paths.
+ // Ensure that there is no conflicting store on any path to SuccToSinkTo.
if (Inst->mayReadFromMemory() &&
- !Inst->hasMetadata(LLVMContext::MD_invariant_load))
+ !Inst->hasMetadata(LLVMContext::MD_invariant_load) &&
+ hasConflictingStoreBeforeSuccToSinkTo(AA, Inst, SuccToSinkTo))
return false;
// We don't want to sink across a critical edge if we don't dominate the
@@ -153,7 +181,7 @@ static bool SinkInstruction(Instruction *Inst,
// The nearest common dominator may be in a parent loop of BB, which may not
// be beneficial. Find an ancestor.
while (SuccToSinkTo != BB &&
- !IsAcceptableTarget(Inst, SuccToSinkTo, DT, LI))
+ !IsAcceptableTarget(AA, Inst, SuccToSinkTo, DT, LI))
SuccToSinkTo = DT.getNode(SuccToSinkTo)->getIDom()->getBlock();
if (SuccToSinkTo == BB)
SuccToSinkTo = nullptr;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 074272f7bed86..28ade94040688 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -1330,13 +1330,7 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [8 x i32], ptr addrspace(1) %in, [8 x i32], ptr addrspace(1) %dummy) {
; GFX7-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX7-NEXT: v_mov_b32_e32 v2, 0
; GFX7-NEXT: s_mov_b32 s2, 0
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[0:3], 0 addr64
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0
; GFX7-NEXT: s_mov_b64 vcc, 0
; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[0:1]
@@ -1355,24 +1349,22 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX7-NEXT: s_or_b64 vcc, s[8:9], s[0:1]
; GFX7-NEXT: .LBB13_2: ; %exit
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_div_fmas_f32 v0, v1, v2, v3
; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
-; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_dwordx3 v[1:3], v[1:2]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0
; GFX8-NEXT: s_mov_b64 vcc, 0
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
@@ -1391,12 +1383,20 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX8-NEXT: s_or_b64 vcc, s[6:7], s[0:1]
; GFX8-NEXT: .LBB13_2: ; %exit
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], v0, v2
+; GFX8-NEXT: v_addc_u32_e64 v1, s[0:1], 0, v1, s[0:1]
+; GFX8-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 8
; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -1404,12 +1404,8 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
;
; GFX10_W32-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX10_W32: ; %bb.0: ; %entry
-; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
-; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX10_W32-NEXT: s_mov_b32 vcc_lo, 0
-; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT: global_load_dwordx3 v[1:3], v1, s[0:1]
; GFX10_W32-NEXT: v_cmp_eq_u32_e64 s0, 0, v0
+; GFX10_W32-NEXT: s_mov_b32 vcc_lo, 0
; GFX10_W32-NEXT: s_and_saveexec_b32 s1, s0
; GFX10_W32-NEXT: s_cbranch_execz .LBB13_2
; GFX10_W32-NEXT: ; %bb.1: ; %bb
@@ -1426,9 +1422,14 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX10_W32-NEXT: s_or_b32 vcc_lo, s2, s0
; GFX10_W32-NEXT: .LBB13_2: ; %exit
; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
+; GFX10_W32-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10_W32-NEXT: global_load_dwordx3 v[0:2], v0, s[0:1]
+; GFX10_W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10_W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3
+; GFX10_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] offset:8
@@ -1436,12 +1437,8 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
;
; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX10_W64: ; %bb.0: ; %entry
-; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
-; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX10_W64-NEXT: s_mov_b64 vcc, 0
-; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT: global_load_dwordx3 v[1:3], v1, s[0:1]
; GFX10_W64-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0
+; GFX10_W64-NEXT: s_mov_b64 vcc, 0
; GFX10_W64-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX10_W64-NEXT: s_cbranch_execz .LBB13_2
; GFX10_W64-NEXT: ; %bb.1: ; %bb
@@ -1458,9 +1455,14 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX10_W64-NEXT: s_or_b64 vcc, s[6:7], s[0:1]
; GFX10_W64-NEXT: .LBB13_2: ; %exit
; GFX10_W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
+; GFX10_W64-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10_W64-NEXT: global_load_dwordx3 v[0:2], v0, s[0:1]
+; GFX10_W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10_W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10_W64-NEXT: v_div_fmas_f32 v0, v1, v2, v3
+; GFX10_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2
; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] offset:8
@@ -1468,14 +1470,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
;
; GFX11_W32-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX11_W32: ; %bb.0: ; %entry
-; GFX11_W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x28
-; GFX11_W32-NEXT: v_and_b32_e32 v3, 0x3ff, v0
+; GFX11_W32-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11_W32-NEXT: s_mov_b32 vcc_lo, 0
-; GFX11_W32-NEXT: v_lshlrev_b32_e32 v0, 2, v3
-; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT: global_load_b96 v[0:2], v0, s[0:1]
; GFX11_W32-NEXT: s_mov_b32 s1, exec_lo
-; GFX11_W32-NEXT: v_cmpx_eq_u32_e32 0, v3
+; GFX11_W32-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11_W32-NEXT: s_cbranch_execz .LBB13_2
; GFX11_W32-NEXT: ; %bb.1: ; %bb
; GFX11_W32-NEXT: s_load_b64 s[2:3], s[4:5], 0x50
@@ -1491,6 +1489,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX11_W32-NEXT: s_or_b32 vcc_lo, s2, s0
; GFX11_W32-NEXT: .LBB13_2: ; %exit
; GFX11_W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11_W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x28
+; GFX11_W32-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11_W32-NEXT: global_load_b96 v[0:2], v0, s[0:1]
; GFX11_W32-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11_W32-NEXT: s_waitcnt vmcnt(0)
; GFX11_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2
@@ -1501,14 +1503,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
;
; GFX11_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX11_W64: ; %bb.0: ; %entry
-; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x28
-; GFX11_W64-NEXT: v_and_b32_e32 v3, 0x3ff, v0
+; GFX11_W64-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11_W64-NEXT: s_mov_b64 vcc, 0
; GFX11_W64-NEXT: s_mov_b64 s[2:3], exec
-; GFX11_W64-NEXT: v_lshlrev_b32_e32 v0, 2, v3
-; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT: global_load_b96 v[0:2], v0, s[0:1]
-; GFX11_W64-NEXT: v_cmpx_eq_u32_e32 0, v3
+; GFX11_W64-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11_W64-NEXT: s_cbranch_execz .LBB13_2
; GFX11_W64-NEXT: ; %bb.1: ; %bb
; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x50
@@ -1524,6 +1522,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
; GFX11_W64-NEXT: s_or_b64 vcc, s[6:7], s[0:1]
; GFX11_W64-NEXT: .LBB13_2: ; %exit
; GFX11_W64-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x28
+; GFX11_W64-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11_W64-NEXT: global_load_b96 v[0:2], v0, s[0:1]
; GFX11_W64-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11_W64-NEXT: s_waitcnt vmcnt(0)
; GFX11_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index 4a6b2ebd3d203..500659ea0ca86 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -877,14 +877,11 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: ; =>This Loop Header: Depth=1
; CHECK-NEXT: ; Child Loop BB1_3 Depth 2
; CHECK-NEXT: ; Child Loop BB1_8 Depth 2
-; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44
; CHECK-NEXT: s_lshl_b32 s5, s4, 5
; CHECK-NEXT: s_add_i32 s53, s4, 1
; CHECK-NEXT: s_add_i32 s6, s4, 5
-; CHECK-NEXT: v_or3_b32 v47, s5, v42, s53
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: ds_read_u8 v46, v0
-; CHECK-NEXT: v_mov_b32_e32 v56, s53
+; CHECK-NEXT: v_or3_b32 v46, s5, v42, s53
+; CHECK-NEXT: v_mov_b32_e32 v47, s53
; CHECK-NEXT: s_mov_b32 s5, exec_lo
; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41
; CHECK-NEXT: s_cbranch_execz .LBB1_5
@@ -898,46 +895,48 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: s_add_i32 s7, s7, 4
; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
; CHECK-NEXT: s_add_i32 s8, s4, s7
-; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v47
+; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v46
; CHECK-NEXT: s_add_i32 s9, s8, 5
; CHECK-NEXT: s_add_i32 s8, s8, 1
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s9, v41
-; CHECK-NEXT: v_mov_b32_e32 v56, s8
+; CHECK-NEXT: v_mov_b32_e32 v47, s8
; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB1_3
; CHECK-NEXT: ; %bb.4: ; %Flow3
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; CHECK-NEXT: v_mov_b32_e32 v47, v0
+; CHECK-NEXT: v_mov_b32_e32 v46, v0
; CHECK-NEXT: .LBB1_5: ; %Flow4
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_mov_b32 s54, exec_lo
-; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41
+; CHECK-NEXT: v_cmpx_lt_u32_e64 v47, v41
; CHECK-NEXT: s_cbranch_execz .LBB1_11
; CHECK-NEXT: ; %bb.6: ; %.103.preheader
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44
; CHECK-NEXT: s_mov_b32 s55, 0
+; CHECK-NEXT: ds_read_u8 v56, v0
; CHECK-NEXT: s_inst_prefetch 0x1
; CHECK-NEXT: s_branch .LBB1_8
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB1_7: ; %.114
; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s64
-; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41
+; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v47, v41
; CHECK-NEXT: s_or_b32 s55, vcc_lo, s55
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s55
; CHECK-NEXT: s_cbranch_execz .LBB1_10
; CHECK-NEXT: .LBB1_8: ; %.103
; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
-; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v56
+; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v47
; CHECK-NEXT: ds_read_u8 v0, v0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: s_and_saveexec_b32 s64, s4
; CHECK-NEXT: s_cbranch_execz .LBB1_7
; CHECK-NEXT: ; %bb.9: ; %.110
@@ -958,7 +957,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; CHECK-NEXT: ds_write_b32 v0, v47
+; CHECK-NEXT: ds_write_b32 v0, v46
; CHECK-NEXT: s_branch .LBB1_7
; CHECK-NEXT: .LBB1_10: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
index c5732531f5423..ec95a7ed03b95 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
@@ -73,8 +73,8 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
}
; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure:
-; GFX908: NumSgprs: 64
-; GFX908-GCNTRACKERS: NumSgprs: 64
+; GFX908: NumSgprs: 56
+; GFX908-GCNTRACKERS: NumSgprs: 56
; GFX908: NumVgprs: 43
; GFX908-GCNTRACKERS: NumVgprs: 39
; GFX908: Occupancy: 5
diff --git a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
index a27d1217031ca..0e30b4bb5925c 100644
--- a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
@@ -72,13 +72,14 @@ entry:
a:
%v2 = call <2 x float> @llvm.amdgcn.struct.ptr.buffer.load.v2f32(ptr addrspace(8) %p, i32 0, i32 0, i32 1, i32 0)
+ %v3 = fadd <2 x float> %v1, %v2
%v20 = extractelement <2 x float> %v2, i32 0
%v21 = extractelement <2 x float> %v2, i32 1
%cond2 = fcmp ult float %v20, %v21
br i1 %cond2, label %b, label %c
b:
- ret <2 x float> %v2
+ ret <2 x float> %v3
c:
%v4 = fadd <2 x float> %v1, %v1
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index 8dfd841671730..7426ecca7301a 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -41,7 +41,8 @@ bb:
%tmp20 = extractelement <4 x float> %tmp18, i32 1
%tmp21 = extractelement <4 x float> %tmp18, i32 2
%tmp22 = extractelement <4 x float> %tmp18, i32 3
- %tmp23 = bitcast float %tmp14 to i32
+ %tmp23 = fadd float %tmp14, %tmp22
+ %tmp24 = bitcast f...
[truncated]
|
Signed-off-by: John Lu <[email protected]>
Signed-off-by: John Lu <[email protected]>
Signed-off-by: John Lu <[email protected]>
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What you're doing here looks like it will be extremely expensive even if you don't hit a degenerate case.
You're performing AA queries on all stores in a potentially large number of blocks (and I think the block walk may not even stop at the origin block if there is a way past it), and you're doing that for every single potentially sinkable load.
If you want to do something like this you probably need to make use of MemorySSA or similar.
Signed-off-by: John Lu <[email protected]>
The block walk must stop in the load block since the load block dominates the uses. I'll and a limit to the number of stores that are analyzed. Is a default of 6 okay? |
Probably should avoid doing ad-hoc memory analysis, and use one of the existing analyses |
Signed-off-by: John Lu <[email protected]>
Signed-off-by: John Lu <[email protected]>
Signed-off-by: John Lu <[email protected]>
Signed-off-by: John Lu <[email protected]>
static cl::opt<unsigned> SinkLoadStoreLimit( | ||
"sink-load-store-limit", cl::Hidden, cl::init(4), | ||
cl::desc("Maximum number of stores in descendant blocks that will be " | ||
"analyzed when attempting to sink a load.")); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you avoid this super specific flag? Is there some threshold MemorySSA usually uses? At worst can it be a pass parameter instead?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see a MaxCheckLimit in MemorySSA.cpp, but it is local to MemorySSA.cpp. EarlyCSE.cpp, DeadStoreElimination.cpp, and LICM.cpp all have hidden options setting limits on memory SSA usage. Is it okay if we stick with a hidden option? Otherwise, by "pass parameter" do you mean a limit specific to Sink that is not implemented as a user option?
Signed-off-by: John Lu <[email protected]>
Signed-off-by: John Lu <[email protected]>
Allow more loads to be sunk by ensuring that no conflicting stores are on paths to the target block.