diff --git a/lgc/include/lgc/patch/PatchBufferOp.h b/lgc/include/lgc/patch/PatchBufferOp.h index 3f9345425c..96ccfe4ffd 100644 --- a/lgc/include/lgc/patch/PatchBufferOp.h +++ b/lgc/include/lgc/patch/PatchBufferOp.h @@ -118,6 +118,8 @@ class BufferOpLowering { llvm::Value *replaceLoadStore(llvm::Instruction &inst); llvm::Instruction *makeLoop(llvm::Value *const loopStart, llvm::Value *const loopEnd, llvm::Value *const loopStride, llvm::Instruction *const insertPos); + Value *createGlobalPointerAccess(llvm::Value *const bufferDesc, llvm::Value *const offset, llvm::Type *const type, + llvm::Instruction &inst, const llvm::function_ref callback); TypeLowering &m_typeLowering; llvm::IRBuilder<> m_builder; diff --git a/lgc/interface/lgc/Pipeline.h b/lgc/interface/lgc/Pipeline.h index 6a346ac856..5d8a3036ea 100644 --- a/lgc/interface/lgc/Pipeline.h +++ b/lgc/interface/lgc/Pipeline.h @@ -187,6 +187,7 @@ union Options { unsigned rtStaticPipelineFlags; // Ray tracing static pipeline flags unsigned rtTriCompressMode; // Ray tracing triangle compression mode bool useGpurt; // Whether GPURT is used + bool enableExtendedRobustBufferAccess; // Enable the extended robust buffer access }; }; static_assert(sizeof(Options) == sizeof(Options::u32All)); diff --git a/lgc/patch/PatchBufferOp.cpp b/lgc/patch/PatchBufferOp.cpp index c728e9a9ad..3d89ea6ada 100644 --- a/lgc/patch/PatchBufferOp.cpp +++ b/lgc/patch/PatchBufferOp.cpp @@ -353,34 +353,27 @@ void BufferOpLowering::visitAtomicCmpXchgInst(AtomicCmpXchgInst &atomicCmpXchgIn // If our buffer descriptor is divergent, need to handle it differently. if (getDescriptorInfo(bufferDesc).divergent.value()) { - Value *const baseAddr = getBaseAddressFromBufferDesc(bufferDesc); - - // The 2nd element in the buffer descriptor is the byte bound, we do this to support robust buffer access. - Value *const bound = m_builder.CreateExtractElement(bufferDesc, 2); - Value *const inBound = m_builder.CreateICmpULT(baseIndex, bound); - Value *const newBaseIndex = m_builder.CreateSelect(inBound, baseIndex, m_builder.getInt32(0)); - - // Add on the index to the address. - Value *atomicPointer = m_builder.CreateGEP(m_builder.getInt8Ty(), baseAddr, newBaseIndex); - - atomicPointer = m_builder.CreateBitCast(atomicPointer, storeType->getPointerTo(ADDR_SPACE_GLOBAL)); - - const AtomicOrdering successOrdering = atomicCmpXchgInst.getSuccessOrdering(); - const AtomicOrdering failureOrdering = atomicCmpXchgInst.getFailureOrdering(); - - Value *const compareValue = atomicCmpXchgInst.getCompareOperand(); - Value *const newValue = atomicCmpXchgInst.getNewValOperand(); - AtomicCmpXchgInst *const newAtomicCmpXchg = m_builder.CreateAtomicCmpXchg( - atomicPointer, compareValue, newValue, MaybeAlign(), successOrdering, failureOrdering); - newAtomicCmpXchg->setVolatile(atomicCmpXchgInst.isVolatile()); - newAtomicCmpXchg->setSyncScopeID(atomicCmpXchgInst.getSyncScopeID()); - newAtomicCmpXchg->setWeak(atomicCmpXchgInst.isWeak()); - copyMetadata(newAtomicCmpXchg, &atomicCmpXchgInst); + auto createAtomicCmpXchgFunc = [&](Value *pointer) { + const AtomicOrdering successOrdering = atomicCmpXchgInst.getSuccessOrdering(); + const AtomicOrdering failureOrdering = atomicCmpXchgInst.getFailureOrdering(); + + Value *const compareValue = atomicCmpXchgInst.getCompareOperand(); + Value *const newValue = atomicCmpXchgInst.getNewValOperand(); + AtomicCmpXchgInst *const newAtomicCmpXchg = m_builder.CreateAtomicCmpXchg( + pointer, compareValue, newValue, MaybeAlign(), successOrdering, failureOrdering); + newAtomicCmpXchg->setVolatile(atomicCmpXchgInst.isVolatile()); + newAtomicCmpXchg->setSyncScopeID(atomicCmpXchgInst.getSyncScopeID()); + newAtomicCmpXchg->setWeak(atomicCmpXchgInst.isWeak()); + copyMetadata(newAtomicCmpXchg, &atomicCmpXchgInst); + return newAtomicCmpXchg; + }; + Value *result = + createGlobalPointerAccess(bufferDesc, baseIndex, storeType, atomicCmpXchgInst, createAtomicCmpXchgFunc); // Record the atomic instruction so we remember to delete it later. m_typeLowering.eraseInstruction(&atomicCmpXchgInst); - atomicCmpXchgInst.replaceAllUsesWith(newAtomicCmpXchg); + atomicCmpXchgInst.replaceAllUsesWith(result); } else { switch (atomicCmpXchgInst.getSuccessOrdering()) { case AtomicOrdering::Release: @@ -459,29 +452,21 @@ void BufferOpLowering::visitAtomicRMWInst(AtomicRMWInst &atomicRmwInst) { // If our buffer descriptor is divergent, need to handle it differently. if (getDescriptorInfo(bufferDesc).divergent.value()) { - Value *const baseAddr = getBaseAddressFromBufferDesc(bufferDesc); - - // The 2nd element in the buffer descriptor is the byte bound, we do this to support robust buffer access. - Value *const bound = m_builder.CreateExtractElement(bufferDesc, 2); - Value *const inBound = m_builder.CreateICmpULT(baseIndex, bound); - Value *const newBaseIndex = m_builder.CreateSelect(inBound, baseIndex, m_builder.getInt32(0)); - - // Add on the index to the address. - Value *atomicPointer = m_builder.CreateGEP(m_builder.getInt8Ty(), baseAddr, newBaseIndex); - - atomicPointer = m_builder.CreateBitCast(atomicPointer, storeType->getPointerTo(ADDR_SPACE_GLOBAL)); - - AtomicRMWInst *const newAtomicRmw = - m_builder.CreateAtomicRMW(atomicRmwInst.getOperation(), atomicPointer, atomicRmwInst.getValOperand(), - atomicRmwInst.getAlign(), atomicRmwInst.getOrdering()); - newAtomicRmw->setVolatile(atomicRmwInst.isVolatile()); - newAtomicRmw->setSyncScopeID(atomicRmwInst.getSyncScopeID()); - copyMetadata(newAtomicRmw, &atomicRmwInst); + auto createAtomicRmwFunc = [&](Value *pointer) { + AtomicRMWInst *const newAtomicRmw = + m_builder.CreateAtomicRMW(atomicRmwInst.getOperation(), pointer, atomicRmwInst.getValOperand(), + atomicRmwInst.getAlign(), atomicRmwInst.getOrdering()); + newAtomicRmw->setVolatile(atomicRmwInst.isVolatile()); + newAtomicRmw->setSyncScopeID(atomicRmwInst.getSyncScopeID()); + copyMetadata(newAtomicRmw, &atomicRmwInst); + return newAtomicRmw; + }; + Value *result = createGlobalPointerAccess(bufferDesc, baseIndex, storeType, atomicRmwInst, createAtomicRmwFunc); // Record the atomic instruction so we remember to delete it later. m_typeLowering.eraseInstruction(&atomicRmwInst); - atomicRmwInst.replaceAllUsesWith(newAtomicRmw); + atomicRmwInst.replaceAllUsesWith(result); } else { switch (atomicRmwInst.getOrdering()) { case AtomicOrdering::Release: @@ -1292,36 +1277,28 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) { // If our buffer descriptor is divergent, need to handle that differently. if (getDescriptorInfo(bufferDesc).divergent.value()) { - Value *const baseAddr = getBaseAddressFromBufferDesc(bufferDesc); - - // The 2nd element in the buffer descriptor is the byte bound, we do this to support robust buffer access. - Value *const bound = m_builder.CreateExtractElement(bufferDesc, 2); - Value *const inBound = m_builder.CreateICmpULT(baseIndex, bound); - Value *const newBaseIndex = m_builder.CreateSelect(inBound, baseIndex, m_builder.getInt32(0)); - - // Add on the index to the address. - Value *pointer = m_builder.CreateGEP(m_builder.getInt8Ty(), baseAddr, newBaseIndex); - - pointer = m_builder.CreateBitCast(pointer, type->getPointerTo(ADDR_SPACE_GLOBAL)); - - if (isLoad) { - LoadInst *const newLoad = m_builder.CreateAlignedLoad(type, pointer, alignment, loadInst->isVolatile()); - newLoad->setOrdering(ordering); - newLoad->setSyncScopeID(syncScopeID); - copyMetadata(newLoad, loadInst); - - if (isInvariant) - newLoad->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(m_builder.getContext(), {})); - - return newLoad; - } - StoreInst *const newStore = - m_builder.CreateAlignedStore(storeInst->getValueOperand(), pointer, alignment, storeInst->isVolatile()); - newStore->setOrdering(ordering); - newStore->setSyncScopeID(syncScopeID); - copyMetadata(newStore, storeInst); - - return newStore; + auto createLoadStoreFunc = [&](Value *pointer) { + Value *result = nullptr; + if (isLoad) { + LoadInst *const newLoad = m_builder.CreateAlignedLoad(type, pointer, alignment, loadInst->isVolatile()); + newLoad->setOrdering(ordering); + newLoad->setSyncScopeID(syncScopeID); + copyMetadata(newLoad, loadInst); + + if (isInvariant) + newLoad->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(m_builder.getContext(), {})); + result = newLoad; + } else { + StoreInst *const newStore = + m_builder.CreateAlignedStore(storeInst->getValueOperand(), pointer, alignment, storeInst->isVolatile()); + newStore->setOrdering(ordering); + newStore->setSyncScopeID(syncScopeID); + copyMetadata(newStore, storeInst); + result = newStore; + } + return result; + }; + return createGlobalPointerAccess(bufferDesc, baseIndex, type, inst, createLoadStoreFunc); } switch (ordering) { @@ -1572,3 +1549,54 @@ Instruction *BufferOpLowering::makeLoop(Value *const loopStart, Value *const loo return loopCounter; } + +// ===================================================================================================================== +// Create global pointer access. +// +// @param bufferDesc: The buffer descriptor +// @param offset: The offset on the global memory +// @param type: The accessed data type +// @param inst: The instruction to be executed on the buffer +// @param callback: The callback function to perform the specific global access +Value *BufferOpLowering::createGlobalPointerAccess(Value *const bufferDesc, Value *const offset, Type *const type, + Instruction &inst, const function_ref callback) { + // The 2nd element (NUM_RECORDS) in the buffer descriptor is byte bound. + Value *bound = m_builder.CreateExtractElement(bufferDesc, 2); + Value *inBound = m_builder.CreateICmpULT(offset, bound); + + // If null descriptor or extended robust buffer access is allowed, we will create a branch to perform normal global + // access based on the valid check. + Value *isValidAccess = m_builder.getTrue(); + if (m_pipelineState.getOptions().allowNullDescriptor || + m_pipelineState.getOptions().enableExtendedRobustBufferAccess) { + Value *isNonNullDesc = m_builder.getTrue(); + if (m_pipelineState.getOptions().allowNullDescriptor) { + // Check dword2 against 0 for null descriptor + isNonNullDesc = m_builder.CreateICmpNE(bound, m_builder.getInt32(0)); + } + Value *isInBound = m_pipelineState.getOptions().enableExtendedRobustBufferAccess ? inBound : m_builder.getTrue(); + isValidAccess = m_builder.CreateAnd(isNonNullDesc, isInBound); + } + + BasicBlock *const origBlock = inst.getParent(); + Instruction *const terminator = SplitBlockAndInsertIfThen(isValidAccess, &inst, false); + + // Global pointer access + m_builder.SetInsertPoint(terminator); + Value *baseAddr = getBaseAddressFromBufferDesc(bufferDesc); + // NOTE: The offset of out-of-bound overridden as 0 may causes unexpected result when the extended robustness access + // is disabled. + Value *newOffset = m_builder.CreateSelect(inBound, offset, m_builder.getInt32(0)); + // Add on the index to the address. + Value *pointer = m_builder.CreateGEP(m_builder.getInt8Ty(), baseAddr, newOffset); + pointer = m_builder.CreateBitCast(pointer, type->getPointerTo(ADDR_SPACE_GLOBAL)); + Value *newValue = callback(pointer); + + m_builder.SetInsertPoint(&inst); + assert(!type->isVoidTy()); + auto phi = m_builder.CreatePHI(type, 2, "newValue"); + phi->addIncoming(Constant::getNullValue(type), origBlock); + phi->addIncoming(newValue, terminator->getParent()); + + return phi; +} diff --git a/lgc/test/Transforms/PatchBufferOp/simple.lgc b/lgc/test/Transforms/PatchBufferOp/simple.lgc index 35c84beccd..e172c4a40c 100644 --- a/lgc/test/Transforms/PatchBufferOp/simple.lgc +++ b/lgc/test/Transforms/PatchBufferOp/simple.lgc @@ -29,16 +29,21 @@ define amdgpu_gfx float @uniform_select(<4 x i32> inreg %desc0, <4 x i32> inreg define amdgpu_gfx float @divergent_select(<4 x i32> inreg %desc0, <4 x i32> inreg %desc1, i1 %sel) !lgc.shaderstage !0 { ; CHECK-LABEL: @divergent_select( ; CHECK-NEXT: [[PTR_0:%.*]] = select i1 [[SEL:%.*]], <4 x i32> [[DESC0:%.*]], <4 x i32> [[DESC1:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4 -; CHECK-NEXT: ret float [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 0, [[TMP1]] +; CHECK-NEXT: br i1 true, label [[TMP3:%.*]], label [[TMP11:%.*]] +; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP2]], i32 0, i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP7]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4 +; CHECK-NEXT: br label [[TMP11]] +; CHECK: 11: +; CHECK-NEXT: [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP10]], [[TMP3]] ] +; CHECK-NEXT: ret float [[NEWVALUE]] ; %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0) %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1) @@ -50,16 +55,21 @@ define amdgpu_gfx float @divergent_select(<4 x i32> inreg %desc0, <4 x i32> inre define amdgpu_gfx float @divergent_select1(<4 x i32> %desc0, <4 x i32> inreg %desc1, i1 inreg %sel) !lgc.shaderstage !0 { ; CHECK-LABEL: @divergent_select1( ; CHECK-NEXT: [[PTR_0:%.*]] = select i1 [[SEL:%.*]], <4 x i32> [[DESC0:%.*]], <4 x i32> [[DESC1:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4 -; CHECK-NEXT: ret float [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 0, [[TMP1]] +; CHECK-NEXT: br i1 true, label [[TMP3:%.*]], label [[TMP11:%.*]] +; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP2]], i32 0, i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP7]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4 +; CHECK-NEXT: br label [[TMP11]] +; CHECK: 11: +; CHECK-NEXT: [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP10]], [[TMP3]] ] +; CHECK-NEXT: ret float [[NEWVALUE]] ; %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0) %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1) @@ -71,16 +81,21 @@ define amdgpu_gfx float @divergent_select1(<4 x i32> %desc0, <4 x i32> inreg %de define amdgpu_gfx float @divergent_select2(<4 x i32> inreg %desc0, <4 x i32> %desc1, i1 inreg %sel) !lgc.shaderstage !0 { ; CHECK-LABEL: @divergent_select2( ; CHECK-NEXT: [[PTR_0:%.*]] = select i1 [[SEL:%.*]], <4 x i32> [[DESC0:%.*]], <4 x i32> [[DESC1:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4 -; CHECK-NEXT: ret float [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 0, [[TMP1]] +; CHECK-NEXT: br i1 true, label [[TMP3:%.*]], label [[TMP11:%.*]] +; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP2]], i32 0, i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP7]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4 +; CHECK-NEXT: br label [[TMP11]] +; CHECK: 11: +; CHECK-NEXT: [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP10]], [[TMP3]] ] +; CHECK-NEXT: ret float [[NEWVALUE]] ; %ptr0 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc0) %ptr1 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc1) @@ -131,16 +146,21 @@ define amdgpu_gfx float @divergent_input0_phi(<4 x i32> %desc0, <4 x i32> inreg ; CHECK-NEXT: br label [[TAIL]] ; CHECK: tail: ; CHECK-NEXT: [[PTR_0:%.*]] = phi <4 x i32> [ [[DESC0:%.*]], [[A]] ], [ [[DESC1:%.*]], [[B]] ] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4 -; CHECK-NEXT: ret float [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 0, [[TMP1]] +; CHECK-NEXT: br i1 true, label [[TMP3:%.*]], label [[TMP11:%.*]] +; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP2]], i32 0, i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP7]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4 +; CHECK-NEXT: br label [[TMP11]] +; CHECK: 11: +; CHECK-NEXT: [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TAIL]] ], [ [[TMP10]], [[TMP3]] ] +; CHECK-NEXT: ret float [[NEWVALUE]] ; br i1 %sel, label %a, label %b @@ -167,16 +187,21 @@ define amdgpu_gfx float @divergent_input1_phi(<4 x i32> inreg %desc0, <4 x i32> ; CHECK-NEXT: br label [[TAIL]] ; CHECK: tail: ; CHECK-NEXT: [[PTR_0:%.*]] = phi <4 x i32> [ [[DESC0:%.*]], [[A]] ], [ [[DESC1:%.*]], [[B]] ] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4 -; CHECK-NEXT: ret float [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 0, [[TMP1]] +; CHECK-NEXT: br i1 true, label [[TMP3:%.*]], label [[TMP11:%.*]] +; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP2]], i32 0, i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP7]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4 +; CHECK-NEXT: br label [[TMP11]] +; CHECK: 11: +; CHECK-NEXT: [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TAIL]] ], [ [[TMP10]], [[TMP3]] ] +; CHECK-NEXT: ret float [[NEWVALUE]] ; br i1 %sel, label %a, label %b @@ -203,16 +228,21 @@ define amdgpu_gfx float @divergent_sync_phi(<4 x i32> inreg %desc0, <4 x i32> in ; CHECK-NEXT: br label [[TAIL]] ; CHECK: tail: ; CHECK-NEXT: [[PTR_0:%.*]] = phi <4 x i32> [ [[DESC0:%.*]], [[A]] ], [ [[DESC1:%.*]], [[B]] ] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4 -; CHECK-NEXT: ret float [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 0, [[TMP1]] +; CHECK-NEXT: br i1 true, label [[TMP3:%.*]], label [[TMP11:%.*]] +; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP2]], i32 0, i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP7]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4 +; CHECK-NEXT: br label [[TMP11]] +; CHECK: 11: +; CHECK-NEXT: [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TAIL]] ], [ [[TMP10]], [[TMP3]] ] +; CHECK-NEXT: ret float [[NEWVALUE]] ; br i1 %sel, label %a, label %b diff --git a/lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc b/lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc index 9a1bdcdb55..9bdd634dfd 100644 --- a/lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc +++ b/lgc/test/Transforms/PatchBufferOp/uniform-phi.lgc @@ -16,16 +16,21 @@ define amdgpu_gfx float @uniform_phi(<4 x i32> inreg %desc0, <4 x i32> inreg %de ; CHECK-NEXT: br label [[TAIL]] ; CHECK: tail: ; CHECK-NEXT: [[PTR_0:%.*]] = phi <4 x i32> [ [[DESC0:%.*]], [[A]] ], [ [[DESC1:%.*]], [[B]] ] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(1) -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 0, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP4]], i32 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(1) [[TMP8]], align 4 -; CHECK-NEXT: ret float [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[PTR_0]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 0, [[TMP1]] +; CHECK-NEXT: br i1 true, label [[TMP3:%.*]], label [[TMP11:%.*]] +; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[PTR_0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP2]], i32 0, i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP7]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(1) [[TMP9]], align 4 +; CHECK-NEXT: br label [[TMP11]] +; CHECK: 11: +; CHECK-NEXT: [[NEWVALUE:%.*]] = phi float [ 0.000000e+00, [[TAIL]] ], [ [[TMP10]], [[TMP3]] ] +; CHECK-NEXT: ret float [[NEWVALUE]] ; br i1 %sel, label %a, label %b diff --git a/llpc/context/llpcPipelineContext.cpp b/llpc/context/llpcPipelineContext.cpp index e71c97e687..f808c4d294 100644 --- a/llpc/context/llpcPipelineContext.cpp +++ b/llpc/context/llpcPipelineContext.cpp @@ -328,6 +328,7 @@ Options PipelineContext::computePipelineOptions() const { } options.allowNullDescriptor = getPipelineOptions()->extendedRobustness.nullDescriptor; + options.enableExtendedRobustBufferAccess = getPipelineOptions()->extendedRobustness.robustBufferAccess; options.disableImageResourceCheck = getPipelineOptions()->disableImageResourceCheck; options.optimizeTessFactor = getPipelineOptions()->optimizeTessFactor; options.enableInterpModePatch = getPipelineOptions()->enableInterpModePatch;