-
Notifications
You must be signed in to change notification settings - Fork 247
Description
In OpenCL.jl, we lower atomic add on floats to the mangled builtin _Z10atomic_addPU8CLglobalVff. The LLVM Backend recognizes that and - if SPV_EXT_shader_atomic_float_add is specified as an extension - even lowers it to an OpAtomicFAddEXT instruction and declares usage of the extension with OpExtension "SPV_EXT_shader_atomic_float_add".
It looks like the translator OTOH doesn't recognize that builtin and doesn't add OpExtension "SPV_EXT_shader_atomic_float_add". It instead expects the operation to be spelled using an atomicrmw instruction as in
SPIRV-LLVM-Translator/test/extensions/EXT/SPV_EXT_shader_atomic_float_/atomicrmw_fadd_float.ll
Line 23 in 6238ea3
| %0 = atomicrmw fadd ptr addrspace(1) @f, float 42.000000e+00 seq_cst |
This leads to the following linker error when passed to pocl for example:
Failed to compile program
Build log:
Error(s) while linking:
Cannot find symbol _Z10atomic_addPU8CLglobalVff in kernel library
<unknown>:0:0: 7 instructions in function
<unknown>:0:0: 7 instructions in function
<unknown>:0:0: 7 instructions in function
<unknown>:0:0: 7 instructions in function
Device cpu-skylake-avx512-AMD Ryzen AI 7 PRO 360 w/ Radeon 880M failed to build the program
I believe this is due to pocl not being aware that the SPV_EXT_shader_atomic_float_add extension is required. Is this to be expected and we should just change our lowering or is it a bug that the translator didn't add a declaration for the extension even though it was explicitly specified with --spirv-ext=+SPV_EXT_shader_atomic_float_add?
Reproducer:
; ModuleID = 'start'
source_filename = "start"
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-G1"
target triple = "spir64-unknown-unknown"
declare float @_Z10atomic_addPU8CLglobalVff(ptr addrspace(1), float) local_unnamed_addr
define spir_kernel void @_Z16atomic_float_add13CLDeviceArrayI7Float32Li0ELi1EES0_(ptr byval({ { ptr addrspace(1), i64, i64 } }) %"counter::CLDeviceArray", float %"val::Float32") local_unnamed_addr {
conversion:
%0 = getelementptr inbounds { { ptr addrspace(1), i64, i64 } }, ptr %"counter::CLDeviceArray", i32 0, i32 0
%"counter::CLDeviceArray.unbox" = load ptr addrspace(1), ptr %0, align 8
%1 = call float @_Z10atomic_addPU8CLglobalVff(ptr addrspace(1) %"counter::CLDeviceArray.unbox", float %"val::Float32")
ret void
}
!llvm.module.flags = !{!0, !1}
!julia.kernel = !{!2}
!opencl.ocl.version = !{!3}
!opencl.spirv.version = !{!4}
!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{ptr @_Z16atomic_float_add13CLDeviceArrayI7Float32Li0ELi1EES0_}
!3 = !{i32 2, i32 0}
!4 = !{i32 1, i32 5}translated to:
; SPIR-V
; Version: 1.0
; Generator: Khronos LLVM/SPIR-V Translator; 14
; Bound: 33
; Schema: 0
OpCapability Addresses
OpCapability Linkage
OpCapability Kernel
OpCapability Int64
OpCapability Int8
%1 = OpExtInstImport "OpenCL.std"
OpMemoryModel Physical64 OpenCL
OpEntryPoint Kernel %28 "_Z16atomic_float_add13CLDeviceArrayI7Float32Li0ELi1EES0_"
OpExecutionMode %28 ContractionOff
OpSource OpenCL_C 200000
OpName %_Z10atomic_addPU8CLglobalVff "_Z10atomic_addPU8CLglobalVff"
OpName %_Z16atomic_float_add13CLDeviceArrayI7Float32Li0ELi1EES0_ "_Z16atomic_float_add13CLDeviceArrayI7Float32Li0ELi1EES0_"
OpName %counter__CLDeviceArray "counter::CLDeviceArray"
OpName %val__Float32 "val::Float32"
OpName %conversion "conversion"
OpName %counter__CLDeviceArray_unbox "counter::CLDeviceArray.unbox"
OpName %counter__CLDeviceArray_0 "counter::CLDeviceArray"
OpName %val__Float32_0 "val::Float32"
OpDecorate %_Z10atomic_addPU8CLglobalVff LinkageAttributes "_Z10atomic_addPU8CLglobalVff" Import
OpDecorate %_Z16atomic_float_add13CLDeviceArrayI7Float32Li0ELi1EES0_ LinkageAttributes "_Z16atomic_float_add13CLDeviceArrayI7Float32Li0ELi1EES0_" Export
OpDecorate %counter__CLDeviceArray FuncParamAttr ByVal
OpDecorate %counter__CLDeviceArray_0 FuncParamAttr ByVal
%uchar = OpTypeInt 8 0
%ulong = OpTypeInt 64 0
%uint = OpTypeInt 32 0
%uint_0 = OpConstant %uint 0
%float = OpTypeFloat 32
%_ptr_CrossWorkgroup_float = OpTypePointer CrossWorkgroup %float
%4 = OpTypeFunction %float %_ptr_CrossWorkgroup_float %float
%void = OpTypeVoid
%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar
%_struct_10 = OpTypeStruct %_ptr_CrossWorkgroup_uchar %ulong %ulong
%_struct_9 = OpTypeStruct %_struct_10
%_ptr_Function__struct_9 = OpTypePointer Function %_struct_9
%15 = OpTypeFunction %void %_ptr_Function__struct_9 %float
%_ptr_Function__struct_10 = OpTypePointer Function %_struct_10
%_ptr_Function__ptr_CrossWorkgroup_float = OpTypePointer Function %_ptr_CrossWorkgroup_float
%_Z10atomic_addPU8CLglobalVff = OpFunction %float None %4
%6 = OpFunctionParameter %_ptr_CrossWorkgroup_float
%7 = OpFunctionParameter %float
OpFunctionEnd
%_Z16atomic_float_add13CLDeviceArrayI7Float32Li0ELi1EES0_ = OpFunction %void None %15
%counter__CLDeviceArray = OpFunctionParameter %_ptr_Function__struct_9
%val__Float32 = OpFunctionParameter %float
%conversion = OpLabel
%23 = OpInBoundsPtrAccessChain %_ptr_Function__struct_10 %counter__CLDeviceArray %uint_0 %uint_0
%25 = OpBitcast %_ptr_Function__ptr_CrossWorkgroup_float %23
%counter__CLDeviceArray_unbox = OpLoad %_ptr_CrossWorkgroup_float %25 Aligned 8
%27 = OpFunctionCall %float %_Z10atomic_addPU8CLglobalVff %counter__CLDeviceArray_unbox %val__Float32
OpReturn
OpFunctionEnd
%28 = OpFunction %void None %15
%counter__CLDeviceArray_0 = OpFunctionParameter %_ptr_Function__struct_9
%val__Float32_0 = OpFunctionParameter %float
%31 = OpLabel
%32 = OpFunctionCall %void %_Z16atomic_float_add13CLDeviceArrayI7Float32Li0ELi1EES0_ %counter__CLDeviceArray_0 %val__Float32_0
OpReturn
OpFunctionEnd