@@ -355,42 +355,24 @@ DECLARE_MMAD_EMU(mmad8x8_bf16, bf16_dot2, 8, 8, short8, int8, float8)
355355#endif
356356
357357// Atomics
358- #if __OPENCL_C_VERSION__ >= 200
359358#define DECLARE_ATOMIC_OP (op , type ) \
360359 type __attribute__((overloadable)) CONCAT3(atomic_, op, _global)( \
361360 volatile global CONCAT2(atomic_, type) * source, type operand) { \
362361 return CONCAT3(atomic_fetch_, op, _explicit)( \
363362 source, operand, memory_order_relaxed); \
364363 }
365-
366- #if defined(cl_intel_global_float_atomics ) \
367- || (defined(cl_ext_float_atomics ) \
368- && defined(__opencl_c_ext_fp32_global_atomic_add ))
364+ #if __OPENCL_C_VERSION__ >= 200
365+ // Atomic operations require:
366+ // 1. The cl_ext_float_atomics extension (for all float functions)
367+ // 2. the __opencl_c_ext_fp32_global_atomic_add feature (for float add/sub)
368+ // 3. the __opencl_c_ext_fp32_global_atomic_min_max feature (for float min/max)
369+ // All intel GPUs should support these on up-to-date drivers, for all archs
370+ // gen9 and later
369371DECLARE_ATOMIC_OP (add , float )
370372DECLARE_ATOMIC_OP (sub , float )
371- #else // float atomics
372- inline float atomic_add_global (
373- volatile __global atomic_float * source , float operand ) {
374- float old_val = atomic_load_explicit (
375- source , memory_order_relaxed , memory_scope_device );
376- bool success = false;
377- do {
378- float new_val = old_val + operand ;
379- success = atomic_compare_exchange_strong_explicit (source , & old_val ,
380- new_val , memory_order_acq_rel , memory_order_relaxed ,
381- memory_scope_device );
382- } while (!success );
383- return old_val ;
384- }
385- #endif
386373
387- #if defined(cl_intel_global_float_atomics ) \
388- || (defined(cl_ext_float_atomics ) \
389- && defined(__opencl_c_ext_fp32_global_atomic_min_max ))
390374DECLARE_ATOMIC_OP (min , float )
391375DECLARE_ATOMIC_OP (max , float )
392376#endif
393377
394- #endif
395-
396- #endif
378+ #endif
0 commit comments