diff --git a/ockl/src/gaaf.cl b/ockl/src/gaaf.cl index cb5ed822..2821aeaa 100644 --- a/ockl/src/gaaf.cl +++ b/ockl/src/gaaf.cl @@ -17,16 +17,28 @@ extern void __llvm_amdgcn_global_atomic_fadd_p1f32_f32(__global float *, float) __asm("llvm.amdgcn.global.atomic.fadd.p1f32.f32"); +__attribute__((target("atomic-fadd-insts"))) static void +global_atomic_fadd(__global float *p, float v) +{ + __llvm_amdgcn_global_atomic_fadd_p1f32_f32(p, v); +} + +static void +generic_atomic_fadd(float *p, float v) +{ + atomic_uint *t = (atomic_uint *)p; + uint e = AL(t, memory_order_relaxed, memory_scope_device); + while (!AC(t, &e, AS_UINT(v + AS_FLOAT(e)), memory_order_relaxed, memory_order_relaxed, memory_scope_device)) + ; +} + void __ockl_atomic_add_noret_f32(float *p, float v) { if (__oclc_ISA_version == 9008 && !__ockl_is_local_addr(p) && !__ockl_is_private_addr(p)) { - __llvm_amdgcn_global_atomic_fadd_p1f32_f32((__global float *)p, v); + global_atomic_fadd((__global float *)p, v); } else { - atomic_uint *t = (atomic_uint *)p; - uint e = AL(t, memory_order_relaxed, memory_scope_device); - while (!AC(t, &e, AS_UINT(v + AS_FLOAT(e)), memory_order_relaxed, memory_order_relaxed, memory_scope_device)) - ; + generic_atomic_fadd(p, v); } }