diff --git a/asanrtl/inc/asan_type_decls.h b/asanrtl/inc/asan_type_decls.h deleted file mode 100644 index 070aa4a..0000000 --- a/asanrtl/inc/asan_type_decls.h +++ /dev/null @@ -1,17 +0,0 @@ -/*===-------------------------------------------------------------------------- - * ROCm Device Libraries - * - * This file is distributed under the University of Illinois Open Source - * License. See LICENSE.TXT for details. - *===------------------------------------------------------------------------*/ - -// Provides short-hands for types used in runtime for -// parity with host sanitizer runtime - -#pragma once -typedef ulong uptr; -typedef unsigned char u8; -typedef signed char s8; -typedef unsigned short u16; -typedef short s16; -typedef unsigned long u64; diff --git a/asanrtl/inc/asan_util.h b/asanrtl/inc/asan_util.h new file mode 100644 index 0000000..941b843 --- /dev/null +++ b/asanrtl/inc/asan_util.h @@ -0,0 +1,60 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#pragma once +#include "ockl.h" + +typedef ulong uptr; +typedef unsigned char u8; +typedef signed char s8; +typedef unsigned short u16; +typedef short s16; +typedef unsigned long u64; + +#define ASAN_SHADOW 3 + +#define SHADOW_GRANULARITY (1ULL << ASAN_SHADOW) + +#define GET_CALLER_PC() (uptr) __builtin_return_address(0) + +#define WORKGROUP_ID(dim) __builtin_amdgcn_workgroup_id_##dim() + +#define OPT_NONE __attribute__((optnone)) + +#define NO_SANITIZE_ADDR __attribute__((no_sanitize("address"))) + +#define REPORT_IMPL(caller_pc, addr, is_write, size, no_abort) \ + uptr read = is_write; \ + if (no_abort) \ + read |= 0xFFFFFFFF00000000; \ + \ + __ockl_sanitizer_report(addr, caller_pc, WORKGROUP_ID(x), WORKGROUP_ID(y), \ + WORKGROUP_ID(z), __ockl_get_local_linear_id(), \ + read, size); + +NO_SANITIZE_ADDR +static bool +is_aligned_by_granularity(uptr addr) +{ + return (addr & (SHADOW_GRANULARITY - 1)) == 0; +} + +// round up size to the nearest multiple of boundary. +NO_SANITIZE_ADDR +static uptr +round_upto(uptr size, uptr boundary) +{ + return (size + boundary - 1) & ~(boundary - 1); +} + +// round down size to the nearest multiple of boundary. +NO_SANITIZE_ADDR +static uptr +round_downto(uptr size, uptr boundary) +{ + return size & ~(boundary - 1); +} diff --git a/asanrtl/inc/globals.h b/asanrtl/inc/globals.h index c480fc7..8130b07 100644 --- a/asanrtl/inc/globals.h +++ b/asanrtl/inc/globals.h @@ -6,7 +6,7 @@ *===------------------------------------------------------------------------*/ #pragma once -#include "shadow_mapping.h" +#include "asan_util.h" // The strucutures semantics and layout must match the host instrumented // global variable as defined in diff --git a/asanrtl/inc/shadow_mapping.h b/asanrtl/inc/shadow_mapping.h index cf68e00..16eec88 100644 --- a/asanrtl/inc/shadow_mapping.h +++ b/asanrtl/inc/shadow_mapping.h @@ -6,11 +6,7 @@ *===------------------------------------------------------------------------*/ #pragma once -#include "asan_type_decls.h" - -#define ASAN_SHADOW 3 - -#define SHADOW_GRANULARITY (1ULL << ASAN_SHADOW) +#include "asan_util.h" //offset from llvm/compiler-rt/lib/asan/asan_mapping.h static const u64 kh_Linux64bit_ShadowOffset = @@ -18,10 +14,8 @@ static const u64 kh_Linux64bit_ShadowOffset = #define MEM_TO_SHADOW(mem_addr) (((mem_addr) >> ASAN_SHADOW) + kh_Linux64bit_ShadowOffset) -#define NO_SANITIZE_ADDR __attribute__((no_sanitize("address"))) - -//address are atleast SHADOW_GRANULARITY aligned -//true, when given byte is accessible false otherwise +// Addresses are atleast SHADOW_GRANULARITY aligned. +// True, when given byte is accessible false otherwise. NO_SANITIZE_ADDR static bool is_address_poisoned(uptr addr) @@ -35,21 +29,6 @@ is_address_poisoned(uptr addr) return false; } -//check all application bytes in [beg,beg+size) range are accessible NO_SANITIZE_ADDR -static bool -is_region_poisoned(uptr beg, uptr size) -{ - uptr end = beg + size - 1; - // Fast path - check first and last application bytes - if (is_address_poisoned(beg) || - is_address_poisoned(end)) - return true; - - // check all inner bytes - for (uptr addr = beg+1; addr < end; addr++){ - if (is_address_poisoned(addr)) - return true; - } - return false; -} +uptr +__asan_region_is_poisoned(uptr beg, uptr size); diff --git a/asanrtl/src/globals.cl b/asanrtl/src/globals.cl index aa37427..21acfe7 100644 --- a/asanrtl/src/globals.cl +++ b/asanrtl/src/globals.cl @@ -5,24 +5,9 @@ * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ +#include "asan_util.h" #include "globals.h" - -static bool -is_aligned_by_granularity(uptr addr) { - return (addr & (SHADOW_GRANULARITY - 1)) == 0; -} - -// round up size to the nearest multiple of boundary. -static uptr -round_upto(uptr size, uptr boundary) { - return (size + boundary - 1) & ~(boundary - 1); -} - -// round down size to the nearest multiple of boundary. -static uptr -round_downto(uptr size, uptr boundary) { - return size & ~(boundary - 1); -} +#include "shadow_mapping.h" // fill shadow bytes of range [aligned_beg, aligned_beg+aligned_size) // with value. diff --git a/asanrtl/src/memintrinsics.cl b/asanrtl/src/memintrinsics.cl new file mode 100644 index 0000000..e6a0fd9 --- /dev/null +++ b/asanrtl/src/memintrinsics.cl @@ -0,0 +1,61 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "asan_util.h" +#include "shadow_mapping.h" + +OPT_NONE +NO_SANITIZE_ADDR +static void +check_memory_range_accessible(void* dest, const void* src, + uptr size, uptr pc) { + if (size == 0) + return; + uptr invalid_addr = 0; + uptr src_addr = (uptr)src; + invalid_addr = __asan_region_is_poisoned(src_addr, size); + if (invalid_addr) { + REPORT_IMPL(pc, invalid_addr, false, size, false) + } + uptr dest_addr = (uptr)dest; + invalid_addr = __asan_region_is_poisoned(dest_addr, size); + if (invalid_addr) { + REPORT_IMPL(pc, invalid_addr, true, size, false) + } +} + +OPT_NONE +NO_SANITIZE_ADDR +void* +__asan_memcpy(void* to, const void* from, uptr size) { + uptr pc = GET_CALLER_PC(); + check_memory_range_accessible(to, from, size, pc); + return __builtin_memcpy(to, from, size); +} + +OPT_NONE +NO_SANITIZE_ADDR +void* +__asan_memmove(void* to, const void* from, uptr size) { + uptr pc = GET_CALLER_PC(); + check_memory_range_accessible(to, from, size, pc); + return __builtin_memmove(to, from, size); +} + +OPT_NONE +NO_SANITIZE_ADDR +void* +__asan_memset(void* s, int c, uptr n) { + uptr pc = GET_CALLER_PC(); + uptr src_addr = (uptr)s; + uptr invalid_addr = 0; + invalid_addr = __asan_region_is_poisoned(src_addr, n); + if (invalid_addr) { + REPORT_IMPL(pc, invalid_addr, true, n, false) + } + return __builtin_memset(s, c, n); +} diff --git a/asanrtl/src/preserve.cl b/asanrtl/src/preserve.cl index 02bf72a..c8a9e79 100644 --- a/asanrtl/src/preserve.cl +++ b/asanrtl/src/preserve.cl @@ -54,6 +54,7 @@ extern void __asan_store_n (uptr addr, uptr size); extern void __asan_store_n_noabort (uptr addr, uptr size); extern void __asan_load_n (uptr addr, uptr size); extern void __asan_load_n_noabort (uptr addr, uptr size); +extern uptr __asan_region_is_poisoned(uptr beg, uptr size); extern void* __asan_memmove(void* to, void* from, uptr size); extern void* __asan_memcpy(void* to, void* from, uptr size); extern void* __asan_memset(void* s, int c, uptr n); @@ -122,6 +123,7 @@ __amdgpu_device_library_preserve_asan_functions(void) __asan_store_n_noabort(0, 0); __asan_load_n(0, 0); __asan_load_n_noabort(0, 0); + __asan_region_is_poisoned(0, 0); (void)__asan_memmove((void*)0, (void*)0, 0); (void)__asan_memcpy((void*)0, (void*)0, 0); (void)__asan_memset((void*)0, 0, 0); diff --git a/asanrtl/src/report.cl b/asanrtl/src/report.cl index 976ecb1..a01d23d 100644 --- a/asanrtl/src/report.cl +++ b/asanrtl/src/report.cl @@ -5,27 +5,9 @@ * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ -#include "ockl.h" -#include "asan_type_decls.h" +#include "asan_util.h" #include "shadow_mapping.h" -#define GET_CALLER_PC() \ - (uptr) __builtin_return_address(0) - -#define WORKGROUP_ID(dim) \ - __builtin_amdgcn_workgroup_id_ ##dim() - -#define OPT_NONE __attribute__((optnone)) - -#define REPORT_IMPL(caller_pc, addr, is_write, size, no_abort) \ - uptr read = is_write; \ - if (no_abort) \ - read |= 0xFFFFFFFF00000000; \ - \ - __ockl_sanitizer_report(addr, caller_pc, WORKGROUP_ID(x), WORKGROUP_ID(y), \ - WORKGROUP_ID(z), __ockl_get_local_linear_id(), read, size); \ - - #define ASAN_REPORT_ERROR(type, size, is_write) \ OPT_NONE \ void __asan_report_ ## type ## size(uptr addr) { \ @@ -108,14 +90,14 @@ ASAN_ERROR(store, 16,1) OPT_NONE NO_SANITIZE_ADDR \ void __asan_ ## type ## _n(uptr addr, uptr size) { \ uptr caller_pc = GET_CALLER_PC(); \ - if (is_region_poisoned(addr, size)) { \ + if (__asan_region_is_poisoned(addr, size)) { \ REPORT_IMPL(caller_pc, addr, is_write, size, false) \ } \ } \ OPT_NONE NO_SANITIZE_ADDR \ void __asan_ ## type ## _n_noabort(uptr addr, uptr size) { \ uptr caller_pc = GET_CALLER_PC(); \ - if (is_region_poisoned(addr, size)) { \ + if (__asan_region_is_poisoned(addr, size)) { \ REPORT_IMPL(caller_pc, addr, is_write, size, true) \ } \ } \ diff --git a/asanrtl/src/shadow_mapping.cl b/asanrtl/src/shadow_mapping.cl new file mode 100644 index 0000000..e18c65a --- /dev/null +++ b/asanrtl/src/shadow_mapping.cl @@ -0,0 +1,47 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "shadow_mapping.h" + +NO_SANITIZE_ADDR +static uptr +range_check(uptr beg, uptr end) { + uptr aligned_beg = round_downto(beg, SHADOW_GRANULARITY); + uptr aligned_end = round_downto(end, SHADOW_GRANULARITY); + uptr shadow_beg = MEM_TO_SHADOW(aligned_beg); + uptr shadow_end = MEM_TO_SHADOW(aligned_end); + uptr nbytes = (shadow_end - shadow_beg)+1; + uptr shadow_byte_count = 0; + while (shadow_beg <= shadow_end) { + s8 shadow_value = *(__global s8 *)shadow_beg; + if (shadow_value) + break; + shadow_byte_count++; + shadow_beg++; + } + if (shadow_byte_count == nbytes) + return 0; + uptr start_addr = round_downto(beg + (shadow_byte_count*SHADOW_GRANULARITY), SHADOW_GRANULARITY); + return start_addr; +} + +//check all application bytes in [beg,beg+size) range are accessible +NO_SANITIZE_ADDR +uptr +__asan_region_is_poisoned(uptr beg, uptr size) +{ + uptr end = beg + size - 1; + uptr start_addr = range_check(beg, end); + if (start_addr != 0) { + // loop through the range to find accessible address. + for (uptr addr = start_addr; addr <= end; ++addr) { + if (is_address_poisoned(addr)) + return addr; + } + } + return 0; +} diff --git a/asanrtl/src/stubs.cl b/asanrtl/src/stubs.cl index 2e6180d..c68fe1a 100644 --- a/asanrtl/src/stubs.cl +++ b/asanrtl/src/stubs.cl @@ -7,13 +7,6 @@ typedef ulong uptr; - -void* __asan_memmove(void* to, void* from, uptr size) { return to; } - -void* __asan_memcpy(void* to, void* from, uptr size) { return to; } - -void* __asan_memset(void* s, int c, uptr n) { return s; } - void __asan_handle_no_return(void) {} void __sanitizer_ptr_cmp(uptr a, uptr b) {} diff --git a/ockl/src/services.cl b/ockl/src/services.cl index db936ea..176aecf 100644 --- a/ockl/src/services.cl +++ b/ockl/src/services.cl @@ -401,7 +401,7 @@ __ockl_sanitizer_report(ulong addr, ulong pc, ulong wgidx, ulong wgidy, /*===--- DEVMEM ----------------------------------------------------------*/ -ulong +WEAK_ATTR ulong __ockl_devmem_request(ulong addr, ulong size) { long2 result = __ockl_hostcall_preview(SERVICE_DEVMEM, addr, size, 0, 0, 0, 0, 0, 0); diff --git a/ocml/src/tanred2D.cl b/ocml/src/tanred2D.cl index 18dd4bf..39c6864 100644 --- a/ocml/src/tanred2D.cl +++ b/ocml/src/tanred2D.cl @@ -7,9 +7,31 @@ #include "mathD.h" +#define DOUBLE_SPECIALIZATION +#include "ep.h" + +#define NOCFLOW + CONSTATTR double MATH_PRIVATE(tanred2)(double x, double xx, int sel) { +#if defined NOCFLOW + double s = sqr(con(x,xx)).hi; + double p = s * MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, + MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, + MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, + MATH_MAD(s, + 0x1.5e089c751c08cp-16, -0x1.78809a9a29f71p-15), + 0x1.7746f90a8aaep-14), -0x1.bb44da6fbf144p-16), + 0x1.1e634a7943acfp-13), 0x1.d250fdeb68febp-13), + 0x1.37fd9b58c4d95p-11), 0x1.7d5af15120e2cp-10), + 0x1.d6d93e09491dfp-9), 0x1.226e12033784dp-7), + 0x1.664f49ac36ae2p-6), 0x1.ba1ba1b451c21p-5), + 0x1.11111111185b7p-3), 0x1.55555555554eep-2); + double2 t = fadd(con(x,xx), mul(x, p)); + double2 tr = frcp(t); + return sel ? -tr.hi : t.hi; +#else const double piby4_lead = 0x1.921fb54442d18p-1; const double piby4_tail = 0x1.1a62633145c06p-55; @@ -55,16 +77,16 @@ MATH_PRIVATE(tanred2)(double x, double xx, int sel) } else { if (sel) { // Compute -1.0/(t1 + t2) accurately - double z1 = AS_DOUBLE(AS_LONG(tp) & 0xffffffff00000000L); - double z2 = t2 - (z1 - t1); - double trec = -MATH_FAST_RCP(tp); - double trec_top = AS_DOUBLE(AS_LONG(trec) & 0xffffffff00000000L); - ret = MATH_MAD(MATH_MAD(trec_top, z2, MATH_MAD(trec_top, z1, 1.0)), trec, trec_top); + double tq = t2 - (tp - t1); + double tr = -MATH_FAST_RCP(tp); + double e = MATH_MAD(tr, tq, MATH_MAD(tr, tp, 1.0)); + ret = MATH_MAD(e, tr, tr); } else { ret = tp; } } return ret; +#endif } diff --git a/ocml/src/tanredF.cl b/ocml/src/tanredF.cl index b1a196c..70fcbd2 100644 --- a/ocml/src/tanredF.cl +++ b/ocml/src/tanredF.cl @@ -14,17 +14,25 @@ MATH_PRIVATE(tanred)(float x, int i) float s = x * x; #if defined MORE_ACCURACY - float t = MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, - MATH_MAD(s, MATH_MAD(s, - 0x1.65e368p-8f, -0x1.334754p-9f), 0x1.a93cacp-7f), 0x1.4d80eap-6f), - 0x1.bc8056p-5f), 0x1.1103bep-3f), 0x1.555578p-2f); - t = MATH_MAD(x*s, t, x); - float tr = -MATH_RCP(t); + float p = s * MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, MATH_MAD(s, + MATH_MAD(s, + 0x1.33d5e6p-7f, 0x1.9697f8p-9f), 0x1.907be2p-6f), 0x1.b581ap-5f), + 0x1.112e2p-3f), 0x1.5554dcp-2f); #else float a = MATH_MAD(s, -0x1.19dba6p-6f, 0x1.8a8b0ep-2f); float b = MATH_MAD(s, MATH_MAD(s, 0x1.2e2900p-6f, -0x1.07266ep-1f), 0x1.27e84ap+0f); - float t = MATH_MAD(x*s, MATH_FAST_DIV(a, b), x); + float p = s * MATH_FAST_DIV(a,b); +#endif + +#if defined LESS_ACCURACY + float t = MATH_MAD(p, x, x); + float tr = -MATH_FAST_RCP(t); +#else + float t = BUILTIN_FMA_F32(p, x, x); + float tt = BUILTIN_FMA_F32(p, x, -(t - x)); float tr = -MATH_FAST_RCP(t); + float e = BUILTIN_FMA_F32(tt, tr, BUILTIN_FMA_F32(t, tr, 1.0f)); + tr = BUILTIN_FMA_F32(e, tr, tr); #endif return i ? tr : t; diff --git a/ocml/src/tgammaF.cl b/ocml/src/tgammaF.cl index 0b4654c..a4c2e8f 100644 --- a/ocml/src/tgammaF.cl +++ b/ocml/src/tgammaF.cl @@ -16,7 +16,7 @@ MATH_MANGLE(tgamma)(float x) float ax = BUILTIN_ABS_F32(x); float ret; - if (ax > 0.0125f) { + if (ax > 0x1.0p-6f) { // For x < 3, push to larger value using gamma(x) = gamma(x+1) / x float d = 1.0f; if (x < 1.0f) { @@ -48,16 +48,11 @@ MATH_MANGLE(tgamma)(float x) ret = BUILTIN_FRACTION_F32(x) == 0.0f ? AS_FLOAT(QNANBITPATT_SP32) : ret; } } else { - float p = MATH_MAD(ax, - MATH_MAD(ax, - MATH_MAD(ax, 0.95758557809281868459f, -0.90729132749086121523f), - 0.98905552641429454945f), - -0.57721566471808262829f); - if (BUILTIN_CLASS_F32(x, CLASS_PZER|CLASS_PSUB|CLASS_PNOR)) - ret = MATH_RCP(ax) + p; - else - ret = MATH_DIV(pi, MATH_MAD(ax, p, 1.0f) * MATH_MANGLE(sinpi)(x)); + ret = MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, + 0x1.f6a510p-1f, -0x1.d0a118p-1f), 0x1.fa658cp-1f), -0x1.2788d0p-1f) + + 4.0f*MATH_FAST_RCP(4.0f*x); } + return ret; } diff --git a/ocml/src/trigredlargeF.cl b/ocml/src/trigredlargeF.cl index 94ea8ae..ea32daf 100644 --- a/ocml/src/trigredlargeF.cl +++ b/ocml/src/trigredlargeF.cl @@ -8,14 +8,6 @@ #include "mathF.h" #include "trigredF.h" -#define FULL_MUL(A, B, HI, LO) \ - LO = A * B; \ - HI = BUILTIN_MULHI_U32(A, B) - -#define FULL_MAD(A, B, C, HI, LO) \ - LO = BUILTIN_MAD_U32(A, B, C); \ - HI = BUILTIN_MULHI_U32(A, B); \ - HI += LO < C CONSTATTR struct redret MATH_PRIVATE(trigredlarge)(float x) @@ -32,15 +24,16 @@ MATH_PRIVATE(trigredlarge)(float x) const uint b1 = 0x3C439041U; const uint b0 = 0xFE5163ABU; - uint p0, p1, p2, p3, p4, p5, p6, p7, c0, c1; + uint p0, p1, p2, p3, p4, p5, p6, p7; + ulong a; - FULL_MUL(xm, b0, c0, p0); - FULL_MAD(xm, b1, c0, c1, p1); - FULL_MAD(xm, b2, c1, c0, p2); - FULL_MAD(xm, b3, c0, c1, p3); - FULL_MAD(xm, b4, c1, c0, p4); - FULL_MAD(xm, b5, c0, c1, p5); - FULL_MAD(xm, b6, c1, p7, p6); + a = (ulong)xm * (ulong)b0; p0 = a; a >>= 32; + a = (ulong)xm * (ulong)b1 + a; p1 = a; a >>= 32; + a = (ulong)xm * (ulong)b2 + a; p2 = a; a >>= 32; + a = (ulong)xm * (ulong)b3 + a; p3 = a; a >>= 32; + a = (ulong)xm * (ulong)b4 + a; p4 = a; a >>= 32; + a = (ulong)xm * (ulong)b5 + a; p5 = a; a >>= 32; + a = (ulong)xm * (ulong)b6 + a; p6 = a; p7 = a >> 32; uint fbits = 224 + 23 - xe; @@ -49,24 +42,14 @@ MATH_PRIVATE(trigredlarge)(float x) uint shift = 256U - 2 - fbits; // Shift by up to 134/32 = 4 words - int c = shift > 31; - p7 = c ? p6 : p7; - p6 = c ? p5 : p6; - p5 = c ? p4 : p5; - p4 = c ? p3 : p4; - p3 = c ? p2 : p3; - p2 = c ? p1 : p2; - p1 = c ? p0 : p1; - shift -= (-c) & 32; - - c = shift > 31; - p7 = c ? p6 : p7; - p6 = c ? p5 : p6; - p5 = c ? p4 : p5; - p4 = c ? p3 : p4; - p3 = c ? p2 : p3; - p2 = c ? p1 : p2; - shift -= (-c) & 32; + int c = shift > 63; + p7 = c ? p5 : p7; + p6 = c ? p4 : p6; + p5 = c ? p3 : p5; + p4 = c ? p2 : p4; + p3 = c ? p1 : p3; + p2 = c ? p0 : p2; + shift -= (-c) & 64; c = shift > 31; p7 = c ? p6 : p7; diff --git a/opencl/src/misc/atom.cl b/opencl/src/misc/atom.cl index 82ef537..950e4f3 100644 --- a/opencl/src/misc/atom.cl +++ b/opencl/src/misc/atom.cl @@ -35,27 +35,27 @@ #define RC_float(X) as_float(X) #define RC_double(X) as_double(X) -#define PC_int (VOLATILE atomic_int *) -#define PC_uint (VOLATILE atomic_uint *) -#define PC_long (VOLATILE atomic_long *) -#define PC_ulong (VOLATILE atomic_ulong *) -#define PC_intptr_t (VOLATILE atomic_intptr_t *) -#define PC_uintptr_t (VOLATILE atomic_uintptr_t *) -#define PC_size_t (VOLATILE atomic_size_t *) -#define PC_ptrdiff_t (VOLATILE atomic_ptrdiff_t *) -#define PC_float (VOLATILE atomic_int *) -#define PC_double (VOLATILE atomic_long *) - -#define EC_int -#define EC_uint -#define EC_long -#define EC_ulong -#define EC_intptr_t -#define EC_uintptr_t -#define EC_size_t -#define EC_ptrdiff_t -#define EC_float (int *) -#define EC_double (long *) +#define AT_int atomic_int +#define AT_uint atomic_uint +#define AT_long atomic_long +#define AT_ulong atomic_ulong +#define AT_intptr_t atomic_intptr_t +#define AT_uintptr_t atomic_uintptr_t +#define AT_size_t atomic_size_t +#define AT_ptrdiff_t atomic_ptrdiff_t +#define AT_float atomic_int +#define AT_double atomic_long + +#define ET_int int +#define ET_uint uint +#define ET_long long +#define ET_ulong ulong +#define ET_intptr_t intptr_t +#define ET_uintptr_t uintptr_t +#define ET_size_t size_t +#define ET_ptrdiff_t ptrdiff_t +#define ET_float int +#define ET_double long #define OCL12_MEMORY_ORDER memory_order_relaxed #define OCL12_MEMORY_SCOPE memory_scope_device @@ -68,14 +68,14 @@ ATTR T \ atom_##O(volatile A T *p, T v) \ { \ - return __opencl_atomic_fetch_##O((VOLATILE atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ + return __opencl_atomic_fetch_##O((VOLATILE A atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ } #define GEN2(T,A,O) \ ATTR T \ atomic_##O(volatile A T *p, T v) \ { \ - return __opencl_atomic_fetch_##O((VOLATILE atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ + return __opencl_atomic_fetch_##O((VOLATILE A atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ } #define OPSA(F,T,A) \ @@ -116,14 +116,14 @@ ALL() ATTR T \ atom_##O(volatile A T *p) \ { \ - return F_##O((VOLATILE atomic_##T *)p, (T)1, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ + return F_##O((VOLATILE A atomic_##T *)p, (T)1, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ } #define GEN2(T,A,O) \ ATTR T \ atomic_##O(volatile A T *p) \ { \ - return F_##O((VOLATILE atomic_##T *)p, (T)1, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ + return F_##O((VOLATILE A atomic_##T *)p, (T)1, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ } ALL() @@ -138,14 +138,14 @@ ALL() ATTR T \ atom_xchg(volatile A T *p, T v) \ { \ - return __opencl_atomic_exchange((VOLATILE atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ + return __opencl_atomic_exchange((VOLATILE A atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ } #define GEN2(T,A) \ ATTR T \ atomic_xchg(volatile A T *p, T v) \ { \ - return __opencl_atomic_exchange((VOLATILE atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ + return __opencl_atomic_exchange((VOLATILE A atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ } #define OPS(F,T) \ @@ -159,7 +159,7 @@ ALL() ATTR float \ atomic_xchg(volatile A float *p, float v) \ { \ - return as_float(__opencl_atomic_exchange((VOLATILE atomic_int *)p, as_int(v), OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE)); \ + return as_float(__opencl_atomic_exchange((VOLATILE A atomic_int *)p, as_int(v), OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE)); \ } G(__local) @@ -175,7 +175,7 @@ G() ATTR T \ atom_cmpxchg(volatile A T *p, T e, T d) \ { \ - __opencl_atomic_compare_exchange_strong((VOLATILE atomic_##T *)p, &e, d, OCL12_MEMORY_ORDER, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ + __opencl_atomic_compare_exchange_strong((VOLATILE A atomic_##T *)p, &e, d, OCL12_MEMORY_ORDER, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ return e; \ } @@ -183,7 +183,7 @@ atom_cmpxchg(volatile A T *p, T e, T d) \ ATTR T \ atomic_cmpxchg(volatile A T *p, T e, T d) \ { \ - __opencl_atomic_compare_exchange_strong((VOLATILE atomic_##T *)p, &e, d, OCL12_MEMORY_ORDER, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ + __opencl_atomic_compare_exchange_strong((VOLATILE A atomic_##T *)p, &e, d, OCL12_MEMORY_ORDER, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ return e; \ } @@ -193,211 +193,254 @@ ALL() #undef ALL // 2.0 functions +#undef EXPLICIT_ASPACES -#define GENI(T) \ +#define GENIA(A,T) \ ATTR void \ -atomic_init(volatile atomic_##T *p, T v) \ +atomic_init(volatile A atomic_##T *p, T v) \ { \ - __opencl_atomic_init((VOLATILE atomic_##T *)p, v); \ + __opencl_atomic_init((VOLATILE A atomic_##T *)p, v); \ } -#define GENS(T) \ +#define GENSA(A,T) \ ATTR void \ -atomic_store(volatile atomic_##T *p, T v) \ +atomic_store(volatile A atomic_##T *p, T v) \ { \ - __opencl_atomic_store((VOLATILE atomic_##T *)p, v, memory_order_seq_cst, memory_scope_device); \ + __opencl_atomic_store((VOLATILE A atomic_##T *)p, v, memory_order_seq_cst, memory_scope_device); \ } \ \ ATTR void \ -atomic_store_explicit(volatile atomic_##T *p, T v, memory_order o) \ +atomic_store_explicit(volatile A atomic_##T *p, T v, memory_order o) \ { \ - __opencl_atomic_store((VOLATILE atomic_##T *)p, v, o, memory_scope_device); \ + __opencl_atomic_store((VOLATILE A atomic_##T *)p, v, o, memory_scope_device); \ } \ \ ATTR void \ -atomic_store_explicit(volatile atomic_##T *p, T v, memory_order o, memory_scope s) \ +atomic_store_explicit(volatile A atomic_##T *p, T v, memory_order o, memory_scope s) \ { \ - __opencl_atomic_store((VOLATILE atomic_##T *)p, v, o, s); \ + __opencl_atomic_store((VOLATILE A atomic_##T *)p, v, o, s); \ } -#define GENL(T) \ +#define GENLA(A,T) \ ATTR T \ -atomic_load(volatile atomic_##T *p) \ +atomic_load(volatile A atomic_##T *p) \ { \ - return __opencl_atomic_load((VOLATILE atomic_##T *)p, memory_order_seq_cst, memory_scope_device); \ + return __opencl_atomic_load((VOLATILE A atomic_##T *)p, memory_order_seq_cst, memory_scope_device); \ } \ \ ATTR T \ -atomic_load_explicit(volatile atomic_##T *p, memory_order o) \ +atomic_load_explicit(volatile A atomic_##T *p, memory_order o) \ { \ - return __opencl_atomic_load((VOLATILE atomic_##T *)p, o, memory_scope_device); \ + return __opencl_atomic_load((VOLATILE A atomic_##T *)p, o, memory_scope_device); \ } \ \ ATTR T \ -atomic_load_explicit(volatile atomic_##T *p, memory_order o, memory_scope s) \ +atomic_load_explicit(volatile A atomic_##T *p, memory_order o, memory_scope s) \ { \ - return __opencl_atomic_load((VOLATILE atomic_##T *)p, o, s); \ + return __opencl_atomic_load((VOLATILE A atomic_##T *)p, o, s); \ } -#define GENX(T) \ +#define GENXA(A,T) \ ATTR T \ -atomic_exchange(volatile atomic_##T *p, T v) \ +atomic_exchange(volatile A atomic_##T *p, T v) \ { \ - return RC_##T(__opencl_atomic_exchange(PC_##T p, AC_##T(v), memory_order_seq_cst, memory_scope_device)); \ + return RC_##T(__opencl_atomic_exchange((VOLATILE A AT_##T *)p, AC_##T(v), memory_order_seq_cst, memory_scope_device)); \ } \ \ ATTR T \ -atomic_exchange_explicit(volatile atomic_##T *p, T v, memory_order o) \ +atomic_exchange_explicit(volatile A atomic_##T *p, T v, memory_order o) \ { \ - return RC_##T(__opencl_atomic_exchange(PC_##T p, AC_##T(v), o, memory_scope_device)); \ + return RC_##T(__opencl_atomic_exchange((VOLATILE A AT_##T *)p, AC_##T(v), o, memory_scope_device)); \ } \ \ ATTR T \ -atomic_exchange_explicit(volatile atomic_##T *p, T v, memory_order o, memory_scope s) \ +atomic_exchange_explicit(volatile A atomic_##T *p, T v, memory_order o, memory_scope s) \ { \ - return RC_##T(__opencl_atomic_exchange(PC_##T p, AC_##T(v), o, s)); \ + return RC_##T(__opencl_atomic_exchange((VOLATILE A AT_##T *)p, AC_##T(v), o, s)); \ } -#define GENCX(T,K) \ +#define GENCXAA(AP,AE,T,K) \ ATTR bool \ -atomic_compare_exchange_##K(volatile atomic_##T *p, T *e, T d) \ +atomic_compare_exchange_##K(volatile AP atomic_##T *p, AE T *e, T d) \ { \ - return __opencl_atomic_compare_exchange_##K(PC_##T p, EC_##T e, AC_##T(d), memory_order_seq_cst, memory_order_seq_cst, memory_scope_device); \ + return __opencl_atomic_compare_exchange_##K((VOLATILE AP AT_##T *) p, (AE ET_##T *) e, AC_##T(d), memory_order_seq_cst, memory_order_seq_cst, memory_scope_device); \ } \ \ ATTR bool \ -atomic_compare_exchange_##K##_explicit(volatile atomic_##T *p, T *e, T d, memory_order os, memory_order of) \ +atomic_compare_exchange_##K##_explicit(volatile AP atomic_##T *p, AE T *e, T d, memory_order os, memory_order of) \ { \ - return __opencl_atomic_compare_exchange_##K(PC_##T p, EC_##T e, AC_##T(d), os, of, memory_scope_device); \ + return __opencl_atomic_compare_exchange_##K((VOLATILE AP AT_##T *)p, (AE ET_##T *)e, AC_##T(d), os, of, memory_scope_device); \ } \ \ ATTR bool \ -atomic_compare_exchange_##K##_explicit(volatile atomic_##T *p, T *e, T d, memory_order os, memory_order of, memory_scope s) \ +atomic_compare_exchange_##K##_explicit(volatile AP atomic_##T *p, AE T *e, T d, memory_order os, memory_order of, memory_scope s) \ { \ - return __opencl_atomic_compare_exchange_##K(PC_##T p, EC_##T e, AC_##T(d), os, of, s); \ + return __opencl_atomic_compare_exchange_##K((VOLATILE AP AT_##T *) p, (AE ET_##T *)e, AC_##T(d), os, of, s); \ } -#define GENFO(T,O) \ +#if defined EXPLICIT_ASPACES +#define GENCXA(A,T,K) \ + GENCXAA(A,__global,T,K) \ + GENCXAA(A,__local,T,K) \ + GENCXAA(A,__private,T,K) \ + GENCXAA(A,,T,K) +#else +#define GENCXA(A,T,K) GENCXAA(A,,T,K) +#endif + +#define GENFOA(A,T,O) \ ATTR T \ -atomic_fetch_##O(volatile atomic_##T *p, T v) \ +atomic_fetch_##O(volatile A atomic_##T *p, T v) \ { \ - return RC_##T(__opencl_atomic_fetch_##O(PC_##T p, AC_##T(v), memory_order_seq_cst, memory_scope_device)); \ + return RC_##T(__opencl_atomic_fetch_##O((VOLATILE A AT_##T *)p, AC_##T(v), memory_order_seq_cst, memory_scope_device)); \ } \ \ ATTR T \ -atomic_fetch_##O##_explicit(volatile atomic_##T *p, T v, memory_order o) \ +atomic_fetch_##O##_explicit(volatile A atomic_##T *p, T v, memory_order o) \ { \ - return RC_##T(__opencl_atomic_fetch_##O(PC_##T p, AC_##T(v), o, memory_scope_device)); \ + return RC_##T(__opencl_atomic_fetch_##O((VOLATILE A AT_##T *)p, AC_##T(v), o, memory_scope_device)); \ } \ \ ATTR T \ -atomic_fetch_##O##_explicit(volatile atomic_##T *p, T v, memory_order o, memory_scope s) \ +atomic_fetch_##O##_explicit(volatile A atomic_##T *p, T v, memory_order o, memory_scope s) \ { \ - return RC_##T(__opencl_atomic_fetch_##O(PC_##T p, AC_##T(v), o, s)); \ + return RC_##T(__opencl_atomic_fetch_##O((VOLATILE A AT_##T *) p, AC_##T(v), o, s)); \ } -#define CX(T) \ - GENCX(T,strong) \ - GENCX(T,weak) - -#define FO(T) \ - GENFO(T,add) \ - GENFO(T,sub) \ - GENFO(T,or) \ - GENFO(T,xor) \ - GENFO(T,and) \ - GENFO(T,min) \ - GENFO(T,max) \ - +#define CXA(A,T) \ + GENCXA(A,T,strong) \ + GENCXA(A,T,weak) + +#define FOA(A,T) \ + GENFOA(A,T,add) \ + GENFOA(A,T,sub) \ + GENFOA(A,T,or) \ + GENFOA(A,T,xor) \ + GENFOA(A,T,and) \ + GENFOA(A,T,min) \ + GENFOA(A,T,max) \ + +#define ALLIA(A,F) \ + F(A,int) \ + F(A,uint) \ + F(A,long) \ + F(A,ulong) + +#define ALLA(A,F) \ + ALLIA(A,F) \ + F(A,float) \ + F(A,double) + +#if defined EXPLICIT_ASPACES #define ALLI(F) \ - F(int) \ - F(uint) \ - F(long) \ - F(ulong) - + ALLIA(__global, F) \ + ALLIA(__local, F) \ + ALLIA(, F) +#else +#define ALLI(F) ALLIA(, F) +#endif + +#if defined EXPLICIT_ASPACES #define ALL(F) \ - ALLI(F) \ - F(float) \ - F(double) - -ALL(GENI) -ALL(GENL) -ALL(GENS) -ALL(GENX) -ALL(CX) -ALLI(FO) + ALLA(__global,F) \ + ALLA(__local, F) \ + ALLA(, F) +#else +#define ALL(F) ALLA(, F) +#endif + +ALL(GENIA) +ALL(GENLA) +ALL(GENSA) +ALL(GENXA) +ALL(CXA) +ALLI(FOA) // These are needed for uintptr_t -ATTR ulong -atomic_fetch_add(volatile atomic_ulong *p, long v) -{ - return __opencl_atomic_fetch_add((VOLATILE atomic_ulong *)p, (ulong)v, memory_order_seq_cst, memory_scope_device); -} - -ATTR ulong -atomic_fetch_add_explicit(volatile atomic_ulong *p, long v, memory_order o) -{ - return __opencl_atomic_fetch_add((VOLATILE atomic_ulong *)p, (ulong)v, o, memory_scope_device); -} - -ATTR ulong -atomic_fetch_add_explicit(volatile atomic_ulong *p, long v, memory_order o, memory_scope s) -{ - return __opencl_atomic_fetch_add((VOLATILE atomic_ulong *)p, (ulong)v, o, s); -} - -ATTR ulong -atomic_fetch_sub(volatile atomic_ulong *p, long v) -{ - return __opencl_atomic_fetch_sub((VOLATILE atomic_ulong *)p, (ulong)v, memory_order_seq_cst, memory_scope_device); -} - -ATTR ulong -atomic_fetch_sub_explicit(volatile atomic_ulong *p, long v, memory_order o) -{ - return __opencl_atomic_fetch_sub((VOLATILE atomic_ulong *)p, (ulong)v, o, memory_scope_device); +#define UIP(A) \ +ATTR ulong \ +atomic_fetch_add(volatile A atomic_ulong *p, long v) \ +{ \ + return __opencl_atomic_fetch_add((VOLATILE A atomic_ulong *)p, (ulong)v, memory_order_seq_cst, memory_scope_device); \ +} \ + \ +ATTR ulong \ +atomic_fetch_add_explicit(volatile A atomic_ulong *p, long v, memory_order o) \ +{ \ + return __opencl_atomic_fetch_add((VOLATILE A atomic_ulong *)p, (ulong)v, o, memory_scope_device); \ +} \ + \ +ATTR ulong \ +atomic_fetch_add_explicit(volatile A atomic_ulong *p, long v, memory_order o, memory_scope s) \ +{ \ + return __opencl_atomic_fetch_add((VOLATILE A atomic_ulong *)p, (ulong)v, o, s); \ +} \ + \ +ATTR ulong \ +atomic_fetch_sub(volatile A atomic_ulong *p, long v) \ +{ \ + return __opencl_atomic_fetch_sub((VOLATILE A atomic_ulong *)p, (ulong)v, memory_order_seq_cst, memory_scope_device); \ +} \ + \ +ATTR ulong \ +atomic_fetch_sub_explicit(volatile A atomic_ulong *p, long v, memory_order o) \ +{ \ + return __opencl_atomic_fetch_sub((VOLATILE A atomic_ulong *)p, (ulong)v, o, memory_scope_device); \ +} \ + \ +ATTR ulong \ +atomic_fetch_sub_explicit(volatile A atomic_ulong *p, long v, memory_order o, memory_scope s) \ +{ \ + return __opencl_atomic_fetch_sub((VOLATILE A atomic_ulong *)p, (ulong)v, o, s); \ } -ATTR ulong -atomic_fetch_sub_explicit(volatile atomic_ulong *p, long v, memory_order o, memory_scope s) -{ - return __opencl_atomic_fetch_sub((VOLATILE atomic_ulong *)p, (ulong)v, o, s); -} +#if defined EXPLICIT_ASPACES +UIP(__global) +UIP(__local) +#endif +UIP() // flag functions -ATTR bool -atomic_flag_test_and_set(volatile atomic_flag *p) -{ - return __opencl_atomic_exchange((VOLATILE atomic_int *)p, 1, memory_order_seq_cst, memory_scope_device); -} - -ATTR bool -atomic_flag_test_and_set_explicit(volatile atomic_flag *p, memory_order o) -{ - return __opencl_atomic_exchange((VOLATILE atomic_int *)p, 1, o, memory_scope_device); -} - -ATTR bool -atomic_flag_test_and_set_explicit(volatile atomic_flag *p, memory_order o, memory_scope s) -{ - return __opencl_atomic_exchange((VOLATILE atomic_int *)p, 1, o, s); -} - -ATTR void -atomic_flag_clear(volatile atomic_flag *p) -{ - __opencl_atomic_store((VOLATILE atomic_int *)p, 0, memory_order_seq_cst, memory_scope_device); -} - -ATTR void -atomic_flag_clear_explicit(volatile atomic_flag *p, memory_order o) -{ - __opencl_atomic_store((VOLATILE atomic_int *)p, 0, o, memory_scope_device); -} +#define FLG(A) \ +ATTR bool \ +atomic_flag_test_and_set(volatile A atomic_flag *p) \ +{ \ + return __opencl_atomic_exchange((VOLATILE A atomic_int *)p, 1, memory_order_seq_cst, memory_scope_device); \ +} \ + \ +ATTR bool \ +atomic_flag_test_and_set_explicit(volatile A atomic_flag *p, memory_order o) \ +{ \ + return __opencl_atomic_exchange((VOLATILE A atomic_int *)p, 1, o, memory_scope_device); \ +} \ + \ +ATTR bool \ +atomic_flag_test_and_set_explicit(volatile A atomic_flag *p, memory_order o, memory_scope s) \ +{ \ + return __opencl_atomic_exchange((VOLATILE A atomic_int *)p, 1, o, s); \ +} \ + \ +ATTR void \ +atomic_flag_clear(volatile A atomic_flag *p) \ +{ \ + __opencl_atomic_store((VOLATILE A atomic_int *)p, 0, memory_order_seq_cst, memory_scope_device); \ +} \ + \ +ATTR void \ +atomic_flag_clear_explicit(volatile A atomic_flag *p, memory_order o) \ +{ \ + __opencl_atomic_store((VOLATILE A atomic_int *)p, 0, o, memory_scope_device); \ +} \ + \ +ATTR void \ +atomic_flag_clear_explicit(volatile A atomic_flag *p, memory_order o, memory_scope s) \ +{ \ + __opencl_atomic_store((VOLATILE A atomic_int *)p, 0, o, s); \ +} \ -ATTR void -atomic_flag_clear_explicit(volatile atomic_flag *p, memory_order o, memory_scope s) -{ - __opencl_atomic_store((VOLATILE atomic_int *)p, 0, o, s); -} +#if defined EXPLICIT_ASPACES +FLG(__global) +FLG(__local) +#endif +FLG()