diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -126,29 +126,17 @@ DEVICE unsigned GetLaneId(); // Atomics -template INLINE T __kmpc_atomic_add(T *address, T val) { - return __atomic_fetch_add(address, val, __ATOMIC_SEQ_CST); -} - -INLINE uint32_t __kmpc_atomic_inc(uint32_t *address, uint32_t max) { - return __builtin_amdgcn_atomic_inc32(address, max, __ATOMIC_SEQ_CST, ""); -} - -template INLINE T __kmpc_atomic_max(T *address, T val) { - return __atomic_fetch_max(address, val, __ATOMIC_SEQ_CST); -} - -template INLINE T __kmpc_atomic_exchange(T *address, T val) { - T r; - __atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST); - return r; -} - -template INLINE T __kmpc_atomic_cas(T *address, T compare, T val) { - (void)__atomic_compare_exchange(address, &compare, &val, false, - __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); - return compare; -} +DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t); + +static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); +DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *, + unsigned long long); +DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *, + unsigned long long); // Locks DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock); diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -132,11 +132,13 @@ } // namespace DEVICE int GetNumberOfBlocksInKernel() { - return get_grid_dim(__builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x()); + return get_grid_dim(__builtin_amdgcn_grid_size_x(), + __builtin_amdgcn_workgroup_size_x()); } DEVICE int GetNumberOfThreadsInBlock() { - return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), __builtin_amdgcn_grid_size_x(), + return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), + __builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x()); } @@ -149,6 +151,40 @@ return GetNumberOfThreadsInBlock(); } +// Atomics +DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) { + return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); +} +DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) { + return __builtin_amdgcn_atomic_inc32(Address, max, __ATOMIC_SEQ_CST, ""); +} +DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) { + return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST); +} + +DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) { + uint32_t R; + __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); + return R; +} +DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, + uint32_t Val) { + (void)__atomic_compare_exchange(Address, &Compare, &Val, false, + __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); + return Compare; +} + +DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address, + unsigned long long Val) { + unsigned long long R; + __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); + return R; +} +DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address, + unsigned long long Val) { + return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); +} + // Stub implementations DEVICE void *__kmpc_impl_malloc(size_t) { return nullptr; } DEVICE void __kmpc_impl_free(void *) {} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -131,25 +131,17 @@ DEVICE unsigned GetLaneId(); // Atomics -template INLINE T __kmpc_atomic_add(T *address, T val) { - return atomicAdd(address, val); -} - -template INLINE T __kmpc_atomic_inc(T *address, T val) { - return atomicInc(address, val); -} - -template INLINE T __kmpc_atomic_max(T *address, T val) { - return atomicMax(address, val); -} - -template INLINE T __kmpc_atomic_exchange(T *address, T val) { - return atomicExch(address, val); -} - -template INLINE T __kmpc_atomic_cas(T *address, T compare, T val) { - return atomicCAS(address, compare, val); -} +DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t); +DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t); + +static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); +DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *, + unsigned long long); +DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *, + unsigned long long); // Locks DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -123,6 +123,41 @@ DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } +// Forward declaration of atomics. Although they're template functions, we +// already have definitions for different types in CUDA internal headers with +// the right mangled names. +template DEVICE T atomicAdd(T *address, T val); +template DEVICE T atomicInc(T *address, T val); +template DEVICE T atomicMax(T *address, T val); +template DEVICE T atomicExch(T *address, T val); +template DEVICE T atomicCAS(T *address, T compare, T val); + +DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) { + return atomicAdd(Address, Val); +} +DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) { + return atomicInc(Address, Val); +} +DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) { + return atomicMax(Address, Val); +} +DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) { + return atomicExch(Address, Val); +} +DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, + uint32_t Val) { + return atomicCAS(Address, Compare, Val); +} + +DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address, + unsigned long long Val) { + return atomicExch(Address, Val); +} +DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address, + unsigned long long Val) { + return atomicAdd(Address, Val); +} + #define __OMP_SPIN 1000 #define UNSET 0u #define SET 1u