diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -28,9 +28,6 @@ int __shfl_down(int var, unsigned detla, int width); int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width); void __syncwarp(int mask); -void __threadfence(); -void __threadfence_block(); -void __threadfence_system(); } DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { @@ -126,9 +123,9 @@ : "memory"); } -DEVICE void __kmpc_impl_threadfence() { __threadfence(); } -DEVICE void __kmpc_impl_threadfence_block() { __threadfence_block(); } -DEVICE void __kmpc_impl_threadfence_system() { __threadfence_system(); } +DEVICE void __kmpc_impl_threadfence() { __nvvm_membar_gl(); } +DEVICE void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); } +DEVICE void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); } // Calls to the NVPTX layer (assuming 1D layout) DEVICE int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); } @@ -140,39 +137,41 @@ DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } -// Forward declaration of atomics. Although they're template functions, we -// already have definitions for different types in CUDA internal headers with -// the right mangled names. -template DEVICE T atomicAdd(T *address, T val); -template DEVICE T atomicInc(T *address, T val); -template DEVICE T atomicMax(T *address, T val); -template DEVICE T atomicExch(T *address, T val); -template DEVICE T atomicCAS(T *address, T compare, T val); - +// Atomics DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) { - return atomicAdd(Address, Val); + return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); } DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) { - return atomicInc(Address, Val); + return __nvvm_atom_inc_gen_ui(Address, Val); } + DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) { - return atomicMax(Address, Val); + return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST); } + DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) { - return atomicExch(Address, Val); + uint32_t R; + __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); + return R; } + DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) { - return atomicCAS(Address, Compare, Val); + (void)__atomic_compare_exchange(Address, &Compare, &Val, false, + __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); + return Compare; } DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address, unsigned long long Val) { - return atomicExch(Address, Val); + unsigned long long R; + __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST); + return R; } + DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address, unsigned long long Val) { - return atomicAdd(Address, Val); + return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST); } #define __OMP_SPIN 1000