diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -10,9 +10,9 @@ // //===----------------------------------------------------------------------===// -#include "target_impl.h" #include "common/debug.h" #include "common/target_atomic.h" +#include "target_impl.h" #include @@ -56,23 +56,23 @@ } // In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask(). - DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { #if CUDA_VERSION >= 9000 - return __activemask(); + uint32_t mask; + asm volatile("activemask.b32 %0;" : "=r"(mask)); + return mask; #else - return __ballot(1); + return __nvvm_vote_ballot(1); #endif } // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. - DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, int32_t SrcLane) { #if CUDA_VERSION >= 9000 - return __shfl_sync(Mask, Var, SrcLane); + return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, WARPSIZE - 1); #else - return __shfl(Var, SrcLane); + return __nvvm_shfl_idx_i32(Var, SrcLane, WARPSIZE - 1); #endif // CUDA_VERSION } @@ -80,24 +80,18 @@ int32_t Var, uint32_t Delta, int32_t Width) { #if CUDA_VERSION >= 9000 - return __shfl_down_sync(Mask, Var, Delta, Width); + return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, + ((WARPSIZE - Width) << 8) | 0x1f); #else - return __shfl_down(Var, Delta, Width); + return __nvvm_shfl_down_i32(Var, Delta, ((WARPSIZE - Width) << 8) | 0x1f); #endif // CUDA_VERSION } -DEVICE void __kmpc_impl_syncthreads() { - // Use original __syncthreads if compiled by nvcc or clang >= 9.0. -#if !defined(__clang__) || __clang_major__ >= 9 - __syncthreads(); -#else - asm volatile("bar.sync %0;" : : "r"(0) : "memory"); -#endif // __clang__ -} +DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); } DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { #if CUDA_VERSION >= 9000 - __syncwarp(Mask); + __nvvm_bar_warp_sync(Mask); #else // In Cuda < 9.0 no need to sync threads in warps. #endif // CUDA_VERSION @@ -145,11 +139,11 @@ DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) { // TODO: not sure spinning is a good idea here.. while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) { - clock_t start = clock(); - clock_t now; + int32_t start = __nvvm_read_ptx_sreg_clock(); + int32_t now; for (;;) { - now = clock(); - clock_t cycles = now > start ? now - start : now + (0xffffffff - start); + now = __nvvm_read_ptx_sreg_clock(); + int32_t cycles = now > start ? now - start : now + (0xffffffff - start); if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) { break; }