diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -16,20 +16,6 @@ #include -// Forward declaration of CUDA primitives which will be evetually transformed -// into LLVM intrinsics. -extern "C" { -unsigned int __activemask(); -unsigned int __ballot(unsigned); -// The default argument here is based on NVIDIA's website -// https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/ -int __shfl_sync(unsigned mask, int val, int src_line, int width = WARPSIZE); -int __shfl(int val, int src_line, int width = WARPSIZE); -int __shfl_down(int var, unsigned detla, int width); -int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width); -void __syncwarp(int mask); -} - DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); } @@ -71,10 +57,12 @@ // In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask(). DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { -#if CUDA_VERSION >= 9000 - return __activemask(); +#if CUDA_VERSION < 9020 + return __nvvm_vote_ballot(1); #else - return __ballot(1); + unsigned int Mask; + asm volatile("activemask.b32 %0;" : "=r"(Mask)); + return Mask; #endif } @@ -82,19 +70,20 @@ DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, int32_t SrcLane) { #if CUDA_VERSION >= 9000 - return __shfl_sync(Mask, Var, SrcLane); + return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f); #else - return __shfl(Var, SrcLane); + return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f); #endif // CUDA_VERSION } DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, uint32_t Delta, int32_t Width) { + int32_t T = ((WARPSIZE - Width) << 8) | 0x1f; #if CUDA_VERSION >= 9000 - return __shfl_down_sync(Mask, Var, Delta, Width); + return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T); #else - return __shfl_down(Var, Delta, Width); + return __nvvm_shfl_down_i32(Var, Delta, T); #endif // CUDA_VERSION } @@ -102,7 +91,7 @@ DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { #if CUDA_VERSION >= 9000 - __syncwarp(Mask); + __nvvm_bar_warp_sync(Mask); #else // In Cuda < 9.0 no need to sync threads in warps. #endif // CUDA_VERSION