diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -55,14 +55,11 @@ #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ __shfl_down_sync((mask), (var), (delta), (width)) #define __ACTIVEMASK() __activemask() -#define __SYNCWARP(Mask) __syncwarp(Mask) #else #define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane)) #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ __shfl_down((var), (delta), (width)) #define __ACTIVEMASK() __ballot(1) -// In Cuda < 9.0 no need to sync threads in warps. -#define __SYNCWARP(Mask) #endif // CUDA_VERSION #define __SYNCTHREADS_N(n) asm volatile("bar.sync %0;" : : "r"(n) : "memory"); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -14,6 +14,8 @@ // Execution Parameters //////////////////////////////////////////////////////////////////////////////// +#include "target_impl.h" + INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) { execution_param = EMode; execution_param |= RMode; @@ -203,7 +205,7 @@ INLINE void IncParallelLevel(bool ActiveParallel) { unsigned Active = __ACTIVEMASK(); - __SYNCWARP(Active); + __kmpc_impl_syncwarp(Active); unsigned LaneMaskLt; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); unsigned Rank = __popc(Active & LaneMaskLt); @@ -212,12 +214,12 @@ (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); __threadfence(); } - __SYNCWARP(Active); + __kmpc_impl_syncwarp(Active); } INLINE void DecParallelLevel(bool ActiveParallel) { unsigned Active = __ACTIVEMASK(); - __SYNCWARP(Active); + __kmpc_impl_syncwarp(Active); unsigned LaneMaskLt; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); unsigned Rank = __popc(Active & LaneMaskLt); @@ -226,7 +228,7 @@ (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); __threadfence(); } - __SYNCWARP(Active); + __kmpc_impl_syncwarp(Active); } //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -38,6 +38,16 @@ INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); } -INLINE void __kmpc_impl_syncwarp(int32_t Mask) { __SYNCWARP(Mask); } +#ifndef CUDA_VERSION +#error CUDA_VERSION macro is undefined, something wrong with cuda. +#endif + +INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { +#if CUDA_VERSION >= 9000 + __syncwarp(Mask); +#else + // In Cuda < 9.0 no need to sync threads in warps. +#endif +} #endif