Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -55,11 +55,22 @@ #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ __shfl_down_sync((mask), (var), (delta), (width)) #define __ACTIVEMASK() __activemask() +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#define __SYNCWARP() __syncwarp() +#else +// For .target sm_6x or below, all threads in mask must execute the same +// __syncwarp() in convergence, and the union of all values in mask must be +// equal to the active mask. Otherwise, the behavior is undefined. +// (https://docs.nvidia.com/cuda/archive/9.0/cuda-c-programming-guide/index.html#synchronization-functions) +#define __SYNCWARP() __syncwarp(__activemask()) +#endif // __CUDA_ARCH__ #else #define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane)) #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ __shfl_down((var), (delta), (width)) #define __ACTIVEMASK() __ballot(1) +// In Cuda < 9.0 no need to sync threads in warps. +#define __SYNCWARP() #endif // CUDA_VERSION #define __SYNCTHREADS_N(n) asm volatile("bar.sync %0;" : : "r"(n) : "memory"); Index: libomptarget/deviceRTLs/nvptx/src/supporti.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/supporti.h +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -202,25 +202,31 @@ // Parallel level INLINE void IncParallelLevel(bool ActiveParallel) { - unsigned tnum = __ACTIVEMASK(); - int leader = __ffs(tnum) - 1; - __SHFL_SYNC(tnum, leader, leader); - if (GetLaneId() == leader) { + __SYNCWARP(); + unsigned Active = __ACTIVEMASK(); + unsigned LaneMaskLt; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); + unsigned Rank = __popc(Active & LaneMaskLt); + if (Rank == 0) { parallelLevel[GetWarpId()] += (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); + __threadfence(); } - __SHFL_SYNC(tnum, leader, leader); + __SYNCWARP(); } INLINE void DecParallelLevel(bool ActiveParallel) { - unsigned tnum = __ACTIVEMASK(); - int leader = __ffs(tnum) - 1; - __SHFL_SYNC(tnum, leader, leader); - if (GetLaneId() == leader) { + __SYNCWARP(); + unsigned Active = __ACTIVEMASK(); + unsigned LaneMaskLt; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); + unsigned Rank = __popc(Active & LaneMaskLt); + if (Rank == 0) { parallelLevel[GetWarpId()] -= (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); + __threadfence(); } - __SHFL_SYNC(tnum, leader, leader); + __SYNCWARP(); } ////////////////////////////////////////////////////////////////////////////////