Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -55,11 +55,14 @@ #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ __shfl_down_sync((mask), (var), (delta), (width)) #define __ACTIVEMASK() __activemask() +#define __SYNCWARP(Mask) __syncwarp(Mask) #else #define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane)) #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ __shfl_down((var), (delta), (width)) #define __ACTIVEMASK() __ballot(1) +// In Cuda < 9.0 no need to sync threads in warps. +#define __SYNCWARP(Mask) #endif // CUDA_VERSION #define __SYNCTHREADS_N(n) asm volatile("bar.sync %0;" : : "r"(n) : "memory"); Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -202,25 +202,31 @@ // Parallel level INLINE void IncParallelLevel(bool ActiveParallel) { - unsigned tnum = __ACTIVEMASK(); - int leader = __ffs(tnum) - 1; - __SHFL_SYNC(tnum, leader, leader); - if (GetLaneId() == leader) { + unsigned Active = __ACTIVEMASK(); + __SYNCWARP(Active); + unsigned LaneMaskLt; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); + unsigned Rank = __popc(Active & LaneMaskLt); + if (Rank == 0) { parallelLevel[GetWarpId()] += (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); + __threadfence(); } - __SHFL_SYNC(tnum, leader, leader); + __SYNCWARP(Active); } INLINE void DecParallelLevel(bool ActiveParallel) { - unsigned tnum = __ACTIVEMASK(); - int leader = __ffs(tnum) - 1; - __SHFL_SYNC(tnum, leader, leader); - if (GetLaneId() == leader) { + unsigned Active = __ACTIVEMASK(); + __SYNCWARP(Active); + unsigned LaneMaskLt; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); + unsigned Rank = __popc(Active & LaneMaskLt); + if (Rank == 0) { parallelLevel[GetWarpId()] -= (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); + __threadfence(); } - __SHFL_SYNC(tnum, leader, leader); + __SYNCWARP(Active); } ////////////////////////////////////////////////////////////////////////////////