Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -311,7 +311,16 @@ (int)newTaskDescr->ThreadId(), (int)nThreads); isActive = true; - IncParallelLevel(threadsInTeam != 1); + // Reconverge the threads at the end of the parallel region to correctly + // handle parallel levels. + // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole + // warp. If only 1 thread is active, not need to reconverge the threads. + // If we have the whole warp, reconverge all the threads in the warp before + // actually trying to change the parallel level. Otherwise, parallel level + // can be changed incorrectly because of threads divergence. + bool IsActiveParallelRegion = threadsInTeam != 1; + IncParallelLevel(IsActiveParallelRegion, + IsActiveParallelRegion ? 0xFFFFFFFF : 1u); } return isActive; @@ -329,7 +338,16 @@ omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, currTaskDescr->GetPrevTaskDescr()); - DecParallelLevel(threadsInTeam != 1); + // Reconverge the threads at the end of the parallel region to correctly + // handle parallel levels. + // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole + // warp. If only 1 thread is active, not need to reconverge the threads. + // If we have the whole warp, reconverge all the threads in the warp before + // actually trying to change the parallel level. Otherwise, parallel level can + // be changed incorrectly because of threads divergence. + bool IsActiveParallelRegion = threadsInTeam != 1; + DecParallelLevel(IsActiveParallelRegion, + IsActiveParallelRegion ? 0xFFFFFFFF : 1u); } //////////////////////////////////////////////////////////////////////////////// @@ -339,7 +357,7 @@ EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n"); - IncParallelLevel(/*ActiveParallel=*/false); + IncParallelLevel(/*ActiveParallel=*/false, __ACTIVEMASK()); if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), @@ -378,7 +396,7 @@ uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n"); - DecParallelLevel(/*ActiveParallel=*/false); + DecParallelLevel(/*ActiveParallel=*/false, __ACTIVEMASK()); if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), Index: libomptarget/deviceRTLs/nvptx/src/support.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/support.h +++ libomptarget/deviceRTLs/nvptx/src/support.h @@ -65,8 +65,8 @@ INLINE int IsTeamMaster(int ompThreadId); // Parallel level -INLINE void IncParallelLevel(bool ActiveParallel); -INLINE void DecParallelLevel(bool ActiveParallel); +INLINE void IncParallelLevel(bool ActiveParallel, unsigned Mask); +INLINE void DecParallelLevel(bool ActiveParallel, unsigned Mask); //////////////////////////////////////////////////////////////////////////////// // Memory Index: libomptarget/deviceRTLs/nvptx/src/supporti.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/supporti.h +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -201,32 +201,30 @@ //////////////////////////////////////////////////////////////////////////////// // Parallel level -INLINE void IncParallelLevel(bool ActiveParallel) { - unsigned Active = __ACTIVEMASK(); - __SYNCWARP(Active); +INLINE void IncParallelLevel(bool ActiveParallel, unsigned Mask) { + __SYNCWARP(Mask); unsigned LaneMaskLt; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); - unsigned Rank = __popc(Active & LaneMaskLt); + unsigned Rank = __popc(Mask & LaneMaskLt); if (Rank == 0) { parallelLevel[GetWarpId()] += (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); __threadfence(); } - __SYNCWARP(Active); + __SYNCWARP(Mask); } -INLINE void DecParallelLevel(bool ActiveParallel) { - unsigned Active = __ACTIVEMASK(); - __SYNCWARP(Active); +INLINE void DecParallelLevel(bool ActiveParallel, unsigned Mask) { + __SYNCWARP(Mask); unsigned LaneMaskLt; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); - unsigned Rank = __popc(Active & LaneMaskLt); + unsigned Rank = __popc(Mask & LaneMaskLt); if (Rank == 0) { parallelLevel[GetWarpId()] -= (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); __threadfence(); } - __SYNCWARP(Active); + __SYNCWARP(Mask); } //////////////////////////////////////////////////////////////////////////////// Index: libomptarget/deviceRTLs/nvptx/test/parallel/level.c =================================================================== --- libomptarget/deviceRTLs/nvptx/test/parallel/level.c +++ libomptarget/deviceRTLs/nvptx/test/parallel/level.c @@ -135,5 +135,17 @@ } } + // Check for paraller level in non-SPMD kernels. + level = 0; + #pragma omp target teams distribute num_teams(1) thread_limit(32) reduction(+:level) + for (int i=0; i<5032; i+=32) { + int ub = (i+32 > 5032) ? 5032 : i+32; + #pragma omp parallel for schedule(dynamic) + for (int j=i ; j < ub; j++) ; + level += omp_get_level(); + } + // CHECK: Integral level = 0. + printf("Integral level = %d.\n", level); + return 0; }