Index: libomptarget/deviceRTLs/nvptx/src/libcall.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -165,7 +165,7 @@ ASSERT0(LT_FUSSY, isSPMDMode(), "Expected SPMD mode only with uninitialized runtime."); // parallelLevel starts from 0, need to add 1 for correct level. - return parallelLevel + 1; + return parallelLevel[GetWarpId()] + 1; } int level = 0; omptarget_nvptx_TaskDescr *currTaskDescr = Index: libomptarget/deviceRTLs/nvptx/src/omp_data.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -31,7 +31,7 @@ __device__ __shared__ uint32_t usedMemIdx; __device__ __shared__ uint32_t usedSlotIdx; -__device__ __shared__ uint8_t parallelLevel; +__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; // Pointer to this team's OpenMP state object __device__ __shared__ Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -406,7 +406,8 @@ omptarget_nvptx_simpleMemoryManager; extern __device__ __shared__ uint32_t usedMemIdx; extern __device__ __shared__ uint32_t usedSlotIdx; -extern __device__ __shared__ uint8_t parallelLevel; +extern __device__ __shared__ uint8_t + parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; extern __device__ __shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -95,8 +95,10 @@ // If OMP runtime is not required don't initialize OMP state. setExecutionParameters(Spmd, RuntimeUninitialized); if (GetThreadIdInBlock() == 0) { - parallelLevel = 0; usedSlotIdx = smid() % MAX_SM; + parallelLevel[0] = 0; + } else if (GetLaneId() == 0) { + parallelLevel[GetWarpId()] = 0; } __SYNCTHREADS(); return; Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -339,10 +339,12 @@ if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected SPMD mode with uninitialized runtime."); - __SYNCTHREADS(); - if (GetThreadIdInBlock() == 0) - ++parallelLevel; - __SYNCTHREADS(); + unsigned tnum = __ACTIVEMASK(); + int leader = __ffs(tnum) - 1; + __SHFL_SYNC(tnum, leader, leader); + if (GetLaneId() == leader) + ++parallelLevel[GetWarpId()]; + __SHFL_SYNC(tnum, leader, leader); return; } @@ -382,10 +384,12 @@ if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected SPMD mode with uninitialized runtime."); - __SYNCTHREADS(); - if (GetThreadIdInBlock() == 0) - --parallelLevel; - __SYNCTHREADS(); + unsigned tnum = __ACTIVEMASK(); + int leader = __ffs(tnum) - 1; + __SHFL_SYNC(tnum, leader, leader); + if (GetLaneId() == leader) + --parallelLevel[GetWarpId()]; + __SHFL_SYNC(tnum, leader, leader); return; } @@ -407,7 +411,7 @@ if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected SPMD mode with uninitialized runtime."); - return parallelLevel + 1; + return parallelLevel[GetWarpId()] + 1; } int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); Index: libomptarget/deviceRTLs/nvptx/src/support.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/support.h +++ libomptarget/deviceRTLs/nvptx/src/support.h @@ -40,6 +40,8 @@ INLINE int GetBlockIdInKernel(); INLINE int GetNumberOfBlocksInKernel(); INLINE int GetNumberOfThreadsInBlock(); +INLINE unsigned GetWarpId(); +INLINE unsigned GetLaneId(); // get global ids to locate tread/team info (constant regardless of OMP) INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode); Index: libomptarget/deviceRTLs/nvptx/src/supporti.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/supporti.h +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -102,6 +102,10 @@ INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; } +INLINE unsigned GetWarpId() { return threadIdx.x / WARPSIZE; } + +INLINE unsigned GetLaneId() { return threadIdx.x % WARPSIZE; } + //////////////////////////////////////////////////////////////////////////////// // // Calls to the Generic Scheme Implementation Layer (assuming 1D layout) @@ -154,7 +158,7 @@ ASSERT0(LT_FUSSY, isSPMDExecutionMode, "Uninitialized runtime with non-SPMD mode."); // For level 2 parallelism all parallel regions are executed sequentially. - if (parallelLevel > 0) + if (parallelLevel[GetWarpId()] > 0) rc = 0; else rc = GetThreadIdInBlock(); @@ -175,7 +179,7 @@ ASSERT0(LT_FUSSY, isSPMDExecutionMode, "Uninitialized runtime with non-SPMD mode."); // For level 2 parallelism all parallel regions are executed sequentially. - if (parallelLevel > 0) + if (parallelLevel[GetWarpId()] > 0) rc = 1; else rc = GetNumberOfThreadsInBlock(); Index: libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp =================================================================== --- libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp +++ libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp @@ -6,9 +6,10 @@ int main(void) { int isHost = -1; int ParallelLevel1 = -1, ParallelLevel2 = -1; + int Count = 0; #pragma omp target parallel for map(tofrom \ - : isHost, ParallelLevel1, ParallelLevel2) + : isHost, ParallelLevel1, ParallelLevel2), reduction(+: Count) schedule(static, 1) for (int J = 0; J < 10; ++J) { #pragma omp critical { @@ -18,12 +19,18 @@ ParallelLevel1 = (ParallelLevel1 < 0 || ParallelLevel1 == 1) ? omp_get_level() : 2; } - int L2; -#pragma omp parallel for schedule(dynamic) lastprivate(L2) - for (int I = 0; I < 10; ++I) - L2 = omp_get_level(); + if (omp_get_thread_num() > 5) { + int L2; +#pragma omp parallel for schedule(dynamic) lastprivate(L2) reduction(+: Count) + for (int I = 0; I < 10; ++I) { + L2 = omp_get_level(); + Count += omp_get_level(); // (9-5)*10*2 = 80 + } #pragma omp critical - ParallelLevel2 = (ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : 1; + ParallelLevel2 = (ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : 1; + } else { + Count += omp_get_level(); // 6 * 1 = 6 + } } if (isHost < 0) { @@ -35,6 +42,10 @@ // CHECK: Parallel level in SPMD mode: L1 is 1, L2 is 2 printf("Parallel level in SPMD mode: L1 is %d, L2 is %d\n", ParallelLevel1, ParallelLevel2); + // Final result of Count is (9-5)(num of loops)*10(num of iterations)*2(par + // level) + 6(num of iterations) * 1(par level) + // CHECK: Expected count = 86 + printf("Expected count = %d\n", Count); return isHost; }