Index: libomptarget/deviceRTLs/nvptx/src/libcall.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -37,7 +37,7 @@ PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num); if (num <= 0) { WARNING0(LW_INPUT, "expected positive num; ignore\n"); - } else if (parallelLevel[GetWarpId()] == 0) { + } else if (getParallelLevel(GetWarpId()) == 0) { nThreads = num; } } @@ -49,12 +49,13 @@ } EXTERN int omp_get_max_threads(void) { - if (parallelLevel[GetWarpId()] > 0) + int parLevel = getParallelLevel(GetWarpId()); + if (parLevel > 0) // We're already in parallel region. return 1; // default is 1 thread avail // Not currently in a parallel region, return what was set. int rc = 1; - if (parallelLevel[GetWarpId()] == 0) + if (parLevel == 0) rc = nThreads; ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads"); PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc); @@ -84,7 +85,7 @@ } EXTERN int omp_in_parallel(void) { - int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0; + int rc = getParallelLevel(GetWarpId()) > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0; PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc); return rc; } @@ -133,13 +134,13 @@ } EXTERN int omp_get_level(void) { - int level = parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1); + int level = getParallelLevel(GetWarpId()) & (OMP_ACTIVE_PARALLEL_LEVEL - 1); PRINT(LD_IO, "call omp_get_level() returns %d\n", level); return level; } EXTERN int omp_get_active_level(void) { - int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0; + int level = getParallelLevel(GetWarpId()) > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0; PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level) return level; } @@ -149,7 +150,7 @@ return level == 1 ? GetThreadIdInBlock() : 0; int rc = -1; // If level is 0 or all parallel regions are not active - return 0. - unsigned parLevel = parallelLevel[GetWarpId()]; + int parLevel = getParallelLevel(GetWarpId()); if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) { int totLevel = omp_get_level(); if (level <= totLevel) { @@ -202,7 +203,7 @@ if (isSPMDMode()) return level == 1 ? GetNumberOfThreadsInBlock() : 1; int rc = -1; - unsigned parLevel = parallelLevel[GetWarpId()]; + int parLevel = getParallelLevel(GetWarpId()); // If level is 0 or all parallel regions are not active - return 1. if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) { rc = threadsInTeam; Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -45,7 +45,7 @@ "Generic always requires initialized runtime."); setExecutionParameters(Generic, RuntimeInitialized); for (int I = 0; I < MAX_THREADS_PER_TEAM / WARPSIZE; ++I) - parallelLevel[I] = 0; + setParallelLevel(I, 0); int threadIdInBlock = GetThreadIdInBlock(); ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(), @@ -99,11 +99,13 @@ int threadId = GetThreadIdInBlock(); if (threadId == 0) { usedSlotIdx = smid() % MAX_SM; - parallelLevel[0] = - 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); + setParallelLevel( + 0, + 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); } else if (GetLaneId() == 0) { - parallelLevel[GetWarpId()] = - 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); + setParallelLevel( + GetWarpId(), + 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); } if (!RequiresOMPRuntime) { // Runtime is not required - exit. Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -421,7 +421,7 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_parallel_level\n"); - return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1); + return getParallelLevel(GetWarpId()) & (OMP_ACTIVE_PARALLEL_LEVEL - 1); } // This kmpc call returns the thread id across all teams. It's value is Index: libomptarget/deviceRTLs/nvptx/src/support.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/support.h +++ libomptarget/deviceRTLs/nvptx/src/support.h @@ -32,6 +32,10 @@ INLINE bool isRuntimeUninitialized(); INLINE bool isRuntimeInitialized(); +INLINE int getParallelLevel(int WarpId); +INLINE void setParallelLevel(int WarpId, int Val); +INLINE void changeParallelLevel(int WarpId, int Val, bool IsIncrement); + //////////////////////////////////////////////////////////////////////////////// // get info from machine //////////////////////////////////////////////////////////////////////////////// Index: libomptarget/deviceRTLs/nvptx/src/supporti.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/supporti.h +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -86,6 +86,23 @@ return !checkRuntimeUninitialized(loc); } +INLINE int getParallelLevel(int WarpId) { + return __kmpc_impl_get_parallel_level(parallelLevel[WarpId]); +} + +INLINE void setParallelLevel(int WarpId, int Val) { + __kmpc_impl_set_parallel_level(parallelLevel[WarpId], Val); +} + +INLINE void changeParallelLevel(int WarpId, int Val, bool IsIncrement) { + int ParLevel = getParallelLevel(WarpId); + if (IsIncrement) + ParLevel += Val; + else + ParLevel -= Val; + setParallelLevel(WarpId, ParLevel); +} + //////////////////////////////////////////////////////////////////////////////// // support: get info from machine //////////////////////////////////////////////////////////////////////////////// @@ -154,7 +171,8 @@ INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) { // omp_thread_num int rc; - if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) { + int Level = getParallelLevel(GetWarpId()); + if ((Level & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) { rc = 0; } else if (isSPMDExecutionMode) { rc = GetThreadIdInBlock(); @@ -170,7 +188,7 @@ INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) { // omp_num_threads int rc; - int Level = parallelLevel[GetWarpId()]; + int Level = getParallelLevel(GetWarpId()); if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) { rc = 1; } else if (isSPMDExecutionMode) { @@ -208,8 +226,9 @@ __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt(); unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt); if (Rank == 0) { - parallelLevel[GetWarpId()] += - (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); + changeParallelLevel(GetWarpId(), + 1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0), + /*IsIncrement=*/true); __threadfence(); } __kmpc_impl_syncwarp(Mask); @@ -220,8 +239,9 @@ __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt(); unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt); if (Rank == 0) { - parallelLevel[GetWarpId()] -= - (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); + changeParallelLevel(GetWarpId(), + 1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0), + /*IsIncrement=*/false); __threadfence(); } __kmpc_impl_syncwarp(Mask); Index: libomptarget/deviceRTLs/nvptx/src/target_impl.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -98,4 +98,38 @@ #endif // CUDA_VERSION } +#ifdef __LP64__ +#define PTR_CONSTRAINT "l" +#else // __LP64__ +#define PTR_CONSTRAINT "r" +#endif // __LP64__ + +INLINE int32_t __kmpc_impl_get_parallel_level(uint8_t &ParLevel) { + // Use volatile access in case of CUDA8 to prevent dangerous optimizations + // performed by ptxas. + int32_t ParLevelVal; +#if defined(CUDA_VERSION) && CUDA_VERSION <= 8000 + asm volatile("ld.volatile.u8 %0, [%1];" + : "=r"(ParLevelVal) + : PTR_CONSTRAINT(&ParLevel) + : "memory"); +#else + ParLevelVal = ParLevel; +#endif // CUDA_VERSION + return ParLevelVal; +} + +INLINE void __kmpc_impl_set_parallel_level(uint8_t &ParLevel, + int32_t ParLevelVal) { +#if defined(CUDA_VERSION) && CUDA_VERSION <= 8000 + asm volatile("st.volatile.u8 [%1], %0;" ::"r"(ParLevelVal), + PTR_CONSTRAINT(&ParLevel) + : "memory"); +#else // CUDA_VERSION + ParLevel = ParLevelVal; +#endif // CUDA_VERSION +} + +#undef PTR_CONSTRAINT + #endif Index: libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp =================================================================== --- libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp +++ libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp @@ -47,5 +47,15 @@ // CHECK: Expected count = 86 printf("Expected count = %d\n", Count); + Count = 0; +#pragma omp target parallel for reduction(+: Count) schedule(dynamic, 2) num_threads(64) + for (int J = 0; J < 1000; ++J) { + Count += J; + } + + // Final result of Count is 1000 * (999-0) / 2 + // CHECK: Expected count with dynamic scheduling = 499500 + printf("Expected count with dynamic scheduling = %d\n", Count); + return isHost; }