Index: libomptarget/deviceRTLs/nvptx/src/libcall.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -37,7 +37,7 @@ PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num); if (num <= 0) { WARNING0(LW_INPUT, "expected positive num; ignore\n"); - } else if (parallelLevel[GetWarpId()] == 0) { + } else if (getParallelLevel(GetWarpId()) == 0) { nThreads = num; } } @@ -49,12 +49,13 @@ } EXTERN int omp_get_max_threads(void) { - if (parallelLevel[GetWarpId()] > 0) + int parLevel = getParallelLevel(GetWarpId()); + if (parLevel > 0) // We're already in parallel region. return 1; // default is 1 thread avail // Not currently in a parallel region, return what was set. int rc = 1; - if (parallelLevel[GetWarpId()] == 0) + if (parLevel == 0) rc = nThreads; ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads"); PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc); @@ -84,7 +85,7 @@ } EXTERN int omp_in_parallel(void) { - int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0; + int rc = getParallelLevel(GetWarpId()) > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0; PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc); return rc; } @@ -133,13 +134,13 @@ } EXTERN int omp_get_level(void) { - int level = parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1); + int level = getParallelLevel(GetWarpId()) & (OMP_ACTIVE_PARALLEL_LEVEL - 1); PRINT(LD_IO, "call omp_get_level() returns %d\n", level); return level; } EXTERN int omp_get_active_level(void) { - int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0; + int level = getParallelLevel(GetWarpId()) > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0; PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level) return level; } @@ -149,7 +150,7 @@ return level == 1 ? GetThreadIdInBlock() : 0; int rc = -1; // If level is 0 or all parallel regions are not active - return 0. - unsigned parLevel = parallelLevel[GetWarpId()]; + int parLevel = getParallelLevel(GetWarpId()); if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) { int totLevel = omp_get_level(); if (level <= totLevel) { @@ -202,7 +203,7 @@ if (isSPMDMode()) return level == 1 ? GetNumberOfThreadsInBlock() : 1; int rc = -1; - unsigned parLevel = parallelLevel[GetWarpId()]; + int parLevel = getParallelLevel(GetWarpId()); // If level is 0 or all parallel regions are not active - return 1. if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) { rc = threadsInTeam; Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -44,7 +44,7 @@ "Generic always requires initialized runtime."); setExecutionParameters(Generic, RuntimeInitialized); for (int I = 0; I < MAX_THREADS_PER_TEAM / WARPSIZE; ++I) - parallelLevel[I] = 0; + setParallelLevel(I, 0); int threadIdInBlock = GetThreadIdInBlock(); ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(), @@ -98,11 +98,13 @@ int threadId = GetThreadIdInBlock(); if (threadId == 0) { usedSlotIdx = smid() % MAX_SM; - parallelLevel[0] = - 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); + setParallelLevel( + 0, + 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); } else if (GetLaneId() == 0) { - parallelLevel[GetWarpId()] = - 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); + setParallelLevel( + GetWarpId(), + 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); } if (!RequiresOMPRuntime) { // Runtime is not required - exit. Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -401,7 +401,7 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_parallel_level\n"); - return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1); + return getParallelLevel(GetWarpId()) & (OMP_ACTIVE_PARALLEL_LEVEL - 1); } // This kmpc call returns the thread id across all teams. It's value is Index: libomptarget/deviceRTLs/nvptx/src/support.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/support.h +++ libomptarget/deviceRTLs/nvptx/src/support.h @@ -31,6 +31,10 @@ INLINE bool isRuntimeUninitialized(); INLINE bool isRuntimeInitialized(); +INLINE int getParallelLevel(int WarpId); +INLINE void setParallelLevel(int WarpId, int Val); +INLINE void changeParallelLevel(int WarpId, int Val, bool IsIncrement); + //////////////////////////////////////////////////////////////////////////////// // get info from machine //////////////////////////////////////////////////////////////////////////////// Index: libomptarget/deviceRTLs/nvptx/src/supporti.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/supporti.h +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -84,6 +84,77 @@ return !checkRuntimeUninitialized(loc); } +INLINE int getParallelLevel(int WarpId) { + // Use volatile access in case of CUDA8 to prevent dangerous optimizations + // performed by ptxas. + int32_t parLevel; +#if defined(CUDA_VERSION) && CUDA_VERSION <= 8000 + asm volatile("ld.volatile.u8 %0, [%1];" + : "=r"(parLevel) + : +#ifdef __LP64__ + "l" +#else // __LP64__ + "r" +#endif // __LP64__ + (¶llelLevel[WarpId]) + : "memory"); +#else + parLevel = parallelLevel[WarpId]; +#endif // CUDA_VERSION + return parLevel; +} + +INLINE void setParallelLevel(int WarpId, int Val) { + uint8_t &parLevel = parallelLevel[WarpId]; +#if defined(CUDA_VERSION) && CUDA_VERSION <= 8000 + asm volatile("st.volatile.u8 [%1], %0;" ::"r"(Val), +#ifdef __LP64__ + "l" +#else // __LP64__ + "r" +#endif // __LP64__ + (&parLevel) + : "memory"); +#else // CUDA_VERSION + parLevel = Val; +#endif // CUDA_VERSION +} + +INLINE void changeParallelLevel(int WarpId, int Val, bool IsIncrement) { + uint8_t &parLevel = parallelLevel[WarpId]; +#if defined(CUDA_VERSION) && CUDA_VERSION <= 8000 + int CurVal; + asm volatile("ld.volatile.u8 %0, [%1];" + : "=r"(CurVal) + : +#ifdef __LP64__ + "l" +#else // __LP64__ + "r" +#endif // __LP64__ + (&parLevel) + : "memory"); + if (IsIncrement) + CurVal += Val; + else + CurVal -= Val; + asm volatile("st.volatile.u8 [%1], %0;" ::"r"(CurVal), +#ifdef __LP64__ + "l" +#else // __LP64__ + "r" +#endif // __LP64__ + (&parLevel) + : "memory"); +#else // CUDA_VERSION + if (IsIncrement) + parLevel += Val; + else + parLevel -= Val; +#endif // CUDA_VERSION +} + //////////////////////////////////////////////////////////////////////////////// // support: get info from machine //////////////////////////////////////////////////////////////////////////////// @@ -152,7 +223,8 @@ INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) { // omp_thread_num int rc; - if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) { + int Level = getParallelLevel(GetWarpId()); + if ((Level & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) { rc = 0; } else if (isSPMDExecutionMode) { rc = GetThreadIdInBlock(); @@ -168,7 +240,7 @@ INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) { // omp_num_threads int rc; - int Level = parallelLevel[GetWarpId()]; + int Level = getParallelLevel(GetWarpId()); if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) { rc = 1; } else if (isSPMDExecutionMode) { @@ -206,8 +278,9 @@ int leader = __ffs(tnum) - 1; __SHFL_SYNC(tnum, leader, leader); if (GetLaneId() == leader) { - parallelLevel[GetWarpId()] += - (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); + changeParallelLevel(GetWarpId(), + 1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0), + /*IsIncrement=*/true); } __SHFL_SYNC(tnum, leader, leader); } @@ -217,8 +290,9 @@ int leader = __ffs(tnum) - 1; __SHFL_SYNC(tnum, leader, leader); if (GetLaneId() == leader) { - parallelLevel[GetWarpId()] -= - (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); + changeParallelLevel(GetWarpId(), + 1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0), + /*IsIncrement=*/false); } __SHFL_SYNC(tnum, leader, leader); } Index: libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp =================================================================== --- libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp +++ libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp @@ -47,5 +47,15 @@ // CHECK: Expected count = 86 printf("Expected count = %d\n", Count); + Count = 0; +#pragma omp target parallel for reduction(+: Count) schedule(dynamic, 2) num_threads(64) + for (int J = 0; J < 1000; ++J) { + Count += J; + } + + // Final result of Count is 1000 * (999-0) / 2 + // CHECK: Expected count with dynamic scheduling = 499500 + printf("Expected count with dynamic scheduling = %d\n", Count); + return isHost; }