Index: libomptarget/deviceRTLs/nvptx/src/libcall.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -30,17 +30,16 @@ } EXTERN void omp_set_num_threads(int num) { - // Ignore it for SPMD mode. - if (isSPMDMode()) + // Ignore it if the parallel region is started already, inner L2+ parallel + // regions can have only 1 thread maximum. + if (parallelLevel[GetWarpId()] > 0) return; ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num); if (num <= 0) { WARNING0(LW_INPUT, "expected positive num; ignore\n"); } else { - omptarget_nvptx_TaskDescr *currTaskDescr = - getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false); - currTaskDescr->NThreads() = num; + numberOfThreads = num; } } @@ -56,12 +55,7 @@ if (parallelLevel[GetWarpId()] > 0) // We're already in parallel region. return 1; // default is 1 thread avail - omptarget_nvptx_TaskDescr *currTaskDescr = - getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false); - ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(), - "Should no be in the parallel region"); - // Not currently in a parallel region, return what was set. - int rc = currTaskDescr->NThreads(); + int rc = numberOfThreads; ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads"); PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc); return rc; @@ -184,7 +178,7 @@ currTaskDescr->RuntimeChunkSize(), (int)currTaskDescr->ThreadId(), (int)currTaskDescr->ThreadsInTeam(), - (int)currTaskDescr->NThreads()); + (int)numberOfThreads); } if (currTaskDescr->IsParallelConstruct()) { Index: libomptarget/deviceRTLs/nvptx/src/omp_data.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -32,6 +32,7 @@ __device__ __shared__ uint32_t usedSlotIdx; __device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; +__device__ __shared__ uint16_t numberOfThreads; // Pointer to this team's OpenMP state object __device__ __shared__ Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -164,7 +164,6 @@ } INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); } // methods for other fields - INLINE uint16_t &NThreads() { return items.nthreads; } INLINE uint16_t &ThreadLimit() { return items.threadlimit; } INLINE uint16_t &ThreadId() { return items.threadId; } INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; } @@ -212,7 +211,6 @@ struct TaskDescr_items { uint8_t flags; // 6 bit used (see flag above) uint8_t unused; - uint16_t nthreads; // thread num for subsequent parallel regions uint16_t threadlimit; // thread limit ICV uint16_t threadId; // thread id uint16_t threadsInTeam; // threads in current team @@ -408,6 +406,7 @@ extern __device__ __shared__ uint32_t usedSlotIdx; extern __device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; +extern __device__ __shared__ uint16_t numberOfThreads; extern __device__ __shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -73,7 +73,7 @@ // set number of threads and thread limit in team to started value omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); - currTaskDescr->NThreads() = GetNumberOfWorkersInTeam(); + numberOfThreads = GetNumberOfWorkersInTeam(); currTaskDescr->ThreadLimit() = ThreadLimit; } Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h @@ -39,7 +39,6 @@ // not in parallel items.flags = 0; - items.nthreads = GetNumberOfProcsInTeam(isSPMDExecutionMode); ; // threads: whatever was alloc by kernel items.threadId = 0; // is master items.threadsInTeam = 1; // sequential @@ -58,7 +57,6 @@ items.flags = TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel - items.nthreads = 0; // # threads for subsequent parallel region items.threadId = GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) items.threadsInTeam = tnum; Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -249,9 +249,8 @@ uint16_t &NumThreadsClause = omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); - uint16_t NumThreads = - determineNumberOfThreads(NumThreadsClause, currTaskDescr->NThreads(), - currTaskDescr->ThreadLimit()); + uint16_t NumThreads = determineNumberOfThreads( + NumThreadsClause, numberOfThreads, currTaskDescr->ThreadLimit()); if (NumThreadsClause != 0) { // Reset request to avoid propagating to successive #parallel @@ -308,7 +307,7 @@ PRINT(LD_PAR, "thread will execute parallel region with id %d in a team of " "%d threads\n", - (int)newTaskDescr->ThreadId(), (int)newTaskDescr->NThreads()); + (int)newTaskDescr->ThreadId(), (int)numberOfThreads); isActive = true; IncParallelLevel(workDescr.WorkTaskDescr()->ThreadsInTeam() != 1);