Index: libomptarget/deviceRTLs/nvptx/src/libcall.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -45,9 +45,7 @@ } EXTERN int omp_get_num_threads(void) { - bool isSPMDExecutionMode = isSPMDMode(); - int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); - int rc = GetNumberOfOmpThreads(tid, isSPMDExecutionMode); + int rc = GetNumberOfOmpThreads(isSPMDMode()); PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc); return rc; } @@ -156,10 +154,7 @@ int rc = -1; // If level is 0 or all parallel regions are not active - return 0. unsigned parLevel = parallelLevel[GetWarpId()]; - if (level == 0 || (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL && - level <= parLevel)) { - rc = 0; - } else if (level > 0) { + if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) { int totLevel = omp_get_level(); if (level <= totLevel) { omptarget_nvptx_TaskDescr *currTaskDescr = @@ -179,8 +174,7 @@ (currTaskDescr->IsParallelConstruct() ? "par" : "task"), (int)currTaskDescr->InParallelRegion(), (int)sched, currTaskDescr->RuntimeChunkSize(), - (int)currTaskDescr->ThreadId(), - (int)currTaskDescr->ThreadsInTeam(), + (int)currTaskDescr->ThreadId(), (int)threadsInTeam, (int)currTaskDescr->NThreads()); } @@ -196,6 +190,12 @@ } while (currTaskDescr); ASSERT0(LT_FUSSY, !steps, "expected to find all steps"); } + } else if (level == 0 || + (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL && + level <= parLevel) || + (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL && + level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) { + rc = 0; } PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level, rc) @@ -208,30 +208,14 @@ int rc = -1; unsigned parLevel = parallelLevel[GetWarpId()]; // If level is 0 or all parallel regions are not active - return 1. - if (level == 0 || (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL && - level <= parLevel)) { + if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) { + rc = threadsInTeam; + } else if (level == 0 || + (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL && + level <= parLevel) || + (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL && + level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) { rc = 1; - } else if (level > 0) { - int totLevel = omp_get_level(); - if (level <= totLevel) { - omptarget_nvptx_TaskDescr *currTaskDescr = - getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false); - int steps = totLevel - level; - ASSERT0(LT_FUSSY, currTaskDescr, - "do not expect fct to be called in a non-active thread"); - do { - if (currTaskDescr->IsParallelConstruct()) { - if (!steps) { - // found the level - rc = currTaskDescr->ThreadsInTeam(); - break; - } - steps--; - } - currTaskDescr = currTaskDescr->GetPrevTaskDescr(); - } while (currTaskDescr); - ASSERT0(LT_FUSSY, !steps, "expected to find all steps"); - } } PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc) return rc; Index: libomptarget/deviceRTLs/nvptx/src/loop.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/loop.cu +++ libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -99,12 +99,9 @@ // When IsRuntimeUninitialized is true, we assume that the caller is // in an L0 parallel region and that all worker threads participate. - int tid = GetLogicalThreadIdInBlock(IsSPMDExecutionMode); - // Assume we are in teams region or that we use a single block // per target region - ST numberOfActiveOMPThreads = - GetNumberOfOmpThreads(tid, IsSPMDExecutionMode); + ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode); // All warps that are in excess of the maximum requested, do // not execute the loop @@ -212,7 +209,7 @@ } int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid); - T tnum = currTaskDescr->ThreadsInTeam(); + T tnum = GetNumberOfOmpThreads(checkSPMDMode(loc)); T tripCount = ub - lb + 1; // +1 because ub is inclusive ASSERT0(LT_FUSSY, threadId < tnum, "current thread is not needed here; error"); @@ -455,7 +452,7 @@ // automatically selects thread or warp ID based on selected implementation int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc)), + ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(checkSPMDMode(loc)), "current thread is not needed here; error"); // retrieve schedule kmp_sched_t schedule = @@ -509,7 +506,7 @@ PRINT(LD_LOOP, "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, " "last %d\n", - (int)GetNumberOfOmpThreads(tid, isSPMDMode()), + (int)GetNumberOfOmpThreads(isSPMDMode()), (int)GetNumberOfWorkersInTeam(), (long long)*plower, (long long)*pupper, (long long)*pstride, (int)*plast); return DISPATCH_NOTFINISHED; @@ -782,8 +779,7 @@ "Expected non-SPMD mode + initialized runtime."); omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor(); - int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc)); + uint32_t NumThreads = GetNumberOfOmpThreads(checkSPMDMode(loc)); uint64_t *Buffer = teamDescr.getLastprivateIterBuffer(); for (unsigned i = 0; i < varNum; i++) { // Reset buffer. Index: libomptarget/deviceRTLs/nvptx/src/omp_data.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -33,6 +33,7 @@ __device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; __device__ __shared__ uint16_t threadLimit; +__device__ __shared__ uint16_t threadsInTeam; // Pointer to this team's OpenMP state object __device__ __shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -166,7 +166,6 @@ // methods for other fields INLINE uint16_t &NThreads() { return items.nthreads; } INLINE uint16_t &ThreadId() { return items.threadId; } - INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; } INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; } INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { @@ -174,14 +173,12 @@ } // init & copy INLINE void InitLevelZeroTaskDescr(bool isSPMDExecutionMode); - INLINE void InitLevelOneTaskDescr(uint16_t tnum, - omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr); INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); - INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr, - uint16_t tnum); + INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr); INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum); @@ -213,7 +210,6 @@ uint8_t unused; uint16_t nthreads; // thread num for subsequent parallel regions uint16_t threadId; // thread id - uint16_t threadsInTeam; // threads in current team uint64_t runtimeChunkSize; // runtime chunk size } items; omptarget_nvptx_TaskDescr *prev; @@ -407,6 +403,7 @@ extern __device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; extern __device__ __shared__ uint16_t threadLimit; +extern __device__ __shared__ uint16_t threadsInTeam; extern __device__ __shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -137,8 +137,7 @@ omptarget_nvptx_TaskDescr *newTaskDescr = omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); - newTaskDescr->InitLevelOneTaskDescr(ThreadLimit, - currTeamDescr.LevelZeroTaskDescr()); + newTaskDescr->InitLevelOneTaskDescr(currTeamDescr.LevelZeroTaskDescr()); // install new top descriptor omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, newTaskDescr); @@ -147,7 +146,7 @@ PRINT(LD_PAR, "thread will execute parallel region with id %d in a team of " "%d threads\n", - (int)newTaskDescr->ThreadId(), (int)newTaskDescr->ThreadsInTeam()); + (int)newTaskDescr->ThreadId(), (int)ThreadLimit); if (RequiresDataSharing && GetLaneId() == 0) { // Warp master innitializes data sharing environment. Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h @@ -42,14 +42,13 @@ items.nthreads = GetNumberOfProcsInTeam(isSPMDExecutionMode); ; // threads: whatever was alloc by kernel items.threadId = 0; // is master - items.threadsInTeam = 1; // sequential items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1 } // This is called when all threads are started together in SPMD mode. // OMP directives include target parallel, target distribute parallel for, etc. INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr( - uint16_t tnum, omptarget_nvptx_TaskDescr *parentTaskDescr) { + omptarget_nvptx_TaskDescr *parentTaskDescr) { // slow method // flag: // default sched is static, @@ -61,7 +60,6 @@ items.nthreads = 0; // # threads for subsequent parallel region items.threadId = GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) - items.threadsInTeam = tnum; items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1 prev = parentTaskDescr; } @@ -91,12 +89,11 @@ } INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr( - omptarget_nvptx_TaskDescr *masterTaskDescr, uint16_t tnum) { + omptarget_nvptx_TaskDescr *masterTaskDescr) { CopyParent(masterTaskDescr); // overrwrite specific items; items.flags |= TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel - items.threadsInTeam = tnum; // set number of threads } INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr( @@ -121,7 +118,6 @@ omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) { CopyParent(parentTaskDescr); items.flags |= TaskDescr_InParL2P; // In L2+ parallelism - items.threadsInTeam = tnum; // set number of threads items.threadId = tid; } Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -264,7 +264,8 @@ // Set number of threads on work descriptor. omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); - workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr, NumThreads); + workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr); + threadsInTeam = NumThreads; } // All workers call this function. Deactivate those not needed. @@ -294,7 +295,7 @@ // Set to true for workers participating in the parallel region. bool isActive = false; // Initialize state for active threads. - if (threadId < workDescr.WorkTaskDescr()->ThreadsInTeam()) { + if (threadId < threadsInTeam) { // init work descriptor from workdesccr omptarget_nvptx_TaskDescr *newTaskDescr = omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); @@ -310,7 +311,7 @@ (int)newTaskDescr->ThreadId(), (int)newTaskDescr->NThreads()); isActive = true; - IncParallelLevel(workDescr.WorkTaskDescr()->ThreadsInTeam() != 1); + IncParallelLevel(threadsInTeam != 1); } return isActive; @@ -328,7 +329,7 @@ omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, currTaskDescr->GetPrevTaskDescr()); - DecParallelLevel(currTaskDescr->ThreadsInTeam() != 1); + DecParallelLevel(threadsInTeam != 1); } //////////////////////////////////////////////////////////////////////////////// @@ -367,7 +368,6 @@ // - each thread becomes ID 0 in its serialized parallel, and // - there is only one thread per team newTaskDescr->ThreadId() = 0; - newTaskDescr->ThreadsInTeam() = 1; // set new task descriptor as top omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, Index: libomptarget/deviceRTLs/nvptx/src/reduction.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -20,8 +20,7 @@ EXTERN int32_t __gpu_block_reduce() { bool isSPMDExecutionMode = isSPMDMode(); - int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); - int nt = GetNumberOfOmpThreads(tid, isSPMDExecutionMode); + int nt = GetNumberOfOmpThreads(isSPMDExecutionMode); if (nt != blockDim.x) return 0; unsigned tnum = __ACTIVEMASK(); @@ -39,7 +38,7 @@ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); int numthread; if (currTaskDescr->IsParallelConstruct()) { - numthread = GetNumberOfOmpThreads(threadId, checkSPMDMode(loc)); + numthread = GetNumberOfOmpThreads(checkSPMDMode(loc)); } else { numthread = GetNumberOfOmpTeams(); } @@ -147,8 +146,7 @@ kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, bool isSPMDExecutionMode, bool isRuntimeUninitialized) { uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); - uint32_t NumThreads = - GetNumberOfOmpThreads(BlockThreadId, isSPMDExecutionMode); + uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode); if (NumThreads == 1) return 1; /* @@ -279,9 +277,8 @@ // In generic mode only the team master participates in the teams // reduction because the workers are waiting for parallel work. uint32_t NumThreads = - isSPMDExecutionMode - ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true) - : /*Master thread only*/ 1; + isSPMDExecutionMode ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true) + : /*Master thread only*/ 1; uint32_t TeamId = GetBlockIdInKernel(); uint32_t NumTeams = GetNumberOfBlocksInKernel(); __shared__ volatile bool IsLastTeam; @@ -473,9 +470,8 @@ // In generic mode only the team master participates in the teams // reduction because the workers are waiting for parallel work. uint32_t NumThreads = - checkSPMDMode(loc) - ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true) - : /*Master thread only*/ 1; + checkSPMDMode(loc) ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true) + : /*Master thread only*/ 1; uint32_t TeamId = GetBlockIdInKernel(); uint32_t NumTeams = GetNumberOfBlocksInKernel(); __shared__ unsigned Bound; Index: libomptarget/deviceRTLs/nvptx/src/support.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/support.h +++ libomptarget/deviceRTLs/nvptx/src/support.h @@ -54,8 +54,7 @@ INLINE int GetOmpTeamId(); // omp_team_num // get OpenMP number of threads and team -INLINE int GetNumberOfOmpThreads(int threadId, - bool isSPMDExecutionMode); // omp_num_threads +INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads INLINE int GetNumberOfOmpTeams(); // omp_num_teams // get OpenMP number of procs Index: libomptarget/deviceRTLs/nvptx/src/supporti.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/supporti.h +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -165,18 +165,16 @@ return rc; } -INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode) { +INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) { // omp_num_threads int rc; - if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) { + int Level = parallelLevel[GetWarpId()]; + if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) { rc = 1; } else if (isSPMDExecutionMode) { rc = GetNumberOfThreadsInBlock(); } else { - omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); - ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); - rc = currTaskDescr->ThreadsInTeam(); + rc = threadsInTeam; } return rc; Index: libomptarget/deviceRTLs/nvptx/src/sync.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/sync.cu +++ libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -46,10 +46,8 @@ __kmpc_barrier_simple_spmd(loc_ref, tid); } else { tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref)); - omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid); int numberOfActiveOMPThreads = - GetNumberOfOmpThreads(tid, checkSPMDMode(loc_ref)); + GetNumberOfOmpThreads(checkSPMDMode(loc_ref)); if (numberOfActiveOMPThreads > 1) { if (checkSPMDMode(loc_ref)) { __kmpc_barrier_simple_spmd(loc_ref, tid);