Index: libomptarget/deviceRTLs/nvptx/src/libcall.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -49,12 +49,13 @@ } EXTERN int omp_get_max_threads(void) { - if (parallelLevel[GetWarpId()] > 0) + unsigned parLevel = parallelLevel[GetWarpId()]; + if (parLevel > 0) // We're already in parallel region. return 1; // default is 1 thread avail // Not currently in a parallel region, return what was set. int rc = 1; - if (parallelLevel[GetWarpId()] == 0) + if (parLevel == 0) rc = nThreads; ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads"); PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc); @@ -70,9 +71,7 @@ } EXTERN int omp_get_thread_num() { - bool isSPMDExecutionMode = isSPMDMode(); - int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); - int rc = GetOmpThreadId(tid, isSPMDExecutionMode); + int rc = GetOmpThreadId(isSPMDMode()); PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc); return rc; } @@ -151,41 +150,7 @@ // If level is 0 or all parallel regions are not active - return 0. unsigned parLevel = parallelLevel[GetWarpId()]; if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) { - int totLevel = omp_get_level(); - if (level <= totLevel) { - omptarget_nvptx_TaskDescr *currTaskDescr = - getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false); - int steps = totLevel - level; - PRINT(LD_IO, "backtrack %d steps\n", steps); - ASSERT0(LT_FUSSY, currTaskDescr, - "do not expect fct to be called in a non-active thread"); - do { - if (DON(LD_IOD)) { - // print current state - omp_sched_t sched = currTaskDescr->GetRuntimeSched(); - PRINT(LD_ALL, - "task descr %s %d: %s, in par %d, rt sched %d," - " chunk %" PRIu64 "; tid %d, tnum %d, nthreads %d\n", - "ancestor", steps, - (currTaskDescr->IsParallelConstruct() ? "par" : "task"), - (int)currTaskDescr->InParallelRegion(), (int)sched, - currTaskDescr->RuntimeChunkSize(), - (int)currTaskDescr->ThreadId(), (int)threadsInTeam, - (int)nThreads); - } - - if (currTaskDescr->IsParallelConstruct()) { - // found the level - if (!steps) { - rc = currTaskDescr->ThreadId(); - break; - } - steps--; - } - currTaskDescr = currTaskDescr->GetPrevTaskDescr(); - } while (currTaskDescr); - ASSERT0(LT_FUSSY, !steps, "expected to find all steps"); - } + rc = GetLogicalThreadIdInBlock(/*isSPMDExecutionMode=*/false); } else if (level == 0 || (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL && level <= parLevel) || Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -169,7 +169,6 @@ } INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); } // methods for other fields - INLINE uint16_t &ThreadId() { return items.threadId; } INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; } INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { @@ -186,8 +185,8 @@ INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum); - INLINE void SaveLoopData(); - INLINE void RestoreLoopData() const; + INLINE void SaveLoopData(int threadId); + INLINE void RestoreLoopData(int threadId) const; private: // bits for flags: (6 used, 2 free) @@ -212,7 +211,6 @@ struct TaskDescr_items { uint8_t flags; // 6 bit used (see flag above) uint8_t unused; - uint16_t threadId; // thread id uint64_t runtimeChunkSize; // runtime chunk size } items; omptarget_nvptx_TaskDescr *prev; Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -46,8 +46,7 @@ for (int I = 0; I < MAX_THREADS_PER_TEAM / WARPSIZE; ++I) parallelLevel[I] = 0; - int threadIdInBlock = GetThreadIdInBlock(); - ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(), + ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), "__kmpc_kernel_init() must be called by team master warp only!"); PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n"); @@ -58,7 +57,7 @@ omptarget_nvptx_device_State[slot].Dequeue(); // init thread private - int threadId = GetLogicalThreadIdInBlock(/*isSPMDExecutionMode=*/false); + const unsigned threadId = 0; omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId); // init team context @@ -93,15 +92,17 @@ int16_t RequiresDataSharing) { PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n"); - setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized - : RuntimeUninitialized); - int threadId = GetThreadIdInBlock(); + const unsigned threadId = GetThreadIdInBlock(); + const unsigned WID = GetWarpId(); + const unsigned LID = GetLaneId(); if (threadId == 0) { + setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized + : RuntimeUninitialized); usedSlotIdx = smid() % MAX_SM; parallelLevel[0] = 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); - } else if (GetLaneId() == 0) { - parallelLevel[GetWarpId()] = + } else if (LID == 0) { + parallelLevel[WID] = 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); } if (!RequiresOMPRuntime) { @@ -144,13 +145,12 @@ // init thread private from init value PRINT(LD_PAR, - "thread will execute parallel region with id %d in a team of " - "%d threads\n", - (int)newTaskDescr->ThreadId(), (int)ThreadLimit); + "thread will execute parallel region with id %d in a team of %d " + "threads\n", + (int)threadId, (int)ThreadLimit); - if (RequiresDataSharing && GetLaneId() == 0) { + if (RequiresDataSharing && LID == 0) { // Warp master innitializes data sharing environment. - unsigned WID = threadId / WARPSIZE; __kmpc_data_sharing_slot *RootS = currTeamDescr.RootS( WID, WID == WARPSIZE - 1); DataSharingState.SlotPtr[WID] = RootS; Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h @@ -39,7 +39,6 @@ // not in parallel items.flags = 0; - items.threadId = 0; // is master items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1 } @@ -55,8 +54,6 @@ items.flags = TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel - items.threadId = - GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1 prev = parentTaskDescr; } @@ -96,49 +93,33 @@ INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr( omptarget_nvptx_TaskDescr *workTaskDescr) { Copy(workTaskDescr); - // - // overrwrite specific items; - // - // The threadID should be GetThreadIdInBlock() % GetMasterThreadID(). - // This is so that the serial master (first lane in the master warp) - // gets a threadId of 0. - // However, we know that this function is always called in a parallel - // region where only workers are active. The serial master thread - // never enters this region. When a parallel region is executed serially, - // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions - // are called, which never activate this region. - items.threadId = - GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) } INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent( omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) { CopyParent(parentTaskDescr); items.flags |= TaskDescr_InParL2P; // In L2+ parallelism - items.threadId = tid; } -INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() { +INLINE void omptarget_nvptx_TaskDescr::SaveLoopData(int threadId) { loopData.loopUpperBound = - omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId); + omptarget_nvptx_threadPrivateContext->LoopUpperBound(threadId); loopData.nextLowerBound = - omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId); + omptarget_nvptx_threadPrivateContext->NextLowerBound(threadId); loopData.schedule = - omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId); - loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId); - loopData.stride = - omptarget_nvptx_threadPrivateContext->Stride(items.threadId); + omptarget_nvptx_threadPrivateContext->ScheduleType(threadId); + loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(threadId); + loopData.stride = omptarget_nvptx_threadPrivateContext->Stride(threadId); } -INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const { - omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk; - omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) = +INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData(int threadId) const { + omptarget_nvptx_threadPrivateContext->Chunk(threadId) = loopData.chunk; + omptarget_nvptx_threadPrivateContext->LoopUpperBound(threadId) = loopData.loopUpperBound; - omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) = + omptarget_nvptx_threadPrivateContext->NextLowerBound(threadId) = loopData.nextLowerBound; - omptarget_nvptx_threadPrivateContext->Stride(items.threadId) = - loopData.stride; - omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) = + omptarget_nvptx_threadPrivateContext->Stride(threadId) = loopData.stride; + omptarget_nvptx_threadPrivateContext->ScheduleType(threadId) = loopData.schedule; } Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -308,7 +308,7 @@ PRINT(LD_PAR, "thread will execute parallel region with id %d in a team of " "%d threads\n", - (int)newTaskDescr->ThreadId(), (int)nThreads); + (int)threadId, (int)nThreads); isActive = true; IncParallelLevel(threadsInTeam != 1); @@ -355,7 +355,7 @@ // get current task omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); - currTaskDescr->SaveLoopData(); + currTaskDescr->SaveLoopData(threadId); // allocate new task descriptor and copy value from current one, set prev to // it @@ -364,11 +364,6 @@ "new seq parallel task"); newTaskDescr->CopyParent(currTaskDescr); - // tweak values for serialized parallel case: - // - each thread becomes ID 0 in its serialized parallel, and - // - there is only one thread per team - newTaskDescr->ThreadId() = 0; - // set new task descriptor as top omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, newTaskDescr); @@ -395,7 +390,7 @@ // free SafeFree(currTaskDescr, (char *)"new seq parallel task"); currTaskDescr = getMyTopTaskDescriptor(threadId); - currTaskDescr->RestoreLoopData(); + currTaskDescr->RestoreLoopData(threadId); } EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) { @@ -409,8 +404,7 @@ // it's cheap to recalculate this value so we never use the result // of this call. EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) { - int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - return GetOmpThreadId(tid, checkSPMDMode(loc)); + return GetOmpThreadId(checkSPMDMode(loc)); } //////////////////////////////////////////////////////////////////////////////// Index: libomptarget/deviceRTLs/nvptx/src/support.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/support.h +++ libomptarget/deviceRTLs/nvptx/src/support.h @@ -49,8 +49,7 @@ INLINE int GetNumberOfWorkersInTeam(); // get OpenMP thread and team ids -INLINE int GetOmpThreadId(int threadId, - bool isSPMDExecutionMode); // omp_thread_num +INLINE int GetOmpThreadId(bool isSPMDExecutionMode); // omp_thread_num INLINE int GetOmpTeamId(); // omp_team_num // get OpenMP number of threads and team Index: libomptarget/deviceRTLs/nvptx/src/supporti.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/supporti.h +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -149,18 +149,14 @@ // //////////////////////////////////////////////////////////////////////////////// -INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) { +INLINE int GetOmpThreadId(bool isSPMDExecutionMode) { // omp_thread_num int rc; - if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) { - rc = 0; - } else if (isSPMDExecutionMode) { - rc = GetThreadIdInBlock(); + unsigned parLevel = parallelLevel[GetWarpId()]; + if (parLevel == OMP_ACTIVE_PARALLEL_LEVEL + 1) { + rc = GetLogicalThreadIdInBlock(isSPMDExecutionMode); } else { - omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); - ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); - rc = currTaskDescr->ThreadId(); + rc = 0; } return rc; } @@ -168,15 +164,14 @@ INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) { // omp_num_threads int rc; - int Level = parallelLevel[GetWarpId()]; - if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) { + unsigned parLevel = parallelLevel[GetWarpId()]; + if (parLevel != OMP_ACTIVE_PARALLEL_LEVEL + 1) { rc = 1; } else if (isSPMDExecutionMode) { rc = GetNumberOfThreadsInBlock(); } else { rc = threadsInTeam; } - return rc; }