Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -298,7 +298,9 @@ // compute static chunk ST stride; int lastiter = 0; - ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); + ForStaticChunk( + lastiter, lb, ub, stride, chunk, + GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum); // save computed params omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; @@ -320,7 +322,9 @@ // compute static chunk ST stride; int lastiter = 0; - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); + ForStaticNoChunk( + lastiter, lb, ub, stride, chunk, + GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum); // save computed params omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; @@ -366,10 +370,11 @@ // Support for dispatch next INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize, - Counter &loopLowerBound, + int64_t &loopLowerBound, T loopUpperBound) { // calculate lower bound for all lanes in the warp - lb = atomicAdd(&loopLowerBound, (Counter)chunkSize); + lb = atomicAdd((unsigned long long *)&loopLowerBound, + (unsigned long long)chunkSize); ub = lb + chunkSize - 1; // Clang uses i <= ub // 3 result cases: Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -192,6 +192,8 @@ INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum); + INLINE void SaveLoopData(); + INLINE void RestoreLoopData() const; private: // bits for flags: (7 used, 1 free) @@ -207,6 +209,14 @@ static const uint8_t TaskDescr_IsParConstr = 0x20; static const uint8_t TaskDescr_InParL2P = 0x40; + struct SavedLoopDescr_items { + int64_t loopUpperBound; + int64_t nextLowerBound; + int64_t chunk; + int64_t stride; + kmp_sched_t schedule; + } loopData; + struct TaskDescr_items { uint8_t flags; // 6 bit used (see flag above) uint8_t unused; @@ -335,16 +345,8 @@ INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } INLINE int64_t &Chunk(int tid) { return chunk[tid]; } INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } - // state for dispatch with dyn/guided - INLINE Counter &CurrentEvent(int tid) { - return currEvent_or_nextLowerBound[tid]; - } - INLINE Counter &EventsNumber(int tid) { return eventsNum_or_stride[tid]; } - // state for dispatch with static - INLINE Counter &NextLowerBound(int tid) { - return currEvent_or_nextLowerBound[tid]; - } - INLINE Counter &Stride(int tid) { return eventsNum_or_stride[tid]; } + INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; } + INLINE int64_t &Stride(int tid) { return stride[tid]; } INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } @@ -373,8 +375,8 @@ int64_t chunk[MAX_THREADS_PER_TEAM]; int64_t loopUpperBound[MAX_THREADS_PER_TEAM]; // state for dispatch with dyn/guided OR static (never use both at a time) - Counter currEvent_or_nextLowerBound[MAX_THREADS_PER_TEAM]; - Counter eventsNum_or_stride[MAX_THREADS_PER_TEAM]; + int64_t nextLowerBound[MAX_THREADS_PER_TEAM]; + int64_t stride[MAX_THREADS_PER_TEAM]; // Queue to which this object must be returned. uint64_t SourceQueue; }; Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h @@ -125,6 +125,30 @@ items.threadId = tid; } +INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() { + loopData.loopUpperBound = + omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId); + loopData.nextLowerBound = + omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId); + loopData.schedule = + omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId); + loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId); + loopData.stride = + omptarget_nvptx_threadPrivateContext->Stride(items.threadId); +} + +INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const { + omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk; + omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) = + loopData.loopUpperBound; + omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) = + loopData.nextLowerBound; + omptarget_nvptx_threadPrivateContext->Stride(items.threadId) = + loopData.stride; + omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) = + loopData.schedule; +} + //////////////////////////////////////////////////////////////////////////////// // Thread Private Context //////////////////////////////////////////////////////////////////////////////// Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -386,6 +386,7 @@ // get current task omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); + currTaskDescr->SaveLoopData(); // allocate new task descriptor and copy value from current one, set prev to // it @@ -417,6 +418,8 @@ threadId, currTaskDescr->GetPrevTaskDescr()); // free SafeFree(currTaskDescr, (char *)"new seq parallel task"); + currTaskDescr = getMyTopTaskDescriptor(threadId); + currTaskDescr->RestoreLoopData(); } EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) {