diff --git a/openmp/libomptarget/deviceRTLs/common/allocator.h b/openmp/libomptarget/deviceRTLs/common/allocator.h --- a/openmp/libomptarget/deviceRTLs/common/allocator.h +++ b/openmp/libomptarget/deviceRTLs/common/allocator.h @@ -39,6 +39,12 @@ #define EXTERN_SHARED(NAME) \ NAME; \ OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc)) + +// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right +// now that's not the case. +#define THREAD_LOCAL(NAME) \ + NAME [[clang::loader_uninitialized, clang::address_space(5)]] + #endif #endif // OMPTARGET_ALLOCATOR_H diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -132,8 +132,6 @@ INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum); - INLINE void SaveLoopData(); - INLINE void RestoreLoopData() const; private: // bits for flags: (6 used, 2 free) @@ -147,14 +145,6 @@ static const uint8_t TaskDescr_IsParConstr = 0x20; static const uint8_t TaskDescr_InParL2P = 0x40; - struct SavedLoopDescr_items { - int64_t loopUpperBound; - int64_t nextLowerBound; - int64_t chunk; - int64_t stride; - kmp_sched_t schedule; - } loopData; - struct TaskDescr_items { uint8_t flags; // 6 bit used (see flag above) uint8_t unused; @@ -223,6 +213,7 @@ // thread private data (struct of arrays for better coalescing) // tid refers here to the global thread id // do not support multiple concurrent kernel a this time + class omptarget_nvptx_ThreadPrivateContext { public: // task @@ -238,13 +229,6 @@ INLINE uint16_t &NumThreadsForNextParallel(int tid) { return nextRegion.tnum[tid]; } - // schedule (for dispatch) - INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } - INLINE int64_t &Chunk(int tid) { return chunk[tid]; } - INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } - INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; } - INLINE int64_t &Stride(int tid) { return stride[tid]; } - INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } INLINE void InitThreadPrivateContext(int tid); @@ -263,12 +247,6 @@ uint16_t tnum[MAX_THREADS_PER_TEAM]; } nextRegion; // schedule (for dispatch) - kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for - int64_t chunk[MAX_THREADS_PER_TEAM]; - int64_t loopUpperBound[MAX_THREADS_PER_TEAM]; - // state for dispatch with dyn/guided OR static (never use both at a time) - int64_t nextLowerBound[MAX_THREADS_PER_TEAM]; - int64_t stride[MAX_THREADS_PER_TEAM]; uint64_t cnt; }; diff --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h --- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h +++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h @@ -116,30 +116,6 @@ items.threadId = tid; } -INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() { - loopData.loopUpperBound = - omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId); - loopData.nextLowerBound = - omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId); - loopData.schedule = - omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId); - loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId); - loopData.stride = - omptarget_nvptx_threadPrivateContext->Stride(items.threadId); -} - -INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const { - omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk; - omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) = - loopData.loopUpperBound; - omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) = - loopData.nextLowerBound; - omptarget_nvptx_threadPrivateContext->Stride(items.threadId) = - loopData.stride; - omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) = - loopData.schedule; -} - //////////////////////////////////////////////////////////////////////////////// // Thread Private Context //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu --- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu @@ -17,6 +17,15 @@ #include "target/shuffle.h" #include "target_impl.h" +struct DynamicScheduleTracker { + int64_t Chunk; + int64_t LoopUpperBound; + int64_t NextLowerBound; + int64_t Stride; + kmp_sched_t ScheduleType; + DynamicScheduleTracker *NextDST; +}; + //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// // template class that encapsulate all the helper functions @@ -203,7 +212,7 @@ INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId, kmp_sched_t schedule, T lb, T ub, ST st, - ST chunk) { + ST chunk, DynamicScheduleTracker *DST) { if (checkRuntimeUninitialized(loc)) { // In SPMD mode no need to check parallelism level - dynamic scheduling // may appear only in L2 parallel regions with lightweight runtime. @@ -279,32 +288,29 @@ if (schedule == kmp_sched_static_chunk) { ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); // save sched state - omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; + DST->ScheduleType = schedule; // save ub - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; + DST->LoopUpperBound = ub; // compute static chunk ST stride; int lastiter = 0; ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); // save computed params - omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; - omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; + DST->Chunk = chunk; + DST->NextLowerBound = lb; + DST->Stride = stride; PRINT(LD_LOOP, "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 ", next lower bound = %llu, stride = %llu\n", - (int)tnum, - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), - (unsigned long long) - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), - (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( - tid)); + (int)tnum, DST->LoopUpperBound, + (unsigned long long)DST->NextLowerBound, + (unsigned long long)DST->Stride); } else if (schedule == kmp_sched_static_balanced_chunk) { ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); // save sched state - omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; + DST->ScheduleType = schedule; // save ub - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; + DST->LoopUpperBound = ub; // compute static chunk ST stride; int lastiter = 0; @@ -319,49 +325,45 @@ if (ub > oldUb) ub = oldUb; // save computed params - omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; - omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; + DST->Chunk = chunk; + DST->NextLowerBound = lb; + DST->Stride = stride; PRINT(LD_LOOP, "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 ", next lower bound = %llu, stride = %llu\n", - (int)tnum, - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), - (unsigned long long) - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), + (int)tnum, DST->LoopUpperBound, + (unsigned long long)DST->NextLowerBound, (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( tid)); } else if (schedule == kmp_sched_static_nochunk) { ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); // save sched state - omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; + DST->ScheduleType = schedule; // save ub - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; + DST->LoopUpperBound = ub; // compute static chunk ST stride; int lastiter = 0; ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); // save computed params - omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; - omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; + DST->Chunk = chunk; + DST->NextLowerBound = lb; + DST->Stride = stride; PRINT(LD_LOOP, "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 ", next lower bound = %llu, stride = %llu\n", - (int)tnum, - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), - (unsigned long long) - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), + (int)tnum, DST->LoopUpperBound, + (unsigned long long)DST->NextLowerBound, (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( tid)); } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { // save data - omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; + DST->ScheduleType = schedule; if (chunk < 1) chunk = 1; - omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; + DST->Chunk = chunk; + DST->LoopUpperBound = ub; + DST->NextLowerBound = lb; __kmpc_barrier(loc, threadId); if (tid == 0) { omptarget_nvptx_threadPrivateContext->Cnt() = 0; @@ -371,11 +373,8 @@ PRINT(LD_LOOP, "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64 ", chunk %" PRIu64 "\n", - (int)tnum, - (unsigned long long) - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), - omptarget_nvptx_threadPrivateContext->Chunk(tid)); + (int)tnum, (unsigned long long)DST->NextLowerBound, + DST->LoopUpperBound, DST->Chunk); } } @@ -440,7 +439,8 @@ } INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast, - T *plower, T *pupper, ST *pstride) { + T *plower, T *pupper, ST *pstride, + DynamicScheduleTracker *DST) { if (checkRuntimeUninitialized(loc)) { // In SPMD mode no need to check parallelism level - dynamic scheduling // may appear only in L2 parallel regions with lightweight runtime. @@ -457,14 +457,13 @@ ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(checkSPMDMode(loc)), "current thread is not needed here; error"); // retrieve schedule - kmp_sched_t schedule = - omptarget_nvptx_threadPrivateContext->ScheduleType(tid); + kmp_sched_t schedule = DST->ScheduleType; // xxx reduce to one if (schedule == kmp_sched_static_chunk || schedule == kmp_sched_static_nochunk) { - T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid); - T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid); + T myLb = DST->NextLowerBound; + T ub = DST->LoopUpperBound; // finished? if (myLb > ub) { PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n", @@ -472,7 +471,7 @@ return DISPATCH_FINISHED; } // not finished, save current bounds - ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid); + ST chunk = DST->Chunk; *plower = myLb; T myUb = myLb + chunk - 1; // Clang uses i <= ub if (myUb > ub) @@ -481,8 +480,8 @@ *plast = (int32_t)(myUb == ub); // increment next lower bound by the stride - ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid); - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride; + ST stride = DST->Stride; + DST->NextLowerBound = myLb + stride; PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n", (long long)*plower, (long long)*pupper); return DISPATCH_NOTFINISHED; @@ -491,10 +490,8 @@ schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, "bad sched"); T myLb, myUb; - int finished = DynamicNextChunk( - myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid), - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid)); + int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound, + DST->LoopUpperBound); if (finished == FINISHED) return DISPATCH_FINISHED; @@ -527,89 +524,125 @@ // KMP interface implementation (dyn loops) //////////////////////////////////////////////////////////////////////////////// +// TODO: This is a stopgap. We probably want to expand the dispatch API to take +// an DST pointer which can then be allocated properly without malloc. +DynamicScheduleTracker *THREAD_LOCAL(ThreadDSTPtr); + +// Create a new DST, link the current one, and define the new as current. +static DynamicScheduleTracker *pushDST() { + DynamicScheduleTracker *NewDST = static_cast( + SafeMalloc(sizeof(DynamicScheduleTracker), "new DST")); + *NewDST = DynamicScheduleTracker({0}); + NewDST->NextDST = ThreadDSTPtr; + ThreadDSTPtr = NewDST; + return ThreadDSTPtr; +} + +// Return the current DST. +static DynamicScheduleTracker *peekDST() { return ThreadDSTPtr; } + +// Pop the current DST and restore the last one. +static void popDST() { + DynamicScheduleTracker *OldDST = ThreadDSTPtr->NextDST; + SafeFree(ThreadDSTPtr, "remove DST"); + ThreadDSTPtr = OldDST; +} + // init EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid, int32_t schedule, int32_t lb, int32_t ub, int32_t st, int32_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_4\n"); + DynamicScheduleTracker *DST = pushDST(); omptarget_nvptx_LoopSupport::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); } EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid, int32_t schedule, uint32_t lb, uint32_t ub, int32_t st, int32_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n"); + DynamicScheduleTracker *DST = pushDST(); omptarget_nvptx_LoopSupport::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); } EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid, int32_t schedule, int64_t lb, int64_t ub, int64_t st, int64_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_8\n"); + DynamicScheduleTracker *DST = pushDST(); omptarget_nvptx_LoopSupport::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); } EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid, int32_t schedule, uint64_t lb, uint64_t ub, int64_t st, int64_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n"); + DynamicScheduleTracker *DST = pushDST(); omptarget_nvptx_LoopSupport::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); } // next EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last, int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_4\n"); + DynamicScheduleTracker *DST = peekDST(); return omptarget_nvptx_LoopSupport::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st); + loc, tid, p_last, p_lb, p_ub, p_st, DST); } EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid, int32_t *p_last, uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n"); + DynamicScheduleTracker *DST = peekDST(); return omptarget_nvptx_LoopSupport::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st); + loc, tid, p_last, p_lb, p_ub, p_st, DST); } EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last, int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_8\n"); + DynamicScheduleTracker *DST = peekDST(); return omptarget_nvptx_LoopSupport::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st); + loc, tid, p_last, p_lb, p_ub, p_st, DST); } EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid, int32_t *p_last, uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n"); + DynamicScheduleTracker *DST = peekDST(); return omptarget_nvptx_LoopSupport::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st); + loc, tid, p_last, p_lb, p_ub, p_st, DST); } // fini EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) { PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n"); omptarget_nvptx_LoopSupport::dispatch_fini(); + popDST(); } EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) { PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n"); omptarget_nvptx_LoopSupport::dispatch_fini(); + popDST(); } EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) { PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n"); omptarget_nvptx_LoopSupport::dispatch_fini(); + popDST(); } EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) { PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n"); omptarget_nvptx_LoopSupport::dispatch_fini(); + popDST(); } //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu --- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu @@ -216,7 +216,6 @@ // get current task omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); - currTaskDescr->SaveLoopData(); // allocate new task descriptor and copy value from current one, set prev to // it @@ -256,7 +255,6 @@ // free SafeFree(currTaskDescr, "new seq parallel task"); currTaskDescr = getMyTopTaskDescriptor(threadId); - currTaskDescr->RestoreLoopData(); } EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {