diff --git a/openmp/libomptarget/deviceRTLs/common/include/ICVs.h b/openmp/libomptarget/deviceRTLs/common/include/ICVs.h --- a/openmp/libomptarget/deviceRTLs/common/include/ICVs.h +++ b/openmp/libomptarget/deviceRTLs/common/include/ICVs.h @@ -8,9 +8,13 @@ // // //===----------------------------------------------------------------------===// + #ifndef OMPTARGET_ICVS_H #define OMPTARGET_ICVS_H +#include "interface.h" +#include + struct ICVStateTy { int nthreads_var; @@ -22,11 +26,32 @@ /// active-levels-var is 1, if active_level is not 0, otherweise it is 0. int active_level; + /// Don't use UB type punning! + /// + ///{ + struct RunSchedVarEncodingTy { + omp_sched_t ScheduleKind; + int ChunkSize; + }; + + uint64_t run_sched_var; + + static_assert( + sizeof(run_sched_var) == sizeof(RunSchedVarEncodingTy), + "Schedule encoding is supposed to cover the entire run-sched-var ICV!"); + ///} + static bool ensureICVStateForThread(unsigned TId); static int &getICVForThread(int ICVStateTy::*Var); static int incICVForThread(int ICVStateTy::*Var, int UpdateVal); static int setICVForThread(int ICVStateTy::*Var, int UpdateVal); + + static uint64_t &getICVForThread(uint64_t ICVStateTy::*Var); + static uint64_t incICVForThread(uint64_t ICVStateTy::*Var, + uint64_t UpdateVal); + static uint64_t setICVForThread(uint64_t ICVStateTy::*Var, + uint64_t UpdateVal); }; #ifdef __cplusplus diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -102,20 +102,7 @@ class omptarget_nvptx_TaskDescr { public: - // methods for flags - INLINE omp_sched_t GetRuntimeSched() const; - INLINE void SetRuntimeSched(omp_sched_t sched); - INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; } - INLINE int InL2OrHigherParallelRegion() const { - return items.flags & TaskDescr_InParL2P; - } - INLINE int IsParallelConstruct() const { - return items.flags & TaskDescr_IsParConstr; - } - INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); } // methods for other fields - INLINE uint16_t &ThreadId() { return items.threadId; } - INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; } INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { prev = taskDescr; @@ -133,23 +120,6 @@ uint16_t tid, uint16_t tnum); private: - // bits for flags: (6 used, 2 free) - // 3 bits (SchedMask) for runtime schedule - // 1 bit (InPar) if this thread has encountered one or more parallel region - // 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task) - // 1 bit (InParL2+) if this thread has encountered L2 or higher parallel - // region - static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4); - static const uint8_t TaskDescr_InPar = 0x10; - static const uint8_t TaskDescr_IsParConstr = 0x20; - static const uint8_t TaskDescr_InParL2P = 0x40; - - struct TaskDescr_items { - uint8_t flags; // 6 bit used (see flag above) - uint8_t unused; - uint16_t threadId; // thread id - uint64_t runtimeChunkSize; // runtime chunk size - } items; omptarget_nvptx_TaskDescr *prev; }; diff --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h --- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h +++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h @@ -15,53 +15,18 @@ // Task Descriptor //////////////////////////////////////////////////////////////////////////////// -INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const { - // sched starts from 1..4; encode it as 0..3; so add 1 here - uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1; - return (omp_sched_t)rc; -} - -INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) { - // sched starts from 1..4; encode it as 0..3; so sub 1 here - uint8_t val = ((uint8_t)sched) - 1; - // clear current sched - items.flags &= ~TaskDescr_SchedMask; - // set new sched - items.flags |= val; -} - INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() { - // slow method - // flag: - // default sched is static, - // dyn is off (unused now anyway, but may need to sample from host ?) - // not in parallel - - items.flags = 0; - items.threadId = 0; // is master - items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1 } // This is called when all threads are started together in SPMD mode. // OMP directives include target parallel, target distribute parallel for, etc. INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr( omptarget_nvptx_TaskDescr *parentTaskDescr) { - // slow method - // flag: - // default sched is static, - // dyn is off (unused now anyway, but may need to sample from host ?) - // in L1 parallel - - items.flags = TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel - items.threadId = - GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) - items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1 prev = parentTaskDescr; } INLINE void omptarget_nvptx_TaskDescr::CopyData( omptarget_nvptx_TaskDescr *sourceTaskDescr) { - items = sourceTaskDescr->items; } INLINE void @@ -79,41 +44,21 @@ INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask( omptarget_nvptx_TaskDescr *parentTaskDescr) { CopyParent(parentTaskDescr); - items.flags = items.flags & ~TaskDescr_IsParConstr; - ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task"); } INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr( omptarget_nvptx_TaskDescr *masterTaskDescr) { CopyParent(masterTaskDescr); - // overwrite specific items; - items.flags |= - TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel } INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr( omptarget_nvptx_TaskDescr *workTaskDescr) { Copy(workTaskDescr); - // - // overwrite specific items; - // - // The threadID should be GetThreadIdInBlock() % GetMasterThreadID(). - // This is so that the serial master (first lane in the master warp) - // gets a threadId of 0. - // However, we know that this function is always called in a parallel - // region where only workers are active. The serial master thread - // never enters this region. When a parallel region is executed serially, - // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions - // are called, which never activate this region. - items.threadId = - GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) } INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent( omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) { CopyParent(parentTaskDescr); - items.flags |= TaskDescr_InParL2P; // In L2+ parallelism - items.threadId = tid; } //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/common/src/ICVs.cpp b/openmp/libomptarget/deviceRTLs/common/src/ICVs.cpp --- a/openmp/libomptarget/deviceRTLs/common/src/ICVs.cpp +++ b/openmp/libomptarget/deviceRTLs/common/src/ICVs.cpp @@ -18,6 +18,7 @@ #include "omptarget.h" #include "support.h" #include "target_interface.h" +#include #include #define ICV_DEBUG(...) @@ -119,6 +120,24 @@ return !!ICVStateTy::getICVForThread(&ICVStateTy::active_level); } +void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) { + uint64_t RunSchedVar = + ICVStateTy::getICVForThread(&ICVStateTy::run_sched_var); + ICVStateTy::RunSchedVarEncodingTy RunSchedVarEncoding; + memcpy(&RunSchedVarEncoding, &RunSchedVar, sizeof(RunSchedVarEncoding)); + *ScheduleKind = RunSchedVarEncoding.ScheduleKind; + *ChunkSize = RunSchedVarEncoding.ChunkSize; +} + +void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) { + ICVStateTy::RunSchedVarEncodingTy RunSchedVarEncoding; + RunSchedVarEncoding.ScheduleKind = ScheduleKind; + RunSchedVarEncoding.ChunkSize = ChunkSize; + uint64_t RunSchedVar; + memcpy(&RunSchedVar, &RunSchedVarEncoding, sizeof(RunSchedVarEncoding)); + ICVStateTy::setICVForThread(&ICVStateTy::run_sched_var, RunSchedVar); +} + static int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, int OutOfBoundsVal = -1) { if (Level == 0) @@ -197,9 +216,9 @@ ThreadStates[TId] = nullptr; } -DEVICE TeamStateTy SHARED(omp::TeamState); +TeamStateTy SHARED(omp::TeamState); -[[clang::loader_uninitialized]] DEVICE ThreadStateTy +[[clang::loader_uninitialized]] ThreadStateTy *omp::ThreadStates[MAX_THREADS_PER_TEAM]; #pragma omp allocate(omp::ThreadStates) allocator(omp_pteam_mem_alloc) diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu --- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu @@ -74,41 +74,6 @@ return rc; } -EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) { - if (isRuntimeUninitialized()) { - ASSERT0(LT_FUSSY, isSPMDMode(), - "Expected SPMD mode only with uninitialized runtime."); - *kind = omp_sched_static; - *modifier = 1; - } else { - omptarget_nvptx_TaskDescr *currTaskDescr = - getMyTopTaskDescriptor(isSPMDMode()); - *kind = currTaskDescr->GetRuntimeSched(); - *modifier = currTaskDescr->RuntimeChunkSize(); - } - PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n", - (int)*kind, *modifier); -} - -EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) { - PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind, - modifier); - if (isRuntimeUninitialized()) { - ASSERT0(LT_FUSSY, isSPMDMode(), - "Expected SPMD mode only with uninitialized runtime."); - return; - } - if (kind >= omp_sched_static && kind < omp_sched_auto) { - omptarget_nvptx_TaskDescr *currTaskDescr = - getMyTopTaskDescriptor(isSPMDMode()); - currTaskDescr->SetRuntimeSched(kind); - currTaskDescr->RuntimeChunkSize() = modifier; - PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %" PRIu64 "\n", - (int)currTaskDescr->GetRuntimeSched(), - currTaskDescr->RuntimeChunkSize()); - } -} - EXTERN omp_proc_bind_t omp_get_proc_bind(void) { PRINT0(LD_IO, "call omp_get_proc_bin() is true, regardless on state\n"); return omp_proc_bind_true; diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu --- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu @@ -248,8 +248,10 @@ chunk = tripCount; // one thread gets the whole loop } else if (schedule == kmp_sched_runtime) { // process runtime - omp_sched_t rtSched = currTaskDescr->GetRuntimeSched(); - chunk = currTaskDescr->RuntimeChunkSize(); + omp_sched_t rtSched; + int ChunkInt; + omp_get_schedule(&rtSched, &ChunkInt); + chunk = ChunkInt; switch (rtSched) { case omp_sched_static: { if (chunk > 0) diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -144,7 +144,7 @@ PRINT(LD_PAR, "thread will execute parallel region with id %d in a team of " "%d threads\n", - (int)newTaskDescr->ThreadId(), (int)ThreadLimit); + (int)threadId, (int)ThreadLimit); } EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) { diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu --- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu @@ -81,12 +81,6 @@ omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); - ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(), - "cannot be called in a parallel region."); - if (currTaskDescr->InParallelRegion()) { - PRINT0(LD_PAR, "already in parallel: go seq\n"); - return; - } uint16_t NumThreads = determineNumberOfThreads(); TeamState.ParallelTeamSize = NumThreads; @@ -152,7 +146,7 @@ PRINT(LD_PAR, "thread will execute parallel region with id %d in a team of " "%d threads\n", - (int)newTaskDescr->ThreadId(), (int)nThreads); + (int)threadId, (int)nThreads); } return ThreadIsActive; @@ -211,11 +205,6 @@ "new seq parallel task"); newTaskDescr->CopyParent(currTaskDescr); - // tweak values for serialized parallel case: - // - each thread becomes ID 0 in its serialized parallel, and - // - there is only one thread per team - newTaskDescr->ThreadId() = 0; - // set new task descriptor as top omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, newTaskDescr);