Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -79,7 +79,7 @@ EXTERN void __kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS, size_t InitialDataSize) { - + assert(isRuntimeInitialized() && "Expected initialized runtime."); DSPRINT0(DSFLAG_INIT, "Entering __kmpc_initialize_data_sharing_environment\n"); @@ -331,6 +331,7 @@ //////////////////////////////////////////////////////////////////////////////// INLINE void data_sharing_init_stack_common() { + assert(isRuntimeInitialized() && "Expected initialized runtime."); omptarget_nvptx_TeamDescr *teamDescr = &omptarget_nvptx_threadPrivateContext->TeamContext(); @@ -346,6 +347,7 @@ // initialization). This function is called only by the MASTER thread of each // team in non-SPMD mode. EXTERN void __kmpc_data_sharing_init_stack() { + assert(isRuntimeInitialized() && "Expected initialized runtime."); // This function initializes the stack pointer with the pointer to the // statically allocated shared memory slots. The size of a shared memory // slot is pre-determined to be 256 bytes. @@ -357,6 +359,7 @@ // once at the beginning of a data sharing context (coincides with the kernel // initialization). This function is called in SPMD mode only. EXTERN void __kmpc_data_sharing_init_stack_spmd() { + assert(isRuntimeInitialized() && "Expected initialized runtime."); // This function initializes the stack pointer with the pointer to the // statically allocated shared memory slots. The size of a shared memory // slot is pre-determined to be 256 bytes. Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h @@ -116,6 +116,8 @@ kmp_sched_runtime = 37, kmp_sched_auto = 38, + kmp_sched_static_balanced_chunk = 45, + kmp_sched_static_ordered = 65, kmp_sched_static_nochunk_ordered = 66, kmp_sched_dynamic_ordered = 67, Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -31,6 +31,10 @@ } EXTERN void omp_set_num_threads(int num) { + // Ignore it for SPMD mode. + if (isSPMDMode()) + return; + assert(isRuntimeInitialized() && "Expected initialized runtime."); PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num); if (num <= 0) { WARNING0(LW_INPUT, "expected positive num; ignore\n"); @@ -48,6 +52,12 @@ } EXTERN int omp_get_max_threads(void) { + if (isRuntimeUninitialized()) { + assert(isSPMDMode() && + "expected SPMD mode only with uninitialized runtime."); + // We're already in parallel region. + return 1; // default is 1 thread avail + } omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); int rc = 1; // default is 1 thread avail if (!currTaskDescr->InParallelRegion()) { @@ -60,6 +70,11 @@ } EXTERN int omp_get_thread_limit(void) { + if (isRuntimeUninitialized()) { + assert(isSPMDMode() && + "expected SPMD mode only with uninitialized runtime."); + return 0; // default is 0 + } // per contention group.. meaning threads in current team omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); int rc = currTaskDescr->ThreadLimit(); @@ -82,9 +97,15 @@ EXTERN int omp_in_parallel(void) { int rc = 0; - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); - if (currTaskDescr->InParallelRegion()) { - rc = 1; + if (isRuntimeUninitialized()) { + assert(isSPMDMode() && + "expected SPMD mode only with uninitialized runtime."); + rc = 1; // SPMD mode is always in parallel. + } else { + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + if (currTaskDescr->InParallelRegion()) { + rc = 1; + } } PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc); return rc; @@ -102,6 +123,11 @@ EXTERN void omp_set_dynamic(int flag) { PRINT(LD_IO, "call omp_set_dynamic(%d)\n", flag); + if (isRuntimeUninitialized()) { + assert(isSPMDMode() && + "expected SPMD mode only with uninitialized runtime."); + return; + } omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); if (flag) { @@ -113,6 +139,11 @@ EXTERN int omp_get_dynamic(void) { int rc = 0; + if (isRuntimeUninitialized()) { + assert(isSPMDMode() && + "expected SPMD mode only with uninitialized runtime."); + return rc; + } omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); if (currTaskDescr->IsDynamic()) { rc = 1; @@ -145,6 +176,11 @@ } EXTERN int omp_get_level(void) { + if (isRuntimeUninitialized()) { + assert(isSPMDMode() && + "expected SPMD mode only with uninitialized runtime."); + return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel(); + } int level = 0; omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); ASSERT0(LT_FUSSY, currTaskDescr, @@ -160,6 +196,11 @@ } EXTERN int omp_get_active_level(void) { + if (isRuntimeUninitialized()) { + assert(isSPMDMode() && + "expected SPMD mode only with uninitialized runtime."); + return 1; + } int level = 0; // no active level parallelism omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); ASSERT0(LT_FUSSY, currTaskDescr, @@ -177,6 +218,11 @@ } EXTERN int omp_get_ancestor_thread_num(int level) { + if (isRuntimeUninitialized()) { + assert(isSPMDMode() && + "expected SPMD mode only with uninitialized runtime."); + return level == 1 ? GetThreadIdInBlock() : 0; + } int rc = 0; // default at level 0 if (level >= 0) { int totLevel = omp_get_level(); @@ -220,6 +266,11 @@ } EXTERN int omp_get_team_size(int level) { + if (isRuntimeUninitialized()) { + assert(isSPMDMode() && + "expected SPMD mode only with uninitialized runtime."); + return level == 1 ? GetNumberOfThreadsInBlock() : 1; + } int rc = 1; // default at level 0 if (level >= 0) { int totLevel = omp_get_level(); @@ -247,9 +298,16 @@ } EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) { - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); - *kind = currTaskDescr->GetRuntimeSched(); - *modifier = currTaskDescr->RuntimeChunkSize(); + if (isRuntimeUninitialized()) { + assert(isSPMDMode() && + "expected SPMD mode only with uninitialized runtime."); + *kind = omp_sched_static; + *modifier = 1; + } else { + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + *kind = currTaskDescr->GetRuntimeSched(); + *modifier = currTaskDescr->RuntimeChunkSize(); + } PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n", (int)*kind, *modifier); } @@ -257,6 +315,11 @@ EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) { PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind, modifier); + if (isRuntimeUninitialized()) { + assert(isSPMDMode() && + "expected SPMD mode only with uninitialized runtime."); + return; + } if (kind >= omp_sched_static && kind < omp_sched_auto) { omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); currTaskDescr->SetRuntimeSched(kind); Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -131,7 +131,7 @@ ST stride = *pstride; T entityId, numberOfEntities; // init - switch (schedtype) { + switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) { case kmp_sched_static_chunk: { if (chunk > 0) { entityId = @@ -143,6 +143,28 @@ break; } } // note: if chunk <=0, use nochunk + case kmp_sched_static_balanced_chunk: { + if (chunk > 0) { + entityId = + GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized); + numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, + IsRuntimeUninitialized); + + // round up to make sure the chunk is enough to cover all iterations + T tripCount = ub - lb + 1; // +1 because ub is inclusive + T span = (tripCount + numberOfEntities - 1) / numberOfEntities; + // perform chunk adjustment + chunk = (span + chunk - 1) & ~(chunk - 1); + + assert(ub >= lb && "ub must be >= lb."); + T oldUb = ub; + ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId, + numberOfEntities); + if (ub > oldUb) + ub = oldUb; + break; + } + } // note: if chunk <=0, use nochunk case kmp_sched_static_nochunk: { entityId = GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized); @@ -199,12 +221,13 @@ *plower = lb; *pupper = ub; *pstride = stride; - PRINT(LD_LOOP, - "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld\n", - GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, - IsRuntimeUninitialized), - GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), - P64(*pstride)); + PRINT( + LD_LOOP, + "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last " + "%d\n", + GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, IsRuntimeUninitialized), + GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), P64(*pstride), + lastiter); } //////////////////////////////////////////////////////////////////////////////// @@ -218,6 +241,8 @@ INLINE static void dispatch_init(kmp_Indent *loc, int32_t threadId, kmp_sched_t schedule, T lb, T ub, ST st, ST chunk) { + assert(isRuntimeInitialized() && + "Expected non-SPMD mode + initialized runtime."); int tid = GetLogicalThreadIdInBlock(); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid); T tnum = currTaskDescr->ThreadsInTeam(); @@ -308,7 +333,38 @@ omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), omptarget_nvptx_threadPrivateContext->Stride(tid)); + } else if (schedule == kmp_sched_static_balanced_chunk) { + ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); + // save sched state + omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; + // save ub + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; + // compute static chunk + ST stride; + int lastiter = 0; + // round up to make sure the chunk is enough to cover all iterations + T span = (tripCount + tnum - 1) / tnum; + // perform chunk adjustment + chunk = (span + chunk - 1) & ~(chunk - 1); + T oldUb = ub; + ForStaticChunk( + lastiter, lb, ub, stride, chunk, + GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum); + assert(ub >= lb && "ub must be >= lb."); + if (ub > oldUb) + ub = oldUb; + // save computed params + omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; + omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; + PRINT(LD_LOOP, + "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 + ", next lower bound = %llu, stride = %llu\n", + GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), + omptarget_nvptx_threadPrivateContext->Stride(tid)); } else if (schedule == kmp_sched_static_nochunk) { ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); // save sched state @@ -398,6 +454,8 @@ // in a warp cannot make independent progress. NOINLINE static int dispatch_next(int32_t *plast, T *plower, T *pupper, ST *pstride) { + assert(isRuntimeInitialized() && + "Expected non-SPMD mode + initialized runtime."); // ID of a thread in its own warp // automatically selects thread or warp ID based on selected implementation @@ -458,10 +516,11 @@ *pstride = 1; PRINT(LD_LOOP, - "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld\n", + "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, " + "last %d\n", GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), - GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), - P64(*pstride)); + GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), P64(*pstride), + *plast); return DISPATCH_NOTFINISHED; } @@ -736,6 +795,8 @@ EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Indent *loc, int32_t gtid, int32_t varNum, void *array) { PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n"); + assert(isRuntimeInitialized() && + "Expected non-SPMD mode + initialized runtime."); omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor(); int tid = GetOmpThreadId(GetLogicalThreadIdInBlock(), isSPMDMode(), Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -27,10 +27,17 @@ omptarget_nvptx_Queue omptarget_nvptx_device_State[MAX_SM]; +__device__ omptarget_nvptx_Queue + omptarget_nvptx_device_simpleState[MAX_SM]; + // Pointer to this team's OpenMP state object __device__ __shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; +__device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext + *omptarget_nvptx_simpleThreadPrivateContext; + //////////////////////////////////////////////////////////////////////////////// // The team master sets the outlined parallel function in this variable to // communicate with the workers. Since it is in shared memory, there is one Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -395,6 +395,38 @@ int32_t debug_level; }; +class omptarget_nvptx_SimpleThreadPrivateContext { + uint16_t par_level[MAX_THREADS_PER_TEAM]; +public: + INLINE void Init() { + assert(isSPMDMode() && isRuntimeUninitialized() && + "Expected SPMD + uninitialized runtime modes."); + par_level[GetThreadIdInBlock()] = 0; + } + INLINE void IncParLevel() { + assert(isSPMDMode() && isRuntimeUninitialized() && + "Expected SPMD + uninitialized runtime modes."); + ++par_level[GetThreadIdInBlock()]; + } + INLINE void DecParLevel() { + assert(isSPMDMode() && isRuntimeUninitialized() && + "Expected SPMD + uninitialized runtime modes."); + assert(par_level[GetThreadIdInBlock()] > 0 && + "Expected parallel level >0."); + --par_level[GetThreadIdInBlock()]; + } + INLINE bool InL2OrHigherParallelRegion() const { + assert(isSPMDMode() && isRuntimeUninitialized() && + "Expected SPMD + uninitialized runtime modes."); + return par_level[GetThreadIdInBlock()] > 0; + } + INLINE uint16_t GetParallelLevel() const { + assert(isSPMDMode() && isRuntimeUninitialized() && + "Expected SPMD + uninitialized runtime modes."); + return par_level[GetThreadIdInBlock()] + 1; + } +}; + //////////////////////////////////////////////////////////////////////////////// // global device envrionment //////////////////////////////////////////////////////////////////////////////// @@ -409,6 +441,9 @@ extern __device__ __shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; +extern __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext + *omptarget_nvptx_simpleThreadPrivateContext; + extern __device__ __shared__ uint32_t execution_param; extern __device__ __shared__ void *ReductionScratchpadPtr; Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -24,6 +24,13 @@ extern __device__ __shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; +extern __device__ omptarget_nvptx_Queue< + omptarget_nvptx_SimpleThreadPrivateContext, OMP_STATE_COUNT> + omptarget_nvptx_device_simpleState[MAX_SM]; + +extern __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext + *omptarget_nvptx_simpleThreadPrivateContext; + // // The team master sets the outlined function and its arguments in these // variables to communicate with the workers. Since they are in shared memory, @@ -53,12 +60,7 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) { PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n", OMPTARGET_NVPTX_VERSION); - - if (!RequiresOMPRuntime) { - // If OMP runtime is not required don't initialize OMP state. - setExecutionParameters(Generic, RuntimeUninitialized); - return; - } + assert(RequiresOMPRuntime && "Generic always requires initialized runtime."); setExecutionParameters(Generic, RuntimeInitialized); int threadIdInBlock = GetThreadIdInBlock(); @@ -95,16 +97,16 @@ } EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) { - if (IsOMPRuntimeInitialized) { - // Enqueue omp state object for use by another team. + assert(IsOMPRuntimeInitialized && + "Generic always requires initialized runtime."); + // Enqueue omp state object for use by another team. #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - int slot = omptarget_nvptx_threadPrivateContext->GetSourceQueue(); + int slot = omptarget_nvptx_threadPrivateContext->GetSourceQueue(); #else - int slot = smid() % MAX_SM; + int slot = smid() % MAX_SM; #endif - omptarget_nvptx_device_State[slot].Enqueue( - omptarget_nvptx_threadPrivateContext); - } + omptarget_nvptx_device_State[slot].Enqueue( + omptarget_nvptx_threadPrivateContext); // Done with work. Kill the workers. omptarget_nvptx_workFn = 0; } @@ -116,6 +118,13 @@ if (!RequiresOMPRuntime) { // If OMP runtime is not required don't initialize OMP state. setExecutionParameters(Spmd, RuntimeUninitialized); + if (GetThreadIdInBlock() == 0) { + int slot = smid() % MAX_SM; + omptarget_nvptx_simpleThreadPrivateContext = + omptarget_nvptx_device_simpleState[slot].Dequeue(); + } + __syncthreads(); + omptarget_nvptx_simpleThreadPrivateContext->Init(); return; } setExecutionParameters(Spmd, RuntimeInitialized); @@ -180,6 +189,15 @@ // there are no more parallel regions in SPMD mode. __syncthreads(); int threadId = GetThreadIdInBlock(); + if (isRuntimeUninitialized()) { + if (threadId == 0) { + // Enqueue omp state object for use by another team. + int slot = smid() % MAX_SM; + omptarget_nvptx_device_simpleState[slot].Enqueue( + omptarget_nvptx_simpleThreadPrivateContext); + return; + } + } if (threadId == 0) { // Enqueue omp state object for use by another team. int slot = smid() % MAX_SM; Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -216,10 +216,9 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, int16_t IsOMPRuntimeInitialized) { PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n"); - omptarget_nvptx_workFn = WorkFn; + assert(IsOMPRuntimeInitialized && "expected initialized runtime."); - if (!IsOMPRuntimeInitialized) - return; + omptarget_nvptx_workFn = WorkFn; // This routine is only called by the team master. The team master is // the first thread of the last warp. It always has the logical thread @@ -320,12 +319,11 @@ int16_t IsOMPRuntimeInitialized) { PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n"); + assert(IsOMPRuntimeInitialized && "expected initialized runtime."); + // Work function and arguments for L1 parallel region. *WorkFn = omptarget_nvptx_workFn; - if (!IsOMPRuntimeInitialized) - return true; - // If this is the termination signal from the master, quit early. if (!*WorkFn) return false; @@ -363,6 +361,8 @@ EXTERN void __kmpc_kernel_end_parallel() { // pop stack PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n"); + assert(isRuntimeInitialized() && "expected initialized runtime."); + // Only the worker threads call this routine and the master warp // never arrives here. Therefore, use the nvptx thread id. int threadId = GetThreadIdInBlock(); @@ -378,6 +378,12 @@ EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n"); + if (isRuntimeUninitialized()) { + assert(isSPMDMode() && "Expected SPMD mode with uninitialized runtime."); + omptarget_nvptx_simpleThreadPrivateContext->IncParLevel(); + return; + } + // assume this is only called for nested parallel int threadId = GetLogicalThreadIdInBlock(); @@ -392,7 +398,7 @@ // it omptarget_nvptx_TaskDescr *newTaskDescr = (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr), - (char *)"new seq parallel task"); + "new seq parallel task"); newTaskDescr->CopyParent(currTaskDescr); // tweak values for serialized parallel case: @@ -410,6 +416,12 @@ uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n"); + if (isRuntimeUninitialized()) { + assert(isSPMDMode() && "Expected SPMD mode with uninitialized runtime."); + omptarget_nvptx_simpleThreadPrivateContext->DecParLevel(); + return; + } + // pop stack int threadId = GetLogicalThreadIdInBlock(); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); @@ -425,6 +437,11 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_parallel_level\n"); + if (isRuntimeUninitialized()) { + assert(isSPMDMode() && "Expected SPMD mode with uninitialized runtime."); + return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel(); + } + int threadId = GetLogicalThreadIdInBlock(); omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); @@ -451,6 +468,7 @@ EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t tid, int32_t num_threads) { PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads); + assert(isRuntimeInitialized() && "Runtime must be initialized."); tid = GetLogicalThreadIdInBlock(); omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) = num_threads; @@ -459,6 +477,7 @@ EXTERN void __kmpc_push_simd_limit(kmp_Indent *loc, int32_t tid, int32_t simd_limit) { PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", simd_limit); + assert(isRuntimeInitialized() && "Runtime must be initialized."); tid = GetLogicalThreadIdInBlock(); omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit; } Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -101,9 +101,13 @@ int rc; if (isRuntimeUninitialized) { - rc = GetThreadIdInBlock(); - if (!isSPMDExecutionMode && rc >= GetMasterThreadID()) + assert(isSPMDExecutionMode && "Uninitialized runtime with non-SPMD mode."); + // For level 2 parallelism all parallel regions are executed sequentially. + if (omptarget_nvptx_simpleThreadPrivateContext + ->InL2OrHigherParallelRegion()) rc = 0; + else + rc = GetThreadIdInBlock(); } else { omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); @@ -118,8 +122,13 @@ int rc; if (isRuntimeUninitialized) { - rc = isSPMDExecutionMode ? GetNumberOfThreadsInBlock() - : GetNumberOfThreadsInBlock() - WARPSIZE; + assert(isSPMDExecutionMode && "Uninitialized runtime with non-SPMD mode."); + // For level 2 parallelism all parallel regions are executed sequentially. + if (omptarget_nvptx_simpleThreadPrivateContext + ->InL2OrHigherParallelRegion()) + rc = 1; + else + rc = GetNumberOfThreadsInBlock(); } else { omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -42,10 +42,8 @@ EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) { if (isRuntimeUninitialized()) { - if (isSPMDMode()) - __kmpc_barrier_simple_spmd(loc_ref, tid); - else - __kmpc_barrier_simple_generic(loc_ref, tid); + assert(isSPMDMode() && "Expected SPMD mode with uninitialized runtime."); + __kmpc_barrier_simple_spmd(loc_ref, tid); } else { tid = GetLogicalThreadIdInBlock(); omptarget_nvptx_TaskDescr *currTaskDescr = Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/task.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/task.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/task.cu @@ -81,6 +81,7 @@ void *noAliasDepList) { PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n", P64(newKmpTaskDescr)); + assert(isRuntimeInitialized() && "Runtime must be initialized."); // 1. get explict task descr from kmp task descr omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( @@ -117,6 +118,7 @@ kmp_TaskDescr *newKmpTaskDescr) { PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n", P64(newKmpTaskDescr)); + assert(isRuntimeInitialized() && "Runtime must be initialized."); // 1. get explict task descr from kmp task descr omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( @@ -141,6 +143,7 @@ kmp_TaskDescr *newKmpTaskDescr) { PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n", P64(newKmpTaskDescr)); + assert(isRuntimeInitialized() && "Runtime must be initialized."); // 1. get explict task descr from kmp task descr omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(