Index: libomptarget/deviceRTLs/nvptx/src/libcall.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -47,8 +47,7 @@ EXTERN int omp_get_num_threads(void) { bool isSPMDExecutionMode = isSPMDMode(); int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); - int rc = - GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized()); + int rc = GetNumberOfOmpThreads(tid, isSPMDExecutionMode); PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc); return rc; } @@ -89,7 +88,7 @@ EXTERN int omp_get_thread_num() { bool isSPMDExecutionMode = isSPMDMode(); int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); - int rc = GetOmpThreadId(tid, isSPMDExecutionMode, isRuntimeUninitialized()); + int rc = GetOmpThreadId(tid, isSPMDExecutionMode); PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc); return rc; } @@ -101,18 +100,7 @@ } EXTERN int omp_in_parallel(void) { - int rc = 0; - if (isRuntimeUninitialized()) { - ASSERT0(LT_FUSSY, isSPMDMode(), - "Expected SPMD mode only with uninitialized runtime."); - rc = 1; // SPMD mode is always in parallel. - } else { - omptarget_nvptx_TaskDescr *currTaskDescr = - getMyTopTaskDescriptor(isSPMDMode()); - if (currTaskDescr->InParallelRegion()) { - rc = 1; - } - } + int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0; PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc); return rc; } @@ -161,46 +149,13 @@ } EXTERN int omp_get_level(void) { - if (isRuntimeUninitialized()) { - ASSERT0(LT_FUSSY, isSPMDMode(), - "Expected SPMD mode only with uninitialized runtime."); - // parallelLevel starts from 0, need to add 1 for correct level. - return parallelLevel[GetWarpId()] + 1; - } - int level = 0; - omptarget_nvptx_TaskDescr *currTaskDescr = - getMyTopTaskDescriptor(isSPMDMode()); - ASSERT0(LT_FUSSY, currTaskDescr, - "do not expect fct to be called in a non-active thread"); - do { - if (currTaskDescr->IsParallelConstruct()) { - level++; - } - currTaskDescr = currTaskDescr->GetPrevTaskDescr(); - } while (currTaskDescr); + int level = parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1); PRINT(LD_IO, "call omp_get_level() returns %d\n", level); return level; } EXTERN int omp_get_active_level(void) { - if (isRuntimeUninitialized()) { - ASSERT0(LT_FUSSY, isSPMDMode(), - "Expected SPMD mode only with uninitialized runtime."); - return 1; - } - int level = 0; // no active level parallelism - omptarget_nvptx_TaskDescr *currTaskDescr = - getMyTopTaskDescriptor(isSPMDMode()); - ASSERT0(LT_FUSSY, currTaskDescr, - "do not expect fct to be called in a non-active thread"); - do { - if (currTaskDescr->ThreadsInTeam() > 1) { - // has a parallel with more than one thread in team - level = 1; - break; - } - currTaskDescr = currTaskDescr->GetPrevTaskDescr(); - } while (currTaskDescr); + int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0; PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level) return level; } Index: libomptarget/deviceRTLs/nvptx/src/loop.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/loop.cu +++ libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -95,8 +95,7 @@ INLINE static void for_static_init(int32_t gtid, int32_t schedtype, int32_t *plastiter, T *plower, T *pupper, ST *pstride, ST chunk, - bool IsSPMDExecutionMode, - bool IsRuntimeUninitialized) { + bool IsSPMDExecutionMode) { // When IsRuntimeUninitialized is true, we assume that the caller is // in an L0 parallel region and that all worker threads participate. @@ -104,8 +103,8 @@ // Assume we are in teams region or that we use a single block // per target region - ST numberOfActiveOMPThreads = GetNumberOfOmpThreads( - tid, IsSPMDExecutionMode, IsRuntimeUninitialized); + ST numberOfActiveOMPThreads = + GetNumberOfOmpThreads(tid, IsSPMDExecutionMode); // All warps that are in excess of the maximum requested, do // not execute the loop @@ -456,9 +455,7 @@ // automatically selects thread or warp ID based on selected implementation int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - ASSERT0(LT_FUSSY, - gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc), - checkRuntimeUninitialized(loc)), + ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc)), "current thread is not needed here; error"); // retrieve schedule kmp_sched_t schedule = @@ -509,13 +506,12 @@ *pupper = myUb; *pstride = 1; - PRINT( - LD_LOOP, - "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, " - "last %d\n", - (int)GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), - (int)GetNumberOfWorkersInTeam(), (long long)*plower, (long long)*pupper, - (long long)*pstride, (int)*plast); + PRINT(LD_LOOP, + "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, " + "last %d\n", + (int)GetNumberOfOmpThreads(tid, isSPMDMode()), + (int)GetNumberOfWorkersInTeam(), (long long)*plower, + (long long)*pupper, (long long)*pstride, (int)*plast); return DISPATCH_NOTFINISHED; } @@ -629,7 +625,7 @@ PRINT0(LD_IO, "call kmpc_for_static_init_4\n"); omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - checkSPMDMode(loc), checkRuntimeUninitialized(loc)); + checkSPMDMode(loc)); } EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid, @@ -640,7 +636,7 @@ PRINT0(LD_IO, "call kmpc_for_static_init_4u\n"); omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - checkSPMDMode(loc), checkRuntimeUninitialized(loc)); + checkSPMDMode(loc)); } EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid, @@ -651,7 +647,7 @@ PRINT0(LD_IO, "call kmpc_for_static_init_8\n"); omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - checkSPMDMode(loc), checkRuntimeUninitialized(loc)); + checkSPMDMode(loc)); } EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid, @@ -662,7 +658,7 @@ PRINT0(LD_IO, "call kmpc_for_static_init_8u\n"); omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - checkSPMDMode(loc), checkRuntimeUninitialized(loc)); + checkSPMDMode(loc)); } EXTERN @@ -674,7 +670,7 @@ PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n"); omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true); + /*IsSPMDExecutionMode=*/true); } EXTERN @@ -686,7 +682,7 @@ PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n"); omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true); + /*IsSPMDExecutionMode=*/true); } EXTERN @@ -698,7 +694,7 @@ PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n"); omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true); + /*IsSPMDExecutionMode=*/true); } EXTERN @@ -710,7 +706,7 @@ PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n"); omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true); + /*IsSPMDExecutionMode=*/true); } EXTERN @@ -721,7 +717,7 @@ PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n"); omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true); + /*IsSPMDExecutionMode=*/false); } EXTERN @@ -732,7 +728,7 @@ PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n"); omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true); + /*IsSPMDExecutionMode=*/false); } EXTERN @@ -743,7 +739,7 @@ PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n"); omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true); + /*IsSPMDExecutionMode=*/false); } EXTERN @@ -754,7 +750,7 @@ PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n"); omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true); + /*IsSPMDExecutionMode=*/false); } EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) { @@ -787,8 +783,7 @@ omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor(); int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc), - checkRuntimeUninitialized(loc)); + uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc)); uint64_t *Buffer = teamDescr.getLastprivateIterBuffer(); for (unsigned i = 0; i < varNum; i++) { // Reset buffer. Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -43,6 +43,8 @@ ASSERT0(LT_FUSSY, RequiresOMPRuntime, "Generic always requires initialized runtime."); setExecutionParameters(Generic, RuntimeInitialized); + for (int I = 0; I < MAX_THREADS_PER_TEAM / WARPSIZE; ++I) + parallelLevel[I] = 0; int threadIdInBlock = GetThreadIdInBlock(); ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(), @@ -91,32 +93,32 @@ int16_t RequiresDataSharing) { PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n"); + setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized + : RuntimeUninitialized); + int threadId = GetThreadIdInBlock(); + if (threadId == 0) { + usedSlotIdx = smid() % MAX_SM; + parallelLevel[0] = + 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); + } else if (GetLaneId() == 0) { + parallelLevel[GetWarpId()] = + 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); + } if (!RequiresOMPRuntime) { - // If OMP runtime is not required don't initialize OMP state. - setExecutionParameters(Spmd, RuntimeUninitialized); - if (GetThreadIdInBlock() == 0) { - usedSlotIdx = smid() % MAX_SM; - parallelLevel[0] = 0; - } else if (GetLaneId() == 0) { - parallelLevel[GetWarpId()] = 0; - } + // Runtime is not required - exit. __SYNCTHREADS(); return; } - setExecutionParameters(Spmd, RuntimeInitialized); // // Team Context Initialization. // // In SPMD mode there is no master thread so use any cuda thread for team // context initialization. - int threadId = GetThreadIdInBlock(); if (threadId == 0) { // Get a state object from the queue. - int slot = smid() % MAX_SM; - usedSlotIdx = slot; omptarget_nvptx_threadPrivateContext = - omptarget_nvptx_device_State[slot].Dequeue(); + omptarget_nvptx_device_State[usedSlotIdx].Dequeue(); omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); @@ -148,7 +150,7 @@ "%d threads\n", (int)newTaskDescr->ThreadId(), (int)newTaskDescr->ThreadsInTeam()); - if (RequiresDataSharing && threadId % WARPSIZE == 0) { + if (RequiresDataSharing && GetLaneId() == 0) { // Warp master innitializes data sharing environment. unsigned WID = threadId / WARPSIZE; __kmpc_data_sharing_slot *RootS = currTeamDescr.RootS( Index: libomptarget/deviceRTLs/nvptx/src/option.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/option.h +++ libomptarget/deviceRTLs/nvptx/src/option.h @@ -44,6 +44,8 @@ #define MAX_SM 16 #endif +#define OMP_ACTIVE_PARALLEL_LEVEL 128 + //////////////////////////////////////////////////////////////////////////////// // algo options //////////////////////////////////////////////////////////////////////////////// Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -311,6 +311,7 @@ (int)newTaskDescr->ThreadId(), (int)newTaskDescr->NThreads()); isActive = true; + IncParallelLevel(workDescr.WorkTaskDescr()->ThreadsInTeam() != 1); } return isActive; @@ -327,6 +328,8 @@ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, currTaskDescr->GetPrevTaskDescr()); + + DecParallelLevel(currTaskDescr->ThreadsInTeam() != 1); } //////////////////////////////////////////////////////////////////////////////// @@ -336,16 +339,11 @@ EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n"); + IncParallelLevel(/*ActiveParallel=*/false); + if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected SPMD mode with uninitialized runtime."); - unsigned tnum = __ACTIVEMASK(); - int leader = __ffs(tnum) - 1; - __SHFL_SYNC(tnum, leader, leader); - if (GetLaneId() == leader) - ++parallelLevel[GetWarpId()]; - __SHFL_SYNC(tnum, leader, leader); - return; } @@ -381,15 +379,11 @@ uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n"); + DecParallelLevel(/*ActiveParallel=*/false); + if (checkRuntimeUninitialized(loc)) { ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected SPMD mode with uninitialized runtime."); - unsigned tnum = __ACTIVEMASK(); - int leader = __ffs(tnum) - 1; - __SHFL_SYNC(tnum, leader, leader); - if (GetLaneId() == leader) - --parallelLevel[GetWarpId()]; - __SHFL_SYNC(tnum, leader, leader); return; } @@ -408,21 +402,7 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) { PRINT0(LD_IO, "call to __kmpc_parallel_level\n"); - if (checkRuntimeUninitialized(loc)) { - ASSERT0(LT_FUSSY, checkSPMDMode(loc), - "Expected SPMD mode with uninitialized runtime."); - return parallelLevel[GetWarpId()] + 1; - } - - int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); - if (currTaskDescr->InL2OrHigherParallelRegion()) - return 2; - else if (currTaskDescr->InParallelRegion()) - return 1; - else - return 0; + return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1); } // This kmpc call returns the thread id across all teams. It's value is @@ -431,8 +411,7 @@ // of this call. EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) { int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - return GetOmpThreadId(tid, checkSPMDMode(loc), - checkRuntimeUninitialized(loc)); + return GetOmpThreadId(tid, checkSPMDMode(loc)); } //////////////////////////////////////////////////////////////////////////////// Index: libomptarget/deviceRTLs/nvptx/src/reduction.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -21,8 +21,7 @@ int32_t __gpu_block_reduce() { bool isSPMDExecutionMode = isSPMDMode(); int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); - int nt = - GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized()); + int nt = GetNumberOfOmpThreads(tid, isSPMDExecutionMode); if (nt != blockDim.x) return 0; unsigned tnum = __ACTIVEMASK(); @@ -40,9 +39,7 @@ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); int numthread; if (currTaskDescr->IsParallelConstruct()) { - numthread = - GetNumberOfOmpThreads(threadId, checkSPMDMode(loc), - checkRuntimeUninitialized(loc)); + numthread = GetNumberOfOmpThreads(threadId, checkSPMDMode(loc)); } else { numthread = GetNumberOfOmpTeams(); } @@ -150,8 +147,8 @@ kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, bool isSPMDExecutionMode, bool isRuntimeUninitialized) { uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); - uint32_t NumThreads = GetNumberOfOmpThreads( - BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized); + uint32_t NumThreads = + GetNumberOfOmpThreads(BlockThreadId, isSPMDExecutionMode); if (NumThreads == 1) return 1; /* @@ -236,10 +233,9 @@ EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait( int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { - return nvptx_parallel_reduce_nowait( - global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, - /*isSPMDExecutionMode=*/isSPMDMode(), - /*isRuntimeUninitialized=*/isRuntimeUninitialized()); + return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, + isSPMDMode(), isRuntimeUninitialized()); } EXTERN @@ -256,36 +252,35 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd( int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { - return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size, - reduce_data, shflFct, cpyFct, - /*isSPMDExecutionMode=*/true, - /*isRuntimeUninitialized=*/true); + return nvptx_parallel_reduce_nowait( + global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, + /*isSPMDExecutionMode=*/true, /*isRuntimeUninitialized=*/true); } EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic( int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { - return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size, - reduce_data, shflFct, cpyFct, - /*isSPMDExecutionMode=*/false, - /*isRuntimeUninitialized=*/true); + return nvptx_parallel_reduce_nowait( + global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, + /*isSPMDExecutionMode=*/false, /*isRuntimeUninitialized=*/true); } INLINE -static int32_t nvptx_teams_reduce_nowait( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, - kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct, - bool isSPMDExecutionMode, bool isRuntimeUninitialized) { +static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars, + size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr scratchFct, + kmp_LoadReduceFctPtr ldFct, + bool isSPMDExecutionMode) { uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); // In non-generic mode all workers participate in the teams reduction. // In generic mode only the team master participates in the teams // reduction because the workers are waiting for parallel work. uint32_t NumThreads = isSPMDExecutionMode - ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true, - isRuntimeUninitialized) + ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true) : /*Master thread only*/ 1; uint32_t TeamId = GetBlockIdInKernel(); uint32_t NumTeams = GetNumberOfBlocksInKernel(); @@ -406,10 +401,9 @@ kmp_InterWarpCopyFctPtr cpyFct, kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) { - return nvptx_teams_reduce_nowait( - global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, - scratchFct, ldFct, /*isSPMDExecutionMode=*/isSPMDMode(), - /*isRuntimeUninitialized=*/isRuntimeUninitialized()); + return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, scratchFct, + ldFct, isSPMDMode()); } EXTERN @@ -419,9 +413,7 @@ kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) { return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, scratchFct, - ldFct, - /*isSPMDExecutionMode=*/true, - /*isRuntimeUninitialized=*/true); + ldFct, /*isSPMDExecutionMode=*/true); } EXTERN @@ -431,9 +423,7 @@ kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) { return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, scratchFct, - ldFct, - /*isSPMDExecutionMode=*/false, - /*isRuntimeUninitialized=*/true); + ldFct, /*isSPMDExecutionMode=*/false); } EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc, @@ -484,8 +474,7 @@ // reduction because the workers are waiting for parallel work. uint32_t NumThreads = checkSPMDMode(loc) - ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true, - checkRuntimeUninitialized(loc)) + ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true) : /*Master thread only*/ 1; uint32_t TeamId = GetBlockIdInKernel(); uint32_t NumTeams = GetNumberOfBlocksInKernel(); Index: libomptarget/deviceRTLs/nvptx/src/support.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/support.h +++ libomptarget/deviceRTLs/nvptx/src/support.h @@ -49,15 +49,14 @@ INLINE int GetNumberOfWorkersInTeam(); // get OpenMP thread and team ids -INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode, - bool isRuntimeUninitialized); // omp_thread_num +INLINE int GetOmpThreadId(int threadId, + bool isSPMDExecutionMode); // omp_thread_num INLINE int GetOmpTeamId(); // omp_team_num // get OpenMP number of threads and team -INLINE int -GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode, - bool isRuntimeUninitialized); // omp_num_threads -INLINE int GetNumberOfOmpTeams(); // omp_num_teams +INLINE int GetNumberOfOmpThreads(int threadId, + bool isSPMDExecutionMode); // omp_num_threads +INLINE int GetNumberOfOmpTeams(); // omp_num_teams // get OpenMP number of procs INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode); @@ -66,6 +65,10 @@ // masters INLINE int IsTeamMaster(int ompThreadId); +// Parallel level +INLINE void IncParallelLevel(bool ActiveParallel); +INLINE void DecParallelLevel(bool ActiveParallel); + //////////////////////////////////////////////////////////////////////////////// // Memory //////////////////////////////////////////////////////////////////////////////// Index: libomptarget/deviceRTLs/nvptx/src/supporti.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/supporti.h +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -149,40 +149,29 @@ // //////////////////////////////////////////////////////////////////////////////// -INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode, - bool isRuntimeUninitialized) { +INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) { // omp_thread_num int rc; - - if (isRuntimeUninitialized) { - ASSERT0(LT_FUSSY, isSPMDExecutionMode, - "Uninitialized runtime with non-SPMD mode."); - // For level 2 parallelism all parallel regions are executed sequentially. - if (parallelLevel[GetWarpId()] > 0) - rc = 0; - else - rc = GetThreadIdInBlock(); + if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) { + rc = 0; + } else if (isSPMDExecutionMode) { + rc = GetThreadIdInBlock(); } else { omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); rc = currTaskDescr->ThreadId(); } return rc; } -INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode, - bool isRuntimeUninitialized) { +INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode) { // omp_num_threads int rc; - - if (isRuntimeUninitialized) { - ASSERT0(LT_FUSSY, isSPMDExecutionMode, - "Uninitialized runtime with non-SPMD mode."); - // For level 2 parallelism all parallel regions are executed sequentially. - if (parallelLevel[GetWarpId()] > 0) - rc = 1; - else - rc = GetNumberOfThreadsInBlock(); + if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) { + rc = 1; + } else if (isSPMDExecutionMode) { + rc = GetNumberOfThreadsInBlock(); } else { omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); @@ -212,6 +201,31 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); } //////////////////////////////////////////////////////////////////////////////// +// Parallel level + +INLINE void IncParallelLevel(bool ActiveParallel) { + unsigned tnum = __ACTIVEMASK(); + int leader = __ffs(tnum) - 1; + __SHFL_SYNC(tnum, leader, leader); + if (GetLaneId() == leader) { + parallelLevel[GetWarpId()] += + (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); + } + __SHFL_SYNC(tnum, leader, leader); +} + +INLINE void DecParallelLevel(bool ActiveParallel) { + unsigned tnum = __ACTIVEMASK(); + int leader = __ffs(tnum) - 1; + __SHFL_SYNC(tnum, leader, leader); + if (GetLaneId() == leader) { + parallelLevel[GetWarpId()] -= + (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); + } + __SHFL_SYNC(tnum, leader, leader); +} + +//////////////////////////////////////////////////////////////////////////////// // get OpenMP number of procs // Get the number of processors in the device. Index: libomptarget/deviceRTLs/nvptx/src/sync.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/sync.cu +++ libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -48,8 +48,8 @@ tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref)); omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid); - int numberOfActiveOMPThreads = GetNumberOfOmpThreads( - tid, checkSPMDMode(loc_ref), /*isRuntimeUninitialized=*/false); + int numberOfActiveOMPThreads = + GetNumberOfOmpThreads(tid, checkSPMDMode(loc_ref)); if (numberOfActiveOMPThreads > 1) { if (checkSPMDMode(loc_ref)) { __kmpc_barrier_simple_spmd(loc_ref, tid); Index: libomptarget/deviceRTLs/nvptx/test/parallel/nested.c =================================================================== --- libomptarget/deviceRTLs/nvptx/test/parallel/nested.c +++ libomptarget/deviceRTLs/nvptx/test/parallel/nested.c @@ -5,6 +5,7 @@ const int MaxThreads = 1024; const int NumThreads = 64; +const int NumThreads1 = 1; int main(int argc, char *argv[]) { int inParallel = -1, numThreads = -1, threadNum = -1; @@ -14,20 +15,20 @@ check1[i] = check2[i] = 0; } - #pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:]) +#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:]) { inParallel = omp_in_parallel(); numThreads = omp_get_num_threads(); threadNum = omp_get_thread_num(); - // Expecting active parallel region. - #pragma omp parallel num_threads(NumThreads) +// Expecting active parallel region. +#pragma omp parallel num_threads(NumThreads) { int id = omp_get_thread_num(); check1[id] += omp_get_num_threads() + omp_in_parallel(); - // Expecting serialized parallel region. - #pragma omp parallel +// Expecting serialized parallel region. +#pragma omp parallel { // Expected to be 1. int nestedInParallel = omp_in_parallel(); @@ -35,7 +36,7 @@ int nestedNumThreads = omp_get_num_threads(); // Expected to be 0. int nestedThreadNum = omp_get_thread_num(); - #pragma omp atomic +#pragma omp atomic check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum; } } @@ -52,7 +53,8 @@ int Expected = NumThreads + 1; if (i < NumThreads) { if (check1[i] != Expected) { - printf("invalid: check1[%d] should be %d, is %d\n", i, Expected, check1[i]); + printf("invalid: check1[%d] should be %d, is %d\n", i, Expected, + check1[i]); } } else if (check1[i] != 0) { printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); @@ -68,5 +70,67 @@ } } + inParallel = -1; + numThreads = -1; + threadNum = -1; + for (int i = 0; i < MaxThreads; i++) { + check1[i] = check2[i] = 0; + } + +#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:]) + { + inParallel = omp_in_parallel(); + numThreads = omp_get_num_threads(); + threadNum = omp_get_thread_num(); + +// Expecting active parallel region. +#pragma omp parallel num_threads(NumThreads1) + { + int id = omp_get_thread_num(); + check1[id] += omp_get_num_threads() + omp_in_parallel(); + +// Expecting serialized parallel region. +#pragma omp parallel + { + // Expected to be 0. + int nestedInParallel = omp_in_parallel(); + // Expected to be 1. + int nestedNumThreads = omp_get_num_threads(); + // Expected to be 0. + int nestedThreadNum = omp_get_thread_num(); +#pragma omp atomic + check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum; + } + } + } + + // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0 + printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n", + inParallel, numThreads, threadNum); + + // CHECK-NOT: invalid + for (int i = 0; i < MaxThreads; i++) { + // Check that all threads reported + // omp_get_num_threads() = 1, omp_in_parallel() = 0. + int Expected = 1; + if (i < NumThreads1) { + if (check1[i] != Expected) { + printf("invalid: check1[%d] should be %d, is %d\n", i, Expected, + check1[i]); + } + } else if (check1[i] != 0) { + printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); + } + + // Check serialized parallel region. + if (i < NumThreads1) { + if (check2[i] != 1) { + printf("invalid: check2[%d] should be 1, is %d\n", i, check2[i]); + } + } else if (check2[i] != 0) { + printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); + } + } + return 0; }