diff --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h --- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h +++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h @@ -192,7 +192,7 @@ INLINE omptarget_nvptx_TaskDescr * getMyTopTaskDescriptor(bool isSPMDExecutionMode) { - return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode)); + return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock()); } //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu @@ -15,11 +15,6 @@ #include "target/shuffle.h" #include "target_impl.h" -// Return true if this is the master thread. -INLINE static bool IsMasterThread(bool isSPMDExecutionMode) { - return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock(); -} - //////////////////////////////////////////////////////////////////////////////// // Runtime functions for trunk data sharing scheme. //////////////////////////////////////////////////////////////////////////////// @@ -66,7 +61,8 @@ EXTERN void *__kmpc_alloc_shared(size_t Bytes) { Bytes = Bytes + (Bytes % MinBytes); - if (IsMasterThread(__kmpc_is_spmd_exec_mode())) { + int TID = GetThreadIdInBlock(); + if (__kmpc_is_generic_main_thread(TID)) { // Main thread alone, use shared memory if space is available. if (MainSharedStack.Usage[0] + Bytes <= MainSharedStack.MaxSize) { void *Ptr = &MainSharedStack.Data[MainSharedStack.Usage[0]]; @@ -75,7 +71,6 @@ return Ptr; } } else { - int TID = GetThreadIdInBlock(); int WID = GetWarpId(); unsigned WarpBytes = Bytes * WARPSIZE; auto AllocSharedStack = [&]() { @@ -92,7 +87,6 @@ return __kmpc_alloc_for_warp(AllocSharedStack, Bytes, WarpBytes); } // Fallback to malloc - int TID = GetThreadIdInBlock(); unsigned WarpBytes = Bytes * WARPSIZE; auto AllocGlobal = [&] { return SafeMalloc(WarpBytes, "AllocGlobalFallback"); diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu --- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu @@ -68,9 +68,7 @@ } EXTERN int omp_get_thread_num() { - bool isSPMDExecutionMode = __kmpc_is_spmd_exec_mode(); - int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); - int rc = GetOmpThreadId(tid, isSPMDExecutionMode); + int rc = GetOmpThreadId(); PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc); return rc; } diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu --- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu @@ -210,7 +210,7 @@ ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), "Expected non-SPMD mode."); return; } - int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode()); + int tid = GetLogicalThreadIdInBlock(); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid); T tnum = GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()); T tripCount = ub - lb + 1; // +1 because ub is inclusive @@ -453,7 +453,7 @@ // ID of a thread in its own warp // automatically selects thread or warp ID based on selected implementation - int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode()); + int tid = GetLogicalThreadIdInBlock(); ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()), "current thread is not needed here; error"); // retrieve schedule diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -160,6 +160,10 @@ return (execution_param & ModeMask) == Spmd; } +EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid) { + return !__kmpc_is_spmd_exec_mode() && GetMasterThreadID() == Tid; +} + EXTERN bool __kmpc_kernel_parallel(void**WorkFn); static void __kmpc_target_region_state_machine(ident_t *Ident) { diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu --- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu @@ -188,7 +188,7 @@ } // assume this is only called for nested parallel - int threadId = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode()); + int threadId = GetLogicalThreadIdInBlock(); // unlike actual parallel, threads in the same team do not share // the workTaskDescr in this case and num threads is fixed to 1 @@ -227,7 +227,7 @@ } // pop stack - int threadId = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode()); + int threadId = GetLogicalThreadIdInBlock(); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); // set new top omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( @@ -249,8 +249,7 @@ // it's cheap to recalculate this value so we never use the result // of this call. EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) { - int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode()); - return GetOmpThreadId(tid, __kmpc_is_spmd_exec_mode()); + return GetOmpThreadId(); } //////////////////////////////////////////////////////////////////////////////// @@ -262,7 +261,7 @@ PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads); ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); - tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode()); + tid = GetLogicalThreadIdInBlock(); omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) = num_threads; } diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu --- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu @@ -69,7 +69,7 @@ int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, bool isSPMDExecutionMode, bool isRuntimeUninitialized) { - uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); + uint32_t BlockThreadId = GetLogicalThreadIdInBlock(); uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode); if (NumThreads == 1) return 1; @@ -184,10 +184,11 @@ kmp_ListGlobalFctPtr glredFct) { // Terminate all threads in non-SPMD mode except for the master thread. - if (!__kmpc_is_spmd_exec_mode() && GetThreadIdInBlock() != GetMasterThreadID()) + if (!__kmpc_is_spmd_exec_mode() && + !__kmpc_is_generic_main_thread(GetThreadIdInBlock())) return 0; - uint32_t ThreadId = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode()); + uint32_t ThreadId = GetLogicalThreadIdInBlock(); // In non-generic mode all workers participate in the teams reduction. // In generic mode only the team master participates in the teams diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu --- a/openmp/libomptarget/deviceRTLs/common/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu @@ -67,11 +67,11 @@ // or a serial region by the master. If the master (whose CUDA thread // id is GetMasterThreadID()) calls this routine, we return 0 because // it is a shadow for the first worker. -int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) { +int GetLogicalThreadIdInBlock() { // Implemented using control flow (predication) instead of with a modulo // operation. int tid = GetThreadIdInBlock(); - if (!isSPMDExecutionMode && tid >= GetMasterThreadID()) + if (__kmpc_is_generic_main_thread(tid)) return 0; else return tid; @@ -83,16 +83,19 @@ // //////////////////////////////////////////////////////////////////////////////// -int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) { +int GetOmpThreadId() { + int tid = GetThreadIdInBlock(); + if (__kmpc_is_generic_main_thread(tid)) + return 0; // omp_thread_num int rc; if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) { rc = 0; - } else if (isSPMDExecutionMode) { - rc = GetThreadIdInBlock(); + } else if (__kmpc_is_spmd_exec_mode()) { + rc = tid; } else { omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid); ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); rc = currTaskDescr->ThreadId(); } diff --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu --- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu @@ -47,7 +47,7 @@ "Expected SPMD mode with uninitialized runtime."); __kmpc_barrier_simple_spmd(loc_ref, tid); } else { - tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode()); + tid = GetLogicalThreadIdInBlock(); int numberOfActiveOMPThreads = GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()); if (numberOfActiveOMPThreads > 1) { diff --git a/openmp/libomptarget/deviceRTLs/common/src/task.cu b/openmp/libomptarget/deviceRTLs/common/src/task.cu --- a/openmp/libomptarget/deviceRTLs/common/src/task.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/task.cu @@ -96,7 +96,7 @@ "bad assumptions"); // 2. push new context: update new task descriptor - int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode()); + int tid = GetLogicalThreadIdInBlock(); omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid); newTaskDescr->CopyForExplicitTask(parentTaskDescr); // set new task descriptor as top @@ -135,7 +135,7 @@ "bad assumptions"); // 2. push new context: update new task descriptor - int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode()); + int tid = GetLogicalThreadIdInBlock(); omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid); newTaskDescr->CopyForExplicitTask(parentTaskDescr); // set new task descriptor as top @@ -163,7 +163,7 @@ omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr(); // 3... noting to call... is inline // 4. pop context - int tid = GetLogicalThreadIdInBlock(__kmpc_is_spmd_exec_mode()); + int tid = GetLogicalThreadIdInBlock(); omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, parentTaskDescr); // 5. free diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h --- a/openmp/libomptarget/deviceRTLs/common/support.h +++ b/openmp/libomptarget/deviceRTLs/common/support.h @@ -41,13 +41,12 @@ //////////////////////////////////////////////////////////////////////////////// // get global ids to locate tread/team info (constant regardless of OMP) -int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode); +int GetLogicalThreadIdInBlock(); int GetMasterThreadID(); int GetNumberOfWorkersInTeam(); // get OpenMP thread and team ids -int GetOmpThreadId(int threadId, - bool isSPMDExecutionMode); // omp_thread_num +int GetOmpThreadId(); // omp_thread_num int GetOmpTeamId(); // omp_team_num // get OpenMP number of threads and team diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h --- a/openmp/libomptarget/deviceRTLs/interface.h +++ b/openmp/libomptarget/deviceRTLs/interface.h @@ -449,6 +449,10 @@ // SPMD execution mode interrogation function. EXTERN int8_t __kmpc_is_spmd_exec_mode(); +/// Return true if the hardware thread id \p Tid represents the OpenMP main +/// thread in generic mode outside of a parallel region. +EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid); + EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, const void *buf, size_t size, int16_t is_shared, const void **res);