diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -195,6 +195,7 @@ __OMP_RTL(__kmpc_cancel_barrier, false, Int32, IdentPtr, Int32) __OMP_RTL(__kmpc_flush, false, Void, IdentPtr) __OMP_RTL(__kmpc_global_thread_num, false, Int32, IdentPtr) +__OMP_RTL(__kmpc_get_hardware_thread_id_in_block, false, Int32, ) __OMP_RTL(__kmpc_fork_call, true, Void, IdentPtr, Int32, ParallelTaskPtr) __OMP_RTL(__kmpc_omp_taskwait, false, Int32, IdentPtr, Int32) __OMP_RTL(__kmpc_omp_taskyield, false, Int32, IdentPtr, Int32, /* Int */ Int32) diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -133,7 +133,7 @@ __builtin_amdgcn_workgroup_size_x()); } -EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } +EXTERN unsigned GetWarpId() { return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; } EXTERN unsigned GetWarpSize() { return WARPSIZE; } EXTERN unsigned GetLaneId() { return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); @@ -212,7 +212,7 @@ } // Calls to the AMDGCN layer (assuming 1D layout) -EXTERN int GetThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); } +EXTERN int __kmpc_get_hardware_thread_id_in_block() { return __builtin_amdgcn_workitem_id_x(); } EXTERN int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); } #pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/debug.h b/openmp/libomptarget/deviceRTLs/common/debug.h --- a/openmp/libomptarget/deviceRTLs/common/debug.h +++ b/openmp/libomptarget/deviceRTLs/common/debug.h @@ -132,8 +132,9 @@ template NOINLINE static void log(const char *fmt, Arguments... parameters) { - printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(), - (int)GetWarpId(), (int)GetLaneId(), parameters...); + printf(fmt, (int)GetBlockIdInKernel(), + (int)__kmpc_get_hardware_thread_id_in_block(), (int)GetWarpId(), + (int)GetLaneId(), parameters...); } #endif @@ -143,8 +144,9 @@ NOINLINE static void check(bool cond, const char *fmt, Arguments... parameters) { if (!cond) { - printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(), - (int)GetWarpId(), (int)GetLaneId(), parameters...); + printf(fmt, (int)GetBlockIdInKernel(), + (int)__kmpc_get_hardware_thread_id_in_block(), (int)GetWarpId(), + (int)GetLaneId(), parameters...); __builtin_trap(); } } diff --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h --- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h +++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h @@ -54,7 +54,8 @@ items.flags = TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel items.threadId = - GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) + __kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only + // called for 1st level) items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1 prev = parentTaskDescr; } @@ -97,16 +98,16 @@ // // overwrite specific items; // - // The threadID should be GetThreadIdInBlock() % GetMasterThreadID(). - // This is so that the serial master (first lane in the master warp) - // gets a threadId of 0. - // However, we know that this function is always called in a parallel - // region where only workers are active. The serial master thread - // never enters this region. When a parallel region is executed serially, - // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions - // are called, which never activate this region. + // The threadID should be __kmpc_get_hardware_thread_id_in_block() % + // GetMasterThreadID(). This is so that the serial master (first lane in the + // master warp) gets a threadId of 0. However, we know that this function is + // always called in a parallel region where only workers are active. The + // serial master thread never enters this region. When a parallel region is + // executed serially, the threadId is set to 0 elsewhere and the + // kmpc_serialized_* functions are called, which never activate this region. items.threadId = - GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) + __kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only + // called for 1st level) } INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent( diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu --- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu @@ -48,7 +48,8 @@ void *Ptr; __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1; - bool IsWarpLeader = (GetThreadIdInBlock() % WARPSIZE) == LeaderID; + bool IsWarpLeader = + (__kmpc_get_hardware_thread_id_in_block() % WARPSIZE) == LeaderID; if (IsWarpLeader) Ptr = Alloc(); // Get address from the first active lane. @@ -61,7 +62,7 @@ EXTERN void *__kmpc_alloc_shared(size_t Bytes) { Bytes = Bytes + (Bytes % MinBytes); - int TID = GetThreadIdInBlock(); + int TID = __kmpc_get_hardware_thread_id_in_block(); if (__kmpc_is_generic_main_thread(TID)) { // Main thread alone, use shared memory if space is available. if (MainSharedStack.Usage[0] + Bytes <= MainSharedStack.MaxSize) { @@ -97,7 +98,8 @@ EXTERN void __kmpc_free_shared(void *Ptr) { __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1; - bool IsWarpLeader = (GetThreadIdInBlock() % WARPSIZE) == LeaderID; + bool IsWarpLeader = + (__kmpc_get_hardware_thread_id_in_block() % WARPSIZE) == LeaderID; __kmpc_syncwarp(CurActive); if (IsWarpLeader) { if (Ptr >= &MainSharedStack.Data[0] && @@ -190,13 +192,14 @@ return; } if (isSPMDExecutionMode) { - if (GetThreadIdInBlock() == 0) { + if (__kmpc_get_hardware_thread_id_in_block() == 0) { *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); } __kmpc_impl_syncthreads(); return; } - ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), + ASSERT0(LT_FUSSY, + __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(), "Must be called only in the target master thread."); *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); __kmpc_impl_threadfence(); @@ -208,13 +211,14 @@ return; if (isSPMDExecutionMode) { __kmpc_impl_syncthreads(); - if (GetThreadIdInBlock() == 0) { + if (__kmpc_get_hardware_thread_id_in_block() == 0) { omptarget_nvptx_simpleMemoryManager.Release(); } return; } __kmpc_impl_threadfence(); - ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), + ASSERT0(LT_FUSSY, + __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(), "Must be called only in the target master thread."); omptarget_nvptx_simpleMemoryManager.Release(); } diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu --- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu @@ -142,7 +142,7 @@ EXTERN int omp_get_ancestor_thread_num(int level) { if (__kmpc_is_spmd_exec_mode()) - return level == 1 ? GetThreadIdInBlock() : 0; + return level == 1 ? __kmpc_get_hardware_thread_id_in_block() : 0; int rc = -1; // If level is 0 or all parallel regions are not active - return 0. unsigned parLevel = parallelLevel[GetWarpId()]; diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -34,7 +34,7 @@ if (GetLaneId() == 0) parallelLevel[GetWarpId()] = 0; - int threadIdInBlock = GetThreadIdInBlock(); + int threadIdInBlock = __kmpc_get_hardware_thread_id_in_block(); if (threadIdInBlock != GetMasterThreadID()) return; @@ -87,7 +87,7 @@ setExecutionParameters(Spmd, RequiresFullRuntime ? RuntimeInitialized : RuntimeUninitialized); - int threadId = GetThreadIdInBlock(); + int threadId = __kmpc_get_hardware_thread_id_in_block(); if (threadId == 0) { usedSlotIdx = __kmpc_impl_smid() % MAX_SM; } @@ -147,7 +147,7 @@ return; __kmpc_impl_syncthreads(); - int threadId = GetThreadIdInBlock(); + int threadId = __kmpc_get_hardware_thread_id_in_block(); if (threadId == 0) { // Enqueue omp state object for use by another team. int slot = usedSlotIdx; @@ -169,7 +169,7 @@ static void __kmpc_target_region_state_machine(ident_t *Ident) { - int TId = GetThreadIdInBlock(); + int TId = __kmpc_get_hardware_thread_id_in_block(); do { void* WorkFn = 0; @@ -199,7 +199,7 @@ int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD, bool UseGenericStateMachine, bool RequiresFullRuntime) { - int TId = GetThreadIdInBlock(); + int TId = __kmpc_get_hardware_thread_id_in_block(); if (IsSPMD) __kmpc_spmd_kernel_init(RequiresFullRuntime); else diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu --- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu @@ -105,7 +105,8 @@ ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", (int)NumThreads); - ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), + ASSERT0(LT_FUSSY, + __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(), "only team master can create parallel"); // Set number of threads on work descriptor. @@ -133,7 +134,7 @@ // Only the worker threads call this routine and the master warp // never arrives here. Therefore, use the nvptx thread id. - int threadId = GetThreadIdInBlock(); + int threadId = __kmpc_get_hardware_thread_id_in_block(); omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); // Set to true for workers participating in the parallel region. bool isActive = false; @@ -166,7 +167,7 @@ // Only the worker threads call this routine and the master warp // never arrives here. Therefore, use the nvptx thread id. - int threadId = GetThreadIdInBlock(); + int threadId = __kmpc_get_hardware_thread_id_in_block(); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, currTaskDescr->GetPrevTaskDescr()); diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu --- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu @@ -47,7 +47,7 @@ INLINE static uint32_t gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) { uint32_t size, remote_id, physical_lane_id; - physical_lane_id = GetThreadIdInBlock() % WARPSIZE; + physical_lane_id = __kmpc_get_hardware_thread_id_in_block() % WARPSIZE; __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt(); __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask(); uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2; @@ -95,9 +95,10 @@ if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1)) gpu_regular_warp_reduce(reduce_data, shflFct); else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. - gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/NumThreads % WARPSIZE, - /*LaneId=*/GetThreadIdInBlock() % WARPSIZE); + gpu_irregular_warp_reduce( + reduce_data, shflFct, + /*LaneCount=*/NumThreads % WARPSIZE, + /*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE); // When we have more than [warpsize] number of threads // a block reduction is performed here. @@ -118,9 +119,10 @@ if (Liveness == __kmpc_impl_all_lanes) // Full warp gpu_regular_warp_reduce(reduce_data, shflFct); else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes - gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/__kmpc_impl_popc(Liveness), - /*LaneId=*/GetThreadIdInBlock() % WARPSIZE); + gpu_irregular_warp_reduce( + reduce_data, shflFct, + /*LaneCount=*/__kmpc_impl_popc(Liveness), + /*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE); else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2 // parallel region may enter here; return // early. @@ -185,7 +187,7 @@ // Terminate all threads in non-SPMD mode except for the master thread. if (!__kmpc_is_spmd_exec_mode() && - !__kmpc_is_generic_main_thread(GetThreadIdInBlock())) + !__kmpc_is_generic_main_thread(__kmpc_get_hardware_thread_id_in_block())) return 0; uint32_t ThreadId = GetLogicalThreadIdInBlock(); diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu --- a/openmp/libomptarget/deviceRTLs/common/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu @@ -70,7 +70,7 @@ int GetLogicalThreadIdInBlock() { // Implemented using control flow (predication) instead of with a modulo // operation. - int tid = GetThreadIdInBlock(); + int tid = __kmpc_get_hardware_thread_id_in_block(); if (__kmpc_is_generic_main_thread(tid)) return 0; else @@ -84,7 +84,7 @@ //////////////////////////////////////////////////////////////////////////////// int GetOmpThreadId() { - int tid = GetThreadIdInBlock(); + int tid = __kmpc_get_hardware_thread_id_in_block(); if (__kmpc_is_generic_main_thread(tid)) return 0; // omp_thread_num diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -60,7 +60,7 @@ return Mask; } -EXTERN void __kmpc_impl_syncthreads() { +EXTERN void __kmpc_impl_syncthreads() { int barrier = 2; asm volatile("barrier.sync %0;" : @@ -92,15 +92,21 @@ EXTERN void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); } // Calls to the NVPTX layer (assuming 1D layout) -EXTERN int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); } +EXTERN int __kmpc_get_hardware_thread_id_in_block() { + return __nvvm_read_ptx_sreg_tid_x(); +} EXTERN int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); } EXTERN int GetNumberOfBlocksInKernel() { return __nvvm_read_ptx_sreg_nctaid_x(); } EXTERN int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); } -EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } +EXTERN unsigned GetWarpId() { + return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; +} EXTERN unsigned GetWarpSize() { return WARPSIZE; } -EXTERN unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } +EXTERN unsigned GetLaneId() { + return __kmpc_get_hardware_thread_id_in_block() & (WARPSIZE - 1); +} // Atomics uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) { diff --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h --- a/openmp/libomptarget/deviceRTLs/target_interface.h +++ b/openmp/libomptarget/deviceRTLs/target_interface.h @@ -16,7 +16,7 @@ #include "target_impl.h" // Calls to the NVPTX layer (assuming 1D layout) -EXTERN int GetThreadIdInBlock(); +EXTERN int __kmpc_get_hardware_thread_id_in_block(); EXTERN int GetBlockIdInKernel(); EXTERN int GetNumberOfBlocksInKernel(); EXTERN int GetNumberOfThreadsInBlock();