diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -127,6 +127,8 @@ INLINE int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); } DEVICE int GetNumberOfBlocksInKernel(); DEVICE int GetNumberOfThreadsInBlock(); +DEVICE unsigned GetWarpId(); +DEVICE unsigned GetLaneId(); DEVICE bool __kmpc_impl_is_first_active_thread(); diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -14,14 +14,10 @@ // Implementations initially derived from hcc -static DEVICE uint32_t getLaneId(void) { - return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); -} - // Initialized with a 64-bit mask with bits set in positions less than the // thread's lane number in the warp DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() { - uint32_t lane = getLaneId(); + uint32_t lane = GetLaneId(); int64_t ballot = __kmpc_impl_activemask(); uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1; return mask & ballot; @@ -30,7 +26,7 @@ // Initialized with a 64-bit mask with bits set in positions greater than the // thread's lane number in the warp DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { - uint32_t lane = getLaneId(); + uint32_t lane = GetLaneId(); if (lane == (WARPSIZE - 1)) return 0; uint64_t ballot = __kmpc_impl_activemask(); @@ -54,14 +50,14 @@ DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var, int32_t srcLane) { int width = WARPSIZE; - int self = getLaneId(); + int self = GetLaneId(); int index = srcLane + (self & ~(width - 1)); return __builtin_amdgcn_ds_bpermute(index << 2, var); } DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var, uint32_t laneDelta, int32_t width) { - int self = getLaneId(); + int self = GetLaneId(); int index = self + laneDelta; index = (int)(laneDelta + (self & (width - 1))) >= width ? self : index; return __builtin_amdgcn_ds_bpermute(index << 2, var); @@ -71,3 +67,7 @@ EXTERN uint64_t __ockl_get_num_groups(uint32_t); DEVICE int GetNumberOfBlocksInKernel() { return __ockl_get_num_groups(0); } DEVICE int GetNumberOfThreadsInBlock() { return __ockl_get_local_size(0); } +DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } +DEVICE unsigned GetLaneId() { + return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); +} diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu --- a/openmp/libomptarget/deviceRTLs/common/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu @@ -92,16 +92,6 @@ // support: get info from machine //////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// -// -// Calls to the NVPTX layer (assuming 1D layout) -// -//////////////////////////////////////////////////////////////////////////////// - -DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } - -DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } - //////////////////////////////////////////////////////////////////////////////// // // Calls to the Generic Scheme Implementation Layer (assuming 1D layout) diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h --- a/openmp/libomptarget/deviceRTLs/common/support.h +++ b/openmp/libomptarget/deviceRTLs/common/support.h @@ -50,10 +50,6 @@ // get info from machine //////////////////////////////////////////////////////////////////////////////// -// get low level ids of resources -DEVICE unsigned GetWarpId(); -DEVICE unsigned GetLaneId(); - // get global ids to locate tread/team info (constant regardless of OMP) DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode); DEVICE int GetMasterThreadID(); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -189,6 +189,8 @@ INLINE int GetBlockIdInKernel() { return blockIdx.x; } INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; } INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; } +INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } +INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } // Return true if this is the first active thread in the warp. INLINE bool __kmpc_impl_is_first_active_thread() {