diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h --- a/openmp/libomptarget/DeviceRTL/include/Interface.h +++ b/openmp/libomptarget/DeviceRTL/include/Interface.h @@ -200,12 +200,6 @@ /// Called by the worker threads in the parallel region (function). void __kmpc_get_shared_variables(void ***GlobalArgs); -/// External interface to get the thread ID. -uint32_t __kmpc_get_hardware_thread_id_in_block(); - -/// External interface to get the number of threads. -uint32_t __kmpc_get_hardware_num_threads_in_block(); - /// Kernel /// ///{ diff --git a/openmp/libomptarget/DeviceRTL/include/Mapping.h b/openmp/libomptarget/DeviceRTL/include/Mapping.h --- a/openmp/libomptarget/DeviceRTL/include/Mapping.h +++ b/openmp/libomptarget/DeviceRTL/include/Mapping.h @@ -65,7 +65,7 @@ /// Return the number of warps in the block. uint32_t getNumberOfWarpsInBlock(); -/// Return the block Id in the kernel, in [0, getKernelSize()). +/// Return the block Id in the kernel, in [0, getBlockSize()). uint32_t getBlockId(); /// Return the block size, thus number of threads in the block. diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -43,6 +43,12 @@ return (r < group_size) ? r : group_size; } +uint32_t getNumberOfThreadsInBlock() { + return getWorkgroupDim(__builtin_amdgcn_workgroup_id_x(), + __builtin_amdgcn_grid_size_x(), + __builtin_amdgcn_workgroup_size_x()); +} + LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); } LaneMaskTy lanemaskLT() { @@ -67,13 +73,6 @@ uint32_t getThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); } -uint32_t getBlockSize() { - // TODO: verify this logic for generic mode. - return getWorkgroupDim(__builtin_amdgcn_workgroup_id_x(), - __builtin_amdgcn_grid_size_x(), - __builtin_amdgcn_workgroup_size_x()); -} - uint32_t getKernelSize() { return __builtin_amdgcn_grid_size_x(); } uint32_t getBlockId() { return __builtin_amdgcn_workgroup_id_x(); } @@ -83,18 +82,6 @@ __builtin_amdgcn_workgroup_size_x()); } -uint32_t getNumberOfProcessorElements() { - return getBlockSize(); -} - -uint32_t getWarpId() { - return mapping::getThreadIdInBlock() / mapping::getWarpSize(); -} - -uint32_t getNumberOfWarpsInBlock() { - return mapping::getBlockSize() / mapping::getWarpSize(); -} - #pragma omp end declare variant ///} @@ -132,35 +119,19 @@ uint32_t getThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); } -uint32_t getBlockSize() { - return __nvvm_read_ptx_sreg_ntid_x() - - (!mapping::isSPMDMode() * mapping::getWarpSize()); +uint32_t getKernelSize() { + return mapping::getNumberOfBlocks() * mapping::getBlockSize(); } -uint32_t getKernelSize() { return __nvvm_read_ptx_sreg_nctaid_x(); } - uint32_t getBlockId() { return __nvvm_read_ptx_sreg_ctaid_x(); } uint32_t getNumberOfBlocks() { return __nvvm_read_ptx_sreg_nctaid_x(); } -uint32_t getNumberOfProcessorElements() { - return __nvvm_read_ptx_sreg_ntid_x(); -} - -uint32_t getWarpId() { - return mapping::getThreadIdInBlock() / mapping::getWarpSize(); -} - -uint32_t getNumberOfWarpsInBlock() { - return (mapping::getBlockSize() + mapping::getWarpSize() - 1) / - mapping::getWarpSize(); -} +uint32_t getNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); } #pragma omp end declare variant ///} -uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; } - } // namespace impl } // namespace _OMP @@ -194,7 +165,10 @@ uint32_t mapping::getThreadIdInBlock() { return impl::getThreadIdInBlock(); } -uint32_t mapping::getBlockSize() { return impl::getBlockSize(); } +uint32_t mapping::getBlockSize() { + return impl::getNumberOfThreadsInBlock() - + (!mapping::isSPMDMode() * mapping::getWarpSize()); +} uint32_t mapping::getKernelSize() { return impl::getKernelSize(); } @@ -203,21 +177,25 @@ uint32_t mapping::getNumberOfBlocks() { return impl::getNumberOfBlocks(); } uint32_t mapping::getNumberOfProcessorElements() { - return impl::getNumberOfProcessorElements(); + // TODO: This should probably look at the actual hardware. + return impl::getNumberOfThreadsInBlock(); } -uint32_t mapping::getWarpId() { return impl::getWarpId(); } +uint32_t mapping::getWarpId() { + return return mapping::getThreadIdInBlock() / mapping::getWarpSize(); +} -uint32_t mapping::getWarpSize() { return impl::getWarpSize(); } +uint32_t mapping::getWarpSize() { return impl::getGridValue().GV_Warp_Size; } uint32_t mapping::getNumberOfWarpsInBlock() { - return impl::getNumberOfWarpsInBlock(); + return (mapping::getBlockSize() + mapping::getWarpSize() - 1) / + mapping::getWarpSize(); } /// Execution mode /// ///{ -static int SHARED(IsSPMDMode); +static bool SHARED(IsSPMDMode); void mapping::init(bool IsSPMD) { if (!mapping::getThreadIdInBlock()) @@ -229,13 +207,4 @@ bool mapping::isGenericMode() { return !isSPMDMode(); } ///} -extern "C" { -__attribute__((noinline)) uint32_t __kmpc_get_hardware_thread_id_in_block() { - return mapping::getThreadIdInBlock(); -} - -__attribute__((noinline)) uint32_t __kmpc_get_hardware_num_threads_in_block() { - return mapping::getNumberOfProcessorElements(); -} -} #pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp --- a/openmp/libomptarget/DeviceRTL/src/State.cpp +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -449,7 +449,7 @@ return omp_get_level() > 1 ? 1 : state::ParallelTeamSize; } -int omp_get_thread_limit(void) { return mapping::getKernelSize(); } +int omp_get_thread_limit(void) { return mapping::getBlockSize(); } int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }