diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -720,6 +720,7 @@ * ``LIBOMPTARGET_JIT_REPLACEMENT_MODULE= (LLVM-IR file)`` * ``LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE= (LLVM-IR file)`` * ``LIBOMPTARGET_JIT_POST_OPT_IR_MODULE= (LLVM-IR file)`` + * ``LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT= (default: 32)`` LIBOMPTARGET_DEBUG """""""""""""""""" @@ -1108,7 +1109,7 @@ LIBOMPTARGET_JIT_POST_OPT_IR_MODULE -"""""""""""""""""""""""""""""""""" +""""""""""""""""""""""""""""""""""" This environment variable can be used to extract the embedded device code after the device JIT runs additional IR optimizations on it (see @@ -1118,6 +1119,18 @@ :ref:`LIBOMPTARGET_JIT_REPLACEMENT_MODULE`. +LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT +""""""""""""""""""""""""""""""""""""""""""" + +This environment variable defines a lower bound for the number of threads if a +combined kernel, e.g., `target teams distribute parallel for`, has insufficient +parallelism. Especially if the trip count of the loops is lower than the number +of threads possible times the number of teams (aka. blocks) the device preferes +(see also :ref:`LIBOMPTARGET_AMDGPU_TEAMS_PER_CU), we will reduce the thread +count to increase outer (team/block) parallelism. The thread count will never +be reduced below the value passed for this environment variable though. + + .. _libomptarget_plugin: diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -313,9 +313,11 @@ /// user-defined threads and block clauses. uint32_t getNumThreads(GenericDeviceTy &GenericDevice, uint32_t ThreadLimitClause[3]) const; + + /// The number of threads \p NumThreads can be adjusted by this method. uint64_t getNumBlocks(GenericDeviceTy &GenericDevice, uint32_t BlockLimitClause[3], uint64_t LoopTripCount, - uint32_t NumThreads) const; + uint32_t &NumThreads) const; /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode. bool isGenericSPMDMode() const { @@ -740,6 +742,14 @@ return std::move(MB); } + /// The minimum number of threads we use for a low-trip count combined loop. + /// Instead of using more threads we increase the outer (block/team) + /// parallelism. + /// @see OMPX_MinThreadsForLowTripCount + virtual uint32_t getMinThreadsForLowTripCountLoop() { + return OMPX_MinThreadsForLowTripCount; + } + private: /// Register offload entry for global variable. Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage, @@ -783,6 +793,12 @@ UInt64Envar OMPX_TargetStackSize; UInt64Envar OMPX_TargetHeapSize; + /// Environment flag to set the minimum number of threads we use for a + /// low-trip count combined loop. Instead of using more threads we increase + /// the outer (block/team) parallelism. + UInt32Envar OMPX_MinThreadsForLowTripCount = + UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32); + protected: /// Return the execution mode used for kernel \p Name. Expected getExecutionModeForKernel(StringRef Name, diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -19,6 +19,7 @@ #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Support/Error.h" #include "llvm/Support/JSON.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/MemoryBuffer.h" #include @@ -301,7 +302,7 @@ uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice, uint32_t NumTeamsClause[3], uint64_t LoopTripCount, - uint32_t NumThreads) const { + uint32_t &NumThreads) const { assert(NumTeamsClause[1] == 0 && NumTeamsClause[2] == 0 && "Multi dimensional launch not supported yet."); @@ -312,14 +313,50 @@ return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit()); } + uint64_t DefaultNumBlocks = getDefaultNumBlocks(GenericDevice); uint64_t TripCountNumBlocks = std::numeric_limits::max(); if (LoopTripCount > 0) { if (isSPMDMode()) { // We have a combined construct, i.e. `target teams distribute // parallel for [simd]`. We launch so many teams so that each thread - // will execute one iteration of the loop. round up to the nearest - // integer - TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1; + // will execute one iteration of the loop; rounded up to the nearest + // integer. However, if that results in too few teams, we artificially + // reduce the thread count per team to increase the outer parallelism. + auto MinThreads = GenericDevice.getMinThreadsForLowTripCountLoop(); + MinThreads = std::min(MinThreads, NumThreads); + + // Honor the thread_limit clause; only lower the number of threads. + auto OldNumThreads = NumThreads; + if (LoopTripCount >= DefaultNumBlocks * NumThreads) { + // Enough parallelism for teams and threads. + TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1; + assert(TripCountNumBlocks >= DefaultNumBlocks && + "Expected sufficient outer parallelism."); + } else if (LoopTripCount >= DefaultNumBlocks * MinThreads) { + // Enough parallelism for teams, limit threads. + + // This case is hard; for now, we force "full warps": + // First, compute a thread count assuming DefaultNumBlocks. + auto NumThreadsDefaultBlocks = + (LoopTripCount + DefaultNumBlocks - 1) / DefaultNumBlocks; + // Now get a power of two that is larger or equal. + auto NumThreadsDefaultBlocksP2 = + llvm::PowerOf2Ceil(NumThreadsDefaultBlocks); + // Do not increase a thread limit given be the user. + NumThreads = std::min(NumThreads, uint32_t(NumThreadsDefaultBlocksP2)); + assert(NumThreads >= MinThreads && + "Expected sufficient inner parallelism."); + TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1; + } else { + // Not enough parallelism for teams and threads, limit both. + NumThreads = std::min(NumThreads, MinThreads); + TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1; + } + + assert(NumThreads * TripCountNumBlocks >= LoopTripCount && + "Expected sufficient parallelism"); + assert(OldNumThreads >= NumThreads && + "Number of threads cannot be increased!"); } else { assert((isGenericMode() || isGenericSPMDMode()) && "Unexpected execution mode!"); @@ -339,8 +376,7 @@ } } // If the loops are long running we rather reuse blocks than spawn too many. - uint32_t PreferredNumBlocks = std::min(uint32_t(TripCountNumBlocks), - getDefaultNumBlocks(GenericDevice)); + uint32_t PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks); return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit()); } diff --git a/openmp/libomptarget/test/offloading/small_trip_count.c b/openmp/libomptarget/test/offloading/small_trip_count.c new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/offloading/small_trip_count.c @@ -0,0 +1,41 @@ +// clang-format off +// RUN: %libomptarget-compile-generic +// RUN: env LIBOMPTARGET_INFO=16 \ +// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT +// RUN: env LIBOMPTARGET_INFO=16 LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=8 \ +// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=EIGHT + +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#define N 128 + +__attribute__((optnone)) void optnone() {} + +int main() { + // DEFAULT: Launching kernel {{.+_main_.+}} with 4 blocks and 32 threads in SPMD mode + // EIGHT: Launching kernel {{.+_main_.+}} with 16 blocks and 8 threads in SPMD mode +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < N; ++i) { + optnone(); + } + // DEFAULT: Launching kernel {{.+_main_.+}} with 4 blocks and 32 threads in SPMD mode + // EIGHT: Launching kernel {{.+_main_.+}} with 16 blocks and 8 threads in SPMD mode +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < N - 1; ++i) { + optnone(); + } + // DEFAULT: Launching kernel {{.+_main_.+}} with 5 blocks and 32 threads in SPMD mode + // EIGHT: Launching kernel {{.+_main_.+}} with 17 blocks and 8 threads in SPMD mode +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < N + 1; ++i) { + optnone(); + } + // DEFAULT: Launching kernel {{.+_main_.+}} with 32 blocks and 4 threads in SPMD mode + // EIGHT: Launching kernel {{.+_main_.+}} with 32 blocks and 4 threads in SPMD mode +#pragma omp target teams distribute parallel for simd thread_limit(4) + for (int i = 0; i < N; ++i) { + optnone(); + } +} +