diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -296,12 +296,13 @@ uint32_t ThreadLimitClause[3]) const { assert(ThreadLimitClause[1] == 0 && ThreadLimitClause[2] == 0 && "Multi dimensional launch not supported yet."); - if (ThreadLimitClause[0] > 0 && isGenericMode()) + + if (static_cast(ThreadLimitClause[0]) <= 0) + ThreadLimitClause[0] = PreferredNumThreads; + else if (ThreadLimitClause[0] > 0 && isGenericMode()) ThreadLimitClause[0] += GenericDevice.getWarpSize(); - return std::min(MaxNumThreads, (ThreadLimitClause[0] > 0) - ? ThreadLimitClause[0] - : PreferredNumThreads); + return std::min(MaxNumThreads, ThreadLimitClause[0]); } uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice, diff --git a/openmp/libomptarget/test/offloading/negative_thread_limit.cpp b/openmp/libomptarget/test/offloading/negative_thread_limit.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/offloading/negative_thread_limit.cpp @@ -0,0 +1,51 @@ +// RUN: %libomptarget-compilexx-run-and-check-amdgcn-amd-amdhsa \ +// RUN: -check-prefixes=INFO,AMDGPU +// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda \ +// RUN: -check-prefixes=INFO,NVIDIAGPU + +#include +#include + +void createThreads(int threadLimit) { + int threadCount = 0; +#pragma omp target teams distribute parallel for map(from : threadCount) \ + thread_limit(threadLimit) + for (int i = 0; i < 1; ++i) + threadCount = omp_get_num_threads(); + + printf("threadCount=%d\n", threadCount); +} + +int main(int argc, char *argv[]) { + int isHost = -1; + +#pragma omp target map(from : isHost) + { isHost = omp_is_initial_device(); } + + // Make sure we run on device + printf("Target region executed on the %s\n", isHost ? "host" : "device"); + + // Set the thread limit to a large negative number, such that the + // addition of GV_Warp_Size cannot push this into positive range + // thread_limit: value < -GV_Warp_Size < 0 -- expected: GV_Default_WG_Size + createThreads((int)0xF0000000); + + // thread_limit: value > GV_Max_WG_Size > 0 -- expected: GV_Max_WG_Size + // Note: on nvidia this could also be CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK + createThreads((int)0x40000000); + + // thread_limit: value == 0 -- expected: GV_Default_WG_Size + createThreads(0); + + return isHost; +} + +// INFO: Target region executed on the device + +// AMDGPU: threadCount=256 +// AMDGPU: threadCount=1024 +// AMDGPU: threadCount=256 + +// NVIDIAGPU: threadCount=128 +// NVIDIAGPU: threadCount={{1024|384}} +// NVIDIAGPU: threadCount=128