diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -23,6 +23,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CFG.h" #include "llvm/IR/CallingConv.h" @@ -37,6 +38,7 @@ #include "llvm/IR/Value.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" @@ -4121,6 +4123,20 @@ Builder.CreateCall(Fn, {}); } +static const omp::GV &getGridValue(Function *Kernel) { + if (Kernel->getCallingConv() == CallingConv::AMDGPU_KERNEL) { + StringRef Features = + Kernel->getFnAttribute("target-features").getValueAsString(); + if (Features.count("+wavefrontsize64")) + return omp::getAMDGPUGridValues<64>(); + return omp::getAMDGPUGridValues<32>(); + } + if (Triple(Kernel->getParent()->getTargetTriple()).isNVPTX()) + + return omp::NVPTXGridValues; + llvm_unreachable("No grid value available for this architecture!"); +} + void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes( Function *OutlinedFn, int32_t NumTeams, int32_t NumThreads) { if (Config.isTargetDevice()) { @@ -4135,6 +4151,9 @@ if (NumTeams > 0) OutlinedFn->addFnAttr("omp_target_num_teams", std::to_string(NumTeams)); + if (NumThreads == -1 && Config.isGPU()) + NumThreads = getGridValue(OutlinedFn).GV_Default_WG_Size; + if (NumThreads > 0) { if (OutlinedFn->getCallingConv() == CallingConv::AMDGPU_KERNEL) { OutlinedFn->addFnAttr("amdgpu-flat-work-group-size", diff --git a/openmp/libomptarget/test/offloading/default_thread_limit.c b/openmp/libomptarget/test/offloading/default_thread_limit.c new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/test/offloading/default_thread_limit.c @@ -0,0 +1,103 @@ +// clang-format off +// RUN: %libomptarget-compile-generic +// RUN: env LIBOMPTARGET_INFO=16 \ +// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT + +// UNSUPPORTED: nvptx64-nvidia-cuda +// UNSUPPORTED: nvptx64-nvidia-cuda-LTO +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +__attribute__((optnone)) int optnone() { return 1; } + +int main() { + int N = optnone() * 4098 * 32; + +// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]] +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < N; ++i) { + optnone(); + } +// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]] +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < N; ++i) { + optnone(); + } +// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]] +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < N; ++i) { + optnone(); + } +// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]] +#pragma omp target +#pragma omp teams distribute parallel for + for (int i = 0; i < N; ++i) { + optnone(); + } +// DEFAULT: 42 (MaxFlatWorkGroupSize: 1024 +#pragma omp target thread_limit(optnone() * 42) +#pragma omp teams distribute parallel for + for (int i = 0; i < N; ++i) { + optnone(); + } +// DEFAULT: 42 (MaxFlatWorkGroupSize: 42 +#pragma omp target thread_limit(optnone() * 42) ompx_attribute(__attribute__((amdgpu_flat_work_group_size(42, 42)))) +#pragma omp teams distribute parallel for + for (int i = 0; i < N; ++i) { + optnone(); + } +// FIXME: Use the attribute value to imply a thread_limit +// DEFAULT: {{(128|256)}} (MaxFlatWorkGroupSize: 42 +#pragma omp target ompx_attribute(__attribute__((amdgpu_flat_work_group_size(42, 42)))) +#pragma omp teams distribute parallel for + for (int i = 0; i < N; ++i) { + optnone(); + } +// DEFAULT: MaxFlatWorkGroupSize: 1024 +#pragma omp target +#pragma omp teams distribute parallel for num_threads(optnone() * 42) + for (int i = 0; i < N; ++i) { + optnone(); + } +// DEFAULT: MaxFlatWorkGroupSize: 1024 +#pragma omp target teams distribute parallel for thread_limit(optnone() * 42) + for (int i = 0; i < N; ++i) { + optnone(); + } +// DEFAULT: MaxFlatWorkGroupSize: 1024 +#pragma omp target teams distribute parallel for num_threads(optnone() * 42) + for (int i = 0; i < N; ++i) { + optnone(); + } +// DEFAULT: 9 (MaxFlatWorkGroupSize: 9 +#pragma omp target +#pragma omp teams distribute parallel for num_threads(9) + for (int i = 0; i < N; ++i) { + optnone(); + } +// DEFAULT: 4 (MaxFlatWorkGroupSize: 4 +#pragma omp target thread_limit(4) +#pragma omp teams distribute parallel for + for (int i = 0; i < N; ++i) { + optnone(); + } +// DEFAULT: 4 (MaxFlatWorkGroupSize: 4 +#pragma omp target +#pragma omp teams distribute parallel for thread_limit(4) + for (int i = 0; i < N; ++i) { + optnone(); + } +// DEFAULT: 9 (MaxFlatWorkGroupSize: 9 +#pragma omp target teams distribute parallel for num_threads(9) + for (int i = 0; i < N; ++i) { + optnone(); + } +// DEFAULT: 4 (MaxFlatWorkGroupSize: 4 +#pragma omp target teams distribute parallel for simd thread_limit(4) + for (int i = 0; i < N; ++i) { + optnone(); + } +} +