Index: clang/test/OpenMP/thread_limit_nvptx.c =================================================================== --- /dev/null +++ clang/test/OpenMP/thread_limit_nvptx.c @@ -0,0 +1,33 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +void foo(int N) { +// CHECK: l11, !"maxntidx", i32 128} +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < N; ++i) + ; +// CHECK: l15, !"maxntidx", i32 4} +#pragma omp target teams distribute parallel for simd thread_limit(4) + for (int i = 0; i < N; ++i) + ; +// TODO: We should not emit two maxntidx annotations. +// CHECK: l21, !"maxntidx", i32 128} +// CHECK: l21, !"maxntidx", i32 42} +#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) + for (int i = 0; i < N; ++i) + ; +// TODO: We should not emit two maxntidx annotations. +// CHECK: l27, !"maxntidx", i32 22} +// CHECK: l27, !"maxntidx", i32 42} +#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22) + for (int i = 0; i < N; ++i) + ; +} + +#endif + Index: llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp =================================================================== --- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -33,7 +33,9 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Value.h" #include "llvm/MC/TargetRegistry.h" @@ -4156,7 +4158,38 @@ OutlinedFn->addFnAttr("amdgpu-flat-work-group-size", llvm::utostr(1) + "," + llvm::utostr(NumThreads)); } else { - // TODO: Modify or create "maxntidx" module metadata. + // Update the "maxntidx" metadata for NVIDIA, or add it. + NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); + MDNode *ExistingOp = nullptr; + for (auto *Op : MD->operands()) { + if (Op->getNumOperands() != 3) + continue; + auto *Kernel = dyn_cast(Op->getOperand(0)); + if (!Kernel || Kernel->getValue() != OutlinedFn) + continue; + auto *Prop = dyn_cast(Op->getOperand(1)); + if (!Prop || Prop->getString() != "maxntidx") + continue; + ExistingOp = Op; + break; + } + if (ExistingOp) { + auto *OldVal = dyn_cast(ExistingOp->getOperand(2)); + int32_t OldLimit = + cast(OldVal->getValue())->getZExtValue(); + ExistingOp->replaceOperandWith( + 2, ConstantAsMetadata::get( + ConstantInt::get(OldVal->getValue()->getType(), + std::min(OldLimit, NumThreads)))); + } else { + LLVMContext &Ctx = M.getContext(); + Metadata *MDVals[] = {ConstantAsMetadata::get(OutlinedFn), + MDString::get(Ctx, "maxntidx"), + ConstantAsMetadata::get(ConstantInt::get( + Type::getInt32Ty(Ctx), NumThreads))}; + // Append metadata to nvvm.annotations + MD->addOperand(MDNode::get(Ctx, MDVals)); + } } OutlinedFn->addFnAttr("omp_target_thread_limit", std::to_string(NumThreads));