Index: lib/CodeGen/TargetInfo.cpp =================================================================== --- lib/CodeGen/TargetInfo.cpp +++ lib/CodeGen/TargetInfo.cpp @@ -4764,7 +4764,10 @@ void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const override; private: - static void addKernelMetadata(llvm::Function *F); + // Adds a NamedMDNode with F, Name, and Operand as operands, and adds the + // resulting MDNode to the nvvm.annotations MDNode. + static void addNVVMMetadata(llvm::Function *F, StringRef Name, + const int Operand); }; ABIArgInfo NVPTXABIInfo::classifyReturnType(QualType RetTy) const { @@ -4823,7 +4826,8 @@ // By default, all functions are device functions if (FD->hasAttr()) { // OpenCL __kernel functions get kernel metadata - addKernelMetadata(F); + // Create !{, metadata !"kernel", i32 1} node + addNVVMMetadata(F, "kernel", 1); // And kernel functions are not subject to inlining F->addFnAttr(llvm::Attribute::NoInline); } @@ -4834,28 +4838,43 @@ // CUDA __global__ functions get a kernel metadata entry. Since // __global__ functions cannot be called from the device, we do not // need to set the noinline attribute. - if (FD->hasAttr()) - addKernelMetadata(F); + if (FD->hasAttr()) { + // Create !{, metadata !"kernel", i32 1} node + addNVVMMetadata(F, "kernel", 1); + } + if (FD->hasAttr()) { + // Create !{, metadata !"maxntidx", i32 } node + addNVVMMetadata(F, "maxntidx", + FD->getAttr()->getMaxThreads()); + // min blocks is a default argument for CUDALaunchBoundsAttr, so getting a + // zero value from getMinBlocks either means it was not specified in + // __launch_bounds__ or the user specified a 0 value. In both cases, we + // don't have to add a PTX directive. + int minctasm = FD->getAttr()->getMinBlocks(); + if (minctasm > 0) { + // Create !{, metadata !"minctasm", i32 } node + addNVVMMetadata(F, "minctasm", minctasm); + } + } } } -void NVPTXTargetCodeGenInfo::addKernelMetadata(llvm::Function *F) { +void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::Function *F, StringRef Name, + const int Operand) { llvm::Module *M = F->getParent(); llvm::LLVMContext &Ctx = M->getContext(); // Get "nvvm.annotations" metadata node llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations"); - // Create !{, metadata !"kernel", i32 1} node llvm::SmallVector MDVals; MDVals.push_back(F); - MDVals.push_back(llvm::MDString::get(Ctx, "kernel")); - MDVals.push_back(llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1)); - + MDVals.push_back(llvm::MDString::get(Ctx, Name)); + MDVals.push_back( + llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand)); // Append metadata to nvvm.annotations MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); } - } //===----------------------------------------------------------------------===// Index: test/CodeGenCUDA/launch-bounds.cu =================================================================== --- /dev/null +++ test/CodeGenCUDA/launch-bounds.cu @@ -0,0 +1,30 @@ +// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm -o - | FileCheck %s + +#include "../SemaCUDA/cuda.h" + +#define MAX_THREADS_PER_BLOCK 256 +#define MIN_BLOCKS_PER_MP 2 + +// Test both max threads per block and Min cta per sm. +extern "C" { +__global__ void +__launch_bounds__( MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP ) +Kernel1() +{ +} +} + +// CHECK: !{{[0-9]+}} = metadata !{void ()* @Kernel1, metadata !"maxntidx", i32 256} +// CHECK: !{{[0-9]+}} = metadata !{void ()* @Kernel1, metadata !"minctasm", i32 2} + +// Test only max threads per block. Min cta per sm defaults to 0, and +// CodeGen doesn't output a zero value for minctasm. +extern "C" { +__global__ void +__launch_bounds__( MAX_THREADS_PER_BLOCK ) +Kernel2() +{ +} +} + +// CHECK: !{{[0-9]+}} = metadata !{void ()* @Kernel2, metadata !"maxntidx", i32 256}