Index: llvm/trunk/docs/NVPTXUsage.rst =================================================================== --- llvm/trunk/docs/NVPTXUsage.rst +++ llvm/trunk/docs/NVPTXUsage.rst @@ -289,7 +289,7 @@ return my_function_precise(a); } -The default value for all unspecified reflection parameters is zero. +The default value for all unspecified reflection parameters is zero. The ``NVVMReflect`` pass should be executed early in the optimization pipeline, immediately after the link stage. The ``internalize`` pass is also @@ -326,6 +326,18 @@ Therefore, it is recommended that ``NVVMReflect`` is executed early in the optimization pipeline before dead-code elimination. +The NVPTX TargetMachine knows how to schedule ``NVVMReflect`` at the beginning +of your pass manager; just use the following code when setting up your pass +manager: + +.. code-block:: c++ + std::unique_ptr TM = ...; + PassManagerBuilder PMBuilder(...); + PMBuilder.addExtension( + PassManagerBuilder::EP_EarlyAsPossible, + [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) { + TM->addEarlyAsPossiblePasses(PM); + }); Reflection Parameters --------------------- @@ -339,35 +351,16 @@ ``__CUDA_FTZ=[0,1]`` Use optimized code paths that flush subnormals to zero ==================== ====================================================== +The value of this flag is determined by the "nvvm-reflect-ftz" module flag. +The following sets the ftz flag to 1. -Invoking NVVMReflect --------------------- - -To ensure that all dead code caused by the reflection pass is eliminated, it -is recommended that the reflection pass is executed early in the LLVM IR -optimization pipeline. The pass takes an optional mapping of reflection -parameter name to an integer value. This mapping can be specified as either a -command-line option to ``opt`` or as an LLVM ``StringMap`` object when -programmatically creating a pass pipeline. - -With ``opt``: - -.. code-block:: text - - # opt -nvvm-reflect -nvvm-reflect-list==,= module.bc -o module.reflect.bc - - -With programmatic pass pipeline: - -.. code-block:: c++ - - extern FunctionPass *llvm::createNVVMReflectPass(const StringMap& Mapping); - - StringMap ReflectParams; - ReflectParams["__CUDA_FTZ"] = 1; - Passes.add(createNVVMReflectPass(ReflectParams)); - +.. code-block:: llvm + !llvm.module.flag = !{!0} + !0 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +(``i32 4`` indicates that the value set here overrides the value in another +module we link with. See the `LangRef ` +for details.) Executing PTX ============= Index: llvm/trunk/lib/Target/NVPTX/NVPTX.h =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTX.h +++ llvm/trunk/lib/Target/NVPTX/NVPTX.h @@ -48,7 +48,6 @@ FunctionPass *createNVPTXInferAddressSpacesPass(); FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion); FunctionPass *createNVVMReflectPass(); -FunctionPass *createNVVMReflectPass(const StringMap &Mapping); MachineFunctionPass *createNVPTXPrologEpilogPass(); MachineFunctionPass *createNVPTXReplaceImageHandlesPass(); FunctionPass *createNVPTXImageOptimizerPass(); Index: llvm/trunk/lib/Target/NVPTX/NVVMReflect.cpp =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVVMReflect.cpp +++ llvm/trunk/lib/Target/NVPTX/NVVMReflect.cpp @@ -10,11 +10,10 @@ // This pass replaces occurrences of __nvvm_reflect("foo") and llvm.nvvm.reflect // with an integer. // -// We choose the value we use by looking, in this order, at: -// -// * the -nvvm-reflect-list flag, which has the format "foo=1,bar=42", -// * the StringMap passed to the pass's constructor, and -// * metadata in the module itself. +// We choose the value we use by looking at metadata in the module itself. Note +// that we intentionally only have one way to choose these values, because other +// parts of LLVM (particularly, InstCombineCall) rely on being able to predict +// the values chosen by this pass. // // If we see an unknown string, we replace its call with 0. // @@ -49,30 +48,17 @@ namespace { class NVVMReflect : public FunctionPass { -private: - StringMap VarMap; - public: static char ID; - NVVMReflect() : NVVMReflect(StringMap()) {} - - NVVMReflect(const StringMap &Mapping) - : FunctionPass(ID), VarMap(Mapping) { + NVVMReflect() : FunctionPass(ID) { initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); - setVarMap(); } bool runOnFunction(Function &) override; - -private: - void setVarMap(); }; } FunctionPass *llvm::createNVVMReflectPass() { return new NVVMReflect(); } -FunctionPass *llvm::createNVVMReflectPass(const StringMap &Mapping) { - return new NVVMReflect(Mapping); -} static cl::opt NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::Hidden, @@ -83,35 +69,6 @@ "Replace occurrences of __nvvm_reflect() calls with 0/1", false, false) -static cl::list -ReflectList("nvvm-reflect-list", cl::value_desc("name="), cl::Hidden, - cl::desc("A list of string=num assignments"), - cl::ValueRequired); - -/// The command line can look as follows : -/// -nvvm-reflect-list a=1,b=2 -nvvm-reflect-list c=3,d=0 -R e=2 -/// The strings "a=1,b=2", "c=3,d=0", "e=2" are available in the -/// ReflectList vector. First, each of ReflectList[i] is 'split' -/// using "," as the delimiter. Then each of this part is split -/// using "=" as the delimiter. -void NVVMReflect::setVarMap() { - for (unsigned i = 0, e = ReflectList.size(); i != e; ++i) { - DEBUG(dbgs() << "Option : " << ReflectList[i] << "\n"); - SmallVector NameValList; - StringRef(ReflectList[i]).split(NameValList, ','); - for (unsigned j = 0, ej = NameValList.size(); j != ej; ++j) { - SmallVector NameValPair; - NameValList[j].split(NameValPair, '='); - assert(NameValPair.size() == 2 && "name=val expected"); - std::stringstream ValStream(NameValPair[1]); - int Val; - ValStream >> Val; - assert((!(ValStream.fail())) && "integer value expected"); - VarMap[NameValPair[0]] = Val; - } - } -} - bool NVVMReflect::runOnFunction(Function &F) { if (!NVVMReflectEnabled) return false; @@ -199,11 +156,10 @@ DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n"); int ReflectVal = 0; // The default value is 0 - auto Iter = VarMap.find(ReflectArg); - if (Iter != VarMap.end()) - ReflectVal = Iter->second; - else if (ReflectArg == "__CUDA_FTZ") { - // Try to pull __CUDA_FTZ from the nvvm-reflect-ftz module flag. + if (ReflectArg == "__CUDA_FTZ") { + // Try to pull __CUDA_FTZ from the nvvm-reflect-ftz module flag. Our + // choice here must be kept in sync with AutoUpgrade, which uses the same + // technique to detect whether ftz is enabled. if (auto *Flag = mdconst::extract_or_null( F.getParent()->getModuleFlag("nvvm-reflect-ftz"))) ReflectVal = Flag->getSExtValue(); Index: llvm/trunk/test/CodeGen/NVPTX/nvvm-reflect.ll =================================================================== --- llvm/trunk/test/CodeGen/NVPTX/nvvm-reflect.ll +++ llvm/trunk/test/CodeGen/NVPTX/nvvm-reflect.ll @@ -1,30 +1,38 @@ -; RUN: opt < %s -S -nvvm-reflect -nvvm-reflect-list USE_MUL=0 -O2 | FileCheck %s --check-prefix=USE_MUL_0 -; RUN: opt < %s -S -nvvm-reflect -nvvm-reflect-list USE_MUL=1 -O2 | FileCheck %s --check-prefix=USE_MUL_1 +; We run nvvm-reflect (and then optimize) this module twice, once with metadata +; that enables FTZ, and again with metadata that disables it. -@str = private unnamed_addr addrspace(4) constant [8 x i8] c"USE_MUL\00" +; RUN: cat %s > %t.noftz +; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz +; RUN: opt %t.noftz -S -nvvm-reflect -O2 \ +; RUN: | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK + +; RUN: cat %s > %t.ftz +; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz +; RUN: opt %t.ftz -S -nvvm-reflect -O2 \ +; RUN: | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK + +@str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00" declare i32 @__nvvm_reflect(i8*) declare i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)*) +; CHECK-LABEL: @foo define float @foo(float %a, float %b) { -; USE_MUL_0: define float @foo -; USE_MUL_0-NOT: call i32 @__nvvm_reflect -; USE_MUL_1: define float @foo -; USE_MUL_1-NOT: call i32 @__nvvm_reflect - %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @str, i32 0, i32 0)) +; CHECK-NOT: call i32 @__nvvm_reflect + %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(4)* @str, i32 0, i32 0)) %reflect = tail call i32 @__nvvm_reflect(i8* %ptr) %cmp = icmp ugt i32 %reflect, 0 br i1 %cmp, label %use_mul, label %use_add use_mul: -; USE_MUL_1: fmul float %a, %b -; USE_MUL_0-NOT: fadd float %a, %b +; USE_FTZ_1: fmul float %a, %b +; USE_FTZ_0-NOT: fadd float %a, %b %ret1 = fmul float %a, %b br label %exit use_add: -; USE_MUL_0: fadd float %a, %b -; USE_MUL_1-NOT: fmul float %a, %b +; USE_FTZ_0: fadd float %a, %b +; USE_FTZ_1-NOT: fmul float %a, %b %ret2 = fadd float %a, %b br label %exit @@ -35,14 +43,12 @@ declare i32 @llvm.nvvm.reflect.p0i8(i8*) -; USE_MUL_0: define i32 @intrinsic -; USE_MUL_1: define i32 @intrinsic +; CHECK-LABEL: define i32 @intrinsic define i32 @intrinsic() { -; USE_MUL_0-NOT: call i32 @llvm.nvvm.reflect -; USE_MUL_0: ret i32 0 -; USE_MUL_1-NOT: call i32 @llvm.nvvm.reflect -; USE_MUL_1: ret i32 1 - %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @str, i32 0, i32 0)) +; CHECK-NOT: call i32 @llvm.nvvm.reflect +; USE_FTZ_0: ret i32 0 +; USE_FTZ_1: ret i32 1 + %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(4)* @str, i32 0, i32 0)) %reflect = tail call i32 @llvm.nvvm.reflect.p0i8(i8* %ptr) ret i32 %reflect } @@ -50,26 +56,24 @@ ; CUDA-7.0 passes __nvvm_reflect argument slightly differently. ; Verify that it works, too -@"$str" = private addrspace(1) constant [8 x i8] c"USE_MUL\00" +@"$str" = private addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00" +; CHECK-LABEL: @bar define float @bar(float %a, float %b) { -; USE_MUL_0: define float @bar -; USE_MUL_0-NOT: call i32 @__nvvm_reflect -; USE_MUL_1: define float @bar -; USE_MUL_1-NOT: call i32 @__nvvm_reflect - %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*)) +; CHECK-NOT: call i32 @__nvvm_reflect + %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*)) %cmp = icmp ne i32 %reflect, 0 br i1 %cmp, label %use_mul, label %use_add use_mul: -; USE_MUL_1: fmul float %a, %b -; USE_MUL_0-NOT: fadd float %a, %b +; USE_FTZ_1: fmul float %a, %b +; USE_FTZ_0-NOT: fadd float %a, %b %ret1 = fmul float %a, %b br label %exit use_add: -; USE_MUL_0: fadd float %a, %b -; USE_MUL_1-NOT: fmul float %a, %b +; USE_FTZ_0: fadd float %a, %b +; USE_FTZ_1-NOT: fmul float %a, %b %ret2 = fadd float %a, %b br label %exit @@ -77,3 +81,6 @@ %ret = phi float [%ret1, %use_mul], [%ret2, %use_add] ret float %ret } + +!llvm.module.flags = !{!0} +; A module flag is added to the end of this file by the RUN lines at the top.