Index: llvm/lib/Target/NVPTX/NVPTX.h =================================================================== --- llvm/lib/Target/NVPTX/NVPTX.h +++ llvm/lib/Target/NVPTX/NVPTX.h @@ -46,7 +46,7 @@ ModulePass *createNVPTXAssignValidGlobalNamesPass(); ModulePass *createGenericToNVVMPass(); FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion); -FunctionPass *createNVVMReflectPass(); +FunctionPass *createNVVMReflectPass(unsigned int SmVersion); MachineFunctionPass *createNVPTXPrologEpilogPass(); MachineFunctionPass *createNVPTXReplaceImageHandlesPass(); FunctionPass *createNVPTXImageOptimizerPass(); Index: llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -195,7 +195,7 @@ Builder.addExtension( PassManagerBuilder::EP_EarlyAsPossible, [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) { - PM.add(createNVVMReflectPass()); + PM.add(createNVVMReflectPass(Subtarget.getSmVersion())); PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion())); }); } @@ -258,7 +258,8 @@ // it here does nothing. But since we need it for correctness when lowering // to NVPTX, run it here too, in case whoever built our pass pipeline didn't // call addEarlyAsPossiblePasses. - addPass(createNVVMReflectPass()); + const NVPTXSubtarget &ST = *getTM().getSubtargetImpl(); + addPass(createNVVMReflectPass(ST.getSmVersion())); if (getOptLevel() != CodeGenOpt::None) addPass(createNVPTXImageOptimizerPass()); Index: llvm/lib/Target/NVPTX/NVVMReflect.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -50,7 +50,9 @@ class NVVMReflect : public FunctionPass { public: static char ID; - NVVMReflect() : FunctionPass(ID) { + unsigned int SmVersion; + NVVMReflect() : NVVMReflect(0) {} + explicit NVVMReflect(unsigned int Sm) : FunctionPass(ID), SmVersion(Sm) { initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); } @@ -58,7 +60,9 @@ }; } -FunctionPass *llvm::createNVVMReflectPass() { return new NVVMReflect(); } +FunctionPass *llvm::createNVVMReflectPass(unsigned int SmVersion) { + return new NVVMReflect(SmVersion); +} static cl::opt NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::Hidden, @@ -163,6 +167,8 @@ if (auto *Flag = mdconst::extract_or_null( F.getParent()->getModuleFlag("nvvm-reflect-ftz"))) ReflectVal = Flag->getSExtValue(); + } else if (ReflectArg == "__CUDA_ARCH") { + ReflectVal = SmVersion * 10; } Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal)); ToRemove.push_back(Call); Index: llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll @@ -0,0 +1,21 @@ +; Libdevice in recent CUDA versions relies on __CUDA_ARCH reflecting GPU type. +; Verify that __nvvm_reflect() is replaced with an appropriate value. +; +; RUN: opt %s -S -nvvm-reflect -O2 -mtriple=nvptx64 \ +; RUN: | FileCheck %s --check-prefixes=COMMON,SM20 +; RUN: opt %s -S -nvvm-reflect -O2 -mtriple=nvptx64 -mcpu=sm_35 \ +; RUN: | FileCheck %s --check-prefixes=COMMON,SM35 + +@"$str" = private addrspace(1) constant [12 x i8] c"__CUDA_ARCH\00" + +declare i32 @__nvvm_reflect(i8*) + +; COMMON-LABEL: @foo +define i32 @foo(float %a, float %b) { +; COMMON-NOT: call i32 @__nvvm_reflect + %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([12 x i8], [12 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*)) +; SM20: ret i32 200 +; SM35: ret i32 350 + ret i32 %reflect +} +