diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -44,7 +44,7 @@ MachineFunctionPass *createNVPTXPrologEpilogPass(); MachineFunctionPass *createNVPTXReplaceImageHandlesPass(); FunctionPass *createNVPTXImageOptimizerPass(); -FunctionPass *createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM); +FunctionPass *createNVPTXLowerArgsPass(); FunctionPass *createNVPTXLowerAllocaPass(); MachineFunctionPass *createNVPTXPeephole(); MachineFunctionPass *createNVPTXProxyRegErasurePass(); diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -93,10 +93,12 @@ #include "NVPTXTargetMachine.h" #include "NVPTXUtilities.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include #include @@ -113,11 +115,11 @@ class NVPTXLowerArgs : public FunctionPass { bool runOnFunction(Function &F) override; - bool runOnKernelFunction(Function &F); - bool runOnDeviceFunction(Function &F); + bool runOnKernelFunction(const NVPTXTargetMachine &TM, Function &F); + bool runOnDeviceFunction(const NVPTXTargetMachine &TM, Function &F); // handle byval parameters - void handleByValParam(Argument *Arg); + void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg); // Knowing Ptr must point to the global address space, this function // addrspacecasts Ptr to global and then back to generic. This allows // NVPTXInferAddressSpaces to fold the global-to-generic cast into @@ -126,21 +128,23 @@ public: static char ID; // Pass identification, replacement for typeid - NVPTXLowerArgs(const NVPTXTargetMachine *TM = nullptr) - : FunctionPass(ID), TM(TM) {} + NVPTXLowerArgs() : FunctionPass(ID) {} StringRef getPassName() const override { return "Lower pointer arguments of CUDA kernels"; } - -private: - const NVPTXTargetMachine *TM; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } }; } // namespace char NVPTXLowerArgs::ID = 1; -INITIALIZE_PASS(NVPTXLowerArgs, "nvptx-lower-args", - "Lower arguments (NVPTX)", false, false) +INITIALIZE_PASS_BEGIN(NVPTXLowerArgs, "nvptx-lower-args", + "Lower arguments (NVPTX)", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(NVPTXLowerArgs, "nvptx-lower-args", + "Lower arguments (NVPTX)", false, false) // ============================================================================= // If the function had a byval struct ptr arg, say foo(%struct.x* byval %d), @@ -310,7 +314,8 @@ } } -void NVPTXLowerArgs::handleByValParam(Argument *Arg) { +void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, + Argument *Arg) { Function *Func = Arg->getParent(); Instruction *FirstInst = &(Func->getEntryBlock().front()); Type *StructType = Arg->getParamByValType(); @@ -354,12 +359,8 @@ convertToParamAS(V, ArgInParamAS); LLVM_DEBUG(dbgs() << "No need to copy " << *Arg << "\n"); - // Further optimizations require target lowering info. - if (!TM) - return; - const auto *TLI = - cast(TM->getSubtargetImpl()->getTargetLowering()); + cast(TM.getSubtargetImpl()->getTargetLowering()); adjustByValArgAlignment(Arg, ArgInParamAS, TLI); @@ -390,7 +391,7 @@ } void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) { - if (Ptr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL) + if (Ptr->getType()->getPointerAddressSpace() != ADDRESS_SPACE_GENERIC) return; // Deciding where to emit the addrspacecast pair. @@ -420,8 +421,9 @@ // ============================================================================= // Main function for this pass. // ============================================================================= -bool NVPTXLowerArgs::runOnKernelFunction(Function &F) { - if (TM && TM->getDrvInterface() == NVPTX::CUDA) { +bool NVPTXLowerArgs::runOnKernelFunction(const NVPTXTargetMachine &TM, + Function &F) { + if (TM.getDrvInterface() == NVPTX::CUDA) { // Mark pointers in byval structs as global. for (auto &B : F) { for (auto &I : B) { @@ -444,8 +446,8 @@ for (Argument &Arg : F.args()) { if (Arg.getType()->isPointerTy()) { if (Arg.hasByValAttr()) - handleByValParam(&Arg); - else if (TM && TM->getDrvInterface() == NVPTX::CUDA) + handleByValParam(TM, &Arg); + else if (TM.getDrvInterface() == NVPTX::CUDA) markPointerAsGlobal(&Arg); } } @@ -453,19 +455,20 @@ } // Device functions only need to copy byval args into local memory. -bool NVPTXLowerArgs::runOnDeviceFunction(Function &F) { +bool NVPTXLowerArgs::runOnDeviceFunction(const NVPTXTargetMachine &TM, + Function &F) { LLVM_DEBUG(dbgs() << "Lowering function args of " << F.getName() << "\n"); for (Argument &Arg : F.args()) if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) - handleByValParam(&Arg); + handleByValParam(TM, &Arg); return true; } bool NVPTXLowerArgs::runOnFunction(Function &F) { - return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F); -} + auto &TM = getAnalysis().getTM(); -FunctionPass * -llvm::createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM) { - return new NVPTXLowerArgs(TM); + return isKernelFunction(F) ? runOnKernelFunction(TM, F) + : runOnDeviceFunction(TM, F); } + +FunctionPass *llvm::createNVPTXLowerArgsPass() { return new NVPTXLowerArgs(); } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -326,7 +326,7 @@ // NVPTXLowerArgs is required for correctness and should be run right // before the address space inference passes. - addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine())); + addPass(createNVPTXLowerArgsPass()); if (getOptLevel() != CodeGenOpt::None) { addAddressSpaceInferencePasses(); addStraightLineScalarOptimizationPasses(); diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll --- a/llvm/test/CodeGen/NVPTX/lower-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args.ll @@ -1,5 +1,7 @@ -; RUN: opt < %s -S -nvptx-lower-args | FileCheck %s --check-prefix IR -; RUN: llc < %s -mcpu=sm_20 | FileCheck %s --check-prefix PTX +; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes COMMON,IR,IRC +; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-nvcl | FileCheck %s --check-prefixes COMMON,IR,IRO +; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes COMMON,PTX,PTXC +; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-nvcl| FileCheck %s --check-prefixes COMMON,PTX,PTXO ; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" @@ -9,6 +11,7 @@ %class.inner = type { ptr, ptr } ; Check that nvptx-lower-args preserves arg alignment +; COMMON-LABEL: load_alignment define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %arg) { entry: ; IR: load %class.outer, ptr addrspace(101) @@ -30,5 +33,43 @@ ret void } + +; COMMON-LABEL: ptr_generic +define void @ptr_generic(ptr %out, ptr %in) { +; IRC: %in3 = addrspacecast ptr %in to ptr addrspace(1) +; IRC: %in4 = addrspacecast ptr addrspace(1) %in3 to ptr +; IRC: %out1 = addrspacecast ptr %out to ptr addrspace(1) +; IRC: %out2 = addrspacecast ptr addrspace(1) %out1 to ptr +; PTXC: cvta.to.global.u64 +; PTXC: cvta.to.global.u64 +; PTXC: ld.global.u32 +; PTXC: st.global.u32 + +; OpenCL can't make assumptions about incoming pointer, so we should generate +; generic pointers load/store. +; IRO-NOT: addrspacecast +; PTXO-NOT: cvta.to.global +; PTXO: ld.u32 +; PTXO: st.u32 + %v = load i32, ptr %in, align 4 + store i32 %v, ptr %out, align 4 + ret void +} + +; COMMON-LABEL: ptr_nongeneric +define void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(4) %in) { +; IR-NOT: addrspacecast +; PTX-NOT: cvta.to.global +; PTX: ld.const.u32 +; PTX st.global.u32 + %v = load i32, ptr addrspace(4) %in, align 4 + store i32 %v, ptr addrspace(1) %out, align 4 + ret void +} + + ; Function Attrs: convergent nounwind declare dso_local ptr @escape(ptr) local_unnamed_addr +!nvvm.annotations = !{!0, !1} +!0 = !{ptr @ptr_generic, !"kernel", i32 1} +!1 = !{ptr @ptr_nongeneric, !"kernel", i32 1}