Index: lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp =================================================================== --- lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -5076,11 +5076,12 @@ Address = N.getOperand(0); return true; } - if (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN) { - unsigned IID = cast(N.getOperand(0))->getZExtValue(); - if (IID == Intrinsic::nvvm_ptr_gen_to_param) - if (N.getOperand(1).getOpcode() == NVPTXISD::MoveParam) - return (SelectDirectAddr(N.getOperand(1).getOperand(0), Address)); + // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol + if (AddrSpaceCastSDNode *CastN = dyn_cast(N)) { + if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC && + CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM && + CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam) + return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address); } return false; } Index: lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- lib/Target/NVPTX/NVPTXISelLowering.cpp +++ lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -2077,7 +2077,6 @@ SDValue Root = DAG.getRoot(); std::vector OutChains; - bool isKernel = llvm::isKernelFunction(*F); bool isABI = (STI.getSmVersion() >= 20); assert(isABI && "Non-ABI compilation is not supported"); if (!isABI) @@ -2111,7 +2110,8 @@ theArgs[i], (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent() : nullptr))) { - assert(isKernel && "Only kernels can have image/sampler params"); + assert(llvm::isKernelFunction(*F) && + "Only kernels can have image/sampler params"); InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32)); continue; } @@ -2336,14 +2336,7 @@ SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); if (p.getNode()) p.getNode()->setIROrder(idx + 1); - if (isKernel) - InVals.push_back(p); - else { - SDValue p2 = DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT, - DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, dl, MVT::i32), p); - InVals.push_back(p2); - } + InVals.push_back(p); } // Clang will check explicit VarArg and issue error if any. However, Clang Index: lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp =================================================================== --- lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp +++ lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp @@ -7,10 +7,9 @@ // //===----------------------------------------------------------------------===// // -// Pointer arguments to kernel functions need to be lowered specially. // -// 1. Copy byval struct args to local memory. This is a preparation for handling -// cases like +// 1. Copy byval struct args of kernel and device functions to local +// memory. This is a preparation for handling cases like // // kernel void foo(struct A arg, ...) // { @@ -20,6 +19,8 @@ // p->filed2 = ... (this is no write access to .param) // } // +// Pointer arguments to kernel functions need to be lowered specially. +// // 2. Convert non-byval pointer arguments of CUDA kernels to pointers in the // global address space. This allows later optimizations to emit // ld.global.*/st.global.* for accessing these pointer arguments. For @@ -101,6 +102,9 @@ class NVPTXLowerKernelArgs : public FunctionPass { bool runOnFunction(Function &F) override; + bool runOnKernelFunction(Function &F); + bool runOnDeviceFunction(Function &F); + // handle byval parameters void handleByValParam(Argument *Arg); // Knowing Ptr must point to the global address space, this function @@ -192,11 +196,7 @@ // ============================================================================= // Main function for this pass. // ============================================================================= -bool NVPTXLowerKernelArgs::runOnFunction(Function &F) { - // Skip non-kernels. See the comments at the top of this file. - if (!isKernelFunction(F)) - return false; - +bool NVPTXLowerKernelArgs::runOnKernelFunction(Function &F) { if (TM && TM->getDrvInterface() == NVPTX::CUDA) { // Mark pointers in byval structs as global. for (auto &B : F) { @@ -228,6 +228,18 @@ return true; } +// Device functions only need to copy byval args into local memory. +bool NVPTXLowerKernelArgs::runOnDeviceFunction(Function &F) { + for (Argument &Arg : F.args()) + if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) + handleByValParam(&Arg); + return true; +} + +bool NVPTXLowerKernelArgs::runOnFunction(Function &F) { + return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F); +} + FunctionPass * llvm::createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM) { return new NVPTXLowerKernelArgs(TM); Index: test/CodeGen/NVPTX/bug21465.ll =================================================================== --- test/CodeGen/NVPTX/bug21465.ll +++ test/CodeGen/NVPTX/bug21465.ll @@ -15,7 +15,7 @@ %b = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1 %0 = load i32, i32* %b, align 4 ; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}] -; PTX: ld.param.u32 [[value:%r[0-9]+]], [{{%rd[0-9]+}}+4] +; PTX: ld.param.u32 [[value:%r[0-9]+]], [_Z11TakesStruct1SPi_param_0+4] store i32 %0, i32* %output, align 4 ; PTX-NEXT: st.global.u32 [{{%rd[0-9]+}}], [[value]] ret void Index: test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll =================================================================== --- test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll +++ test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll @@ -28,20 +28,38 @@ %struct.S = type { i32*, i32* } -define void @ptr_in_byval(%struct.S* byval %input, i32* %output) { -; CHECK-LABEL: .visible .entry ptr_in_byval( -; CHECK: cvta.to.global.u64 -; CHECK: cvta.to.global.u64 +define void @ptr_in_byval_kernel(%struct.S* byval %input, i32* %output) { +; CHECK-LABEL: .visible .entry ptr_in_byval_kernel( +; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_kernel_param_1] +; CHECK: cvta.to.global.u64 %[[optr_g:.*]], %[[optr]]; +; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8] +; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]]; %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1 %b = load i32*, i32** %b_ptr, align 4 %v = load i32, i32* %b, align 4 -; CHECK: ld.global.u32 +; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]] store i32 %v, i32* %output, align 4 -; CHECK: st.global.u32 +; CHECK: st.global.u32 [%[[optr_g]]], %[[val]] + ret void +} + +; Regular functions lower byval arguments differently. We need to make +; sure that we're loading byval argument data using [symbol+offset]. +; There's also no assumption that all pointers within are in global space. +define void @ptr_in_byval_func(%struct.S* byval %input, i32* %output) { +; CHECK-LABEL: .visible .func ptr_in_byval_func( +; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_func_param_1] +; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_func_param_0+8] + %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1 + %b = load i32*, i32** %b_ptr, align 4 + %v = load i32, i32* %b, align 4 +; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]] + store i32 %v, i32* %output, align 4 +; CHECK: st.u32 [%[[optr]]], %[[val]] ret void } !nvvm.annotations = !{!0, !1, !2} !0 = !{void (float*, float*)* @kernel, !"kernel", i32 1} !1 = !{void (float addrspace(1)*, float addrspace(1)*)* @kernel2, !"kernel", i32 1} -!2 = !{void (%struct.S*, i32*)* @ptr_in_byval, !"kernel", i32 1} +!2 = !{void (%struct.S*, i32*)* @ptr_in_byval_kernel, !"kernel", i32 1}