Index: lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp =================================================================== --- lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -5076,11 +5076,12 @@ Address = N.getOperand(0); return true; } - if (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN) { - unsigned IID = cast(N.getOperand(0))->getZExtValue(); - if (IID == Intrinsic::nvvm_ptr_gen_to_param) - if (N.getOperand(1).getOpcode() == NVPTXISD::MoveParam) - return (SelectDirectAddr(N.getOperand(1).getOperand(0), Address)); + // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol + if (AddrSpaceCastSDNode *CastN = dyn_cast(N)) { + if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC && + CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM && + CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam) + return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address); } return false; } Index: lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- lib/Target/NVPTX/NVPTXISelLowering.cpp +++ lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -2077,7 +2077,6 @@ SDValue Root = DAG.getRoot(); std::vector OutChains; - bool isKernel = llvm::isKernelFunction(*F); bool isABI = (STI.getSmVersion() >= 20); assert(isABI && "Non-ABI compilation is not supported"); if (!isABI) @@ -2111,7 +2110,8 @@ theArgs[i], (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent() : nullptr))) { - assert(isKernel && "Only kernels can have image/sampler params"); + assert(llvm::isKernelFunction(*F) && + "Only kernels can have image/sampler params"); InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32)); continue; } @@ -2336,14 +2336,7 @@ SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); if (p.getNode()) p.getNode()->setIROrder(idx + 1); - if (isKernel) - InVals.push_back(p); - else { - SDValue p2 = DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT, - DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, dl, MVT::i32), p); - InVals.push_back(p2); - } + InVals.push_back(p); } // Clang will check explicit VarArg and issue error if any. However, Clang Index: lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp =================================================================== --- lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp +++ lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp @@ -7,20 +7,28 @@ // //===----------------------------------------------------------------------===// // -// Pointer arguments to kernel functions need to be lowered specially. // -// 1. Copy byval struct args to local memory. This is a preparation for handling -// cases like +// Arguments to kernel and device functions are passed via param space, +// which imposes certain restrictions: +// http://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces // -// kernel void foo(struct A arg, ...) -// { -// struct A *p = &arg; -// ... -// ... = p->filed1 ... (this is no generic address for .param) -// p->filed2 = ... (this is no write access to .param) -// } +// Kernel parameters are read-only and accessible only via ld.param +// instruction, directly or via a pointer. Pointers to kernel +// arguments can't be converted to generic address space. // -// 2. Convert non-byval pointer arguments of CUDA kernels to pointers in the +// Device function parameters are directly accessible via +// ld.param/st.param, but taking the address of one returns a pointer +// to a copy created in local space which *can't* be used with +// ld.param/st.param. +// +// Copying a byval struct into local memory in IR allows us to enforce +// the param space restrictions, gives the rest of IR a pointer w/o +// param space restrictions, and gives us an opportunity to eliminate +// the copy. +// +// Pointer arguments to kernel functions need more work to be lowered: +// +// 1. Convert non-byval pointer arguments of CUDA kernels to pointers in the // global address space. This allows later optimizations to emit // ld.global.*/st.global.* for accessing these pointer arguments. For // example, @@ -47,7 +55,7 @@ // ... // } // -// 3. Convert pointers in a byval kernel parameter to pointers in the global +// 2. Convert pointers in a byval kernel parameter to pointers in the global // address space. As #2, it allows NVPTX to emit more ld/st.global. E.g., // // struct S { @@ -101,6 +109,9 @@ class NVPTXLowerKernelArgs : public FunctionPass { bool runOnFunction(Function &F) override; + bool runOnKernelFunction(Function &F); + bool runOnDeviceFunction(Function &F); + // handle byval parameters void handleByValParam(Argument *Arg); // Knowing Ptr must point to the global address space, this function @@ -192,11 +203,7 @@ // ============================================================================= // Main function for this pass. // ============================================================================= -bool NVPTXLowerKernelArgs::runOnFunction(Function &F) { - // Skip non-kernels. See the comments at the top of this file. - if (!isKernelFunction(F)) - return false; - +bool NVPTXLowerKernelArgs::runOnKernelFunction(Function &F) { if (TM && TM->getDrvInterface() == NVPTX::CUDA) { // Mark pointers in byval structs as global. for (auto &B : F) { @@ -228,6 +235,18 @@ return true; } +// Device functions only need to copy byval args into local memory. +bool NVPTXLowerKernelArgs::runOnDeviceFunction(Function &F) { + for (Argument &Arg : F.args()) + if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) + handleByValParam(&Arg); + return true; +} + +bool NVPTXLowerKernelArgs::runOnFunction(Function &F) { + return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F); +} + FunctionPass * llvm::createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM) { return new NVPTXLowerKernelArgs(TM); Index: test/CodeGen/NVPTX/bug21465.ll =================================================================== --- test/CodeGen/NVPTX/bug21465.ll +++ test/CodeGen/NVPTX/bug21465.ll @@ -15,7 +15,7 @@ %b = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1 %0 = load i32, i32* %b, align 4 ; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}] -; PTX: ld.param.u32 [[value:%r[0-9]+]], [{{%rd[0-9]+}}+4] +; PTX: ld.param.u32 [[value:%r[0-9]+]], [_Z11TakesStruct1SPi_param_0+4] store i32 %0, i32* %output, align 4 ; PTX-NEXT: st.global.u32 [{{%rd[0-9]+}}], [[value]] ret void Index: test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll =================================================================== --- test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll +++ test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll @@ -28,20 +28,38 @@ %struct.S = type { i32*, i32* } -define void @ptr_in_byval(%struct.S* byval %input, i32* %output) { -; CHECK-LABEL: .visible .entry ptr_in_byval( -; CHECK: cvta.to.global.u64 -; CHECK: cvta.to.global.u64 +define void @ptr_in_byval_kernel(%struct.S* byval %input, i32* %output) { +; CHECK-LABEL: .visible .entry ptr_in_byval_kernel( +; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_kernel_param_1] +; CHECK: cvta.to.global.u64 %[[optr_g:.*]], %[[optr]]; +; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8] +; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]]; %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1 %b = load i32*, i32** %b_ptr, align 4 %v = load i32, i32* %b, align 4 -; CHECK: ld.global.u32 +; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]] store i32 %v, i32* %output, align 4 -; CHECK: st.global.u32 +; CHECK: st.global.u32 [%[[optr_g]]], %[[val]] + ret void +} + +; Regular functions lower byval arguments differently. We need to make +; sure that we're loading byval argument data using [symbol+offset]. +; There's also no assumption that all pointers within are in global space. +define void @ptr_in_byval_func(%struct.S* byval %input, i32* %output) { +; CHECK-LABEL: .visible .func ptr_in_byval_func( +; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_func_param_1] +; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_func_param_0+8] + %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1 + %b = load i32*, i32** %b_ptr, align 4 + %v = load i32, i32* %b, align 4 +; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]] + store i32 %v, i32* %output, align 4 +; CHECK: st.u32 [%[[optr]]], %[[val]] ret void } !nvvm.annotations = !{!0, !1, !2} !0 = !{void (float*, float*)* @kernel, !"kernel", i32 1} !1 = !{void (float addrspace(1)*, float addrspace(1)*)* @kernel2, !"kernel", i32 1} -!2 = !{void (%struct.S*, i32*)* @ptr_in_byval, !"kernel", i32 1} +!2 = !{void (%struct.S*, i32*)* @ptr_in_byval_kernel, !"kernel", i32 1}