Index: lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp =================================================================== --- lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -662,6 +662,61 @@ TM.is64Bit() ? NVPTX::cvta_to_local_yes_64 : NVPTX::cvta_to_local_yes; break; case ADDRESS_SPACE_PARAM: + if (Src.getOpcode() == NVPTXISD::MoveParam) { + // TL;DR: addrspacecast MoveParam to param space is a no-op. + // + // Longer version is that this particular change is both an + // optimization and a work-around for a problem in ptxas for + // sm_50+. + // + // Background: + // According to PTX ISA v7.5 5.1.6.4: "...whenever a formal + // parameter has its address taken within the called function + // [..] that the parameter will be copied to the stack if + // necessary, and so the address will be in the .local state + // space and is accessed via ld.local and st.local + // instructions. + // + // Bug part: 'copied to the stack if necessary' part is + // currently broken for byval arguments with alignment < 4 for + // sm_50+ in all public versions of CUDA. Taking address of + // such argument results in SASS code that attempts to store + // the value using 32-bit write to an unaligned address. + // + // Optimization: On top of the overhead of spill-to-stack we + // also convert that address from local to generic space which + // may result in further overhead. All of it is in most cases + // unnecessary as we only need the value of the variable and + // it can be accessed directly by using argument symbol. We'll + // use address of local copy if it's needed. + // + // In order for this bit to do its job we need to: + + // a) Let LLVM do the spilling and make sure IR does not + // access byval arguments directly. Instead argument pointer + // (which is in default address space) is addrspacecast'ed to + // param space and the data it points to is copied to + // allocated local space (i.e. we let compiler spill it which + // gives us control over when it happens and an opportunity to + // optimize the spill away later if nobody needs its + // address). Then all uses of arg pointer are replaced with a + // pointer to local copy of the arg. All this is done in + // NVPTXLowerKernelArgs. + + // b) Now that function's IR has access to local copy of the + // byval arg, we no longer need to convert pointer to a + // generic address space. That would normally be done in + // LowerFormalArguments(). Now it just lowers argument to + // NVPTXISD::MoveParam without any space conversion. + + // c) And the final step is done by the code below. It + // replaces addrspacecast from step (a) which takes MoveParam + // from step(b) with arg symbol itself which can then be used + // for [symbol+offset] addressing. + + ReplaceNode(N, Src.getOperand(0).getNode()); + return; + } Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64 : NVPTX::nvvm_ptr_gen_to_param; break; Index: lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- lib/Target/NVPTX/NVPTXISelLowering.cpp +++ lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -2078,7 +2078,6 @@ SDValue Root = DAG.getRoot(); std::vector OutChains; - bool isKernel = llvm::isKernelFunction(*F); bool isABI = (STI.getSmVersion() >= 20); assert(isABI && "Non-ABI compilation is not supported"); if (!isABI) @@ -2112,7 +2111,8 @@ theArgs[i], (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent() : nullptr))) { - assert(isKernel && "Only kernels can have image/sampler params"); + assert(llvm::isKernelFunction(*F) && + "Only kernels can have image/sampler params"); InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32)); continue; } @@ -2334,6 +2334,11 @@ // machine instruction fails because TargetExternalSymbol // (not lowered) is target dependent, and CopyToReg assumes // the source is lowered. + // + // Byval arguments for regular function are lowered the same way + // as for kernels in order to generate better code and work around + // a known issue in ptxas. See comments in + // NVPTXDAGToDAGISel::SelectAddrSpaceCast() for the gory details. EVT ObjectVT = getValueType(DL, Ty); assert(ObjectVT == Ins[InsIdx].VT && "Ins type did not match function type"); @@ -2341,14 +2346,7 @@ SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); if (p.getNode()) p.getNode()->setIROrder(idx + 1); - if (isKernel) - InVals.push_back(p); - else { - SDValue p2 = DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT, - DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, dl, MVT::i32), p); - InVals.push_back(p2); - } + InVals.push_back(p); } // Clang will check explicit VarArg and issue error if any. However, Clang Index: lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp =================================================================== --- lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp +++ lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp @@ -101,6 +101,11 @@ class NVPTXLowerKernelArgs : public FunctionPass { bool runOnFunction(Function &F) override; + // Kernels and regular device functions treat byval arguments + // differently. + bool runOnKernelFunction(Function &F); + bool runOnDeviceFunction(Function &F); + // handle byval parameters void handleByValParam(Argument *Arg); // Knowing Ptr must point to the global address space, this function @@ -192,11 +197,7 @@ // ============================================================================= // Main function for this pass. // ============================================================================= -bool NVPTXLowerKernelArgs::runOnFunction(Function &F) { - // Skip non-kernels. See the comments at the top of this file. - if (!isKernelFunction(F)) - return false; - +bool NVPTXLowerKernelArgs::runOnKernelFunction(Function &F) { if (TM && TM->getDrvInterface() == NVPTX::CUDA) { // Mark pointers in byval structs as global. for (auto &B : F) { @@ -228,6 +229,19 @@ return true; } +// See comments in NVPTXDAGToDAGISel::SelectAddrSpaceCast() for the +// details on why we need to spill byval arguments into local memory. +bool NVPTXLowerKernelArgs::runOnDeviceFunction(Function &F) { + for (Argument &Arg : F.args()) + if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) + handleByValParam(&Arg); + return true; +} + +bool NVPTXLowerKernelArgs::runOnFunction(Function &F) { + return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F); +} + FunctionPass * llvm::createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM) { return new NVPTXLowerKernelArgs(TM); Index: test/CodeGen/NVPTX/bug21465.ll =================================================================== --- test/CodeGen/NVPTX/bug21465.ll +++ test/CodeGen/NVPTX/bug21465.ll @@ -15,7 +15,7 @@ %b = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1 %0 = load i32, i32* %b, align 4 ; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}] -; PTX: ld.param.u32 [[value:%r[0-9]+]], [{{%rd[0-9]+}}+4] +; PTX: ld.param.u32 [[value:%r[0-9]+]], [_Z11TakesStruct1SPi_param_0+4] store i32 %0, i32* %output, align 4 ; PTX-NEXT: st.global.u32 [{{%rd[0-9]+}}], [[value]] ret void Index: test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll =================================================================== --- test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll +++ test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll @@ -28,20 +28,38 @@ %struct.S = type { i32*, i32* } -define void @ptr_in_byval(%struct.S* byval %input, i32* %output) { -; CHECK-LABEL: .visible .entry ptr_in_byval( -; CHECK: cvta.to.global.u64 -; CHECK: cvta.to.global.u64 +define void @ptr_in_byval_kernel(%struct.S* byval %input, i32* %output) { +; CHECK-LABEL: .visible .entry ptr_in_byval_kernel( +; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_kernel_param_1] +; CHECK: cvta.to.global.u64 %[[optr_g:.*]], %[[optr]]; +; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8] +; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]]; %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1 %b = load i32*, i32** %b_ptr, align 4 %v = load i32, i32* %b, align 4 -; CHECK: ld.global.u32 +; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]] store i32 %v, i32* %output, align 4 -; CHECK: st.global.u32 +; CHECK: st.global.u32 [%[[optr_g]]], %[[val]] + ret void +} + +; Regular functions lower byval arguments differently. We need to make +; sure that we're loading byval argument data using [symbol+offset]. +; There's also no assumption that all pointers within are in global space. +define void @ptr_in_byval_func(%struct.S* byval %input, i32* %output) { +; CHECK-LABEL: .visible .func ptr_in_byval_func( +; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_func_param_1] +; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_func_param_0+8] + %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1 + %b = load i32*, i32** %b_ptr, align 4 + %v = load i32, i32* %b, align 4 +; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]] + store i32 %v, i32* %output, align 4 +; CHECK: st.u32 [%[[optr]]], %[[val]] ret void } !nvvm.annotations = !{!0, !1, !2} !0 = !{void (float*, float*)* @kernel, !"kernel", i32 1} !1 = !{void (float addrspace(1)*, float addrspace(1)*)* @kernel2, !"kernel", i32 1} -!2 = !{void (%struct.S*, i32*)* @ptr_in_byval, !"kernel", i32 1} +!2 = !{void (%struct.S*, i32*)* @ptr_in_byval_kernel, !"kernel", i32 1}