Index: lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp =================================================================== --- lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -662,6 +662,11 @@ TM.is64Bit() ? NVPTX::cvta_to_local_yes_64 : NVPTX::cvta_to_local_yes; break; case ADDRESS_SPACE_PARAM: + if (Src.getOpcode() == NVPTXISD::MoveParam) { + // addrspacecast MoveParam to param space is a no-op. + ReplaceNode(N, Src.getOperand(0).getNode()); + return; + } Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64 : NVPTX::nvvm_ptr_gen_to_param; break; Index: lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- lib/Target/NVPTX/NVPTXISelLowering.cpp +++ lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -2341,14 +2341,7 @@ SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); if (p.getNode()) p.getNode()->setIROrder(idx + 1); - if (isKernel) - InVals.push_back(p); - else { - SDValue p2 = DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT, - DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, dl, MVT::i32), p); - InVals.push_back(p2); - } + InVals.push_back(p); } // Clang will check explicit VarArg and issue error if any. However, Clang Index: lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp =================================================================== --- lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp +++ lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp @@ -101,6 +101,11 @@ class NVPTXLowerKernelArgs : public FunctionPass { bool runOnFunction(Function &F) override; + // Kernels and regular device functions treat byval arguments + // differently. + bool runOnKernelFunction(Function &F); + bool runOnDeviceFunction(Function &F); + // handle byval parameters void handleByValParam(Argument *Arg); // Knowing Ptr must point to the global address space, this function @@ -192,11 +197,7 @@ // ============================================================================= // Main function for this pass. // ============================================================================= -bool NVPTXLowerKernelArgs::runOnFunction(Function &F) { - // Skip non-kernels. See the comments at the top of this file. - if (!isKernelFunction(F)) - return false; - +bool NVPTXLowerKernelArgs::runOnKernelFunction(Function &F) { if (TM && TM->getDrvInterface() == NVPTX::CUDA) { // Mark pointers in byval structs as global. for (auto &B : F) { @@ -228,6 +229,17 @@ return true; } +bool NVPTXLowerKernelArgs::runOnDeviceFunction(Function &F) { + for (Argument &Arg : F.args()) + if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) + handleByValParam(&Arg); + return true; +} + +bool NVPTXLowerKernelArgs::runOnFunction(Function &F) { + return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F); +} + FunctionPass * llvm::createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM) { return new NVPTXLowerKernelArgs(TM); Index: test/CodeGen/NVPTX/bug21465.ll =================================================================== --- test/CodeGen/NVPTX/bug21465.ll +++ test/CodeGen/NVPTX/bug21465.ll @@ -15,7 +15,7 @@ %b = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1 %0 = load i32, i32* %b, align 4 ; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}] -; PTX: ld.param.u32 [[value:%r[0-9]+]], [{{%rd[0-9]+}}+4] +; PTX: ld.param.u32 [[value:%r[0-9]+]], [_Z11TakesStruct1SPi_param_0+4] store i32 %0, i32* %output, align 4 ; PTX-NEXT: st.global.u32 [{{%rd[0-9]+}}], [[value]] ret void