Index: lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -5076,11 +5076,12 @@
     Address = N.getOperand(0);
     return true;
   }
-  if (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
-    unsigned IID = cast<ConstantSDNode>(N.getOperand(0))->getZExtValue();
-    if (IID == Intrinsic::nvvm_ptr_gen_to_param)
-      if (N.getOperand(1).getOpcode() == NVPTXISD::MoveParam)
-        return (SelectDirectAddr(N.getOperand(1).getOperand(0), Address));
+  // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
+  if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
+    if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
+        CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
+        CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
+      return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
   }
   return false;
 }
Index: lib/Target/NVPTX/NVPTXISelLowering.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2077,7 +2077,6 @@
   SDValue Root = DAG.getRoot();
   std::vector<SDValue> OutChains;
 
-  bool isKernel = llvm::isKernelFunction(*F);
   bool isABI = (STI.getSmVersion() >= 20);
   assert(isABI && "Non-ABI compilation is not supported");
   if (!isABI)
@@ -2111,7 +2110,8 @@
             theArgs[i],
             (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
                                      : nullptr))) {
-      assert(isKernel && "Only kernels can have image/sampler params");
+      assert(llvm::isKernelFunction(*F) &&
+             "Only kernels can have image/sampler params");
       InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
       continue;
     }
@@ -2336,14 +2336,7 @@
     SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
     if (p.getNode())
       p.getNode()->setIROrder(idx + 1);
-    if (isKernel)
-      InVals.push_back(p);
-    else {
-      SDValue p2 = DAG.getNode(
-          ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
-          DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, dl, MVT::i32), p);
-      InVals.push_back(p2);
-    }
+    InVals.push_back(p);
   }
 
   // Clang will check explicit VarArg and issue error if any. However, Clang
Index: lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
+++ lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
@@ -7,20 +7,28 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Pointer arguments to kernel functions need to be lowered specially.
 //
-// 1. Copy byval struct args to local memory. This is a preparation for handling
-//    cases like
+// Arguments to kernel and device functions are passed via param space,
+// which imposes certain restrictions:
+// http://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces
 //
-//    kernel void foo(struct A arg, ...)
-//    {
-//      struct A *p = &arg;
-//      ...
-//      ... = p->filed1 ...  (this is no generic address for .param)
-//      p->filed2 = ...      (this is no write access to .param)
-//    }
+// Kernel parameters are read-only and accessible only via ld.param
+// instruction, directly or via a pointer. Pointers to kernel
+// arguments can't be converted to generic address space.
 //
-// 2. Convert non-byval pointer arguments of CUDA kernels to pointers in the
+// Device function parameters are directly accessible via
+// ld.param/st.param, but taking the address of one returns a pointer
+// to a copy created in local space which *can't* be used with
+// ld.param/st.param.
+//
+// Copying a byval struct into local memory in IR allows us to enforce
+// the param space restrictions, gives the rest of IR a pointer w/o
+// param space restrictions, and gives us an opportunity to eliminate
+// the copy.
+//
+// Pointer arguments to kernel functions need more work to be lowered:
+//
+// 1. Convert non-byval pointer arguments of CUDA kernels to pointers in the
 //    global address space. This allows later optimizations to emit
 //    ld.global.*/st.global.* for accessing these pointer arguments. For
 //    example,
@@ -47,7 +55,7 @@
 //      ...
 //    }
 //
-// 3. Convert pointers in a byval kernel parameter to pointers in the global
+// 2. Convert pointers in a byval kernel parameter to pointers in the global
 //    address space. As #2, it allows NVPTX to emit more ld/st.global. E.g.,
 //
 //    struct S {
@@ -101,6 +109,9 @@
 class NVPTXLowerKernelArgs : public FunctionPass {
   bool runOnFunction(Function &F) override;
 
+  bool runOnKernelFunction(Function &F);
+  bool runOnDeviceFunction(Function &F);
+
   // handle byval parameters
   void handleByValParam(Argument *Arg);
   // Knowing Ptr must point to the global address space, this function
@@ -192,11 +203,7 @@
 // =============================================================================
 // Main function for this pass.
 // =============================================================================
-bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
-  // Skip non-kernels. See the comments at the top of this file.
-  if (!isKernelFunction(F))
-    return false;
-
+bool NVPTXLowerKernelArgs::runOnKernelFunction(Function &F) {
   if (TM && TM->getDrvInterface() == NVPTX::CUDA) {
     // Mark pointers in byval structs as global.
     for (auto &B : F) {
@@ -228,6 +235,18 @@
   return true;
 }
 
+// Device functions only need to copy byval args into local memory.
+bool NVPTXLowerKernelArgs::runOnDeviceFunction(Function &F) {
+  for (Argument &Arg : F.args())
+    if (Arg.getType()->isPointerTy() && Arg.hasByValAttr())
+      handleByValParam(&Arg);
+  return true;
+}
+
+bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
+  return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F);
+}
+
 FunctionPass *
 llvm::createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM) {
   return new NVPTXLowerKernelArgs(TM);
Index: test/CodeGen/NVPTX/bug21465.ll
===================================================================
--- test/CodeGen/NVPTX/bug21465.ll
+++ test/CodeGen/NVPTX/bug21465.ll
@@ -15,7 +15,7 @@
   %b = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
   %0 = load i32, i32* %b, align 4
 ; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}]
-; PTX: ld.param.u32 [[value:%r[0-9]+]], [{{%rd[0-9]+}}+4]
+; PTX: ld.param.u32 [[value:%r[0-9]+]], [_Z11TakesStruct1SPi_param_0+4]
   store i32 %0, i32* %output, align 4
 ; PTX-NEXT: st.global.u32 [{{%rd[0-9]+}}], [[value]]
   ret void
Index: test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
===================================================================
--- test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
+++ test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
@@ -28,20 +28,38 @@
 
 %struct.S = type { i32*, i32* }
 
-define void @ptr_in_byval(%struct.S* byval %input, i32* %output) {
-; CHECK-LABEL: .visible .entry ptr_in_byval(
-; CHECK: cvta.to.global.u64
-; CHECK: cvta.to.global.u64
+define void @ptr_in_byval_kernel(%struct.S* byval %input, i32* %output) {
+; CHECK-LABEL: .visible .entry ptr_in_byval_kernel(
+; CHECK: ld.param.u64 	%[[optr:rd.*]], [ptr_in_byval_kernel_param_1]
+; CHECK: cvta.to.global.u64 %[[optr_g:.*]], %[[optr]];
+; CHECK: ld.param.u64 	%[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8]
+; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]];
   %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
   %b = load i32*, i32** %b_ptr, align 4
   %v = load i32, i32* %b, align 4
-; CHECK: ld.global.u32
+; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]]
   store i32 %v, i32* %output, align 4
-; CHECK: st.global.u32
+; CHECK: st.global.u32 [%[[optr_g]]], %[[val]]
+  ret void
+}
+
+; Regular functions lower byval arguments differently. We need to make
+; sure that we're loading byval argument data using [symbol+offset].
+; There's also no assumption that all pointers within are in global space.
+define void @ptr_in_byval_func(%struct.S* byval %input, i32* %output) {
+; CHECK-LABEL: .visible .func ptr_in_byval_func(
+; CHECK: ld.param.u64 	%[[optr:rd.*]], [ptr_in_byval_func_param_1]
+; CHECK: ld.param.u64 	%[[iptr:rd.*]], [ptr_in_byval_func_param_0+8]
+  %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
+  %b = load i32*, i32** %b_ptr, align 4
+  %v = load i32, i32* %b, align 4
+; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]]
+  store i32 %v, i32* %output, align 4
+; CHECK: st.u32 [%[[optr]]], %[[val]]
   ret void
 }
 
 !nvvm.annotations = !{!0, !1, !2}
 !0 = !{void (float*, float*)* @kernel, !"kernel", i32 1}
 !1 = !{void (float addrspace(1)*, float addrspace(1)*)* @kernel2, !"kernel", i32 1}
-!2 = !{void (%struct.S*, i32*)* @ptr_in_byval, !"kernel", i32 1}
+!2 = !{void (%struct.S*, i32*)* @ptr_in_byval_kernel, !"kernel", i32 1}