diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -140,6 +140,7 @@
 
 // =============================================================================
 // If the function had a byval struct ptr arg, say foo(%struct.x* byval %d),
+// and we can't guarantee that the only accesses are loads,
 // then add the following instructions to the first basic block:
 //
 // %temp = alloca %struct.x, align 8
@@ -150,7 +151,47 @@
 // The above code allocates some space in the stack and copies the incoming
 // struct from param space to local space.
 // Then replace all occurrences of %d by %temp.
+//
+// In case we know that all users are GEPs or Loads, replace them with the same
+// ones in parameter AS, so we can access them using ld.param.
 // =============================================================================
+
+// Replaces the \p OldUser instruction with the same in parameter AS.
+// Only Load and GEP are supported.
+static void ConvertToParamAS(Value *OldUser, Value *Param) {
+  Instruction *I = dyn_cast<Instruction>(OldUser);
+  assert(I && "OldUser must be an instruction");
+  if (auto *LI = dyn_cast<LoadInst>(I)) {
+    auto *NewLI = new LoadInst(LI->getType(), Param, LI->getName(),
+                               LI->isVolatile(), LI->getAlign(), I);
+    LI->replaceAllUsesWith(NewLI);
+    LI->eraseFromParent();
+    return;
+  }
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+    SmallVector<Value *, 4> Indices(GEP->indices());
+    auto *NewGEP =
+        GetElementPtrInst::Create(nullptr, Param, Indices, GEP->getName(), GEP);
+    NewGEP->setIsInBounds(GEP->isInBounds());
+
+    SmallVector<User *, 16> UsersToUpdate(GEP->users());
+    llvm::for_each(UsersToUpdate,
+                   [NewGEP](Value *V) { ConvertToParamAS(V, NewGEP); });
+    GEP->eraseFromParent();
+    return;
+  }
+  llvm_unreachable("Only Load and GEP can be converted to param AS.");
+}
+
+static bool isALoadChain(Value *V) {
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    if (isa<GetElementPtrInst>(I))
+      return llvm::all_of(I->users(), isALoadChain);
+    return isa<LoadInst>(I);
+  }
+  return false;
+};
+
 void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
   Function *Func = Arg->getParent();
   Instruction *FirstInst = &(Func->getEntryBlock().front());
@@ -159,6 +200,21 @@
   assert(PType && "Expecting pointer type in handleByValParam");
 
   Type *StructType = PType->getElementType();
+
+  if (llvm::all_of(Arg->users(), isALoadChain)) {
+    // Replace all loads with the loads in param AS which allows loading the Arg
+    // directly from parameter AS, without making a temporary copy.
+    SmallVector<User *, 16> UsersToUpdate(Arg->users());
+    Value *ArgInParamAS = new AddrSpaceCastInst(
+        Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
+        FirstInst);
+    llvm::for_each(UsersToUpdate, [ArgInParamAS](Value *V) {
+      ConvertToParamAS(V, ArgInParamAS);
+    });
+    return;
+  }
+
+  // Otherwise we have to create a temporary copy.
   const DataLayout &DL = Func->getParent()->getDataLayout();
   unsigned AS = DL.getAllocaAddrSpace();
   AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -0,0 +1,92 @@
+; RUN: llc < %s -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.ham = type { [4 x i32] }
+
+; // Verify that load with static offset into parameter is done directly.
+; CHECK-LABEL: .visible .entry static_offset
+; CHECK: ld.param.u64    [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
+; CHECK: mov.b64         %[[param_addr:rd[0-9]+]], {{.*}}_param_1
+; CHECK: mov.u64         %[[param_addr1:rd[0-9]+]], %[[param_addr]]
+; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
+; CHECK: ld.param.u32    [[value:%r[0-9]+]], [%[[param_addr1]]+12];
+; CHECK  st.global.u32   [[[result_addr_g]]], [[value]];
+; Function Attrs: nofree norecurse nounwind willreturn mustprogress
+define dso_local void @static_offset(i32* nocapture %arg, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
+bb:
+  %tmp = icmp eq i32 %arg2, 3
+  br i1 %tmp, label %bb3, label %bb6
+
+bb3:                                              ; preds = %bb
+  %tmp4 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 3
+  %tmp5 = load i32, i32* %tmp4, align 4
+  store i32 %tmp5, i32* %arg, align 4
+  br label %bb6
+
+bb6:                                              ; preds = %bb3, %bb
+  ret void
+}
+
+; // Verify that load with dynamic offset into parameter is also done directly.
+; CHECK-LABEL: .visible .entry dynamic_offset
+; CHECK: ld.param.u64    [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
+; CHECK: mov.b64         %[[param_addr:rd[0-9]+]], {{.*}}_param_1
+; CHECK: mov.u64         %[[param_addr1:rd[0-9]+]], %[[param_addr]]
+; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
+; CHECK: add.s64         %[[param_w_offset:rd[0-9]+]], %[[param_addr1]],
+; CHECK: ld.param.u32    [[value:%r[0-9]+]], [%[[param_w_offset]]];
+; CHECK  st.global.u32   [[[result_addr_g]]], [[value]];
+
+; Function Attrs: nofree norecurse nounwind willreturn mustprogress
+define dso_local void @dynamic_offset(i32* nocapture %arg, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
+bb:
+  %tmp = sext i32 %arg2 to i64
+  %tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 %tmp
+  %tmp4 = load i32, i32* %tmp3, align 4
+  store i32 %tmp4, i32* %arg, align 4
+  ret void
+}
+
+; Verify that if the pointer escapes, then we do fall back onto using a temp copy.
+; CHECK-LABEL: .visible .entry pointer_escapes
+; CHECK: .local .align 8 .b8     __local_depot{{.*}}
+; CHECK: ld.param.u64    [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
+; CHECK: add.u64         %[[copy_addr:rd[0-9]+]], %SPL, 0;
+; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1+12];
+; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1+8];
+; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1+4];
+; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1];
+; CHECK-DAG: st.local.u32    [%[[copy_addr]]+12],
+; CHECK-DAG: st.local.u32    [%[[copy_addr]]+8],
+; CHECK-DAG: st.local.u32    [%[[copy_addr]]+4],
+; CHECK-DAG: st.local.u32    [%[[copy_addr]]],
+; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
+; CHECK: add.s64         %[[copy_w_offset:rd[0-9]+]], %[[copy_addr]],
+; CHECK: ld.local.u32    [[value:%r[0-9]+]], [%[[copy_w_offset]]];
+; CHECK  st.global.u32   [[[result_addr_g]]], [[value]];
+
+; Function Attrs: convergent norecurse nounwind mustprogress
+define dso_local void @pointer_escapes(i32* nocapture %arg, %struct.ham* byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 {
+bb:
+  %tmp = sext i32 %arg2 to i64
+  %tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 %tmp
+  %tmp4 = load i32, i32* %tmp3, align 4
+  store i32 %tmp4, i32* %arg, align 4
+  %tmp5 = call i32* @escape(i32* nonnull %tmp3) #3
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare dso_local i32* @escape(i32*) local_unnamed_addr
+
+
+!llvm.module.flags = !{!0, !1, !2}
+!nvvm.annotations = !{!3, !4, !5}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 9, i32 1]}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
+!3 = !{void (i32*, %struct.ham*, i32)* @static_offset, !"kernel", i32 1}
+!4 = !{void (i32*, %struct.ham*, i32)* @dynamic_offset, !"kernel", i32 1}
+!5 = !{void (i32*, %struct.ham*, i32)* @pointer_escapes, !"kernel", i32 1}