diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -140,6 +140,7 @@
 
 // =============================================================================
 // If the function had a byval struct ptr arg, say foo(%struct.x* byval %d),
+// and we can't guarantee that the only accesses are loads,
 // then add the following instructions to the first basic block:
 //
 // %temp = alloca %struct.x, align 8
@@ -150,7 +151,57 @@
 // The above code allocates some space in the stack and copies the incoming
 // struct from param space to local space.
 // Then replace all occurrences of %d by %temp.
+//
+// In case we know that all users are GEPs or Loads, replace them with the same
+// ones in parameter AS, so we can access them using ld.param.
 // =============================================================================
+
+// Replaces the \p OldUser instruction with the same in parameter AS.
+// Only Load and GEP are supported.
+static void convertToParamAS(Value *OldUser, Value *Param) {
+  Instruction *I = dyn_cast<Instruction>(OldUser);
+  assert(I && "OldUser must be an instruction");
+  struct IP {
+    Instruction *OldInstruction;
+    Value *NewParam;
+  };
+  SmallVector<IP> ItemsToConvert = {{I, Param}};
+  SmallVector<GetElementPtrInst *> GEPsToDelete;
+  while (!ItemsToConvert.empty()) {
+    IP I = ItemsToConvert.pop_back_val();
+    if (auto *LI = dyn_cast<LoadInst>(I.OldInstruction))
+      LI->setOperand(0, I.NewParam);
+    else if (auto *GEP = dyn_cast<GetElementPtrInst>(I.OldInstruction)) {
+      SmallVector<Value *, 4> Indices(GEP->indices());
+      auto *NewGEP = GetElementPtrInst::Create(nullptr, I.NewParam, Indices,
+                                               GEP->getName(), GEP);
+      NewGEP->setIsInBounds(GEP->isInBounds());
+      llvm::for_each(GEP->users(), [NewGEP, &ItemsToConvert](Value *V) {
+        ItemsToConvert.push_back({cast<Instruction>(V), NewGEP});
+      });
+      GEPsToDelete.push_back(GEP);
+    } else
+      llvm_unreachable("Only Load and GEP can be converted to param AS.");
+  }
+  llvm::for_each(GEPsToDelete,
+                 [](GetElementPtrInst *GEP) { GEP->eraseFromParent(); });
+}
+
+static bool isALoadChain(Value *Start) {
+  SmallVector<Value *, 16> ValuesToCheck = {Start};
+  while (!ValuesToCheck.empty()) {
+    Value *V = ValuesToCheck.pop_back_val();
+    Instruction *I = dyn_cast<Instruction>(V);
+    if (!I)
+      return false;
+    if (isa<GetElementPtrInst>(I))
+      ValuesToCheck.append(I->user_begin(), I->user_end());
+    else if (!isa<LoadInst>(I))
+      return false;
+  }
+  return true;
+};
+
 void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
   Function *Func = Arg->getParent();
   Instruction *FirstInst = &(Func->getEntryBlock().front());
@@ -159,6 +210,21 @@
   assert(PType && "Expecting pointer type in handleByValParam");
 
   Type *StructType = PType->getElementType();
+
+  if (llvm::all_of(Arg->users(), isALoadChain)) {
+    // Replace all loads with the loads in param AS. This allows loading the Arg
+    // directly from parameter AS, without making a temporary copy.
+    SmallVector<User *, 16> UsersToUpdate(Arg->users());
+    Value *ArgInParamAS = new AddrSpaceCastInst(
+        Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
+        FirstInst);
+    llvm::for_each(UsersToUpdate, [ArgInParamAS](Value *V) {
+      convertToParamAS(V, ArgInParamAS);
+    });
+    return;
+  }
+
+  // Otherwise we have to create a temporary copy.
   const DataLayout &DL = Func->getParent()->getDataLayout();
   unsigned AS = DL.getAllocaAddrSpace();
   AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -23,5 +23,12 @@
   %arg.idx.val.val = load i32, i32* %arg.idx.val, align 4
   %add.i = add nsw i32 %arg.idx.val.val, %arg.idx2.val
   store i32 %add.i, i32* %arg.idx1.val, align 4
+
+  ; let the pointer escape so we still create a local copy this test uses to
+  ; check the load alignment.
+  %tmp = call i32* @escape(i32* nonnull %arg.idx2)
   ret void
 }
+
+; Function Attrs: convergent nounwind
+declare dso_local i32* @escape(i32*) local_unnamed_addr
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -0,0 +1,92 @@
+; RUN: llc < %s -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.ham = type { [4 x i32] }
+
+; // Verify that load with static offset into parameter is done directly.
+; CHECK-LABEL: .visible .entry static_offset
+; CHECK: ld.param.u64    [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
+; CHECK: mov.b64         %[[param_addr:rd[0-9]+]], {{.*}}_param_1
+; CHECK: mov.u64         %[[param_addr1:rd[0-9]+]], %[[param_addr]]
+; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
+; CHECK: ld.param.u32    [[value:%r[0-9]+]], [%[[param_addr1]]+12];
+; CHECK  st.global.u32   [[[result_addr_g]]], [[value]];
+; Function Attrs: nofree norecurse nounwind willreturn mustprogress
+define dso_local void @static_offset(i32* nocapture %arg, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
+bb:
+  %tmp = icmp eq i32 %arg2, 3
+  br i1 %tmp, label %bb3, label %bb6
+
+bb3:                                              ; preds = %bb
+  %tmp4 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 3
+  %tmp5 = load i32, i32* %tmp4, align 4
+  store i32 %tmp5, i32* %arg, align 4
+  br label %bb6
+
+bb6:                                              ; preds = %bb3, %bb
+  ret void
+}
+
+; // Verify that load with dynamic offset into parameter is also done directly.
+; CHECK-LABEL: .visible .entry dynamic_offset
+; CHECK: ld.param.u64    [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
+; CHECK: mov.b64         %[[param_addr:rd[0-9]+]], {{.*}}_param_1
+; CHECK: mov.u64         %[[param_addr1:rd[0-9]+]], %[[param_addr]]
+; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
+; CHECK: add.s64         %[[param_w_offset:rd[0-9]+]], %[[param_addr1]],
+; CHECK: ld.param.u32    [[value:%r[0-9]+]], [%[[param_w_offset]]];
+; CHECK  st.global.u32   [[[result_addr_g]]], [[value]];
+
+; Function Attrs: nofree norecurse nounwind willreturn mustprogress
+define dso_local void @dynamic_offset(i32* nocapture %arg, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
+bb:
+  %tmp = sext i32 %arg2 to i64
+  %tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 %tmp
+  %tmp4 = load i32, i32* %tmp3, align 4
+  store i32 %tmp4, i32* %arg, align 4
+  ret void
+}
+
+; Verify that if the pointer escapes, then we do fall back onto using a temp copy.
+; CHECK-LABEL: .visible .entry pointer_escapes
+; CHECK: .local .align 8 .b8     __local_depot{{.*}}
+; CHECK: ld.param.u64    [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
+; CHECK: add.u64         %[[copy_addr:rd[0-9]+]], %SPL, 0;
+; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1+12];
+; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1+8];
+; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1+4];
+; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1];
+; CHECK-DAG: st.local.u32    [%[[copy_addr]]+12],
+; CHECK-DAG: st.local.u32    [%[[copy_addr]]+8],
+; CHECK-DAG: st.local.u32    [%[[copy_addr]]+4],
+; CHECK-DAG: st.local.u32    [%[[copy_addr]]],
+; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
+; CHECK: add.s64         %[[copy_w_offset:rd[0-9]+]], %[[copy_addr]],
+; CHECK: ld.local.u32    [[value:%r[0-9]+]], [%[[copy_w_offset]]];
+; CHECK  st.global.u32   [[[result_addr_g]]], [[value]];
+
+; Function Attrs: convergent norecurse nounwind mustprogress
+define dso_local void @pointer_escapes(i32* nocapture %arg, %struct.ham* byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 {
+bb:
+  %tmp = sext i32 %arg2 to i64
+  %tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 %tmp
+  %tmp4 = load i32, i32* %tmp3, align 4
+  store i32 %tmp4, i32* %arg, align 4
+  %tmp5 = call i32* @escape(i32* nonnull %tmp3) #3
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare dso_local i32* @escape(i32*) local_unnamed_addr
+
+
+!llvm.module.flags = !{!0, !1, !2}
+!nvvm.annotations = !{!3, !4, !5}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 9, i32 1]}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
+!3 = !{void (i32*, %struct.ham*, i32)* @static_offset, !"kernel", i32 1}
+!4 = !{void (i32*, %struct.ham*, i32)* @dynamic_offset, !"kernel", i32 1}
+!5 = !{void (i32*, %struct.ham*, i32)* @pointer_escapes, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
--- a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
@@ -35,7 +35,7 @@
 ; CHECK: ld.param.u64 	%[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8]
 ; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]];
   %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
-  %b = load i32*, i32** %b_ptr, align 4
+  %b = load i32*, i32** %b_ptr, align 8
   %v = load i32, i32* %b, align 4
 ; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]]
   store i32 %v, i32* %output, align 4
@@ -51,7 +51,7 @@
 ; CHECK: ld.param.u64 	%[[optr:rd.*]], [ptr_in_byval_func_param_1]
 ; CHECK: ld.param.u64 	%[[iptr:rd.*]], [ptr_in_byval_func_param_0+8]
   %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
-  %b = load i32*, i32** %b_ptr, align 4
+  %b = load i32*, i32** %b_ptr, align 8
   %v = load i32, i32* %b, align 4
 ; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]]
   store i32 %v, i32* %output, align 4