diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -86,7 +86,6 @@
 #define DEBUG_TYPE "argpromotion"
 
 STATISTIC(NumArgumentsPromoted, "Number of pointer arguments promoted");
-STATISTIC(NumByValArgsPromoted, "Number of byval arguments promoted");
 STATISTIC(NumArgumentsDead, "Number of dead pointer args eliminated");
 
 namespace {
@@ -156,7 +155,6 @@
 static Function *doPromotion(
     Function *F,
     const DenseMap<Argument *, SmallVector<OffsetAndArgPart, 4>> &ArgsToPromote,
-    SmallPtrSetImpl<Argument *> &ByValArgsToTransform,
     Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>>
         ReplaceCallSite) {
   // Start by computing a new prototype for the function, which is the same as
@@ -174,15 +172,7 @@
   unsigned ArgNo = 0;
   for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
        ++I, ++ArgNo) {
-    if (ByValArgsToTransform.count(&*I)) {
-      // Simple byval argument? Just add all the struct element types.
-      Type *AgTy = I->getParamByValType();
-      StructType *STy = cast<StructType>(AgTy);
-      llvm::append_range(Params, STy->elements());
-      ArgAttrVec.insert(ArgAttrVec.end(), STy->getNumElements(),
-                        AttributeSet());
-      ++NumByValArgsPromoted;
-    } else if (!ArgsToPromote.count(&*I)) {
+    if (!ArgsToPromote.count(&*I)) {
       // Unchanged argument
       Params.push_back(I->getType());
       ArgAttrVec.push_back(PAL.getParamAttrs(ArgNo));
@@ -251,28 +241,9 @@
     ArgNo = 0;
     for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
          ++I, ++AI, ++ArgNo)
-      if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
+      if (!ArgsToPromote.count(&*I)) {
         Args.push_back(*AI); // Unmodified argument
         ArgAttrVec.push_back(CallPAL.getParamAttrs(ArgNo));
-      } else if (ByValArgsToTransform.count(&*I)) {
-        // Emit a GEP and load for each element of the struct.
-        Type *AgTy = I->getParamByValType();
-        StructType *STy = cast<StructType>(AgTy);
-        Value *Idxs[2] = {
-            ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr};
-        const StructLayout *SL = DL.getStructLayout(STy);
-        Align StructAlign = *I->getParamAlign();
-        for (unsigned J = 0, Elems = STy->getNumElements(); J != Elems; ++J) {
-          Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), J);
-          auto *Idx =
-              IRB.CreateGEP(STy, *AI, Idxs, (*AI)->getName() + "." + Twine(J));
-          // TODO: Tell AA about the new values?
-          Align Alignment =
-              commonAlignment(StructAlign, SL->getElementOffset(J));
-          Args.push_back(IRB.CreateAlignedLoad(
-              STy->getElementType(J), Idx, Alignment, Idx->getName() + ".val"));
-          ArgAttrVec.push_back(AttributeSet());
-        }
       } else if (!I->use_empty()) {
         Value *V = *AI;
         const auto &ArgParts = ArgsToPromote.find(&*I)->second;
@@ -346,7 +317,7 @@
   // the new arguments, also transferring over the names as well.
   Function::arg_iterator I2 = NF->arg_begin();
   for (Argument &Arg : F->args()) {
-    if (!ArgsToPromote.count(&Arg) && !ByValArgsToTransform.count(&Arg)) {
+    if (!ArgsToPromote.count(&Arg)) {
       // If this is an unmodified argument, move the name and users over to the
       // new version.
       Arg.replaceAllUsesWith(&*I2);
@@ -355,37 +326,6 @@
       continue;
     }
 
-    if (ByValArgsToTransform.count(&Arg)) {
-      // In the callee, we create an alloca, and store each of the new incoming
-      // arguments into the alloca.
-      Instruction *InsertPt = &NF->begin()->front();
-
-      // Just add all the struct element types.
-      Type *AgTy = Arg.getParamByValType();
-      Align StructAlign = *Arg.getParamAlign();
-      Value *TheAlloca = new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr,
-                                        StructAlign, "", InsertPt);
-      StructType *STy = cast<StructType>(AgTy);
-      Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0),
-                        nullptr};
-      const StructLayout *SL = DL.getStructLayout(STy);
-
-      for (unsigned J = 0, Elems = STy->getNumElements(); J != Elems; ++J) {
-        Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), J);
-        Value *Idx = GetElementPtrInst::Create(
-            AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(J),
-            InsertPt);
-        I2->setName(Arg.getName() + "." + Twine(J));
-        Align Alignment = commonAlignment(StructAlign, SL->getElementOffset(J));
-        new StoreInst(&*I2++, Idx, false, Alignment, InsertPt);
-      }
-
-      // Anything that used the arg should now use the alloca.
-      Arg.replaceAllUsesWith(TheAlloca);
-      TheAlloca->takeName(&Arg);
-      continue;
-    }
-
     // There potentially are metadata uses for things like llvm.dbg.value.
     // Replace them with undef, after handling the other regular uses.
     auto RauwUndefMetadata = make_scope_exit(
@@ -402,8 +342,8 @@
     }
 
     // Otherwise, if we promoted this argument, then all users are load
-    // instructions (with possible casts and GEPs in between).
-
+    // instructions (with possible casts and GEPs in between) or store ones if
+    // the byval attribute is used.
     SmallVector<Value *, 16> Worklist;
     SmallVector<Instruction *, 16> DeadInsts;
     append_range(Worklist, Arg.users());
@@ -427,6 +367,16 @@
         continue;
       }
 
+      if (auto *SI = dyn_cast<StoreInst>(V)) {
+        // Stores are only allowed for byval arguments. This is a job of the
+        // findArgParts function to decide should arguments with stores as
+        // users be eligible for promotion. If a store is an allowed user, it
+        // writes to a temporary (created explicitly or implicitly with the
+        // byval attribute, so the instruction can be removed too).
+        DeadInsts.push_back(SI);
+        continue;
+      }
+
       llvm_unreachable("Unexpected user");
     }
 
@@ -456,8 +406,8 @@
   // direct callees.
   return all_of(Callee->users(), [&](User *U) {
     CallBase &CB = cast<CallBase>(*U);
-    return isDereferenceableAndAlignedPointer(
-        CB.getArgOperand(Arg->getArgNo()), NeededAlign, Bytes, DL);
+    return isDereferenceableAndAlignedPointer(CB.getArgOperand(Arg->getArgNo()),
+                                              NeededAlign, Bytes, DL);
   });
 }
 
@@ -465,6 +415,7 @@
 /// parts it can be promoted into.
 static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
                          unsigned MaxElements, bool IsRecursive,
+                         bool IsStoresAllowed,
                          SmallVectorImpl<OffsetAndArgPart> &ArgPartsVec) {
   // Quick exit for unused arguments
   if (Arg->use_empty())
@@ -605,6 +556,10 @@
       continue;
     }
 
+    // Stores are allowed for byval arguments
+    if (IsStoresAllowed && isa<StoreInst>(V))
+      continue;
+
     // Unknown user.
     LLVM_DEBUG(dbgs() << "ArgPromotion of " << *Arg << " failed: "
                       << "unknown user " << *V << "\n");
@@ -724,11 +679,17 @@
   SmallVector<StoreInst *, 16> Stores;
 
   // Scan through the uses recursively to make sure the pointer is always used
-  // sanely.
+  // sanely. Note: we don't care whether the parts of the argument are actually
+  // loaded or stored, if we have an improper user (GEP with a non-constant
+  // index for example), we report that the padding can be accessed even if
+  // the user doesn't lead to a load or store instruction.
   SmallVector<Value *, 16> WorkList(Arg->users());
   while (!WorkList.empty()) {
     Value *V = WorkList.pop_back_val();
     if (isa<GetElementPtrInst>(V) || isa<PHINode>(V)) {
+      auto *GEP = dyn_cast<GetElementPtrInst>(V);
+      if (GEP && !GEP->hasAllConstantIndices())
+        return true;
       if (PtrValues.insert(V).second)
         append_range(WorkList, V->users());
     } else if (StoreInst *Store = dyn_cast<StoreInst>(V)) {
@@ -774,7 +735,7 @@
   // Don't perform argument promotion for naked functions; otherwise we can end
   // up removing parameters that are seemingly 'not used' as they are referred
   // to in the assembly.
-  if(F->hasFnAttribute(Attribute::Naked))
+  if (F->hasFnAttribute(Attribute::Naked))
     return nullptr;
 
   // Make sure that it is local to this module.
@@ -833,7 +794,6 @@
   // Check to see which arguments are promotable.  If an argument is promotable,
   // add it to ArgsToPromote.
   DenseMap<Argument *, SmallVector<OffsetAndArgPart, 4>> ArgsToPromote;
-  SmallPtrSet<Argument *, 8> ByValArgsToTransform;
   for (Argument *PtrArg : PointerArgs) {
     // Replace sret attribute with noalias. This reduces register pressure by
     // avoiding a register copy.
@@ -850,63 +810,34 @@
 
     // If we can promote the pointer to its value.
     SmallVector<OffsetAndArgPart, 4> ArgParts;
-    if (findArgParts(PtrArg, DL, AAR, MaxElements, IsRecursive, ArgParts)) {
+    // And if this is a byval argument we also allow to have the store
+    // instructions as the argument's users if the passed value is densely
+    // packed or if we can prove the padding bytes are never accessed. Only
+    // handle in such way arguments with specified alignment; if it's
+    // unspecified, the actual alignment of the argument is target-specific.
+    Type *ByValTy = PtrArg->getParamByValType();
+    bool IsStoresAllowed =
+        ByValTy && PtrArg->getParamAlign() &&
+        (ArgumentPromotionPass::isDenselyPacked(ByValTy, DL) ||
+         !canPaddingBeAccessed(PtrArg));
+
+    if (findArgParts(PtrArg, DL, AAR, MaxElements, IsRecursive, IsStoresAllowed,
+                     ArgParts)) {
       SmallVector<Type *, 4> Types;
       for (const auto &Pair : ArgParts)
         Types.push_back(Pair.second.Ty);
 
       if (areTypesABICompatible(Types, *F, TTI)) {
         ArgsToPromote.insert({PtrArg, std::move(ArgParts)});
-        continue;
-      }
-    }
-
-    // Otherwise, if this is a byval argument, and if the aggregate type is
-    // small, just pass the elements, which is always safe, if the passed value
-    // is densely packed or if we can prove the padding bytes are never
-    // accessed.
-    //
-    // Only handle arguments with specified alignment; if it's unspecified, the
-    // actual alignment of the argument is target-specific.
-    Type *ByValTy = PtrArg->getParamByValType();
-    bool IsSafeToPromote =
-        ByValTy && PtrArg->getParamAlign() &&
-        (ArgumentPromotionPass::isDenselyPacked(ByValTy, DL) ||
-         !canPaddingBeAccessed(PtrArg));
-    if (!IsSafeToPromote) {
-      LLVM_DEBUG(dbgs() << "ArgPromotion disables passing the elements of"
-                        << " the argument '" << PtrArg->getName()
-                        << "' because it is not safe.\n");
-      continue;
-    }
-    if (StructType *STy = dyn_cast<StructType>(ByValTy)) {
-      if (MaxElements > 0 && STy->getNumElements() > MaxElements) {
-        LLVM_DEBUG(dbgs() << "ArgPromotion disables passing the elements of"
-                          << " the argument '" << PtrArg->getName()
-                          << "' because it would require adding more"
-                          << " than " << MaxElements
-                          << " arguments to the function.\n");
-        continue;
       }
-      SmallVector<Type *, 4> Types;
-      append_range(Types, STy->elements());
-
-      // If all the elements are single-value types, we can promote it.
-      bool AllSimple =
-          all_of(Types, [](Type *Ty) { return Ty->isSingleValueType(); });
-
-      // Safe to transform. Passing the elements as a scalar will allow sroa to
-      // hack on the new alloca we introduce.
-      if (AllSimple && areTypesABICompatible(Types, *F, TTI))
-        ByValArgsToTransform.insert(PtrArg);
     }
   }
 
   // No promotable pointer arguments.
-  if (ArgsToPromote.empty() && ByValArgsToTransform.empty())
+  if (ArgsToPromote.empty())
     return nullptr;
 
-  return doPromotion(F, ArgsToPromote, ByValArgsToTransform, ReplaceCallSite);
+  return doPromotion(F, ArgsToPromote, ReplaceCallSite);
 }
 
 PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
diff --git a/llvm/test/Transforms/ArgumentPromotion/attrs.ll b/llvm/test/Transforms/ArgumentPromotion/attrs.ll
--- a/llvm/test/Transforms/ArgumentPromotion/attrs.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/attrs.ll
@@ -3,25 +3,14 @@
 
 %struct.ss = type { i32, i64 }
 
-; Don't drop 'byval' on %X here.
 define internal void @f(%struct.ss* byval(%struct.ss) align 4 %b, i32* byval(i32) align 4 %X, i32 %i) nounwind {
 ; CHECK-LABEL: define {{[^@]+}}@f
-; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]], i32* byval(i32) align 4 [[X:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: (i32 [[B_0:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
-; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]], align 4
-; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 1
-; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]], align 4
-; CHECK-NEXT:    [[TEMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    [[TEMP1:%.*]] = load i32, i32* [[TEMP]], align 4
-; CHECK-NEXT:    [[TEMP2:%.*]] = add i32 [[TEMP1]], 1
-; CHECK-NEXT:    store i32 [[TEMP2]], i32* [[TEMP]], align 4
-; CHECK-NEXT:    store i32 0, i32* [[X]], align 4
+; CHECK-NEXT:    [[TEMP:%.*]] = add i32 [[B_0]], 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
-
   %temp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
   %temp1 = load i32, i32* %temp, align 4
   %temp2 = add i32 %temp1, 1
@@ -41,11 +30,9 @@
 ; CHECK-NEXT:    store i32 1, i32* [[TEMP1]], align 8
 ; CHECK-NEXT:    [[TEMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
 ; CHECK-NEXT:    store i64 2, i64* [[TEMP4]], align 4
-; CHECK-NEXT:    [[S_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
+; CHECK-NEXT:    [[S_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i64 0, i32 0
 ; CHECK-NEXT:    [[S_0_VAL:%.*]] = load i32, i32* [[S_0]], align 4
-; CHECK-NEXT:    [[S_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[S_1_VAL:%.*]] = load i64, i64* [[S_1]], align 4
-; CHECK-NEXT:    call void @f(i32 [[S_0_VAL]], i64 [[S_1_VAL]], i32* byval(i32) align 4 [[X]], i32 zeroext 0)
+; CHECK-NEXT:    call void @f(i32 [[S_0_VAL]], i32 zeroext 0)
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
diff --git a/llvm/test/Transforms/ArgumentPromotion/byval-2.ll b/llvm/test/Transforms/ArgumentPromotion/byval-2.ll
--- a/llvm/test/Transforms/ArgumentPromotion/byval-2.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/byval-2.ll
@@ -2,24 +2,14 @@
 ; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 ; Arg promotion eliminates the struct argument.
-; FIXME: We should eliminate the i32* argument.
 
 %struct.ss = type { i32, i64 }
 
 define internal void @f(%struct.ss* byval(%struct.ss) align 8 %b, i32* byval(i32) align 4 %X) nounwind  {
 ; CHECK-LABEL: define {{[^@]+}}@f
-; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]], i32* byval(i32) align 4 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: (i32 [[B_0:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
-; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]], align 8
-; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 1
-; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]], align 4
-; CHECK-NEXT:    [[TEMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    [[TEMP1:%.*]] = load i32, i32* [[TEMP]], align 4
-; CHECK-NEXT:    [[TEMP2:%.*]] = add i32 [[TEMP1]], 1
-; CHECK-NEXT:    store i32 [[TEMP2]], i32* [[TEMP]], align 4
-; CHECK-NEXT:    store i32 0, i32* [[X]], align 4
+; CHECK-NEXT:    [[TEMP:%.*]] = add i32 [[B_0]], 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -41,11 +31,9 @@
 ; CHECK-NEXT:    store i32 1, i32* [[TEMP1]], align 8
 ; CHECK-NEXT:    [[TEMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
 ; CHECK-NEXT:    store i64 2, i64* [[TEMP4]], align 4
-; CHECK-NEXT:    [[S_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
-; CHECK-NEXT:    [[S_0_VAL:%.*]] = load i32, i32* [[S_0]], align 8
-; CHECK-NEXT:    [[S_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[S_1_VAL:%.*]] = load i64, i64* [[S_1]], align 4
-; CHECK-NEXT:    call void @f(i32 [[S_0_VAL]], i64 [[S_1_VAL]], i32* byval(i32) align 4 [[X]])
+; CHECK-NEXT:    [[S_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i64 0, i32 0
+; CHECK-NEXT:    [[S_0_VAL:%.*]] = load i32, i32* [[S_0]], align 4
+; CHECK-NEXT:    call void @f(i32 [[S_0_VAL]])
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
diff --git a/llvm/test/Transforms/ArgumentPromotion/byval.ll b/llvm/test/Transforms/ArgumentPromotion/byval.ll
--- a/llvm/test/Transforms/ArgumentPromotion/byval.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/byval.ll
@@ -7,17 +7,9 @@
 
 define internal void @f(%struct.ss* byval(%struct.ss) align 4 %b) nounwind  {
 ; CHECK-LABEL: define {{[^@]+}}@f
-; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: (i32 [[B_0:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
-; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]], align 4
-; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 1
-; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]], align 4
-; CHECK-NEXT:    [[TEMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    [[TEMP1:%.*]] = load i32, i32* [[TEMP]], align 4
-; CHECK-NEXT:    [[TEMP2:%.*]] = add i32 [[TEMP1]], 1
-; CHECK-NEXT:    store i32 [[TEMP2]], i32* [[TEMP]], align 4
+; CHECK-NEXT:    [[TEMP:%.*]] = add i32 [[B_0]], 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -28,20 +20,11 @@
   ret void
 }
 
-
 define internal void @g(%struct.ss* byval(%struct.ss) align 32 %b) nounwind {
 ; CHECK-LABEL: define {{[^@]+}}@g
-; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (i32 [[B_0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]], align 32
-; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]], align 32
-; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 1
-; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]], align 4
-; CHECK-NEXT:    [[TEMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    [[TEMP1:%.*]] = load i32, i32* [[TEMP]], align 4
-; CHECK-NEXT:    [[TEMP2:%.*]] = add i32 [[TEMP1]], 1
-; CHECK-NEXT:    store i32 [[TEMP2]], i32* [[TEMP]], align 4
+; CHECK-NEXT:    [[TEMP:%.*]] = add i32 [[B_0]], 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -75,6 +58,29 @@
   ret void
 }
 
+; Don't transform if an argument is written to and then is loaded from,
+; the Alias Analysis' 'canInstructionRangeModRef' check has to return
+; 'false' in that case.
+define internal void @k(%struct.ss* byval(%struct.ss) align 4 %b) nounwind  {
+; CHECK-LABEL: define {{[^@]+}}@k
+; CHECK-SAME: (%struct.ss* byval([[STRUCT_SS:%.*]]) align 4 [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TEMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
+; CHECK-NEXT:    [[TEMP1:%.*]] = load i32, i32* [[TEMP]], align 4
+; CHECK-NEXT:    [[TEMP2:%.*]] = add i32 [[TEMP1]], 1
+; CHECK-NEXT:    store i32 [[TEMP2]], i32* [[TEMP]], align 4
+; CHECK-NEXT:    [[TEMP3:%.*]] = load i32, i32* [[TEMP]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %temp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
+  %temp1 = load i32, i32* %temp, align 4
+  %temp2 = add i32 %temp1, 1
+  store i32 %temp2, i32* %temp, align 4
+  %temp3 = load i32, i32* %temp, align 4
+  ret void
+}
+
 define i32 @main() nounwind  {
 ; CHECK-LABEL: define {{[^@]+}}@main
 ; CHECK-SAME: () #[[ATTR0]] {
@@ -84,17 +90,14 @@
 ; CHECK-NEXT:    store i32 1, i32* [[TEMP1]], align 8
 ; CHECK-NEXT:    [[TEMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
 ; CHECK-NEXT:    store i64 2, i64* [[TEMP4]], align 4
-; CHECK-NEXT:    [[S_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
+; CHECK-NEXT:    [[S_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i64 0, i32 0
 ; CHECK-NEXT:    [[S_0_VAL:%.*]] = load i32, i32* [[S_0]], align 4
-; CHECK-NEXT:    [[S_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[S_1_VAL:%.*]] = load i64, i64* [[S_1]], align 4
-; CHECK-NEXT:    call void @f(i32 [[S_0_VAL]], i64 [[S_1_VAL]])
-; CHECK-NEXT:    [[S_01:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
-; CHECK-NEXT:    [[S_01_VAL:%.*]] = load i32, i32* [[S_01]], align 32
-; CHECK-NEXT:    [[S_12:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[S_12_VAL:%.*]] = load i64, i64* [[S_12]], align 4
-; CHECK-NEXT:    call void @g(i32 [[S_01_VAL]], i64 [[S_12_VAL]])
+; CHECK-NEXT:    call void @f(i32 [[S_0_VAL]])
+; CHECK-NEXT:    [[S_01:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i64 0, i32 0
+; CHECK-NEXT:    [[S_01_VAL:%.*]] = load i32, i32* [[S_01]], align 4
+; CHECK-NEXT:    call void @g(i32 [[S_01_VAL]])
 ; CHECK-NEXT:    call void @h(%struct.ss* byval([[STRUCT_SS]]) [[S]])
+; CHECK-NEXT:    call void @k(%struct.ss* byval([[STRUCT_SS]]) align 4 [[S]])
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
@@ -106,6 +109,7 @@
   call void @f(%struct.ss* byval(%struct.ss) align 4 %S) nounwind
   call void @g(%struct.ss* byval(%struct.ss) align 32 %S) nounwind
   call void @h(%struct.ss* byval(%struct.ss) %S) nounwind
+  call void @k(%struct.ss* byval(%struct.ss) align 4 %S) nounwind
   ret i32 0
 }
 
diff --git a/llvm/test/Transforms/ArgumentPromotion/dbg.ll b/llvm/test/Transforms/ArgumentPromotion/dbg.ll
--- a/llvm/test/Transforms/ArgumentPromotion/dbg.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/dbg.ll
@@ -19,15 +19,8 @@
 
 define internal void @test_byval(%struct.pair* byval(%struct.pair) align 4 %P) {
 ; CHECK-LABEL: define {{[^@]+}}@test_byval
-; CHECK-SAME: (i32 [[P_0:%.*]], i32 [[P_1:%.*]]) {
-; CHECK-NEXT:    [[P:%.*]] = alloca [[STRUCT_PAIR:%.*]], align 4
-; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_PAIR]], [[STRUCT_PAIR]]* [[P]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[P_0]], i32* [[DOT0]], align 4
-; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_PAIR]], [[STRUCT_PAIR]]* [[P]], i32 0, i32 1
-; CHECK-NEXT:    store i32 [[P_1]], i32* [[DOT1]], align 4
+; CHECK-SAME: () {
 ; CHECK-NEXT:    [[SINK:%.*]] = alloca i32*, align 8
-; CHECK-NEXT:    [[DOT2:%.*]] = getelementptr [[STRUCT_PAIR]], [[STRUCT_PAIR]]* [[P]], i32 0, i32 0
-; CHECK-NEXT:    store i32* [[DOT2]], i32** [[SINK]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %1 = alloca i32*, align 8
@@ -42,11 +35,7 @@
 ; CHECK-NEXT:    [[Y_VAL:%.*]] = load i32*, i32** [[Y]], align 8, !dbg [[DBG4:![0-9]+]]
 ; CHECK-NEXT:    [[Y_VAL_VAL:%.*]] = load i32, i32* [[Y_VAL]], align 8, !dbg [[DBG4]]
 ; CHECK-NEXT:    call void @test(i32 [[Y_VAL_VAL]]), !dbg [[DBG4]]
-; CHECK-NEXT:    [[P_0:%.*]] = getelementptr [[STRUCT_PAIR:%.*]], %struct.pair* [[P]], i32 0, i32 0, !dbg [[DBG5:![0-9]+]]
-; CHECK-NEXT:    [[P_0_VAL:%.*]] = load i32, i32* [[P_0]], align 4, !dbg [[DBG5]]
-; CHECK-NEXT:    [[P_1:%.*]] = getelementptr [[STRUCT_PAIR]], %struct.pair* [[P]], i32 0, i32 1, !dbg [[DBG5]]
-; CHECK-NEXT:    [[P_1_VAL:%.*]] = load i32, i32* [[P_1]], align 4, !dbg [[DBG5]]
-; CHECK-NEXT:    call void @test_byval(i32 [[P_0_VAL]], i32 [[P_1_VAL]]), !dbg [[DBG5]]
+; CHECK-NEXT:    call void @test_byval(), !dbg [[DBG5:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
   call void @test(i32** %Y), !dbg !1
diff --git a/llvm/test/Transforms/ArgumentPromotion/fp80.ll b/llvm/test/Transforms/ArgumentPromotion/fp80.ll
--- a/llvm/test/Transforms/ArgumentPromotion/fp80.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/fp80.ll
@@ -14,23 +14,23 @@
 
 define void @run() {
 ; CHECK-LABEL: define {{[^@]+}}@run() {
-; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %union.u* bitcast (%struct.s* @b to %union.u*) to i8*
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 10
 ; CHECK-NEXT:    [[DOTVAL:%.*]] = load i8, i8* [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call i8 @UseLongDoubleUnsafely(i8 [[DOTVAL]])
-; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[UNION_U:%.*]], %union.u* bitcast (%struct.s* @b to %union.u*), i32 0, i32 0
+; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[UNION_U:%.*]], %union.u* bitcast (%struct.s* @b to %union.u*), i64 0, i32 0
 ; CHECK-NEXT:    [[DOT0_VAL:%.*]] = load x86_fp80, x86_fp80* [[DOT0]], align 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_fp80 @UseLongDoubleSafely(x86_fp80 [[DOT0_VAL]])
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.Foo* @a to i64*
-; CHECK-NEXT:    [[A_VAL:%.*]] = load i64, i64* [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @AccessPaddingOfStruct(i64 [[A_VAL]])
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @CaptureAStruct(%struct.Foo* byval([[STRUCT_FOO:%.*]]) @a)
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_fp80 @UseLongDoubleSafelyNoPromotion(%union.u* byval(%union.u) align 16 bitcast (%struct.s* @b to %union.u*))
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.Foo* @a to i64*
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i64, i64* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @AccessPaddingOfStruct(i64 [[A_VAL]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @CaptureAStruct(%struct.Foo* byval([[STRUCT_FOO:%.*]]) @a)
 ; CHECK-NEXT:    ret void
 ;
-entry:
   tail call i8 @UseLongDoubleUnsafely(%union.u* byval(%union.u) align 16 bitcast (%struct.s* @b to %union.u*))
   tail call x86_fp80 @UseLongDoubleSafely(%union.u* byval(%union.u) align 16 bitcast (%struct.s* @b to %union.u*))
+  tail call x86_fp80 @UseLongDoubleSafelyNoPromotion(%union.u* byval(%union.u) align 16 bitcast (%struct.s* @b to %union.u*))
   call i64 @AccessPaddingOfStruct(%struct.Foo* byval(%struct.Foo) @a)
   call i64 @CaptureAStruct(%struct.Foo* byval(%struct.Foo) @a)
   ret void
@@ -38,11 +38,9 @@
 
 define internal i8 @UseLongDoubleUnsafely(%union.u* byval(%union.u) align 16 %arg) {
 ; CHECK-LABEL: define {{[^@]+}}@UseLongDoubleUnsafely
-; CHECK-SAME: (i8 [[ARG_10_VAL:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i8 [[ARG_10_VAL]]
+; CHECK-SAME: (i8 [[ARG_0_VAL:%.*]]) {
+; CHECK-NEXT:    ret i8 [[ARG_0_VAL]]
 ;
-entry:
   %bitcast = bitcast %union.u* %arg to %struct.s*
   %gep = getelementptr inbounds %struct.s, %struct.s* %bitcast, i64 0, i32 2
   %result = load i8, i8* %gep
@@ -51,23 +49,30 @@
 
 define internal x86_fp80 @UseLongDoubleSafely(%union.u* byval(%union.u) align 16 %arg) {
 ; CHECK-LABEL: define {{[^@]+}}@UseLongDoubleSafely
-; CHECK-SAME: (x86_fp80 [[ARG_0:%.*]]) {
-; CHECK-NEXT:    [[ARG:%.*]] = alloca [[UNION_U:%.*]], align 16
-; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[UNION_U]], [[UNION_U]]* [[ARG]], i32 0, i32 0
-; CHECK-NEXT:    store x86_fp80 [[ARG_0]], x86_fp80* [[DOT0]], align 16
+; CHECK-SAME: (x86_fp80 [[ARG_0_VAL:%.*]]) {
+; CHECK-NEXT:    ret x86_fp80 [[ARG_0_VAL]]
+;
+  %gep = getelementptr inbounds %union.u, %union.u* %arg, i64 0, i32 0
+  %fp80 = load x86_fp80, x86_fp80* %gep
+  ret x86_fp80 %fp80
+}
+
+define internal x86_fp80 @UseLongDoubleSafelyNoPromotion(%union.u* byval(%union.u) align 16 %arg) {
+; CHECK-LABEL: define {{[^@]+}}@UseLongDoubleSafelyNoPromotion
+; CHECK-SAME: ([[UNION_U]]* byval([[UNION_U]]) align 16 [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[UNION_U]], [[UNION_U]]* [[ARG]], i64 0, i32 0
-; CHECK-NEXT:    [[IDX_P:%.*]] = alloca i64, align 8
-; CHECK-NEXT:    store i64 0, i64* [[IDX_P]], align 8
-; CHECK-NEXT:    [[IDX:%.*]] = load i64, i64* [[IDX_P]], align 8
+; CHECK-NEXT:    [[TMP_IDX:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    store i64 0, i64* [[TMP_IDX]], align 8
+; CHECK-NEXT:    [[IDX:%.*]] = load i64, i64* [[TMP_IDX]], align 8
 ; CHECK-NEXT:    [[GEP_IDX:%.*]] = getelementptr inbounds [[UNION_U]], [[UNION_U]]* [[ARG]], i64 [[IDX]], i32 0
-; CHECK-NEXT:    [[FP80:%.*]] = load x86_fp80, x86_fp80* [[GEP]], align 16
+; CHECK-NEXT:    [[FP80:%.*]] = load x86_fp80, x86_fp80* [[GEP]]
 ; CHECK-NEXT:    ret x86_fp80 [[FP80]]
 ;
   %gep = getelementptr inbounds %union.u, %union.u* %arg, i64 0, i32 0
   %idx_slot = alloca i64, align 8
   store i64 0, i64* %idx_slot, align 8
   %idx = load i64, i64* %idx_slot, align 8
-  %gep_idx = getelementptr inbounds %union.u, %union.u* %arg, i64 %idx, i32 0 ; to protect from "usual" promotion
+  %gep_idx = getelementptr inbounds %union.u, %union.u* %arg, i64 %idx, i32 0 ; to protect from promotion
   %fp80 = load x86_fp80, x86_fp80* %gep
   ret x86_fp80 %fp80
 }