Index: lib/Target/AArch64/AArch64FrameLowering.h
===================================================================
--- lib/Target/AArch64/AArch64FrameLowering.h
+++ lib/Target/AArch64/AArch64FrameLowering.h
@@ -41,8 +41,8 @@
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const override;
   int resolveFrameIndexReference(const MachineFunction &MF, int FI,
-                                 unsigned &FrameReg,
-                                 bool PreferFP = false) const;
+                                 unsigned &FrameReg, bool PreferFP = false,
+                                 bool ForceSP = false) const;
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
@@ -54,7 +54,7 @@
                                   const TargetRegisterInfo *TRI) const override;
 
   /// \brief Can this function use the red zone for local allocations.
-  bool canUseRedZone(const MachineFunction &MF) const;
+  bool canUseRedZone(const MachineFunction &MF, unsigned StackSize) const;
 
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
Index: lib/Target/AArch64/AArch64FrameLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64FrameLowering.cpp
+++ lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -115,7 +115,8 @@
 
 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
 
-bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
+bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF,
+                                         unsigned StackSize) const {
   if (!EnableRedZone)
     return false;
   // Don't use the red zone if the function explicitly asks us not to.
@@ -124,10 +125,8 @@
     return false;
 
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  unsigned NumBytes = AFI->getLocalStackSize();
 
-  return !(MFI->hasCalls() || hasFP(MF) || NumBytes > 128);
+  return !(MFI->hasCalls() || hasFP(MF) || StackSize > 128);
 }
 
 /// hasFP - Return true if the specified function should have a dedicated frame
@@ -316,7 +315,7 @@
       return;
     // REDZONE: If the stack size is less than 128 bytes, we don't need
     // to actually allocate.
-    if (canUseRedZone(MF))
+    if (canUseRedZone(MF, NumBytes))
       ++NumRedZoneFunctions;
     else {
       emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
@@ -334,10 +333,18 @@
     return;
   }
 
-  NumBytes -= AFI->getCalleeSavedStackSize();
-  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+  auto CSStackSize = AFI->getCalleeSavedStackSize();
   // All of the remaining stack allocations are for locals.
-  AFI->setLocalStackSize(NumBytes);
+  AFI->setLocalStackSize(NumBytes - CSStackSize);
+  bool CSRBumpsSP = AFI->getCalleeSaveRestoreUpdatesSP();
+  if (CSRBumpsSP) {
+    NumBytes -= CSStackSize;
+  } else {
+    emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+                    MachineInstr::FrameSetup);
+    NumBytes = 0;
+  }
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
 
   // Move past the saves of the callee-saved registers.
   MachineBasicBlock::iterator End = MBB.end();
@@ -346,6 +353,8 @@
   if (HasFP) {
     // Only set up FP if we actually need to. Frame pointer is fp = sp - 16.
     int FPOffset = AFI->getCalleeSavedStackSize() - 16;
+    if (!CSRBumpsSP)
+      FPOffset += AFI->getLocalStackSize();
 
     // Issue    sub fp, sp, FPOffset or
     //          mov fp,sp          when FPOffset is zero.
@@ -366,7 +375,7 @@
     }
 
     // If we're a leaf function, try using the red zone.
-    if (!canUseRedZone(MF))
+    if (!canUseRedZone(MF, NumBytes))
       // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
       // the correct value here, as NumBytes also includes padding bytes,
       // which shouldn't be counted here.
@@ -569,6 +578,14 @@
   // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
   // it as the 2nd argument of AArch64ISD::TC_RETURN.
 
+  // If there is a single SP update, insert it before the ret and we're done.
+  if (!AFI->getCalleeSaveRestoreUpdatesSP()) {
+    emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
+                    NumBytes + ArgumentPopSize, TII,
+                    MachineInstr::FrameDestroy);
+    return;
+  }
+
   // Move past the restores of the callee-saved registers.
   MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
   MachineBasicBlock::iterator Begin = MBB.begin();
@@ -583,7 +600,7 @@
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
 
   if (!hasFP(MF)) {
-    bool RedZone = canUseRedZone(MF);
+    bool RedZone = canUseRedZone(MF, NumBytes);
     // If this was a redzone leaf function, we don't need to restore the
     // stack pointer (but we may need to pop stack args for fastcc).
     if (RedZone && ArgumentPopSize == 0)
@@ -634,7 +651,8 @@
 
 int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
                                                      int FI, unsigned &FrameReg,
-                                                     bool PreferFP) const {
+                                                     bool PreferFP,
+                                                     bool ForceSP) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
@@ -666,12 +684,15 @@
       // using the FP regardless, though, as the SP offset is unknown
       // and we don't have a base pointer available. If an offset is
       // available via the FP and the SP, use whichever is closest.
-      if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 ||
-          (FPOffset >= -256 && Offset > -FPOffset))
+      if (PreferFP || MFI->hasVarSizedObjects())
         UseFP = true;
+      if (!ForceSP)
+        if (FPOffset >= 0 || (FPOffset >= -256 && Offset > -FPOffset))
+          UseFP = true;
     }
   }
 
+  assert(!(UseFP && ForceSP) && "ForceSP flag could not be honored");
   assert((isFixed || !RegInfo->needsStackRealignment(MF) || !UseFP) &&
          "In the presence of dynamic stack pointer realignment, "
          "non-argument objects cannot be accessed through the frame pointer");
@@ -689,8 +710,9 @@
     // If we're using the red zone for this function, the SP won't actually
     // be adjusted, so the offsets will be negative. They're also all
     // within range of the signed 9-bit immediate instructions.
-    if (canUseRedZone(MF))
-      Offset -= AFI->getLocalStackSize();
+    unsigned StackSize = AFI->getLocalStackSize();
+    if (canUseRedZone(MF, StackSize))
+      Offset -= StackSize;
   }
 
   return Offset;
@@ -790,14 +812,6 @@
     if (RPI.isPaired())
       ++i;
   }
-
-  // Align first offset to even 16-byte boundary to avoid additional SP
-  // adjustment instructions.
-  // Last pair offset is size of whole callee-save region for SP
-  // pre-dec/post-inc.
-  RegPairInfo &LastPair = RegPairs.back();
-  assert(AFI->getCalleeSavedStackSize() % 8 == 0);
-  LastPair.Offset = AFI->getCalleeSavedStackSize() / 8;
 }
 
 bool AArch64FrameLowering::spillCalleeSavedRegisters(
@@ -805,10 +819,12 @@
     const std::vector<CalleeSavedInfo> &CSI,
     const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   DebugLoc DL;
   SmallVector<RegPairInfo, 8> RegPairs;
 
+  bool CSRBumpsSP = AFI->getCalleeSaveRestoreUpdatesSP();
   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
 
   for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
@@ -827,7 +843,7 @@
     // Rationale: This sequence saves uop updates compared to a sequence of
     // pre-increment spills like stp xi,xj,[sp,#-16]!
     // Note: Similar rationale and sequence for restores in epilog.
-    bool BumpSP = RPII == RegPairs.rbegin();
+    bool BumpSP = CSRBumpsSP && RPII == RegPairs.rbegin();
     if (RPI.IsGPR) {
       // For first spill use pre-increment store.
       if (BumpSP)
@@ -849,26 +865,37 @@
             dbgs() << ", " << RPI.FrameIdx+1;
           dbgs() << ")\n");
 
-    const int Offset = BumpSP ? -RPI.Offset : RPI.Offset;
+    int Offset = RPI.Offset;
     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
-    if (BumpSP)
+    // Check and modify offset for pre-increment of SP.
+    if (BumpSP) {
       MIB.addReg(AArch64::SP, RegState::Define);
+      assert(Offset == 0);
+      assert(AFI->getCalleeSavedStackSize() % 8 == 0);
+      Offset = -(AFI->getCalleeSavedStackSize() / 8);
+      assert(Offset % 2 == 0 &&
+             "Callee-save store SP bump not 16-byte aligned");
+    }
 
+    int MIOffset;
     if (RPI.isPaired()) {
       MBB.addLiveIn(Reg1);
       MBB.addLiveIn(Reg2);
       MIB.addReg(Reg2, getPrologueDeath(MF, Reg2))
-        .addReg(Reg1, getPrologueDeath(MF, Reg1))
-        .addReg(AArch64::SP)
-        .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit
-        .setMIFlag(MachineInstr::FrameSetup);
+        .addReg(Reg1, getPrologueDeath(MF, Reg1));
+      MIOffset = Offset; // [sp, #offset * 8], where factor * 8 is implicit
     } else {
       MBB.addLiveIn(Reg1);
-      MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
-        .addReg(AArch64::SP)
-        .addImm(BumpSP ? Offset * 8 : Offset) // pre-inc version is unscaled
-        .setMIFlag(MachineInstr::FrameSetup);
+      MIB.addReg(Reg1, getPrologueDeath(MF, Reg1));
+      MIOffset = BumpSP ? Offset * 8 : Offset; // pre-inc version is unscaled
     }
+    if (CSRBumpsSP)
+      MIB.addReg(AArch64::SP)
+        .addImm(MIOffset);
+    else
+      MIB.addFrameIndex(RPI.isPaired() ? RPI.FrameIdx + 1 : RPI.FrameIdx)
+        .addImm(0);
+    MIB.setMIFlag(MachineInstr::FrameSetup);
   }
   return true;
 }
@@ -878,6 +905,7 @@
     const std::vector<CalleeSavedInfo> &CSI,
     const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   DebugLoc DL;
   SmallVector<RegPairInfo, 8> RegPairs;
@@ -885,6 +913,7 @@
   if (MI != MBB.end())
     DL = MI->getDebugLoc();
 
+  bool CSRBumpsSP = AFI->getCalleeSaveRestoreUpdatesSP();
   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
 
   for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE;
@@ -901,7 +930,7 @@
     //    ldp     x22, x21, [sp], #48     // addImm(+6)
     // Note: see comment in spillCalleeSavedRegisters()
     unsigned LdrOpc;
-    bool BumpSP = RPII == std::prev(RegPairs.end());
+    bool BumpSP = CSRBumpsSP && RPII == std::prev(RegPairs.end());
     if (RPI.IsGPR) {
       if (BumpSP)
         LdrOpc = RPI.isPaired() ? AArch64::LDPXpost : AArch64::LDRXpost;
@@ -921,23 +950,35 @@
             dbgs() << ", " << RPI.FrameIdx+1;
           dbgs() << ")\n");
 
-    const int Offset = RPI.Offset;
+    int Offset = RPI.Offset;
     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
-    if (BumpSP)
+    // Check and modify offset for post-decrement of SP.
+    if (BumpSP) {
       MIB.addReg(AArch64::SP, RegState::Define);
+      assert(Offset == 0);
+      assert(AFI->getCalleeSavedStackSize() % 8 == 0);
+      Offset = AFI->getCalleeSavedStackSize() / 8;
+      assert(Offset % 2 == 0 &&
+             "Callee-save restore SP bump not 16-byte aligned");
+    }
 
-    if (RPI.isPaired())
+    int MIOffset;
+    if (RPI.isPaired()) {
       MIB.addReg(Reg2, getDefRegState(true))
-        .addReg(Reg1, getDefRegState(true))
-        .addReg(AArch64::SP)
-        .addImm(Offset) // [sp], #offset * 8  or [sp, #offset * 8]
-                        // where the factor * 8 is implicit
-        .setMIFlag(MachineInstr::FrameDestroy);
+        .addReg(Reg1, getDefRegState(true));
+      MIOffset = Offset; // [sp], #offset * 8  or [sp, #offset * 8]
+                         // where the factor * 8 is implicit
+    } else {
+      MIB.addReg(Reg1, getDefRegState(true));
+      MIOffset = BumpSP ? Offset * 8 : Offset; // post-dec version is unscaled
+    }
+    if (CSRBumpsSP)
+      MIB.addReg(AArch64::SP)
+        .addImm(MIOffset);
     else
-      MIB.addReg(Reg1, getDefRegState(true))
-        .addReg(AArch64::SP)
-        .addImm(BumpSP ? Offset * 8 : Offset) // post-dec version is unscaled
-        .setMIFlag(MachineInstr::FrameDestroy);
+      MIB.addFrameIndex(RPI.isPaired() ? RPI.FrameIdx + 1 : RPI.FrameIdx)
+        .addImm(0);
+    MIB.setMIFlag(MachineInstr::FrameDestroy);
   }
   return true;
 }
@@ -1052,7 +1093,26 @@
     }
   }
 
-  // Round up to register pair alignment to avoid additional SP adjustment
-  // instructions.
-  AFI->setCalleeSavedStackSize(alignTo(8 * NumRegsSpilled, 16));
+  // Check to see if we can combine the callee-save and local stack pointer
+  // adjustment into a single decrement/increment.
+  unsigned EstNonCSStackSize = MFI->estimateStackSize(MF);
+  unsigned StackSize = EstNonCSStackSize + 8 * NumRegsSpilled;
+  if (EstNonCSStackSize != 0 &&
+      // 512 is the maximum immediate for stp/ldp that will be used for
+      // callee-save save/restores
+      StackSize < 512 &&
+      !MFI->hasVarSizedObjects() && !RegInfo->needsStackRealignment(MF) &&
+      // This isn't strictly neccessary, but it simplifies things a bit since
+      // the current RedZone handling code assumes the SP is adjusted by the
+      // callee-save save/restore code.
+      !canUseRedZone(MF, StackSize))
+    AFI->setCalleeSaveRestoreUpdatesSP(false);
+  else
+    // Round up to register pair alignment to avoid additional SP adjustment
+    // instructions.  In the combined case we don't need to do this since the
+    // combined CS and local stack pointer adjustment will be 16-byte aligned,
+    // and not doing so here allows us to use less stack in some cases.
+    NumRegsSpilled = alignTo(NumRegsSpilled, 2);
+
+  AFI->setCalleeSavedStackSize(8 * NumRegsSpilled);
 }
Index: lib/Target/AArch64/AArch64MachineFunctionInfo.h
===================================================================
--- lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -83,18 +83,27 @@
   /// frame is unknown at compile time. e.g., in case of VLAs.
   bool StackRealigned;
 
+  /// True when the first/last callee-save save/restore store/load instruction
+  /// should decrement/increment the stack pointer.  When this is false, there
+  /// is a single decrement/increment of the stack pointer as the first/last
+  /// non-terminator instruction of the function that allocates/de-allocates
+  /// both the callee-save area and the local area of the stack.
+  bool CalleeSaveRestoreUpdatesSP;
+
 public:
   AArch64FunctionInfo()
       : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
         NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
         VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
-        IsSplitCSR(false), StackRealigned(false) {}
+        IsSplitCSR(false), StackRealigned(false),
+        CalleeSaveRestoreUpdatesSP(true) {}
 
   explicit AArch64FunctionInfo(MachineFunction &MF)
       : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
         NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
         VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
-        IsSplitCSR(false), StackRealigned(false) {
+        IsSplitCSR(false), StackRealigned(false),
+        CalleeSaveRestoreUpdatesSP(true) {
     (void)MF;
   }
 
@@ -121,6 +130,13 @@
   void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; }
   unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; }
 
+  void setCalleeSaveRestoreUpdatesSP(bool CSRUpdatesSP) {
+    CalleeSaveRestoreUpdatesSP = CSRUpdatesSP;
+  }
+  bool getCalleeSaveRestoreUpdatesSP() const {
+    return CalleeSaveRestoreUpdatesSP;
+  }
+
   void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
   unsigned getNumLocalDynamicTLSAccesses() const {
     return NumLocalDynamicTLSAccesses;
Index: lib/Target/AArch64/AArch64RegisterInfo.cpp
===================================================================
--- lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -380,10 +380,16 @@
   }
 
   // Modify MI as necessary to handle as much of 'Offset' as possible
-  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg);
+  bool ForceSP = MI.getFlag(MachineInstr::FrameSetup) ||
+                 MI.getFlag(MachineInstr::FrameDestroy);
+  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
+                                           /*PreferFP=*/false, ForceSP);
   if (rewriteAArch64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
     return;
 
+  assert(!MI.getFlag(MachineInstr::FrameSetup) &&
+         !MI.getFlag(MachineInstr::FrameDestroy) &&
+         "Frame setup/destroy can't use emergency spill slot");
   assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) &&
          "Emergency spill slot is out of reach");
 
Index: test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
===================================================================
--- test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
+++ test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
@@ -98,8 +98,8 @@
 ; CHECK-LABEL: novla_nodynamicrealign_call
 ; CHECK: .cfi_startproc
 ;   Check that used callee-saved registers are saved
-; CHECK: stp	x19, x30, [sp, #-16]!
-; CHECK: sub	sp, sp, #16
+; CHECK: sub	sp, sp, #32
+; CHECK: stp	x19, x30, [sp, #16]
 ;   Check correctness of cfi pseudo-instructions
 ; CHECK: .cfi_def_cfa_offset 32
 ; CHECK: .cfi_offset w30, -8
@@ -110,17 +110,19 @@
 ;   Check correct access to local variable on the stack, through stack pointer
 ; CHECK: ldr	w[[ILOC:[0-9]+]], [sp, #12]
 ;   Check epilogue:
-; CHECK: ldp	x19, x30, [sp], #16
+; CHECK: ldp	x19, x30, [sp, #16]
+; CHECK: add	sp, sp, #32
 ; CHECK: ret
 ; CHECK: .cfi_endproc
 
 ; CHECK-MACHO-LABEL: _novla_nodynamicrealign_call:
 ; CHECK-MACHO: .cfi_startproc
 ;   Check that used callee-saved registers are saved
-; CHECK-MACHO: stp	x20, x19, [sp, #-32]!
+; CHECK-MACHO: sub	sp, sp, #48
+; CHECK-MACHO: stp	x20, x19, [sp, #16]
 ;   Check that the frame pointer is created:
-; CHECK-MACHO: stp	x29, x30, [sp, #16]
-; CHECK-MACHO: add	x29, sp, #16
+; CHECK-MACHO: stp	x29, x30, [sp, #32]
+; CHECK-MACHO: add	x29, sp, #32
 ;   Check correctness of cfi pseudo-instructions
 ; CHECK-MACHO: .cfi_def_cfa w29, 16
 ; CHECK-MACHO: .cfi_offset w30, -8
@@ -133,8 +135,9 @@
 ;   Check correct access to local variable on the stack, through stack pointer
 ; CHECK-MACHO: ldr	w[[ILOC:[0-9]+]], [sp, #12]
 ;   Check epilogue:
-; CHECK-MACHO: ldp	x29, x30, [sp, #16]
-; CHECK-MACHO: ldp	x20, x19, [sp], #32
+; CHECK-MACHO: ldp	x29, x30, [sp, #32]
+; CHECK-MACHO: ldp	x20, x19, [sp, #16]
+; CHECK-MACHO: add	sp, sp, #48
 ; CHECK-MACHO: ret
 ; CHECK-MACHO: .cfi_endproc
 
Index: test/CodeGen/AArch64/arm64-aapcs-be.ll
===================================================================
--- test/CodeGen/AArch64/arm64-aapcs-be.ll
+++ test/CodeGen/AArch64/arm64-aapcs-be.ll
@@ -32,7 +32,7 @@
 
 define void @test_block_addr_callee() {
 ; CHECK-LABEL: test_block_addr_callee:
-; CHECK: str {{[a-z0-9]+}}, [sp, #-16]!
+; CHECK: str {{[a-z0-9]+}}, [sp]
 ; CHECK: bl test_block_addr
   %val = insertvalue [1 x float] undef, float 0.0, 0
   call float @test_block_addr([8 x float] undef, [1 x float] %val)
Index: test/CodeGen/AArch64/arm64-abi.ll
===================================================================
--- test/CodeGen/AArch64/arm64-abi.ll
+++ test/CodeGen/AArch64/arm64-abi.ll
@@ -130,7 +130,7 @@
 ; CHECK-LABEL: test3
 ; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
 ; FAST-LABEL: test3
-; FAST: sub sp, sp, #32
+; FAST: sub sp, sp, #48
 ; FAST: mov x[[ADDR:[0-9]+]], sp
 ; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8]
   %0 = load <2 x i32>, <2 x i32>* %in, align 8
Index: test/CodeGen/AArch64/arm64-abi_align.ll
===================================================================
--- test/CodeGen/AArch64/arm64-abi_align.ll
+++ test/CodeGen/AArch64/arm64-abi_align.ll
@@ -291,7 +291,7 @@
 ; Space for s2 is allocated at sp
 
 ; FAST-LABEL: caller42
-; FAST: sub sp, sp, #96
+; FAST: sub sp, sp, #112
 ; Space for s1 is allocated at fp-24 = sp+72
 ; Space for s2 is allocated at sp+48
 ; FAST: sub x[[A:[0-9]+]], x29, #24
@@ -317,8 +317,8 @@
 define i32 @caller42_stack() #3 {
 entry:
 ; CHECK-LABEL: caller42_stack
-; CHECK: mov x29, sp
-; CHECK: sub sp, sp, #96
+; CHECK: sub sp, sp, #112
+; CHECK: add x29, sp, #96
 ; CHECK: stur {{x[0-9]+}}, [x29, #-16]
 ; CHECK: stur {{q[0-9]+}}, [x29, #-32]
 ; CHECK: str {{x[0-9]+}}, [sp, #48]
@@ -399,7 +399,7 @@
 ; Space for s2 is allocated at sp
 
 ; FAST-LABEL: caller43
-; FAST: mov x29, sp
+; FAST: add x29, sp, #64
 ; Space for s1 is allocated at sp+32
 ; Space for s2 is allocated at sp
 ; FAST: add x1, sp, #32
@@ -429,8 +429,8 @@
 define i32 @caller43_stack() #3 {
 entry:
 ; CHECK-LABEL: caller43_stack
-; CHECK: mov x29, sp
-; CHECK: sub sp, sp, #96
+; CHECK: sub sp, sp, #112
+; CHECK: add x29, sp, #96
 ; CHECK: stur {{q[0-9]+}}, [x29, #-16]
 ; CHECK: stur {{q[0-9]+}}, [x29, #-32]
 ; CHECK: str {{q[0-9]+}}, [sp, #48]
@@ -446,7 +446,7 @@
 ; CHECK: str w[[C]], [sp]
 
 ; FAST-LABEL: caller43_stack
-; FAST: sub sp, sp, #96
+; FAST: sub sp, sp, #112
 ; Space for s1 is allocated at fp-32 = sp+64
 ; Space for s2 is allocated at sp+32
 ; FAST: sub x[[A:[0-9]+]], x29, #32
@@ -508,7 +508,7 @@
 ; "i64 %0" should be in register x7.
 ; "i32 8" should be on stack at [sp].
 ; CHECK: ldr x7, [{{x[0-9]+}}]
-; CHECK: str {{w[0-9]+}}, [sp, #-16]!
+; CHECK: str {{w[0-9]+}}, [sp]
 ; FAST-LABEL: i64_split
 ; FAST: ldr x7, [{{x[0-9]+}}]
 ; FAST: mov x[[R0:[0-9]+]], sp
Index: test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
===================================================================
--- test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
+++ test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
@@ -14,7 +14,7 @@
 define void @main() nounwind {
 entry:
 ; CHECK: main
-; CHECK: mov x29, sp
+; CHECK: add x29, sp, #16
 ; CHECK: mov [[REG:x[0-9]+]], sp
 ; CHECK-NEXT: add x0, [[REG]], #8
   %E = alloca %struct.S2Ty, align 4
Index: test/CodeGen/AArch64/arm64-hello.ll
===================================================================
--- test/CodeGen/AArch64/arm64-hello.ll
+++ test/CodeGen/AArch64/arm64-hello.ll
@@ -2,26 +2,26 @@
 ; RUN: llc < %s -mtriple=arm64-linux-gnu -disable-post-ra | FileCheck %s --check-prefix=CHECK-LINUX
 
 ; CHECK-LABEL: main:
-; CHECK:	stp	x29, x30, [sp, #-16]!
-; CHECK-NEXT:	mov	x29, sp
-; CHECK-NEXT:	sub	sp, sp, #16
+; CHECK:	sub	sp, sp, #32
+; CHECK-NEXT:	stp	x29, x30, [sp, #16]
+; CHECK-NEXT:	add	x29, sp, #16
 ; CHECK-NEXT:	stur	wzr, [x29, #-4]
 ; CHECK:	adrp	x0, L_.str@PAGE
 ; CHECK:	add	x0, x0, L_.str@PAGEOFF
 ; CHECK-NEXT:	bl	_puts
-; CHECK-NEXT:	add	sp, sp, #16
-; CHECK-NEXT:	ldp	x29, x30, [sp], #16
+; CHECK-NEXT:	ldp	x29, x30, [sp, #16]
+; CHECK-NEXT:	add	sp, sp, #32
 ; CHECK-NEXT:	ret
 
 ; CHECK-LINUX-LABEL: main:
-; CHECK-LINUX:	str	x30, [sp, #-16]!
-; CHECK-LINUX-NEXT:	sub	sp, sp, #16
-; CHECK-LINUX-NEXT:	str	wzr, [sp, #12]
+; CHECK-LINUX:	sub	sp, sp, #16
+; CHECK-LINUX-NEXT:	str	x30, [sp, #8]
+; CHECK-LINUX-NEXT:	str	wzr, [sp, #4]
 ; CHECK-LINUX:	adrp	x0, .L.str
 ; CHECK-LINUX:	add	x0, x0, :lo12:.L.str
 ; CHECK-LINUX-NEXT:	bl	puts
+; CHECK-LINUX-NEXT:	ldr	x30, [sp, #8]
 ; CHECK-LINUX-NEXT:	add	sp, sp, #16
-; CHECK-LINUX-NEXT:	ldr	x30, [sp], #16
 ; CHECK-LINUX-NEXT:	ret
 
 @.str = private unnamed_addr constant [7 x i8] c"hello\0A\00"
Index: test/CodeGen/AArch64/arm64-join-reserved.ll
===================================================================
--- test/CodeGen/AArch64/arm64-join-reserved.ll
+++ test/CodeGen/AArch64/arm64-join-reserved.ll
@@ -5,7 +5,7 @@
 ; A move isn't necessary.
 ; <rdar://problem/11492712>
 ; CHECK-LABEL: g:
-; CHECK: str xzr, [sp, #-16]!
+; CHECK: str xzr, [sp]
 ; CHECK: bl
 ; CHECK: ret
 define void @g() nounwind ssp {
Index: test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
===================================================================
--- test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
+++ test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
@@ -7,7 +7,7 @@
 entry:
 ; CHECK-LABEL: jscall_patchpoint_codegen:
 ; CHECK:       Ltmp
-; CHECK:       str x{{.+}}, [sp, #-16]!
+; CHECK:       str x{{.+}}, [sp]
 ; CHECK-NEXT:  mov  x0, x{{.+}}
 ; CHECK:       Ltmp
 ; CHECK-NEXT:  movz  x16, #0xffff, lsl #32
@@ -16,7 +16,7 @@
 ; CHECK-NEXT:  blr x16
 ; FAST-LABEL:  jscall_patchpoint_codegen:
 ; FAST:        Ltmp
-; FAST:        str x{{.+}}, [sp, #-16]!
+; FAST:        str x{{.+}}, [sp]
 ; FAST:        Ltmp
 ; FAST-NEXT:   movz  x16, #0xffff, lsl #32
 ; FAST-NEXT:   movk  x16, #0xdead, lsl #16
@@ -50,7 +50,7 @@
 ; FAST:        orr [[REG1:x[0-9]+]], xzr, #0x2
 ; FAST-NEXT:   orr [[REG2:w[0-9]+]], wzr, #0x4
 ; FAST-NEXT:   orr [[REG3:x[0-9]+]], xzr, #0x6
-; FAST-NEXT:   str [[REG1]], [sp, #-32]!
+; FAST-NEXT:   str [[REG1]], [sp]
 ; FAST-NEXT:   str [[REG2]], [sp, #16]
 ; FAST-NEXT:   str [[REG3]], [sp, #24]
 ; FAST:        Ltmp
@@ -90,7 +90,7 @@
 ; FAST-NEXT:   orr [[REG3:x[0-9]+]], xzr, #0x6
 ; FAST-NEXT:   orr [[REG4:w[0-9]+]], wzr, #0x8
 ; FAST-NEXT:   movz [[REG5:x[0-9]+]], #0xa
-; FAST-NEXT:   str [[REG1]], [sp, #-64]!
+; FAST-NEXT:   str [[REG1]], [sp]
 ; FAST-NEXT:   str [[REG2]], [sp, #16]
 ; FAST-NEXT:   str [[REG3]], [sp, #24]
 ; FAST-NEXT:   str [[REG4]], [sp, #36]
Index: test/CodeGen/AArch64/arm64-patchpoint.ll
===================================================================
--- test/CodeGen/AArch64/arm64-patchpoint.ll
+++ test/CodeGen/AArch64/arm64-patchpoint.ll
@@ -26,10 +26,11 @@
 ; as a leaf function.
 ;
 ; CHECK-LABEL: caller_meta_leaf
-; CHECK:       mov x29, sp
-; CHECK-NEXT:  sub sp, sp, #32
+; CHECK:       sub sp, sp, #48
+; CHECK-NEXT:  stp x29, x30, [sp, #32]
+; CHECK-NEXT:  add x29, sp, #32
 ; CHECK:       Ltmp
-; CHECK:       add sp, sp, #32
+; CHECK:       add sp, sp, #48
 ; CHECK:       ret
 
 define void @caller_meta_leaf() {
Index: test/CodeGen/AArch64/arm64-shrink-wrapping.ll
===================================================================
--- test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+++ test/CodeGen/AArch64/arm64-shrink-wrapping.ll
@@ -13,9 +13,9 @@
 ; ENABLE-NEXT: b.ge [[EXIT_LABEL:LBB[0-9_]+]]
 ;
 ; Prologue code.
-; CHECK: stp [[SAVE_SP:x[0-9]+]], [[CSR:x[0-9]+]], [sp, #-16]!
-; CHECK-NEXT: mov [[SAVE_SP]], sp
-; CHECK-NEXT: sub sp, sp, #16
+; CHECK: sub sp, sp, #32
+; CHECK-NEXT: stp [[SAVE_SP:x[0-9]+]], [[CSR:x[0-9]+]], [sp, #16]
+; CHECK-NEXT: add [[SAVE_SP]], sp, #16
 ;
 ; Compare the arguments and jump to exit.
 ; After the prologue is set.
@@ -33,8 +33,8 @@
 ; Without shrink-wrapping, epilogue is in the exit block.
 ; DISABLE: [[EXIT_LABEL]]:
 ; Epilogue code.
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x{{[0-9]+}}, [[CSR]], [sp], #16
+; CHECK-NEXT: ldp x{{[0-9]+}}, [[CSR]], [sp, #16]
+; CHECK-NEXT: add sp, sp, #32
 ;
 ; With shrink-wrapping, exit block is a simple return.
 ; ENABLE: [[EXIT_LABEL]]:
@@ -454,9 +454,9 @@
 ; ENABLE: cbz w0, [[ELSE_LABEL:LBB[0-9_]+]]
 ;
 ; Prologue code.
-; CHECK: stp [[CSR1:x[0-9]+]], [[CSR2:x[0-9]+]], [sp, #-16]!
-; CHECK-NEXT: mov [[NEW_SP:x[0-9]+]], sp
-; CHECK-NEXT: sub sp, sp, #48
+; CHECK: sub sp, sp, #64
+; CHECK-NEXT: stp [[CSR1:x[0-9]+]], [[CSR2:x[0-9]+]], [sp, #48]
+; CHECK-NEXT: add [[NEW_SP:x[0-9]+]], sp, #48
 ;
 ; DISABLE: cbz w0, [[ELSE_LABEL:LBB[0-9_]+]]
 ; Setup of the varags.
@@ -473,8 +473,8 @@
 ; DISABLE: [[IFEND_LABEL]]: ; %if.end
 ;
 ; Epilogue code.
-; CHECK: add sp, sp, #48
-; CHECK-NEXT: ldp [[CSR1]], [[CSR2]], [sp], #16
+; CHECK: ldp [[CSR1]], [[CSR2]], [sp, #48]
+; CHECK-NEXT: add sp, sp, #64
 ; CHECK-NEXT: ret
 ;
 ; ENABLE: [[ELSE_LABEL]]: ; %if.else
Index: test/CodeGen/AArch64/arm64-virtual_base.ll
===================================================================
--- test/CodeGen/AArch64/arm64-virtual_base.ll
+++ test/CodeGen/AArch64/arm64-virtual_base.ll
@@ -34,9 +34,9 @@
 define void @Precompute_Patch_Values(%struct.Bicubic_Patch_Struct* %Shape) {
 ; CHECK: Precompute_Patch_Values
 ; CHECK: ldr [[VAL:x[0-9]+]], [x0, #288]
-; CHECK-NEXT: str [[VAL]], [sp, #232]
+; CHECK-NEXT: str [[VAL]], [sp, #240]
 ; CHECK-NEXT: ldr [[VAL2:q[0-9]+]], [x0, #272]
-; CHECK-NEXT: stur [[VAL2]], {{\[}}sp, #216]
+; CHECK-NEXT: stur [[VAL2]], {{\[}}sp, #224]
 entry:
   %Control_Points = alloca [16 x [3 x double]], align 8
   %arraydecay5.3.1 = getelementptr inbounds [16 x [3 x double]], [16 x [3 x double]]* %Control_Points, i64 0, i64 9, i64 0
Index: test/CodeGen/AArch64/fastcc.ll
===================================================================
--- test/CodeGen/AArch64/fastcc.ll
+++ test/CodeGen/AArch64/fastcc.ll
@@ -7,13 +7,16 @@
 
 define fastcc void @func_stack0() {
 ; CHECK-LABEL: func_stack0:
-; CHECK: mov x29, sp
-; CHECK: str w{{[0-9]+}}, [sp, #-32]!
+; CHECK: sub sp, sp, #48
+; CHECK-NEXT: stp x29, x30, [sp, #32]
+; CHECK-NEXT: add x29, sp, #32
+; CHECK: str w{{[0-9]+}}, [sp]
 
 ; CHECK-TAIL-LABEL: func_stack0:
-; CHECK-TAIL: stp x29, x30, [sp, #-16]!
-; CHECK-TAIL-NEXT: mov x29, sp
-; CHECK-TAIL: str w{{[0-9]+}}, [sp, #-32]!
+; CHECK-TAIL: sub sp, sp, #48
+; CHECK-TAIL-NEXT: stp x29, x30, [sp, #32]
+; CHECK-TAIL-NEXT: add x29, sp, #32
+; CHECK-TAIL: str w{{[0-9]+}}, [sp]
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
@@ -42,27 +45,29 @@
 ; CHECK-TAIL-NOT: sub sp, sp
 
   ret void
-; CHECK: add sp, sp, #32
-; CHECK-NEXT: ldp     x29, x30, [sp], #16
+; CHECK: ldp     x29, x30, [sp, #32]
+; CHECK-NEXT: add sp, sp, #48
 ; CHECK-NEXT: ret
 
 
-; CHECK-TAIL: add sp, sp, #32
-; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-TAIL: ldp     x29, x30, [sp, #32]
+; CHECK-TAIL-NEXT: add sp, sp, #48
 ; CHECK-TAIL-NEXT: ret
 }
 
 define fastcc void @func_stack8([8 x i32], i32 %stacked) {
 ; CHECK-LABEL: func_stack8:
-; CHECK: stp x29, x30, [sp, #-16]!
-; CHECK: mov x29, sp
-; CHECK: str w{{[0-9]+}}, [sp, #-32]!
+; CHECK: sub sp, sp, #48
+; CHECK-NEXT: stp x29, x30, [sp, #32]
+; CHECK-NEXT: add x29, sp, #32
+; CHECK: str w{{[0-9]+}}, [sp]
 
 
 ; CHECK-TAIL-LABEL: func_stack8:
-; CHECK-TAIL: stp x29, x30, [sp, #-16]!
-; CHECK-TAIL: mov x29, sp
-; CHECK-TAIL: str w{{[0-9]+}}, [sp, #-32]!
+; CHECK-TAIL: sub sp, sp, #48
+; CHECK-TAIL: stp x29, x30, [sp, #32]
+; CHECK-TAIL: add x29, sp, #32
+; CHECK-TAIL: str w{{[0-9]+}}, [sp]
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
@@ -91,23 +96,24 @@
 ; CHECK-TAIL-NOT: sub sp, sp
 
   ret void
-; CHECK: add sp, sp, #32
-; CHECK-NEXT: ldp     x29, x30, [sp], #16
+; CHECK: ldp     x29, x30, [sp, #32]
+; CHECK-NEXT: add sp, sp, #48
 ; CHECK-NEXT: ret
 
 
-; CHECK-TAIL: add sp, sp, #32
-; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-TAIL-NEXT: add     sp, sp, #16
+; CHECK-TAIL: ldp     x29, x30, [sp, #32]
+; CHECK-TAIL-NEXT: add     sp, sp, #64
 ; CHECK-TAIL-NEXT: ret
 }
 
 define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
 ; CHECK-LABEL: func_stack32:
-; CHECK: mov x29, sp
+; CHECK: sub sp, sp, #48
+; CHECK-NEXT: stp x29, x30, [sp, #32]
+; CHECK-NEXT: add x29, sp, #32
 
 ; CHECK-TAIL-LABEL: func_stack32:
-; CHECK-TAIL: mov x29, sp
+; CHECK-TAIL: add x29, sp, #32
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
@@ -136,13 +142,12 @@
 ; CHECK-TAIL-NOT: sub sp, sp
 
   ret void
-; CHECK: add sp, sp, #32
-; CHECK-NEXT: ldp     x29, x30, [sp], #16
+; CHECK: ldp     x29, x30, [sp, #32]
+; CHECK-NEXT: add sp, sp, #48
 ; CHECK-NEXT: ret
 
-; CHECK-TAIL: add sp, sp, #32
-; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-TAIL-NEXT: add     sp, sp, #32
+; CHECK-TAIL: ldp     x29, x30, [sp, #32]
+; CHECK-TAIL-NEXT: add     sp, sp, #80
 ; CHECK-TAIL-NEXT: ret
 }
 
@@ -180,22 +185,21 @@
 ; Check that arg stack pop is done after callee-save restore when no frame pointer is used.
 define fastcc void @func_stack32_leaf_local([8 x i32], i128 %stacked0, i128 %stacked1) {
 ; CHECK-LABEL: func_stack32_leaf_local:
-; CHECK: str     x20, [sp, #-16]!
-; CHECK-NEXT: sub     sp, sp, #16
+; CHECK: sub     sp, sp, #32
+; CHECK-NEXT: str     x20, [sp, #24]
 ; CHECK: nop
 ; CHECK-NEXT: //NO_APP
-; CHECK-NEXT: add     sp, sp, #16
-; CHECK-NEXT: ldr     x20, [sp], #16
+; CHECK-NEXT: ldr     x20, [sp, #24]
+; CHECK-NEXT: add     sp, sp, #32
 ; CHECK-NEXT: ret
 
 ; CHECK-TAIL-LABEL: func_stack32_leaf_local:
-; CHECK-TAIL: str     x20, [sp, #-16]!
-; CHECK-TAIL-NEXT: sub     sp, sp, #16
+; CHECK-TAIL: sub     sp, sp, #32
+; CHECK-TAIL-NEXT: str     x20, [sp, #24]
 ; CHECK-TAIL: nop
 ; CHECK-TAIL-NEXT: //NO_APP
-; CHECK-TAIL-NEXT: add     sp, sp, #16
-; CHECK-TAIL-NEXT: ldr     x20, [sp], #16
-; CHECK-TAIL-NEXT: add     sp, sp, #32
+; CHECK-TAIL-NEXT: ldr     x20, [sp, #24]
+; CHECK-TAIL-NEXT: add     sp, sp, #64
 ; CHECK-TAIL-NEXT: ret
 
 ; CHECK-TAIL-RZ-LABEL: func_stack32_leaf_local:
Index: test/CodeGen/AArch64/func-calls.ll
===================================================================
--- test/CodeGen/AArch64/func-calls.ll
+++ test/CodeGen/AArch64/func-calls.ll
@@ -89,11 +89,11 @@
   ; that varstruct is passed on the stack. Rather dependent on how a
   ; memcpy gets created, but the following works for now.
 
-; CHECK-DAG: str {{q[0-9]+}}, [sp, #-16]
+; CHECK-DAG: str {{q[0-9]+}}, [sp]
 ; CHECK-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
 ; CHECK: mov v0.16b, v[[FINAL_DOUBLE]].16b
 
-; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp, #-16]!
+; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp]
 ; CHECK-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
 ; CHECK-NONEON: fmov d0, d[[FINAL_DOUBLE]]
 
Index: test/CodeGen/AArch64/remat.ll
===================================================================
--- test/CodeGen/AArch64/remat.ll
+++ test/CodeGen/AArch64/remat.ll
@@ -5,7 +5,7 @@
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=exynos-m1 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=kryo -o - %s | FileCheck %s
 
-%X = type { i64, i64, i64 }
+%X = type { i64, i64, i64, i64 }
 declare void @f(%X*)
 define void @t() {
 entry:
Index: test/CodeGen/AArch64/tailcall-implicit-sret.ll
===================================================================
--- test/CodeGen/AArch64/tailcall-implicit-sret.ll
+++ test/CodeGen/AArch64/tailcall-implicit-sret.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false -disable-post-ra | FileCheck %s
 ; Disable the load/store optimizer to avoid having LDP/STPs and simplify checks.
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
Index: test/DebugInfo/AArch64/prologue_end.ll
===================================================================
--- test/DebugInfo/AArch64/prologue_end.ll
+++ test/DebugInfo/AArch64/prologue_end.ll
@@ -9,9 +9,9 @@
 define void @prologue_end_test() nounwind uwtable !dbg !4 {
   ; CHECK: prologue_end_test:
   ; CHECK: .cfi_startproc
-  ; CHECK: stp x29, x30
-  ; CHECK: mov x29, sp
   ; CHECK: sub sp, sp
+  ; CHECK: stp x29, x30
+  ; CHECK: add x29, sp
   ; CHECK: .loc 1 3 3 prologue_end
   ; CHECK: bl _func
   ; CHECK: bl _func