diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -370,29 +370,35 @@
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   const ARMFrameLowering *TFI = getFrameLowering(MF);
 
-  // When outgoing call frames are so large that we adjust the stack pointer
-  // around the call, we can no longer use the stack pointer to reach the
-  // emergency spill slot.
+  // If we have stack realignment and VLAs, we have no pointer to use to
+  // access the stack. If we have stack realignment, and a large call frame,
+  // we have no place to allocate the emergency spill slot.
   if (needsStackRealignment(MF) && !TFI->hasReservedCallFrame(MF))
     return true;
 
   // Thumb has trouble with negative offsets from the FP. Thumb2 has a limited
   // negative range for ldr/str (255), and thumb1 is positive offsets only.
+  //
   // It's going to be better to use the SP or Base Pointer instead. When there
   // are variable sized objects, we can't reference off of the SP, so we
   // reserve a Base Pointer.
-  if (AFI->isThumbFunction() && MFI.hasVarSizedObjects()) {
-    // Conservatively estimate whether the negative offset from the frame
-    // pointer will be sufficient to reach. If a function has a smallish
-    // frame, it's less likely to have lots of spills and callee saved
-    // space, so it's all more likely to be within range of the frame pointer.
-    // If it's wrong, the scavenger will still enable access to work, it just
-    // won't be optimal.
-    if (AFI->isThumb2Function() && MFI.getLocalFrameSize() < 128)
-      return false;
+  //
+  // For Thumb2, estimate whether a negative offset from the frame pointer
+  // will be sufficient to reach the whole stack frame. If a function has a
+  // smallish frame, it's less likely to have lots of spills and callee saved
+  // space, so it's all more likely to be within range of the frame pointer.
+  // If it's wrong, the scavenger will still enable access to work, it just
+  // won't be optimal.  (We should always be able to reach the emergency
+  // spill slot from the frame pointer.)
+  if (AFI->isThumb2Function() && MFI.hasVarSizedObjects() &&
+      MFI.getLocalFrameSize() >= 128)
+    return true;
+  // For Thumb1, if sp moves, nothing is in range, so force a base pointer.
+  // This is necessary for correctness in cases where we need an emergency
+  // spill slot. (In Thumb1, we can't use a negative offset from the frame
+  // pointer.)
+  if (AFI->isThumb1OnlyFunction() && !TFI->hasReservedCallFrame(MF))
     return true;
-  }
-
   return false;
 }
 
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -344,6 +344,10 @@
 /// as assignCalleeSavedSpillSlots() hasn't run at this point. Instead we use
 /// this to produce a conservative estimate that we check in an assert() later.
 static int getMaxFPOffset(const Function &F, const ARMFunctionInfo &AFI) {
+  // For Thumb1, push.w isn't available, so the first push will always push
+  // r7 and lr onto the stack first.
+  if (AFI.isThumb1OnlyFunction())
+    return -AFI.getArgRegsSaveSize() - (2 * 4);
   // This is a conservative estimation: Assume the frame pointer being r7 and
   // pc("r15") up to r8 getting spilled before (= 8 registers).
   return -AFI.getArgRegsSaveSize() - (8 * 4);
@@ -954,8 +958,12 @@
     }
   }
   // Use the base pointer if we have one.
-  if (RegInfo->hasBasePointer(MF))
+  // FIXME: Maybe prefer sp on Thumb1 if it's legal and the offset is cheaper?
+  // That can happen if we forced a base pointer for a large call frame.
+  if (RegInfo->hasBasePointer(MF)) {
     FrameReg = RegInfo->getBaseRegister();
+    Offset -= SPAdj;
+  }
   return Offset;
 }
 
@@ -1775,13 +1783,59 @@
   }
   EstimatedStackSize += 16; // For possible paddings.
 
-  unsigned EstimatedRSStackSizeLimit = estimateRSStackSizeLimit(MF, this);
+  unsigned EstimatedRSStackSizeLimit, EstimatedRSFixedSizeLimit;
+  if (AFI->isThumb1OnlyFunction()) {
+    // For Thumb1, don't bother to iterate over the function. The only
+    // instruction that requires an emergency spill slot is a store to a
+    // frame index.
+    //
+    // tSTRspi, which is used for sp-relative accesses, has an 8-bit unsigned
+    // immediate. tSTRi, which is used for bp- and fp-relative accesses, has
+    // a 5-bit unsigned immediate.
+    //
+    // We could try to check if the function actually contains a tSTRspi
+    // that might need the spill slot, but it's not really important.
+    // Functions with VLAs or extremely large call frames are rare, and
+    // if a function is allocating more than 1KB of stack, an extra 4-byte
+    // slot probably isn't relevant.
+    if (RegInfo->hasBasePointer(MF))
+      EstimatedRSStackSizeLimit = (1U << 5) * 4;
+    else
+      EstimatedRSStackSizeLimit = (1U << 8) * 4;
+    EstimatedRSFixedSizeLimit = (1U << 5) * 4;
+  } else {
+    EstimatedRSStackSizeLimit = estimateRSStackSizeLimit(MF, this);
+    EstimatedRSFixedSizeLimit = EstimatedRSStackSizeLimit;
+  }
+  // Final estimate of whether sp or bp-relative accesses might require
+  // scavenging.
+  bool HasLargeStack = EstimatedStackSize > EstimatedRSStackSizeLimit;
+
+  // If the stack pointer moves and we don't have a base pointer, the
+  // estimate logic doesn't work. The actual offsets might be larger when
+  // we're constructing a call frame, or we might need to use negative
+  // offsets from fp.
+  bool HasMovingSP = MFI.hasVarSizedObjects() ||
+    (MFI.adjustsStack() && !canSimplifyCallFramePseudos(MF));
+  bool HasBPOrFixedSP = RegInfo->hasBasePointer(MF) || !HasMovingSP;
+
+  // If we have a frame pointer, we assume arguments will be accessed
+  // relative to the frame pointer. Check whether fp-relative accesses to
+  // arguments require scavenging.
+  //
+  // We could do slightly better on Thumb1; in some cases, an sp-relative
+  // offset would be legal even though an fp-relative offset is not.
   int MaxFPOffset = getMaxFPOffset(MF.getFunction(), *AFI);
-  bool BigFrameOffsets = EstimatedStackSize >= EstimatedRSStackSizeLimit ||
-    MFI.hasVarSizedObjects() ||
-    (MFI.adjustsStack() && !canSimplifyCallFramePseudos(MF)) ||
-    // For large argument stacks fp relative addressed may overflow.
-    (HasFP && (MaxFixedOffset - MaxFPOffset) >= (int)EstimatedRSStackSizeLimit);
+  bool HasLargeArgumentList =
+      HasFP && (MaxFixedOffset - MaxFPOffset) > (int)EstimatedRSFixedSizeLimit;
+
+  bool BigFrameOffsets = HasLargeStack || !HasBPOrFixedSP ||
+                         HasLargeArgumentList;
+  LLVM_DEBUG(dbgs() << "EstimatedLimit: " << EstimatedRSStackSizeLimit
+                    << "; EstimatedStack" << EstimatedStackSize
+                    << "; EstimatedFPStack" << MaxFixedOffset - MaxFPOffset
+                    << "; BigFrameOffsets: " << BigFrameOffsets
+                    << "\n");
   if (BigFrameOffsets ||
       !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
     AFI->setHasStackFrame(true);
@@ -1806,8 +1860,17 @@
         CS1Spilled = true;
     }
 
-    // This is true when we inserted a spill for an unused register that can now
-    // be used for register scavenging.
+    // This is true when we inserted a spill for a callee-save GPR which is
+    // not otherwise used by the function. This guaranteees it is possible
+    // to scavenge a register to hold the address of a stack slot. On Thumb1,
+    // the register must be a valid operand to tSTRi, i.e. r4-r7. For other
+    // subtargets, this is any GPR, i.e. r4-r11 or lr.
+    //
+    // If we don't insert a spill, we instead allocate an emergency spill
+    // slot, which can be used by scavenging to spill an arbitrary register.
+    //
+    // We currently don't try to figure out whether any specific instruction
+    // requires scavening an additional register.
     bool ExtraCSSpill = false;
 
     if (AFI->isThumb1OnlyFunction()) {
@@ -1916,7 +1979,7 @@
         NumGPRSpills++;
         CS1Spilled = true;
         assert(!MRI.isReserved(Reg) && "Should not be reserved");
-        if (!MRI.isPhysRegUsed(Reg))
+        if (Reg != ARM::LR && !MRI.isPhysRegUsed(Reg))
           ExtraCSSpill = true;
         UnspilledCS1GPRs.erase(llvm::find(UnspilledCS1GPRs, Reg));
         if (Reg == ARM::LR)
@@ -1941,7 +2004,8 @@
         UnspilledCS1GPRs.erase(LRPos);
 
       ForceLRSpill = false;
-      if (!MRI.isReserved(ARM::LR) && !MRI.isPhysRegUsed(ARM::LR))
+      if (!MRI.isReserved(ARM::LR) && !MRI.isPhysRegUsed(ARM::LR) &&
+          !AFI->isThumb1OnlyFunction())
         ExtraCSSpill = true;
     }
 
@@ -1963,7 +2027,8 @@
             SavedRegs.set(Reg);
             LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI)
                               << " to make up alignment\n");
-            if (!MRI.isReserved(Reg) && !MRI.isPhysRegUsed(Reg))
+            if (!MRI.isReserved(Reg) && !MRI.isPhysRegUsed(Reg) &&
+                !(Reg == ARM::LR && AFI->isThumb1OnlyFunction()))
               ExtraCSSpill = true;
             break;
           }
@@ -1992,8 +2057,7 @@
         unsigned Reg = UnspilledCS1GPRs.back();
         UnspilledCS1GPRs.pop_back();
         if (!MRI.isReserved(Reg) &&
-            (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg) ||
-             Reg == ARM::LR)) {
+            (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg))) {
           Extras.push_back(Reg);
           NumExtras--;
         }
@@ -2016,10 +2080,10 @@
             ExtraCSSpill = true;
         }
       }
-      if (!ExtraCSSpill && !AFI->isThumb1OnlyFunction()) {
-        // note: Thumb1 functions spill to R12, not the stack.  Reserve a slot
-        // closest to SP or frame pointer.
+      if (!ExtraCSSpill) {
+        // Reserve a slot closest to SP or frame pointer.
         assert(RS && "Register scavenging not provided");
+        LLVM_DEBUG(dbgs() << "Reserving emergency spill slot\n");
         const TargetRegisterClass &RC = ARM::GPRRegClass;
         unsigned Size = TRI->getSpillSize(RC);
         unsigned Align = TRI->getSpillAlignment(RC);
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -1150,15 +1150,22 @@
     if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4, 0, 256, RHSC)) {
       Base = N.getOperand(0);
       int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-      // For LHS+RHS to result in an offset that's a multiple of 4 the object
-      // indexed by the LHS must be 4-byte aligned.
+      // Make sure the offset is inside the object, or we might fail to
+      // allocate an emergency spill slot. (An out-of-range access is UB, but
+      // it could show up anyway.)
       MachineFrameInfo &MFI = MF->getFrameInfo();
-      if (MFI.getObjectAlignment(FI) < 4)
-        MFI.setObjectAlignment(FI, 4);
-      Base = CurDAG->getTargetFrameIndex(
-          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
-      OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
-      return true;
+      if (RHSC * 4 < MFI.getObjectSize(FI)) {
+        // For LHS+RHS to result in an offset that's a multiple of 4 the object
+        // indexed by the LHS must be 4-byte aligned.
+        if (!MFI.isFixedObjectIndex(FI) && MFI.getObjectAlignment(FI) < 4)
+          MFI.setObjectAlignment(FI, 4);
+        if (MFI.getObjectAlignment(FI) >= 4) {
+          Base = CurDAG->getTargetFrameIndex(
+              FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+          OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
+          return true;
+        }
+      }
     }
   }
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3652,7 +3652,8 @@
   // argument passed via stack.
   int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
                                   CCInfo.getInRegsParamsCount(),
-                                  CCInfo.getNextStackOffset(), 4);
+                                  CCInfo.getNextStackOffset(),
+                                  std::max(4U, TotalArgRegsSaveSize));
   AFI->setVarArgsFrameIndex(FrameIndex);
 }
 
diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
--- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -63,15 +63,52 @@
   return !MFI.hasVarSizedObjects();
 }
 
-static void emitSPUpdate(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator &MBBI,
-                         const TargetInstrInfo &TII, const DebugLoc &dl,
-                         const ThumbRegisterInfo &MRI, int NumBytes,
-                         unsigned MIFlags = MachineInstr::NoFlags) {
+static void
+emitPrologueEpilogueSPUpdate(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator &MBBI,
+                             const TargetInstrInfo &TII, const DebugLoc &dl,
+                             const ThumbRegisterInfo &MRI, int NumBytes,
+                             unsigned ScratchReg, unsigned MIFlags) {
+  // If it would take more than three instructions to adjust the stack pointer
+  // using tADDspi/tSUBspi, load an immediate instead.
+  if (std::abs(NumBytes) > 508 * 3) {
+    // We use a different codepath here from the normal
+    // emitThumbRegPlusImmediate so we don't have to deal with register
+    // scavenging. (Scavenging could try to use the emergency spill slot
+    // before we've actually finished setting up the stack.)
+    if (ScratchReg == ARM::NoRegister)
+      report_fatal_error("Failed to emit Thumb1 stack adjustment");
+    MachineFunction &MF = *MBB.getParent();
+    const ARMSubtarget &ST = MF.getSubtarget<ARMSubtarget>();
+    if (ST.genExecuteOnly()) {
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ScratchReg)
+        .addImm(NumBytes).setMIFlags(MIFlags);
+    } else {
+      MRI.emitLoadConstPool(MBB, MBBI, dl, ScratchReg, 0, NumBytes, ARMCC::AL,
+                            0, MIFlags);
+    }
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDhirr), ARM::SP)
+      .addReg(ARM::SP).addReg(ScratchReg, RegState::Kill)
+      .add(predOps(ARMCC::AL));
+    return;
+  }
+  // FIXME: This is assuming the heuristics in emitThumbRegPlusImmediate
+  // won't change.
   emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII,
                             MRI, MIFlags);
+
 }
 
+static void emitCallSPUpdate(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator &MBBI,
+                             const TargetInstrInfo &TII, const DebugLoc &dl,
+                             const ThumbRegisterInfo &MRI, int NumBytes,
+                             unsigned MIFlags = MachineInstr::NoFlags) {
+  emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII,
+                            MRI, MIFlags);
+}
+
+
 MachineBasicBlock::iterator Thumb1FrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
@@ -95,10 +132,10 @@
       // Replace the pseudo instruction with a new instruction...
       unsigned Opc = Old.getOpcode();
       if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
-        emitSPUpdate(MBB, I, TII, dl, *RegInfo, -Amount);
+        emitCallSPUpdate(MBB, I, TII, dl, *RegInfo, -Amount);
       } else {
         assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
-        emitSPUpdate(MBB, I, TII, dl, *RegInfo, Amount);
+        emitCallSPUpdate(MBB, I, TII, dl, *RegInfo, Amount);
       }
     }
   }
@@ -141,8 +178,8 @@
   int FramePtrSpillFI = 0;
 
   if (ArgRegsSaveSize) {
-    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -ArgRegsSaveSize,
-                 MachineInstr::FrameSetup);
+    emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -ArgRegsSaveSize,
+                                 ARM::NoRegister, MachineInstr::FrameSetup);
     CFAOffset -= ArgRegsSaveSize;
     unsigned CFIIndex = MF.addFrameInst(
         MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
@@ -153,8 +190,9 @@
 
   if (!AFI->hasStackFrame()) {
     if (NumBytes - ArgRegsSaveSize != 0) {
-      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -(NumBytes - ArgRegsSaveSize),
-                   MachineInstr::FrameSetup);
+      emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo,
+                                   -(NumBytes - ArgRegsSaveSize),
+                                   ARM::NoRegister, MachineInstr::FrameSetup);
       CFAOffset -= NumBytes - ArgRegsSaveSize;
       unsigned CFIIndex = MF.addFrameInst(
           MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
@@ -331,8 +369,20 @@
 
   if (NumBytes) {
     // Insert it after all the callee-save spills.
-    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
-                 MachineInstr::FrameSetup);
+    //
+    // For a large stack frame, we might need a scratch register to store
+    // the size of the frame.  We know all callee-save registers are free
+    // at this point in the prologue, so pick one.
+    unsigned ScratchRegister = ARM::NoRegister;
+    for (auto &I : CSI) {
+      unsigned Reg = I.getReg();
+      if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) {
+        ScratchRegister = Reg;
+        break;
+      }
+    }
+    emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
+                                 ScratchRegister, MachineInstr::FrameSetup);
     if (!HasFP) {
       CFAOffset -= NumBytes;
       unsigned CFIIndex = MF.addFrameInst(
@@ -437,7 +487,9 @@
 
   if (!AFI->hasStackFrame()) {
     if (NumBytes - ArgRegsSaveSize != 0)
-      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes - ArgRegsSaveSize);
+      emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo,
+                                   NumBytes - ArgRegsSaveSize, ARM::NoRegister,
+                                   MachineInstr::NoFlags);
   } else {
     // Unwind MBBI to point to first LDR / VLDRD.
     if (MBBI != MBB.begin()) {
@@ -472,13 +524,27 @@
             .addReg(FramePtr)
             .add(predOps(ARMCC::AL));
     } else {
+      // For a large stack frame, we might need a scratch register to store
+      // the size of the frame.  We know all callee-save registers are free
+      // at this point in the epilogue, so pick one.
+      unsigned ScratchRegister = ARM::NoRegister;
+      bool HasFP = hasFP(MF);
+      for (auto &I : MFI.getCalleeSavedInfo()) {
+        unsigned Reg = I.getReg();
+        if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) {
+          ScratchRegister = Reg;
+          break;
+        }
+      }
       if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tBX_RET &&
           &MBB.front() != &*MBBI && std::prev(MBBI)->getOpcode() == ARM::tPOP) {
         MachineBasicBlock::iterator PMBBI = std::prev(MBBI);
         if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*PMBBI, NumBytes))
-          emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
+          emitPrologueEpilogueSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes,
+                                       ScratchRegister, MachineInstr::NoFlags);
       } else if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes))
-        emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes);
+        emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes,
+                                     ScratchRegister, MachineInstr::NoFlags);
     }
   }
 
@@ -665,7 +731,9 @@
     // Advance past the pop instruction.
     MBBI++;
     // Increment the SP.
-    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize + 4);
+    emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo,
+                                 ArgRegsSaveSize + 4, ARM::NoRegister,
+                                 MachineInstr::NoFlags);
     return true;
   }
 
@@ -706,7 +774,8 @@
       .add(predOps(ARMCC::AL))
       .addReg(PopReg, RegState::Define);
 
-  emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);
+  emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize,
+                               ARM::NoRegister, MachineInstr::NoFlags);
 
   BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
       .addReg(ARM::LR, RegState::Define)
diff --git a/llvm/lib/Target/ARM/ThumbRegisterInfo.h b/llvm/lib/Target/ARM/ThumbRegisterInfo.h
--- a/llvm/lib/Target/ARM/ThumbRegisterInfo.h
+++ b/llvm/lib/Target/ARM/ThumbRegisterInfo.h
@@ -51,14 +51,10 @@
                          const ARMBaseInstrInfo &TII) const;
   void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
                          int64_t Offset) const override;
-  bool saveScavengerRegister(MachineBasicBlock &MBB,
-                             MachineBasicBlock::iterator I,
-                             MachineBasicBlock::iterator &UseMI,
-                             const TargetRegisterClass *RC,
-                             unsigned Reg) const override;
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
+  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
 };
 }
 
diff --git a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
--- a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -446,63 +446,6 @@
   (void)Done;
 }
 
-/// saveScavengerRegister - Spill the register so it can be used by the
-/// register scavenger. Return true.
-bool ThumbRegisterInfo::saveScavengerRegister(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-    MachineBasicBlock::iterator &UseMI, const TargetRegisterClass *RC,
-    unsigned Reg) const {
-
-  const ARMSubtarget &STI = MBB.getParent()->getSubtarget<ARMSubtarget>();
-  if (!STI.isThumb1Only())
-    return ARMBaseRegisterInfo::saveScavengerRegister(MBB, I, UseMI, RC, Reg);
-
-  // Thumb1 can't use the emergency spill slot on the stack because
-  // ldr/str immediate offsets must be positive, and if we're referencing
-  // off the frame pointer (if, for example, there are alloca() calls in
-  // the function, the offset will be negative. Use R12 instead since that's
-  // a call clobbered register that we know won't be used in Thumb1 mode.
-  const TargetInstrInfo &TII = *STI.getInstrInfo();
-  DebugLoc DL;
-  BuildMI(MBB, I, DL, TII.get(ARM::tMOVr))
-      .addReg(ARM::R12, RegState::Define)
-      .addReg(Reg, RegState::Kill)
-      .add(predOps(ARMCC::AL));
-
-  // The UseMI is where we would like to restore the register. If there's
-  // interference with R12 before then, however, we'll need to restore it
-  // before that instead and adjust the UseMI.
-  bool done = false;
-  for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) {
-    if (II->isDebugInstr())
-      continue;
-    // If this instruction affects R12, adjust our restore point.
-    for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = II->getOperand(i);
-      if (MO.isRegMask() && MO.clobbersPhysReg(ARM::R12)) {
-        UseMI = II;
-        done = true;
-        break;
-      }
-      if (!MO.isReg() || MO.isUndef() || !MO.getReg() ||
-          TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-        continue;
-      if (MO.getReg() == ARM::R12) {
-        UseMI = II;
-        done = true;
-        break;
-      }
-    }
-  }
-  // Restore the register from R12
-  BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVr))
-      .addReg(Reg, RegState::Define)
-      .addReg(ARM::R12, RegState::Kill)
-      .add(predOps(ARMCC::AL));
-
-  return true;
-}
-
 void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                             int SPAdj, unsigned FIOperandNum,
                                             RegScavenger *RS) const {
@@ -618,3 +561,14 @@
   if (MI.isPredicable())
     MIB.add(predOps(ARMCC::AL));
 }
+
+bool
+ThumbRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
+  if (MF.getSubtarget<ARMSubtarget>().isThumb1Only()) {
+    // For Thumb1, the emergency spill slot must be some small positive
+    // offset from the base/stack pointer.
+    return false;
+  }
+  // For Thumb2, put the emergency spill slot next to FP.
+  return true;
+}
diff --git a/llvm/test/CodeGen/ARM/ldrex-frame-size.ll b/llvm/test/CodeGen/ARM/ldrex-frame-size.ll
--- a/llvm/test/CodeGen/ARM/ldrex-frame-size.ll
+++ b/llvm/test/CodeGen/ARM/ldrex-frame-size.ll
@@ -11,9 +11,9 @@
 define void @test_large_frame() {
 ; CHECK-LABEL: test_large_frame:
 ; CHECK: push
-; CHECK: sub.w sp, sp, #1004
+; CHECK: sub.w sp, sp, #1008
 
-  %ptr = alloca i32, i32 251
+  %ptr = alloca i32, i32 252
 
   %addr = getelementptr i32, i32* %ptr, i32 1
   call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
@@ -24,9 +24,9 @@
 define void @test_small_frame() {
 ; CHECK-LABEL: test_small_frame:
 ; CHECK-NOT: push
-; CHECK: sub.w sp, sp, #1000
+; CHECK: sub.w sp, sp, #1004
 
-  %ptr = alloca i32, i32 250
+  %ptr = alloca i32, i32 251
 
   %addr = getelementptr i32, i32* %ptr, i32 1
   call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
diff --git a/llvm/test/CodeGen/ARM/scavenging.mir b/llvm/test/CodeGen/ARM/scavenging.mir
deleted file mode 100644
--- a/llvm/test/CodeGen/ARM/scavenging.mir
+++ /dev/null
@@ -1,66 +0,0 @@
-# RUN: llc -o - %s -mtriple=thumb-arm-none-eabi -mcpu=cortex-m0 -run-pass scavenger-test | FileCheck %s
----
-# CHECK-LABEL: name: scavengebug0
-# Make sure we are not spilling/using a physreg used in the very last
-# instruction of the scavenging range.
-# CHECK-NOT: tSTRi {{.*}}$r0,{{.*}}$r0
-# CHECK-NOT: tSTRi {{.*}}$r1,{{.*}}$r1
-# CHECK-NOT: tSTRi {{.*}}$r2,{{.*}}$r2
-# CHECK-NOT: tSTRi {{.*}}$r3,{{.*}}$r3
-# CHECK-NOT: tSTRi {{.*}}$r4,{{.*}}$r4
-# CHECK-NOT: tSTRi {{.*}}$r5,{{.*}}$r5
-# CHECK-NOT: tSTRi {{.*}}$r6,{{.*}}$r6
-# CHECK-NOT: tSTRi {{.*}}$r7,{{.*}}$r7
-name: scavengebug0
-body: |
-  bb.0:
-    ; Bring up register pressure to force emergency spilling
-    $r0 = IMPLICIT_DEF
-    $r1 = IMPLICIT_DEF
-    $r2 = IMPLICIT_DEF
-    $r3 = IMPLICIT_DEF
-    $r4 = IMPLICIT_DEF
-    $r5 = IMPLICIT_DEF
-    $r6 = IMPLICIT_DEF
-    $r7 = IMPLICIT_DEF
-
-    %0 : tgpr = IMPLICIT_DEF
-    %0 = tADDhirr %0, $sp, 14, $noreg
-    tSTRi $r0, %0, 0, 14, $noreg
-
-    %1 : tgpr = IMPLICIT_DEF
-    %1 = tADDhirr %1, $sp, 14, $noreg
-    tSTRi $r1, %1, 0, 14, $noreg
-
-    %2 : tgpr = IMPLICIT_DEF
-    %2 = tADDhirr %2, $sp, 14, $noreg
-    tSTRi $r2, %2, 0, 14, $noreg
-
-    %3 : tgpr = IMPLICIT_DEF
-    %3 = tADDhirr %3, $sp, 14, $noreg
-    tSTRi $r3, %3, 0, 14, $noreg
-
-    %4 : tgpr = IMPLICIT_DEF
-    %4 = tADDhirr %4, $sp, 14, $noreg
-    tSTRi $r4, %4, 0, 14, $noreg
-
-    %5 : tgpr = IMPLICIT_DEF
-    %5 = tADDhirr %5, $sp, 14, $noreg
-    tSTRi $r5, %5, 0, 14, $noreg
-
-    %6 : tgpr = IMPLICIT_DEF
-    %6 = tADDhirr %6, $sp, 14, $noreg
-    tSTRi $r6, %6, 0, 14, $noreg
-
-    %7 : tgpr = IMPLICIT_DEF
-    %7 = tADDhirr %7, $sp, 14, $noreg
-    tSTRi $r7, %7, 0, 14, $noreg
-
-    KILL $r0
-    KILL $r1
-    KILL $r2
-    KILL $r3
-    KILL $r4
-    KILL $r5
-    KILL $r6
-    KILL $r7
diff --git a/llvm/test/CodeGen/ARM/thumb1-varalloc.ll b/llvm/test/CodeGen/ARM/thumb1-varalloc.ll
--- a/llvm/test/CodeGen/ARM/thumb1-varalloc.ll
+++ b/llvm/test/CodeGen/ARM/thumb1-varalloc.ll
@@ -34,9 +34,10 @@
 	
 bb3:
 	%.0 = phi i8* [ %0, %entry ], [ %6, %bb2 ], [ %3, %bb1 ]
-; CHECK: subs    r4, #5
+; CHECK:      subs    r4, r7, #7
+; CHECK-NEXT: subs    r4, #1
 ; CHECK-NEXT: mov     sp, r4
-; CHECK-NEXT: pop     {r4, r5, r6, r7, pc}
+; CHECK-NEXT: pop     {r4, r6, r7, pc}
 	ret i8* %.0
 }
 
diff --git a/llvm/test/CodeGen/Thumb/emergency-spill-slot.ll b/llvm/test/CodeGen/Thumb/emergency-spill-slot.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb/emergency-spill-slot.ll
@@ -0,0 +1,380 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s | FileCheck %s
+target triple = "thumbv6m-unknown-unknown-eabi"
+
+define void @vla_emergency_spill(i32 %n) {
+; CHECK-LABEL: vla_emergency_spill:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .setfp r7, sp, #12
+; CHECK-NEXT:    add r7, sp, #12
+; CHECK-NEXT:    .pad #4100
+; CHECK-NEXT:    ldr r6, .LCPI0_0
+; CHECK-NEXT:    add sp, r6
+; CHECK-NEXT:    mov r6, sp
+; CHECK-NEXT:    adds r0, r0, #7
+; CHECK-NEXT:    movs r1, #7
+; CHECK-NEXT:    bics r0, r1
+; CHECK-NEXT:    mov r1, sp
+; CHECK-NEXT:    subs r0, r1, r0
+; CHECK-NEXT:    mov sp, r0
+; CHECK-NEXT:    adds r1, r6, #4
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    str r0, [r6]
+; CHECK-NEXT:    ldr r0, .LCPI0_1
+; CHECK-NEXT:    str r5, [r0, r6]
+; CHECK-NEXT:    ldr r0, [r6]
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    subs r4, r7, #7
+; CHECK-NEXT:    subs r4, #5
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI0_0:
+; CHECK-NEXT:    .long 4294963196 @ 0xffffeffc
+; CHECK-NEXT:  .LCPI0_1:
+; CHECK-NEXT:    .long 1024 @ 0x400
+entry:
+  %x = alloca [1024 x i32], align 4
+  %vla = alloca i8, i32 %n, align 1
+  %asm1 = call { i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},0,1,2,3,4,5"(i8* %vla, [1024 x i32]* %x, i32 undef, i32 undef, i32 undef, i32 undef)
+  %asmresult = extractvalue { i32, i32, i32, i32, i32, i32 } %asm1, 0
+  %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32 } %asm1, 1
+  %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32 } %asm1, 2
+  %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32 } %asm1, 3
+  %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32 } %asm1, 4
+  %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32 } %asm1, 5
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* %x, i32 0, i32 255
+  store i32 %asmresult5, i32* %arrayidx, align 4
+  call void asm sideeffect "", "{r0},{r1},{r2},{r3},{r4},{r5}"(i32 %asmresult, i32 %asmresult1, i32 %asmresult2, i32 %asmresult3, i32 %asmresult4, i32 %asmresult5) #2
+  ret void
+}
+
+define void @simple_emergency_spill(i32 %n) {
+; CHECK-LABEL: simple_emergency_spill:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .pad #8196
+; CHECK-NEXT:    ldr r7, .LCPI1_0
+; CHECK-NEXT:    add sp, r7
+; CHECK-NEXT:    add r0, sp, #4
+; CHECK-NEXT:    ldr r1, .LCPI1_2
+; CHECK-NEXT:    add r1, sp
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    str r0, [sp]
+; CHECK-NEXT:    ldr r0, .LCPI1_3
+; CHECK-NEXT:    add r0, sp
+; CHECK-NEXT:    str r5, [r0]
+; CHECK-NEXT:    ldr r0, [sp]
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    ldr r7, .LCPI1_1
+; CHECK-NEXT:    add sp, r7
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI1_0:
+; CHECK-NEXT:    .long 4294959100 @ 0xffffdffc
+; CHECK-NEXT:  .LCPI1_1:
+; CHECK-NEXT:    .long 8196 @ 0x2004
+; CHECK-NEXT:  .LCPI1_2:
+; CHECK-NEXT:    .long 4100 @ 0x1004
+; CHECK-NEXT:  .LCPI1_3:
+; CHECK-NEXT:    .long 5120 @ 0x1400
+entry:
+  %x = alloca [1024 x i32], align 4
+  %y = alloca [1024 x i32], align 4
+  %asm1 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},={r6},={r7},0,1,2,3,4,5,6,7"([1024 x i32]* %y, [1024 x i32]* %x, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef)
+  %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 0
+  %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 1
+  %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 2
+  %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 3
+  %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 4
+  %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 5
+  %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 6
+  %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 7
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* %x, i32 0, i32 255
+  store i32 %asmresult6, i32* %arrayidx, align 4
+  call void asm sideeffect "", "{r0},{r1},{r2},{r3},{r4},{r5},{r6},{r7}"(i32 %asmresult, i32 %asmresult2, i32 %asmresult3, i32 %asmresult4, i32 %asmresult5, i32 %asmresult6, i32 %asmresult7, i32 %asmresult8)
+  ret void
+}
+
+; We have some logic to try to spill registers instead of allocating an
+; emergency spill slot, but for targets where the stack alignment is 8,
+; it only triggers when there are two available registers.  (This is
+; maybe worth looking into, to improve the generated code quality.)
+;
+; The scavenger itself only cares whether a register is allocatable, not
+; whether it was actually spilled in the prologue, and r7 is first on
+; the priority list, so we use it anyway.  This is likely to confuse
+; debuggers, so maybe worth changing at some point.
+define void @simple_emergency_spill_nor7(i32 %n) {
+; CHECK-LABEL: simple_emergency_spill_nor7:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    .pad #8196
+; CHECK-NEXT:    ldr r6, .LCPI2_0
+; CHECK-NEXT:    add sp, r6
+; CHECK-NEXT:    add r0, sp, #4
+; CHECK-NEXT:    ldr r1, .LCPI2_2
+; CHECK-NEXT:    add r1, sp
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    str r7, [sp]
+; CHECK-NEXT:    ldr r7, .LCPI2_3
+; CHECK-NEXT:    add r7, sp
+; CHECK-NEXT:    str r5, [r7]
+; CHECK-NEXT:    ldr r7, [sp]
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    ldr r6, .LCPI2_1
+; CHECK-NEXT:    add sp, r6
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI2_0:
+; CHECK-NEXT:    .long 4294959100 @ 0xffffdffc
+; CHECK-NEXT:  .LCPI2_1:
+; CHECK-NEXT:    .long 8196 @ 0x2004
+; CHECK-NEXT:  .LCPI2_2:
+; CHECK-NEXT:    .long 4100 @ 0x1004
+; CHECK-NEXT:  .LCPI2_3:
+; CHECK-NEXT:    .long 5120 @ 0x1400
+entry:
+  %x = alloca [1024 x i32], align 4
+  %y = alloca [1024 x i32], align 4
+  %asm1 = call { i32, i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},={r6},0,1,2,3,4,5,6"([1024 x i32]* %y, [1024 x i32]* %x, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef)
+  %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 0
+  %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 1
+  %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 2
+  %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 3
+  %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 4
+  %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 5
+  %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 6
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* %x, i32 0, i32 255
+  store i32 %asmresult6, i32* %arrayidx, align 4
+  call void asm sideeffect "", "{r0},{r1},{r2},{r3},{r4},{r5},{r6}"(i32 %asmresult, i32 %asmresult2, i32 %asmresult3, i32 %asmresult4, i32 %asmresult5, i32 %asmresult6, i32 %asmresult7)
+  ret void
+}
+
+define void @arg_emergency_spill(i32 %n, i32 %n2, i32 %n3, i32 %n4, [252 x i32]* byval %p) {
+; CHECK-LABEL: arg_emergency_spill:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    add r0, sp, #24
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    str r0, [sp]
+; CHECK-NEXT:    ldr r0, .LCPI3_0
+; CHECK-NEXT:    add r0, sp
+; CHECK-NEXT:    str r5, [r0]
+; CHECK-NEXT:    ldr r0, [sp]
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI3_0:
+; CHECK-NEXT:    .long 1028 @ 0x404
+entry:
+  %pp = getelementptr inbounds [252 x i32], [252 x i32]* %p, i32 0, i32 0
+  %asm1 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},={r6},={r7},0,1,2,3,4,5,6,7"(i32* %pp, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef)
+  %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 0
+  %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 1
+  %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 2
+  %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 3
+  %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 4
+  %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 5
+  %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 6
+  %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 7
+  %arrayidx = getelementptr inbounds i32, i32* %pp, i32 251
+  store i32 %asmresult6, i32* %arrayidx, align 4
+  call void asm sideeffect "", "{r0},{r1},{r2},{r3},{r4},{r5},{r6},{r7}"(i32 %asmresult, i32 %asmresult2, i32 %asmresult3, i32 %asmresult4, i32 %asmresult5, i32 %asmresult6, i32 %asmresult7, i32 %asmresult8)
+  ret void
+}
+
+; We currently overestimate the amount of required stack space by 16 bytes,
+; so this is the largest stack that doesn't require an emergency spill slot.
+define void @arg_no_emergency_spill(i32 %n, i32 %n2, i32 %n3, i32 %n4, [248 x i32]* byval %p) {
+; CHECK-LABEL: arg_no_emergency_spill:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    add r0, sp, #20
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    str r5, [sp, #1008]
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+entry:
+  %pp = getelementptr inbounds [248 x i32], [248 x i32]* %p, i32 0, i32 0
+  %asm1 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},={r6},={r7},0,1,2,3,4,5,6,7"(i32* %pp, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef)
+  %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 0
+  %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 1
+  %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 2
+  %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 3
+  %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 4
+  %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 5
+  %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 6
+  %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 7
+  %arrayidx = getelementptr inbounds i32, i32* %pp, i32 247
+  store i32 %asmresult6, i32* %arrayidx, align 4
+  call void asm sideeffect "", "{r0},{r1},{r2},{r3},{r4},{r5},{r6},{r7}"(i32 %asmresult, i32 %asmresult2, i32 %asmresult3, i32 %asmresult4, i32 %asmresult5, i32 %asmresult6, i32 %asmresult7, i32 %asmresult8)
+  ret void
+}
+
+define void @aligned_emergency_spill(i32 %n, i32 %n2, i32 %n3, i32 %n4, [31 x i32]* byval %p) {
+; CHECK-LABEL: aligned_emergency_spill:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .setfp r7, sp, #12
+; CHECK-NEXT:    add r7, sp, #12
+; CHECK-NEXT:    .pad #44
+; CHECK-NEXT:    sub sp, #44
+; CHECK-NEXT:    mov r4, sp
+; CHECK-NEXT:    lsrs r4, r4, #4
+; CHECK-NEXT:    lsls r4, r4, #4
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    add r0, sp, #16
+; CHECK-NEXT:    adds r1, r7, #7
+; CHECK-NEXT:    adds r1, #1
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    str r0, [sp, #12]
+; CHECK-NEXT:    ldr r0, .LCPI5_0
+; CHECK-NEXT:    str r5, [r0, r7]
+; CHECK-NEXT:    ldr r0, [sp, #12]
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    subs r4, r7, #7
+; CHECK-NEXT:    subs r4, #5
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI5_0:
+; CHECK-NEXT:    .long 128 @ 0x80
+entry:
+  %y = alloca [4 x i32], align 16
+  %pp = getelementptr inbounds [31 x i32], [31 x i32]* %p, i32 0, i32 0
+  %asm1 = call { i32, i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},={r6},0,1,2,3,4,5,6"([4 x i32]* %y, i32* %pp, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef) #3
+  %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 0
+  %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 1
+  %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 2
+  %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 3
+  %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 4
+  %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 5
+  %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 6
+  %arrayidx = getelementptr inbounds i32, i32* %pp, i32 30
+  store i32 %asmresult6, i32* %arrayidx, align 4
+  call void asm sideeffect "", "{r0},{r1},{r2},{r3},{r4},{r5},{r6}"(i32 %asmresult, i32 %asmresult2, i32 %asmresult3, i32 %asmresult4, i32 %asmresult5, i32 %asmresult6, i32 %asmresult7)
+  ret void
+}
+
+; This function should have no emergency spill slot, so its stack should be
+; smaller than @aligned_emergency_spill.
+define void @aligned_no_emergency_spill(i32 %n, i32 %n2, i32 %n3, i32 %n4, [30 x i32]* byval %p) {
+; CHECK-LABEL: aligned_no_emergency_spill:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .setfp r7, sp, #12
+; CHECK-NEXT:    add r7, sp, #12
+; CHECK-NEXT:    .pad #28
+; CHECK-NEXT:    sub sp, #28
+; CHECK-NEXT:    mov r4, sp
+; CHECK-NEXT:    lsrs r4, r4, #4
+; CHECK-NEXT:    lsls r4, r4, #4
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    adds r1, r7, #7
+; CHECK-NEXT:    adds r1, #1
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    str r5, [r7, #124]
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    subs r4, r7, #7
+; CHECK-NEXT:    subs r4, #5
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+entry:
+  %y = alloca [4 x i32], align 16
+  %pp = getelementptr inbounds [30 x i32], [30 x i32]* %p, i32 0, i32 0
+  %asm1 = call { i32, i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},={r6},0,1,2,3,4,5,6"([4 x i32]* %y, i32* %pp, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef) #3
+  %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 0
+  %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 1
+  %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 2
+  %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 3
+  %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 4
+  %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 5
+  %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 6
+  %arrayidx = getelementptr inbounds i32, i32* %pp, i32 29
+  store i32 %asmresult6, i32* %arrayidx, align 4
+  call void asm sideeffect "", "{r0},{r1},{r2},{r3},{r4},{r5},{r6}"(i32 %asmresult, i32 %asmresult2, i32 %asmresult3, i32 %asmresult4, i32 %asmresult5, i32 %asmresult6, i32 %asmresult7)
+  ret void
+}
+
+; This function shouldn't fail to compile.  (It's UB, so it doesn't really
+; matter what it compiles to, exactly, but we need to check at some point
+; so we don't generate code that requires an emergency spill slot we never
+; allocated.  If the store gets eliminated, this testcase probably needs
+; to be rewritten.)
+define void @aligned_out_of_range_access(i32 %n, i32 %n2, i32 %n3, i32 %n4, [30 x i32]* byval %p) {
+; CHECK-LABEL: aligned_out_of_range_access:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    .setfp r7, sp, #12
+; CHECK-NEXT:    add r7, sp, #12
+; CHECK-NEXT:    .pad #44
+; CHECK-NEXT:    sub sp, #44
+; CHECK-NEXT:    mov r4, sp
+; CHECK-NEXT:    lsrs r4, r4, #4
+; CHECK-NEXT:    lsls r4, r4, #4
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    add r0, sp, #16
+; CHECK-NEXT:    adds r1, r7, #7
+; CHECK-NEXT:    adds r1, #1
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    str r5, [r0, #120]
+; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    subs r4, r7, #7
+; CHECK-NEXT:    subs r4, #5
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+entry:
+  %y = alloca [4 x i32], align 16
+  %pp = getelementptr inbounds [30 x i32], [30 x i32]* %p, i32 0, i32 0
+  %asm1 = call { i32, i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},={r6},0,1,2,3,4,5,6"([4 x i32]* %y, i32* %pp, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef) #3
+  %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 0
+  %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 1
+  %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 2
+  %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 3
+  %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 4
+  %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 5
+  %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 6
+  %arrayidx = getelementptr inbounds i32, i32* %pp, i32 30
+  store i32 %asmresult6, i32* %arrayidx, align 4
+  call void asm sideeffect "", "{r0},{r1},{r2},{r3},{r4},{r5},{r6}"(i32 %asmresult, i32 %asmresult2, i32 %asmresult3, i32 %asmresult4, i32 %asmresult5, i32 %asmresult6, i32 %asmresult7)
+  ret void
+}
diff --git a/llvm/test/CodeGen/Thumb/frame-access.ll b/llvm/test/CodeGen/Thumb/frame-access.ll
--- a/llvm/test/CodeGen/Thumb/frame-access.ll
+++ b/llvm/test/CodeGen/Thumb/frame-access.ll
@@ -124,7 +124,7 @@
 ; CHECK-NEXT:  lsls r4, r4, #4
 ; CHECK-NEXT:  mov  sp, r4
 ; Incoming register varargs stored via FP
-; CHECK: mov	r0, r7
+; CHECK:      mov r0, r7
 ; CHECK-NEXT: adds r0, #8
 ; CHECK-NEXT: stm r0!, {r1, r2, r3}
 ; VLAs present, access via FP
@@ -199,11 +199,13 @@
 ; CHECK:       push {r4, r5, r6, r7, lr}
 ; 20 bytes locals
 ; CHECK:       sub sp, #20
+; Setup base pointer
+; CHECK:       mov r6, sp
 ; Allocate outgoing arguments space
 ; CHECK:       sub sp, #508
 ; CHECK:       sub sp, #4
-; Load `e` via SP, 552 = 512 + 20 + 20
-; CHECK:       ldr r3, [sp, #552]
+; Load `e` via BP, 40 = 20 + 20
+; CHECK:       ldr r3, [r6, #40]
 ; CHECK:       bl  f
 ; Stack restored before next call
 ; CHECK-NEXT:  add sp, #508
@@ -235,11 +237,12 @@
 ; Three incoming register varargs
 ; CHECK:       sub sp, #12
 ; 16 bytes callee-saves
-; CHECK:       push {r4, r5, r7, lr}
+; CHECK:       push {r4, r5, r6, lr}
 ; 20 bytes locals
 ; CHECK:       sub sp, #20
-; Incoming varargs stored via SP, 36 = 20 + 16
-; CHECK:       add r0, sp, #36
+; Incoming varargs stored via BP, 36 = 20 + 16
+; CHECK:       mov r0, r6
+; CHECK-NEXT:  adds r0, #36
 ; CHECK-NEXT:  stm r0!, {r1, r2, r3}
 
 ;
@@ -394,17 +397,19 @@
 ; CHECK-LABEL: test_local_moving_sp
 ; Locals area
 ; CHECK:      sub sp, #36
+; Setup BP
+; CHECK:      mov r6, sp
 ; Outoging arguments
 ; CHECK:      sub sp, #508
 ; CHECK-NEXT: sub sp, #508
 ; CHECK-NEXT: sub sp, #8
-; Argument addresses computed relative to SP
-; CHECK:      add  r4, sp, #1020
-; CHECK-NEXT: adds r4, #24
-; CHECK:      add  r1, sp, #1020
-; CHECK-NEXT: adds r1, #20
-; CHECK:      add  r5, sp, #1020
-; CHECK-NEXT: adds r5, #16
+; Argument addresses computed relative to BP
+; CHECK:      adds r0, r6, #7
+; CHECK-NEXT: adds r0, #13
+; CHECK:      adds r1, r6, #7
+; CHECK-NEXT: adds r1, #9
+; CHECK:      adds r5, r6, #7
+; CHECK-NEXT: adds r5, #5
 ; CHECK:      bl   u
 ; Stack restored before next call
 ; CHECK:      add  sp, #508
diff --git a/llvm/test/CodeGen/Thumb/large-stack.ll b/llvm/test/CodeGen/Thumb/large-stack.ll
--- a/llvm/test/CodeGen/Thumb/large-stack.ll
+++ b/llvm/test/CodeGen/Thumb/large-stack.ll
@@ -33,9 +33,8 @@
 ; CHECK: sub sp, #508
 ; CHECK: sub sp, #508
 ; CHECK: sub sp, #508
-; ALIGN4: subs r4, r7, #4
-; ALIGN8: subs r4, r7, #7
-; ALIGN8: subs r4, #1
+; CHECK: subs r4, r7, #7
+; CHECK: subs r4, #1
 ; CHECK: mov sp, r4
     %tmp = alloca [ 1524 x i8 ] , align 4
     ret void
@@ -57,9 +56,8 @@
 ; CHECK-LABEL: test2_nofpelim:
 ; CHECK: ldr [[TEMP:r[0-7]]],
 ; CHECK: add sp, [[TEMP]]
-; ALIGN4: subs r4, r7, #4
-; ALIGN8: subs r4, r7, #7
-; ALIGN8: subs r4, #1
+; CHECK: subs r4, r7, #7
+; CHECK: subs r4, #1
 ; CHECK: mov sp, r4
     %tmp = alloca [ 1528 x i8 ] , align 4
     ret void