Index: clang/docs/ClangCommandLineReference.rst
===================================================================
--- clang/docs/ClangCommandLineReference.rst
+++ clang/docs/ClangCommandLineReference.rst
@@ -2430,6 +2430,13 @@
 
 ARM
 ---
+.. option:: -ffixed-r4
+
+Reserve the r4 register (ARM only)
+
+.. option:: -ffixed-r5
+
+Reserve the r5 register (ARM only)
 
 .. option:: -ffixed-r6
 
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -2263,7 +2263,7 @@
 def mno_restrict_it: Flag<["-"], "mno-restrict-it">, Group<m_arm_Features_Group>,
   HelpText<"Allow generation of deprecated IT blocks for ARMv8. It is off by default for ARMv8 Thumb mode">;
 def marm : Flag<["-"], "marm">, Alias<mno_thumb>;
-foreach i = {6-11} in
+foreach i = {4-11} in
   def ffixed_r#i : Flag<["-"], "ffixed-r"#i>, Group<m_arm_Features_Group>,
     HelpText<"Reserve the r"#i#" register (ARM only)">;
 def mno_movt : Flag<["-"], "mno-movt">, Group<m_arm_Features_Group>,
Index: clang/lib/Driver/ToolChains/Arch/ARM.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Arch/ARM.cpp
+++ clang/lib/Driver/ToolChains/Arch/ARM.cpp
@@ -619,6 +619,8 @@
 #define HANDLE_FFIXED_R(n) \
   if (Args.hasArg(options::OPT_ffixed_r##n)) \
     Features.push_back("+reserve-r" #n)
+  HANDLE_FFIXED_R(4);
+  HANDLE_FFIXED_R(5);
   HANDLE_FFIXED_R(6);
   HANDLE_FFIXED_R(7);
   HANDLE_FFIXED_R(8);
Index: llvm/lib/Target/ARM/ARM.td
===================================================================
--- llvm/lib/Target/ARM/ARM.td
+++ llvm/lib/Target/ARM/ARM.td
@@ -391,7 +391,7 @@
                                              "Enable the generation of "
                                              "execute only code.">;
 
-foreach i = {6-11} in
+foreach i = {4-11} in
     def FeatureReserveR#i : SubtargetFeature<"reserve-r"#i,
                                              "ReservedGPRegisters["#i#"]", "true",
                                              "Reserve R"#i#", making it "
Index: llvm/lib/Target/ARM/ARMFrameLowering.h
===================================================================
--- llvm/lib/Target/ARM/ARMFrameLowering.h
+++ llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H
 #define LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include <vector>
@@ -71,6 +72,11 @@
     return false;
   }
 
+  // Return a non-reserved general purpose register that can be used as scratch
+  // register
+  unsigned getScratchRegister(const MachineFunction &MF,
+                              BitVector inUse = BitVector()) const;
+
 private:
   void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                     const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc,
Index: llvm/lib/Target/ARM/ARMFrameLowering.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -73,7 +73,7 @@
 
 static MachineBasicBlock::iterator
 skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
-                        unsigned NumAlignedDPRCS2Regs);
+                        unsigned NumAlignedDPRCS2Regs, unsigned ScratchReg);
 
 ARMFrameLowering::ARMFrameLowering(const ARMSubtarget &sti)
     : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, Align(4)),
@@ -133,6 +133,24 @@
   return !MFI.hasVarSizedObjects();
 }
 
+// Decide which register out of r4-r11 can be used as a scratch register, based
+// on whether they're free according to RegisterInfo and not excluded by inUse
+unsigned ARMFrameLowering::getScratchRegister(const MachineFunction &MF,
+                                              BitVector InUse) const {
+  bool HasFP = hasFP(MF);
+  unsigned R4 = ARM::R4 - ARM::GPRRegClass.begin()[0];
+  for (unsigned i = R4; i < ARM::GPRRegClass.getNumRegs() - R4; i++) {
+    if (!STI.isGPRegisterReserved(i) &&
+        !((InUse.size() >= i && InUse[i]) ||
+          (HasFP &&
+           i == STI.getInstrInfo()->getRegisterInfo().getFrameRegister(MF)))) {
+      return ARM::GPRRegClass.getRegister(i);
+    }
+  }
+  report_fatal_error("ran out of registers: Too many registers reserved");
+  return ARM::NoRegister;
+}
+
 /// canSimplifyCallFramePseudos - If there is a reserved call frame, the
 /// call frame pseudos can be simplified.  Unlike most targets, having a FP
 /// is not sufficient here since we still may reference some objects via SP
@@ -506,7 +524,8 @@
 
   // Move past the aligned DPRCS2 area.
   if (AFI->getNumAlignedDPRCS2Regs() > 0) {
-    MBBI = skipAlignedDPRCS2Spills(MBBI, AFI->getNumAlignedDPRCS2Regs());
+    MBBI = skipAlignedDPRCS2Spills(MBBI, AFI->getNumAlignedDPRCS2Regs(),
+                                   getScratchRegister(MF));
     // The code inserted by emitAlignedDPRCS2Spills realigns the stack, and
     // leaves the stack pointer pointing to the DPRCS2 area.
     //
@@ -516,6 +535,22 @@
     NumBytes = DPRCSOffset;
 
   if (STI.isTargetWindows() && WindowsRequiresStackProbe(MF, NumBytes)) {
+    // r4 might be used as global named register, which means its value has not
+    // been and must not be saved to the stack. Thus temporarily save its
+    // content in another register so r4 is free to be used for __chkstk
+    unsigned BackupR4 = ARM::NoRegister;
+    if (STI.isGPRegisterReserved(ARM::R4 - ARM::GPRRegClass.begin()[0])) {
+      if (!STI.hasV6Ops())
+        // Do not allow r4 to be reserved if segmented stacks are used on a
+        // target where 'mov low_reg, low_reg' is not possible
+        report_fatal_error("-ffixed-r4 is not allowed for this target when "
+                           "segmented stacks are in use.");
+      BackupR4 = getScratchRegister(MF);
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), BackupR4)
+          .addReg(ARM::R4)
+          .add(predOps(ARMCC::AL));
+    }
+
     uint32_t NumWords = NumBytes >> 2;
 
     if (NumWords < 65536)
@@ -560,6 +595,13 @@
         .add(predOps(ARMCC::AL))
         .add(condCodeOp());
     NumBytes = 0;
+
+    // If r4 is actually a fixed register, restore it
+    if (BackupR4) {
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4)
+          .addReg(BackupR4)
+          .add(predOps(ARMCC::AL));
+    }
   }
 
   if (NumBytes) {
@@ -723,20 +765,21 @@
       emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::SP, MaxAlign,
                                false);
     } else {
-      // We cannot use sp as source/dest register here, thus we're using r4 to
-      // perform the calculations. We're emitting the following sequence:
+      // We cannot use sp as source/dest register here, thus we're using a
+      // scratch register to perform the calculations. We're emitting the
+      // following sequence (ex.: scratchReg=r4):
       // mov r4, sp
       // -- use emitAligningInstructions to produce best sequence to zero
       // -- out lower bits in r4
       // mov sp, r4
-      // FIXME: It will be better just to find spare register here.
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4)
+      unsigned ScratchReg = getScratchRegister(MF);
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ScratchReg)
           .addReg(ARM::SP, RegState::Kill)
           .add(predOps(ARMCC::AL));
-      emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::R4, MaxAlign,
-                               false);
+      emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ScratchReg,
+                               MaxAlign, false);
       BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
-          .addReg(ARM::R4, RegState::Kill)
+          .addReg(ScratchReg, RegState::Kill)
           .add(predOps(ARMCC::AL));
     }
 
@@ -827,13 +870,14 @@
           // sub sp, #24
           // This is bad, if an interrupt is taken after the mov, sp is in an
           // inconsistent state.
-          // Use the first callee-saved register as a scratch register.
-          assert(!MFI.getPristineRegs(MF).test(ARM::R4) &&
+          // Use the scratch register.
+          unsigned ScratchReg = getScratchRegister(MF);
+          assert(!MFI.getPristineRegs(MF).test(ScratchReg) &&
                  "No scratch register to restore SP from FP!");
-          emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
+          emitT2RegPlusImmediate(MBB, MBBI, dl, ScratchReg, FramePtr, -NumBytes,
                                  ARMCC::AL, 0, TII);
           BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
-              .addReg(ARM::R4)
+              .addReg(ScratchReg)
               .add(predOps(ARMCC::AL));
         }
       } else {
@@ -1163,7 +1207,8 @@
                                     MachineBasicBlock::iterator MI,
                                     unsigned NumAlignedDPRCS2Regs,
                                     const std::vector<CalleeSavedInfo> &CSI,
-                                    const TargetRegisterInfo *TRI) {
+                                    const TargetRegisterInfo *TRI,
+                                    unsigned ScratchReg) {
   MachineFunction &MF = *MBB.getParent();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
@@ -1192,8 +1237,8 @@
       MFI.setObjectAlignment(FI, MFI.getMaxAlignment());
   }
 
-  // Move the stack pointer to the d8 spill slot, and align it at the same
-  // time. Leave the stack slot address in the scratch register r4.
+  // Move the stack pointer to the d8 spill slot, and align it at the same time.
+  // Leave the stack slot address in the scratch register. (ex.: scratchReg=r4)
   //
   //   sub r4, sp, #numregs * 8
   //   bic r4, r4, #align - 1
@@ -1206,7 +1251,7 @@
   // sub r4, sp, #numregs * 8
   // The immediate is <= 64, so it doesn't need any special encoding.
   unsigned Opc = isThumb ? ARM::t2SUBri : ARM::SUBri;
-  BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
+  BuildMI(MBB, MI, DL, TII.get(Opc), ScratchReg)
       .addReg(ARM::SP)
       .addImm(8 * NumAlignedDPRCS2Regs)
       .add(predOps(ARMCC::AL))
@@ -1218,21 +1263,22 @@
   // stack alignment.  Luckily, this can always be done since all ARM
   // architecture versions that support Neon also support the BFC
   // instruction.
-  emitAligningInstructions(MF, AFI, TII, MBB, MI, DL, ARM::R4, MaxAlign, true);
+  emitAligningInstructions(MF, AFI, TII, MBB, MI, DL, ScratchReg, MaxAlign,
+                           true);
 
   // mov sp, r4
   // The stack pointer must be adjusted before spilling anything, otherwise
   // the stack slots could be clobbered by an interrupt handler.
-  // Leave r4 live, it is used below.
+  // Leave scratch register live, it is used below.
   Opc = isThumb ? ARM::tMOVr : ARM::MOVr;
   MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(Opc), ARM::SP)
-                                .addReg(ARM::R4)
+                                .addReg(ScratchReg)
                                 .add(predOps(ARMCC::AL));
   if (!isThumb)
     MIB.add(condCodeOp());
 
   // Now spill NumAlignedDPRCS2Regs registers starting from d8.
-  // r4 holds the stack slot address.
+  // The scratch register holds the stack slot address.
   unsigned NextReg = ARM::D8;
 
   // 16-byte aligned vst1.64 with 4 d-regs and address writeback.
@@ -1241,8 +1287,8 @@
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
                                                &ARM::QQPRRegClass);
     MBB.addLiveIn(SupReg);
-    BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Qwb_fixed), ARM::R4)
-        .addReg(ARM::R4, RegState::Kill)
+    BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Qwb_fixed), ScratchReg)
+        .addReg(ScratchReg, RegState::Kill)
         .addImm(16)
         .addReg(NextReg)
         .addReg(SupReg, RegState::ImplicitKill)
@@ -1251,8 +1297,8 @@
     NumAlignedDPRCS2Regs -= 4;
   }
 
-  // We won't modify r4 beyond this point.  It currently points to the next
-  // register to be spilled.
+  // We won't modify the scratch register beyond this point. It currently points
+  // to the next register to be spilled.
   unsigned R4BaseReg = NextReg;
 
   // 16-byte aligned vst1.64 with 4 d-regs, no writeback.
@@ -1261,7 +1307,7 @@
                                                &ARM::QQPRRegClass);
     MBB.addLiveIn(SupReg);
     BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Q))
-        .addReg(ARM::R4)
+        .addReg(ScratchReg)
         .addImm(16)
         .addReg(NextReg)
         .addReg(SupReg, RegState::ImplicitKill)
@@ -1276,7 +1322,7 @@
                                                &ARM::QPRRegClass);
     MBB.addLiveIn(SupReg);
     BuildMI(MBB, MI, DL, TII.get(ARM::VST1q64))
-        .addReg(ARM::R4)
+        .addReg(ScratchReg)
         .addImm(16)
         .addReg(SupReg)
         .add(predOps(ARMCC::AL));
@@ -1290,23 +1336,24 @@
     // vstr.64 uses addrmode5 which has an offset scale of 4.
     BuildMI(MBB, MI, DL, TII.get(ARM::VSTRD))
         .addReg(NextReg)
-        .addReg(ARM::R4)
+        .addReg(ScratchReg)
         .addImm((NextReg - R4BaseReg) * 2)
         .add(predOps(ARMCC::AL));
   }
 
-  // The last spill instruction inserted should kill the scratch register r4.
-  std::prev(MI)->addRegisterKilled(ARM::R4, TRI);
+  // The last spill instruction inserted should kill the scratch register.
+  std::prev(MI)->addRegisterKilled(ScratchReg, TRI);
 }
 
 /// Skip past the code inserted by emitAlignedDPRCS2Spills, and return an
 /// iterator to the following instruction.
 static MachineBasicBlock::iterator
 skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
-                        unsigned NumAlignedDPRCS2Regs) {
+                        unsigned NumAlignedDPRCS2Regs, unsigned ScratchReg) {
   //   sub r4, sp, #numregs * 8
   //   bic r4, r4, #align - 1
   //   mov sp, r4
+  //   (ex.: scratchReg=r4)
   ++MI; ++MI; ++MI;
   assert(MI->mayStore() && "Expecting spill instruction");
 
@@ -1323,7 +1370,7 @@
   case 1:
   case 2:
   case 4:
-    assert(MI->killsRegister(ARM::R4) && "Missed kill flag");
+    assert(MI->killsRegister(ScratchReg) && "Missed kill flag");
     ++MI;
   }
   return MI;
@@ -1336,7 +1383,8 @@
                                       MachineBasicBlock::iterator MI,
                                       unsigned NumAlignedDPRCS2Regs,
                                       const std::vector<CalleeSavedInfo> &CSI,
-                                      const TargetRegisterInfo *TRI) {
+                                      const TargetRegisterInfo *TRI,
+                                      unsigned ScratchReg) {
   MachineFunction &MF = *MBB.getParent();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
@@ -1350,7 +1398,7 @@
       break;
     }
 
-  // Materialize the address of the d8 spill slot into the scratch register r4.
+  // Materialize the address of the d8 spill slot into the scratch register.
   // This can be fairly complicated if the stack frame is large, so just use
   // the normal frame index elimination mechanism to do it.  This code runs as
   // the initial part of the epilog where the stack and base pointers haven't
@@ -1359,7 +1407,7 @@
   assert(!AFI->isThumb1OnlyFunction() && "Can't realign stack for thumb1");
 
   unsigned Opc = isThumb ? ARM::t2ADDri : ARM::ADDri;
-  BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
+  BuildMI(MBB, MI, DL, TII.get(Opc), ScratchReg)
       .addFrameIndex(D8SpillFI)
       .addImm(0)
       .add(predOps(ARMCC::AL))
@@ -1373,8 +1421,8 @@
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
                                                &ARM::QQPRRegClass);
     BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Qwb_fixed), NextReg)
-        .addReg(ARM::R4, RegState::Define)
-        .addReg(ARM::R4, RegState::Kill)
+        .addReg(ScratchReg, RegState::Define)
+        .addReg(ScratchReg, RegState::Kill)
         .addImm(16)
         .addReg(SupReg, RegState::ImplicitDefine)
         .add(predOps(ARMCC::AL));
@@ -1382,8 +1430,8 @@
     NumAlignedDPRCS2Regs -= 4;
   }
 
-  // We won't modify r4 beyond this point.  It currently points to the next
-  // register to be spilled.
+  // We won't modify the scratch register beyond this point. It currently points
+  // to the next register to be spilled.
   unsigned R4BaseReg = NextReg;
 
   // 16-byte aligned vld1.64 with 4 d-regs, no writeback.
@@ -1391,7 +1439,7 @@
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
                                                &ARM::QQPRRegClass);
     BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Q), NextReg)
-        .addReg(ARM::R4)
+        .addReg(ScratchReg)
         .addImm(16)
         .addReg(SupReg, RegState::ImplicitDefine)
         .add(predOps(ARMCC::AL));
@@ -1404,7 +1452,7 @@
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
                                                &ARM::QPRRegClass);
     BuildMI(MBB, MI, DL, TII.get(ARM::VLD1q64), SupReg)
-        .addReg(ARM::R4)
+        .addReg(ScratchReg)
         .addImm(16)
         .add(predOps(ARMCC::AL));
     NextReg += 2;
@@ -1414,12 +1462,12 @@
   // Finally, use a vanilla vldr.64 for the remaining odd register.
   if (NumAlignedDPRCS2Regs)
     BuildMI(MBB, MI, DL, TII.get(ARM::VLDRD), NextReg)
-        .addReg(ARM::R4)
+        .addReg(ScratchReg)
         .addImm(2 * (NextReg - R4BaseReg))
         .add(predOps(ARMCC::AL));
 
-  // Last store kills r4.
-  std::prev(MI)->addRegisterKilled(ARM::R4, TRI);
+  // Last store kills the scratch register.
+  std::prev(MI)->addRegisterKilled(ScratchReg, TRI);
 }
 
 bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
@@ -1448,7 +1496,8 @@
   // The stack realignment code will be inserted between the push instructions
   // and these spills.
   if (NumAlignedDPRCS2Regs)
-    emitAlignedDPRCS2Spills(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI);
+    emitAlignedDPRCS2Spills(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI,
+                            getScratchRegister(MF));
 
   return true;
 }
@@ -1468,7 +1517,8 @@
   // The emitPopInst calls below do not insert reloads for the aligned DPRCS2
   // registers. Do that here instead.
   if (NumAlignedDPRCS2Regs)
-    emitAlignedDPRCS2Restores(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI);
+    emitAlignedDPRCS2Restores(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI,
+                              getScratchRegister(MF));
 
   unsigned PopOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD;
   unsigned LdrOpc = AFI->isThumbFunction() ? ARM::t2LDR_POST :ARM::LDR_POST_IMM;
@@ -1586,8 +1636,8 @@
 // In functions that realign the stack, it can be an advantage to spill the
 // callee-saved vector registers after realigning the stack. The vst1 and vld1
 // instructions take alignment hints that can improve performance.
-static void
-checkNumAlignedDPRCS2Regs(MachineFunction &MF, BitVector &SavedRegs) {
+static void checkNumAlignedDPRCS2Regs(MachineFunction &MF, BitVector &SavedRegs,
+                                      unsigned ScratchReg) {
   MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(0);
   if (!SpillAlignedNEONRegs)
     return;
@@ -1627,7 +1677,7 @@
   MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(NumSpills);
 
   // A scratch register is required for the vst1 / vld1 instructions.
-  SavedRegs.set(ARM::R4);
+  SavedRegs.set(ScratchReg);
 }
 
 void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
@@ -1656,23 +1706,23 @@
   (void)TRI;  // Silence unused warning in non-assert builds.
   Register FramePtr = RegInfo->getFrameRegister(MF);
 
-  // Spill R4 if Thumb2 function requires stack realignment - it will be used as
-  // scratch register. Also spill R4 if Thumb2 function has varsized objects,
-  // since it's not always possible to restore sp from fp in a single
-  // instruction.
-  // FIXME: It will be better just to find spare register here.
+  unsigned ScratchReg = getScratchRegister(MF);
+
+  // Spill scratch register if Thumb2 function requires stack realignment.
+  // Also spill it if Thumb2 function has varsized objects, since it's not
+  // always possible to restore sp from fp in a single instruction.
   if (AFI->isThumb2Function() &&
       (MFI.hasVarSizedObjects() || RegInfo->needsStackRealignment(MF)))
-    SavedRegs.set(ARM::R4);
+    SavedRegs.set(ScratchReg);
 
-  // If a stack probe will be emitted, spill R4 and LR, since they are
-  // clobbered by the stack probe call.
+  // If a stack probe will be emitted, spill scratch register and LR, since they
+  // are clobbered by the stack probe call.
   // This estimate should be a safe, conservative estimate. The actual
   // stack probe is enabled based on the size of the local objects;
   // this estimate also includes the varargs store size.
   if (STI.isTargetWindows() &&
       WindowsRequiresStackProbe(MF, MFI.estimateStackSize(MF))) {
-    SavedRegs.set(ARM::R4);
+    SavedRegs.set(ScratchReg);
     SavedRegs.set(ARM::LR);
   }
 
@@ -1681,19 +1731,19 @@
     if (AFI->getArgRegsSaveSize() > 0)
       SavedRegs.set(ARM::LR);
 
-    // Spill R4 if Thumb1 epilogue has to restore SP from FP or the function
-    // requires stack alignment.  We don't know for sure what the stack size
-    // will be, but for this, an estimate is good enough. If there anything
-    // changes it, it'll be a spill, which implies we've used all the registers
-    // and so R4 is already used, so not marking it here will be OK.
-    // FIXME: It will be better just to find spare register here.
+    // Spill the scratch register if Thumb1 epilogue has to restore SP from FP
+    // or the function requires stack alignment.  We don't know for sure what
+    // the stack size will be, but for this, an estimate is good enough. If
+    // there anything changes it, it'll be a spill, which implies we've used all
+    // the registers and so R4 is already used, so not marking it here will be
+    // OK.
     if (MFI.hasVarSizedObjects() || RegInfo->needsStackRealignment(MF) ||
         MFI.estimateStackSize(MF) > 508)
-      SavedRegs.set(ARM::R4);
+      SavedRegs.set(ScratchReg);
   }
 
   // See if we can spill vector registers to aligned stack.
-  checkNumAlignedDPRCS2Regs(MF, SavedRegs);
+  checkNumAlignedDPRCS2Regs(MF, SavedRegs, ScratchReg);
 
   // Spill the BasePtr if it's used.
   if (RegInfo->hasBasePointer(MF))
@@ -2292,10 +2342,39 @@
     return;
   }
 
-  // Use R4 and R5 as scratch registers.
-  // We save R4 and R5 before use and restore them before leaving the function.
-  unsigned ScratchReg0 = ARM::R4;
-  unsigned ScratchReg1 = ARM::R5;
+  // We will use r4 to hold stack limit and r5 to hold the stack size
+  // requested and arguments for __morestack().
+
+  // r4 and r5 might be used as global named registers. If that is the case,
+  // temporarily save their content in other registers so they're free
+  // to be used for __morestack
+  unsigned BackupR4 = ARM::NoRegister;
+  unsigned BackupR5 = ARM::NoRegister;
+  BitVector UsedRegs = BitVector(ARM::GPRRegClass.getNumRegs());
+  // make sure we won't get the suggestion to use r4/r5 as backup reg
+  unsigned R4 = ARM::R4 - ARM::GPRRegClass.begin()[0];
+  unsigned R5 = ARM::R5 - ARM::GPRRegClass.begin()[0];
+  UsedRegs[R4] = true;
+  UsedRegs[R5] = true;
+
+  if (STI.isGPRegisterReserved(R4)) {
+    if (!STI.hasV6Ops())
+      // Do not allow r4 to be reserved if segmented stacks are used on a
+      // target where 'mov low_reg, low_reg' is not possible
+      report_fatal_error("-ffixed-r4 is not allowed for this target when "
+                         "segmented stacks are in use.");
+    BackupR4 = getScratchRegister(MF, UsedRegs);
+    UsedRegs[BackupR4 - ARM::GPRRegClass.begin()[0]] = true;
+  }
+  if (STI.isGPRegisterReserved(R5)) {
+    if (!STI.hasV6Ops())
+      // Do not allow r5 to be reserved if segmented stacks are used on a
+      // target where 'mov low_reg, low_reg' is not possible
+      report_fatal_error("-ffixed-r5 is not allowed for this target when "
+                         "segmented stacks are in use.");
+    BackupR5 = getScratchRegister(MF, UsedRegs);
+  }
+
   uint64_t AlignedStackSize;
 
   MachineBasicBlock *PrevStackMBB = MF.CreateMachineBasicBlock();
@@ -2354,27 +2433,37 @@
   // boundary directly to the value of the stack pointer, per gcc.
   bool CompareStackPointer = AlignedStackSize < kSplitStackAvailable;
 
-  // We will use two of the callee save registers as scratch registers so we
-  // need to save those registers onto the stack.
-  // We will use SR0 to hold stack limit and SR1 to hold the stack size
-  // requested and arguments for __morestack().
-  // SR0: Scratch Register #0
-  // SR1: Scratch Register #1
-  // push {SR0, SR1}
+  // We will use r4 and r5 either directly or use scratch registers to save
+  // their values, so we need to save those registers onto the stack.
+  // push {r4/sr4, r5/sr5}
+  unsigned sortedPushList[2] = {
+      std::min(BackupR4 ? BackupR4 : (unsigned)ARM::R4,
+               BackupR5 ? BackupR5 : (unsigned)ARM::R5),
+      std::max(BackupR5 ? BackupR5 : (unsigned)ARM::R5,
+               BackupR4 ? BackupR4 : (unsigned)ARM::R4)};
   if (Thumb) {
     BuildMI(PrevStackMBB, DL, TII.get(ARM::tPUSH))
         .add(predOps(ARMCC::AL))
-        .addReg(ScratchReg0)
-        .addReg(ScratchReg1);
+        .addReg(sortedPushList[0])
+        .addReg(sortedPushList[1]);
   } else {
     BuildMI(PrevStackMBB, DL, TII.get(ARM::STMDB_UPD))
         .addReg(ARM::SP, RegState::Define)
         .addReg(ARM::SP)
         .add(predOps(ARMCC::AL))
-        .addReg(ScratchReg0)
-        .addReg(ScratchReg1);
+        .addReg(sortedPushList[0])
+        .addReg(sortedPushList[1]);
   }
 
+  if (BackupR4)
+    BuildMI(McrMBB, DL, TII.get(ARM::tMOVr), BackupR4)
+        .addReg(ARM::R4)
+        .add(predOps(ARMCC::AL));
+  if (BackupR5)
+    BuildMI(McrMBB, DL, TII.get(ARM::tMOVr), BackupR5)
+        .addReg(ARM::R5)
+        .add(predOps(ARMCC::AL));
+
   // Emit the relevant DWARF information about the change in stack pointer as
   // well as where to find both r4 and r5 (the callee-save registers)
   CFIIndex =
@@ -2382,21 +2471,21 @@
   BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
   CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
-      nullptr, MRI->getDwarfRegNum(ScratchReg1, true), -4));
+      nullptr, MRI->getDwarfRegNum(ARM::R5, true), -4));
   BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
   CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
-      nullptr, MRI->getDwarfRegNum(ScratchReg0, true), -8));
+      nullptr, MRI->getDwarfRegNum(ARM::R4, true), -8));
   BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
 
   // mov SR1, sp
   if (Thumb) {
-    BuildMI(McrMBB, DL, TII.get(ARM::tMOVr), ScratchReg1)
+    BuildMI(McrMBB, DL, TII.get(ARM::tMOVr), ARM::R5)
         .addReg(ARM::SP)
         .add(predOps(ARMCC::AL));
   } else if (CompareStackPointer) {
-    BuildMI(McrMBB, DL, TII.get(ARM::MOVr), ScratchReg1)
+    BuildMI(McrMBB, DL, TII.get(ARM::MOVr), ARM::R5)
         .addReg(ARM::SP)
         .add(predOps(ARMCC::AL))
         .add(condCodeOp());
@@ -2404,13 +2493,13 @@
 
   // sub SR1, sp, #StackSize
   if (!CompareStackPointer && Thumb) {
-    BuildMI(McrMBB, DL, TII.get(ARM::tSUBi8), ScratchReg1)
+    BuildMI(McrMBB, DL, TII.get(ARM::tSUBi8), ARM::R5)
         .add(condCodeOp())
-        .addReg(ScratchReg1)
+        .addReg(ARM::R5)
         .addImm(AlignedStackSize)
         .add(predOps(ARMCC::AL));
   } else if (!CompareStackPointer) {
-    BuildMI(McrMBB, DL, TII.get(ARM::SUBri), ScratchReg1)
+    BuildMI(McrMBB, DL, TII.get(ARM::SUBri), ARM::R5)
         .addReg(ARM::SP)
         .addImm(AlignedStackSize)
         .add(predOps(ARMCC::AL))
@@ -2424,20 +2513,20 @@
     MachineConstantPool *MCP = MF.getConstantPool();
     unsigned CPI = MCP->getConstantPoolIndex(NewCPV, 4);
 
-    // ldr SR0, [pc, offset(STACK_LIMIT)]
-    BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0)
+    // ldr r4, [pc, offset(STACK_LIMIT)]
+    BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ARM::R4)
         .addConstantPoolIndex(CPI)
         .add(predOps(ARMCC::AL));
 
-    // ldr SR0, [SR0]
-    BuildMI(GetMBB, DL, TII.get(ARM::tLDRi), ScratchReg0)
-        .addReg(ScratchReg0)
+    // ldr r4, [r4]
+    BuildMI(GetMBB, DL, TII.get(ARM::tLDRi), ARM::R4)
+        .addReg(ARM::R4)
         .addImm(0)
         .add(predOps(ARMCC::AL));
   } else {
     // Get TLS base address from the coprocessor
-    // mrc p15, #0, SR0, c13, c0, #3
-    BuildMI(McrMBB, DL, TII.get(ARM::MRC), ScratchReg0)
+    // mrc p15, #0, r4, c13, c0, #3
+    BuildMI(McrMBB, DL, TII.get(ARM::MRC), ARM::R4)
         .addImm(15)
         .addImm(0)
         .addImm(13)
@@ -2450,19 +2539,19 @@
     unsigned TlsOffset = ST->isTargetAndroid() ? 63 : 1;
 
     // Get the stack limit from the right offset
-    // ldr SR0, [sr0, #4 * TlsOffset]
-    BuildMI(GetMBB, DL, TII.get(ARM::LDRi12), ScratchReg0)
-        .addReg(ScratchReg0)
+    // ldr r4, [sr0, #4 * TlsOffset]
+    BuildMI(GetMBB, DL, TII.get(ARM::LDRi12), ARM::R4)
+        .addReg(ARM::R4)
         .addImm(4 * TlsOffset)
         .add(predOps(ARMCC::AL));
   }
 
   // Compare stack limit with stack size requested.
-  // cmp SR0, SR1
+  // cmp r4, r5
   Opcode = Thumb ? ARM::tCMPr : ARM::CMPrr;
   BuildMI(GetMBB, DL, TII.get(Opcode))
-      .addReg(ScratchReg0)
-      .addReg(ScratchReg1)
+      .addReg(ARM::R4)
+      .addReg(ARM::R5)
       .add(predOps(ARMCC::AL));
 
   // This jump is taken if StackLimit < SP - stack required.
@@ -2471,33 +2560,32 @@
        .addImm(ARMCC::LO)
        .addReg(ARM::CPSR);
 
-
   // Calling __morestack(StackSize, Size of stack arguments).
-  // __morestack knows that the stack size requested is in SR0(r4)
-  // and amount size of stack arguments is in SR1(r5).
+  // __morestack knows that the stack size requested is in r4
+  // and amount size of stack arguments is in r5.
 
-  // Pass first argument for the __morestack by Scratch Register #0.
+  // Pass first argument for the __morestack by r4.
   //   The amount size of stack required
   if (Thumb) {
-    BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg0)
+    BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ARM::R4)
         .add(condCodeOp())
         .addImm(AlignedStackSize)
         .add(predOps(ARMCC::AL));
   } else {
-    BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg0)
+    BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ARM::R4)
         .addImm(AlignedStackSize)
         .add(predOps(ARMCC::AL))
         .add(condCodeOp());
   }
   // Pass second argument for the __morestack by Scratch Register #1.
-  //   The amount size of stack consumed to save function arguments.
+  // Pass second argument for the __morestack by r5.
   if (Thumb) {
-    BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg1)
+    BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ARM::R5)
         .add(condCodeOp())
         .addImm(alignToARMConstant(ARMFI->getArgumentStackSize()))
         .add(predOps(ARMCC::AL));
   } else {
-    BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg1)
+    BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ARM::R5)
         .addImm(alignToARMConstant(ARMFI->getArgumentStackSize()))
         .add(predOps(ARMCC::AL))
         .add(condCodeOp());
@@ -2542,9 +2630,9 @@
     if (ST->isThumb1Only()) {
       BuildMI(AllocMBB, DL, TII.get(ARM::tPOP))
           .add(predOps(ARMCC::AL))
-          .addReg(ScratchReg0);
+          .addReg(ARM::R4);
       BuildMI(AllocMBB, DL, TII.get(ARM::tMOVr), ARM::LR)
-          .addReg(ScratchReg0)
+          .addReg(ARM::R4)
           .add(predOps(ARMCC::AL));
     } else {
       BuildMI(AllocMBB, DL, TII.get(ARM::t2LDR_POST))
@@ -2562,22 +2650,35 @@
         .addReg(ARM::LR);
   }
 
-  // Restore SR0 and SR1 in case of __morestack() was called.
+  // If r4 and r5 were secured using other registers, restore them from there
+  // movr r4, sr4
+  // movr r5, sr5
+  if (BackupR4)
+    BuildMI(AllocMBB, DL, TII.get(ARM::tMOVr), ARM::R4)
+        .addReg(BackupR4)
+        .add(predOps(ARMCC::AL));
+  if (BackupR5)
+    BuildMI(AllocMBB, DL, TII.get(ARM::tMOVr), ARM::R5)
+        .addReg(BackupR5)
+        .add(predOps(ARMCC::AL));
+
+  // Restore r4 and r5, resp. the secure scratch registers, in case of
+  // __morestack() was not called.
   // __morestack() will skip PostStackMBB block so we need to restore
   // scratch registers from here.
-  // pop {SR0, SR1}
+  // pop {r4/sr4, r5/sr5}
   if (Thumb) {
     BuildMI(AllocMBB, DL, TII.get(ARM::tPOP))
         .add(predOps(ARMCC::AL))
-        .addReg(ScratchReg0)
-        .addReg(ScratchReg1);
+        .addReg(sortedPushList[0])
+        .addReg(sortedPushList[1]);
   } else {
     BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD))
         .addReg(ARM::SP, RegState::Define)
         .addReg(ARM::SP)
         .add(predOps(ARMCC::AL))
-        .addReg(ScratchReg0)
-        .addReg(ScratchReg1);
+        .addReg(sortedPushList[0])
+        .addReg(sortedPushList[1]);
   }
 
   // Update the CFA offset now that we've popped
@@ -2589,19 +2690,23 @@
   BuildMI(AllocMBB, DL, TII.get(ST->getReturnOpcode())).add(predOps(ARMCC::AL));
 
   // Restore SR0 and SR1 in case of __morestack() was not called.
-  // pop {SR0, SR1}
+  // Restore r4 and r5, resp. the secure scratch registers, in case of
+  // __morestack() was not called.
+  // movr r4, sr4
+  // movr r5, sr5
+  // pop {r4/sr4, r5/sr5}
   if (Thumb) {
     BuildMI(PostStackMBB, DL, TII.get(ARM::tPOP))
         .add(predOps(ARMCC::AL))
-        .addReg(ScratchReg0)
-        .addReg(ScratchReg1);
+        .addReg(sortedPushList[0])
+        .addReg(sortedPushList[1]);
   } else {
     BuildMI(PostStackMBB, DL, TII.get(ARM::LDMIA_UPD))
         .addReg(ARM::SP, RegState::Define)
         .addReg(ARM::SP)
         .add(predOps(ARMCC::AL))
-        .addReg(ScratchReg0)
-        .addReg(ScratchReg1);
+        .addReg(sortedPushList[0])
+        .addReg(sortedPushList[1]);
   }
 
   // Update the CFA offset now that we've popped
@@ -2612,11 +2717,11 @@
   // Tell debuggers that r4 and r5 are now the same as they were in the
   // previous function, that they're the "Same Value".
   CFIIndex = MF.addFrameInst(MCCFIInstruction::createSameValue(
-      nullptr, MRI->getDwarfRegNum(ScratchReg0, true)));
+      nullptr, MRI->getDwarfRegNum(ARM::R4, true)));
   BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
   CFIIndex = MF.addFrameInst(MCCFIInstruction::createSameValue(
-      nullptr, MRI->getDwarfRegNum(ScratchReg1, true)));
+      nullptr, MRI->getDwarfRegNum(ARM::R5, true)));
   BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
 
Index: llvm/lib/Target/ARM/ARMISelLowering.h
===================================================================
--- llvm/lib/Target/ARM/ARMISelLowering.h
+++ llvm/lib/Target/ARM/ARMISelLowering.h
@@ -628,6 +628,8 @@
 
     bool preferIncOfAddToSubOfNot(EVT VT) const override;
 
+    unsigned getScratchRegister(unsigned FramePointer) const;
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
Index: llvm/lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -5572,6 +5572,8 @@
 Register ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
                                               const MachineFunction &MF) const {
   Register Reg = StringSwitch<unsigned>(RegName)
+                     .Case("r4", ARM::R4)
+                     .Case("r5", ARM::R5)
                      .Case("r6", ARM::R6)
                      .Case("r7", ARM::R7)
                      .Case("r8", ARM::R8)
@@ -10217,6 +10219,18 @@
          "__chkstk is only supported on Windows");
   assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
 
+  // r4 might be used as global named register, which means its value has not
+  // been and must not be saved to the stack. Thus temporarily save its
+  // content in another register so r4 is free to be used for __chkstk
+  unsigned BackupR4 = ARM::NoRegister;
+  if (Subtarget->isGPRegisterReserved(ARM::R4 - ARM::GPRRegClass.begin()[0])) {
+    BackupR4 = getScratchRegister(ARM::R11);
+    BuildMI(*MBB, MI, DL, TII.get(ARM::tMOVr), ARM::R4)
+        .addReg(BackupR4)
+        .add(predOps(ARMCC::AL));
+    ;
+  }
+
   // __chkstk takes the number of words to allocate on the stack in R4, and
   // returns the stack adjustment in number of bytes in R4.  This will not
   // clober any other registers (other than the obvious lr).
@@ -10277,6 +10291,14 @@
       .add(predOps(ARMCC::AL))
       .add(condCodeOp());
 
+  // If r4 is actually a fixed register, restore it
+  if (BackupR4) {
+    BuildMI(*MBB, MI, DL, TII.get(ARM::tMOVr), BackupR4)
+        .addReg(ARM::R4)
+        .add(predOps(ARMCC::AL));
+    ;
+  }
+
   MI.eraseFromParent();
   return MBB;
 }
@@ -17183,3 +17205,14 @@
   MF.getFrameInfo().computeMaxCallFrameSize(MF);
   TargetLoweringBase::finalizeLowering(MF);
 }
+
+unsigned ARMTargetLowering::getScratchRegister(unsigned FramePointer) const {
+  for (unsigned i = 4; i < ARM::GPRRegClass.getNumRegs() - 4; i++) {
+    // register must not be reserved nor FP
+    if (!Subtarget->isGPRegisterReserved(i) && i != FramePointer) {
+      return ARM::GPRRegClass.getRegister(i);
+    }
+  }
+  report_fatal_error("ran out of registers: Too many registers reserved");
+  return ARM::NoRegister;
+}
Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -76,10 +76,10 @@
       ARM::FeatureDSP, ARM::FeatureMP, ARM::FeatureVirtualization,
       ARM::FeatureMClass, ARM::FeatureRClass, ARM::FeatureAClass,
       ARM::FeatureNaClTrap, ARM::FeatureStrictAlign, ARM::FeatureLongCalls,
-      ARM::FeatureExecuteOnly, ARM::FeatureReserveR6, ARM::FeatureReserveR7,
-      ARM::FeatureReserveR8, ARM::FeatureReserveR9, ARM::FeatureReserveR10,
-      ARM::FeatureReserveR11, ARM::FeatureNoMovt,
-      ARM::FeatureNoNegativeImmediates
+      ARM::FeatureExecuteOnly, ARM::FeatureReserveR4, ARM::FeatureReserveR5,
+      ARM::FeatureReserveR6, ARM::FeatureReserveR7, ARM::FeatureReserveR8,
+      ARM::FeatureReserveR9, ARM::FeatureReserveR10, ARM::FeatureReserveR11,
+      ARM::FeatureNoMovt, ARM::FeatureNoNegativeImmediates
   };
 
   const ARMSubtarget *getST() const { return ST; }
Index: llvm/lib/Target/ARM/Thumb1FrameLowering.h
===================================================================
--- llvm/lib/Target/ARM/Thumb1FrameLowering.h
+++ llvm/lib/Target/ARM/Thumb1FrameLowering.h
@@ -36,6 +36,9 @@
 
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
+  unsigned getScratchRegister(const MachineFunction &MF,
+                              BitVector inUse = BitVector()) const;
+
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF,
                                 MachineBasicBlock &MBB,
Index: llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
===================================================================
--- llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -63,6 +63,15 @@
   return !MFI.hasVarSizedObjects();
 }
 
+unsigned Thumb1FrameLowering::getScratchRegister(const MachineFunction &MF,
+                                                 BitVector InUse) const {
+  unsigned Reg = ARMFrameLowering::getScratchRegister(MF, InUse);
+  if (isARMLowRegister(Reg))
+    return Reg;
+  report_fatal_error("ran out of registers: Too many registers reserved");
+  return ARM::NoRegister;
+}
+
 static void
 emitPrologueEpilogueSPUpdate(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator &MBBI,
@@ -373,16 +382,9 @@
     // For a large stack frame, we might need a scratch register to store
     // the size of the frame.  We know all callee-save registers are free
     // at this point in the prologue, so pick one.
-    unsigned ScratchRegister = ARM::NoRegister;
-    for (auto &I : CSI) {
-      unsigned Reg = I.getReg();
-      if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) {
-        ScratchRegister = Reg;
-        break;
-      }
-    }
+    unsigned ScratchReg = getScratchRegister(MF);
     emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
-                                 ScratchRegister, MachineInstr::FrameSetup);
+                                 ScratchReg, MachineInstr::FrameSetup);
     if (!HasFP) {
       CFAOffset -= NumBytes;
       unsigned CFIIndex = MF.addFrameInst(
@@ -403,31 +405,34 @@
 
   if (RegInfo->needsStackRealignment(MF)) {
     const unsigned NrBitsToZero = countTrailingZeros(MFI.getMaxAlignment());
-    // Emit the following sequence, using R4 as a temporary, since we cannot use
-    // SP as a source or destination register for the shifts:
+    // Emit the following sequence, using our scratch register as a temporary,
+    // since we cannot use SP as a source or destination register for the
+    // shifts:
     // mov  r4, sp
     // lsrs r4, r4, #NrBitsToZero
     // lsls r4, r4, #NrBitsToZero
     // mov  sp, r4
-    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4)
-      .addReg(ARM::SP, RegState::Kill)
-      .add(predOps(ARMCC::AL));
+    // (ex.: scratchReg=r4)
+    unsigned ScratchReg = getScratchRegister(MF);
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ScratchReg)
+        .addReg(ARM::SP, RegState::Kill)
+        .add(predOps(ARMCC::AL));
 
-    BuildMI(MBB, MBBI, dl, TII.get(ARM::tLSRri), ARM::R4)
-      .addDef(ARM::CPSR)
-      .addReg(ARM::R4, RegState::Kill)
-      .addImm(NrBitsToZero)
-      .add(predOps(ARMCC::AL));
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tLSRri), ScratchReg)
+        .addDef(ARM::CPSR)
+        .addReg(ScratchReg, RegState::Kill)
+        .addImm(NrBitsToZero)
+        .add(predOps(ARMCC::AL));
 
-    BuildMI(MBB, MBBI, dl, TII.get(ARM::tLSLri), ARM::R4)
-      .addDef(ARM::CPSR)
-      .addReg(ARM::R4, RegState::Kill)
-      .addImm(NrBitsToZero)
-      .add(predOps(ARMCC::AL));
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tLSLri), ScratchReg)
+        .addDef(ARM::CPSR)
+        .addReg(ScratchReg, RegState::Kill)
+        .addImm(NrBitsToZero)
+        .add(predOps(ARMCC::AL));
 
     BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
-      .addReg(ARM::R4, RegState::Kill)
-      .add(predOps(ARMCC::AL));
+        .addReg(ScratchReg, RegState::Kill)
+        .add(predOps(ARMCC::AL));
 
     AFI->setShouldRestoreSPFromFP(true);
   }
@@ -511,13 +516,14 @@
       // Reset SP based on frame pointer only if the stack frame extends beyond
       // frame pointer stack slot, the target is ELF and the function has FP, or
       // the target uses var sized objects.
+      unsigned ScratchReg = Thumb1FrameLowering::getScratchRegister(MF);
       if (NumBytes) {
-        assert(!MFI.getPristineRegs(MF).test(ARM::R4) &&
+        assert(!MFI.getPristineRegs(MF).test(ScratchReg) &&
                "No scratch register to restore SP from FP!");
-        emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
-                                  TII, *RegInfo);
+        emitThumbRegPlusImmediate(MBB, MBBI, dl, ScratchReg, FramePtr,
+                                  -NumBytes, TII, *RegInfo);
         BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
-            .addReg(ARM::R4)
+            .addReg(ScratchReg)
             .add(predOps(ARMCC::AL));
       } else
         BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
@@ -527,24 +533,16 @@
       // For a large stack frame, we might need a scratch register to store
       // the size of the frame.  We know all callee-save registers are free
       // at this point in the epilogue, so pick one.
-      unsigned ScratchRegister = ARM::NoRegister;
-      bool HasFP = hasFP(MF);
-      for (auto &I : MFI.getCalleeSavedInfo()) {
-        unsigned Reg = I.getReg();
-        if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) {
-          ScratchRegister = Reg;
-          break;
-        }
-      }
+      unsigned ScratchReg = getScratchRegister(MF);
       if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tBX_RET &&
           &MBB.front() != &*MBBI && std::prev(MBBI)->getOpcode() == ARM::tPOP) {
         MachineBasicBlock::iterator PMBBI = std::prev(MBBI);
         if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*PMBBI, NumBytes))
           emitPrologueEpilogueSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes,
-                                       ScratchRegister, MachineInstr::NoFlags);
+                                       ScratchReg, MachineInstr::NoFlags);
       } else if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes))
         emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes,
-                                     ScratchRegister, MachineInstr::NoFlags);
+                                     ScratchReg, MachineInstr::NoFlags);
     }
   }
 
Index: llvm/test/CodeGen/ARM/Windows/chkstk-fixed-r4.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/ARM/Windows/chkstk-fixed-r4.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 -filetype asm -mattr=+reserve-r4 %s -o - \
+; RUN:  | FileCheck %s
+
+define arm_aapcs_vfpcc void @check_r4usage() {
+entry:
+  %buffer = alloca [4096 x i8], align 1
+  ret void
+}
+
+; CHECK-NOT: push {{{.*}}r4{{.*}}}
+; CHECK: push {{{.*}}r5{{.*}}}
+; CHECK: mov r5, r4
+; CHECK: movw r4, #1024
+; CHECK: bl __chkstk
+; CHECK: sub.w sp, sp, r4
+; CHECK: mov r4, r5
+; CHECK: pop {{{.*}}r5{{.*}}}
+; CHECK-NOT: pop {{{.*}}r4{{.*}}}
+
Index: llvm/test/CodeGen/ARM/Windows/vla-fixed-r4.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/ARM/Windows/vla-fixed-r4.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -mattr=+reserve-r4 -filetype asm -o - %s \
+; RUN:  | FileCheck %s -check-prefix CHECK-SMALL-CODE
+; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -code-model=large -mattr=+reserve-r4 -filetype asm -o - %s \
+; RUN:  | FileCheck %s -check-prefix CHECK-LARGE-CODE
+; RUN: llc -mtriple=thumbv7-windows-msvc -mcpu=cortex-a9 -mattr=+reserve-r4 -filetype asm -o - %s \
+; RUN:  | FileCheck %s -check-prefix CHECK-SMALL-CODE
+
+define arm_aapcs_vfpcc i8 @function(i32 %sz, i32 %idx) {
+entry:
+  %vla = alloca i8, i32 %sz, align 1
+  %arrayidx = getelementptr inbounds i8, i8* %vla, i32 %idx
+  %0 = load volatile i8, i8* %arrayidx, align 1
+  ret i8 %0
+}
+
+; CHECK-SMALL-CODE:   adds [[R4:r[0-9]+]], #7
+; CHECK-SMALL-CODE:   bic [[R4]], [[R4]], #4
+; CHECK-SMALL-CODE:   lsrs r4, [[R4]], #2
+; CHECK-SMALL-CODE:   bl __chkstk
+; CHECK-SMALL-CODE:   sub.w sp, sp, r4
+
+; CHECK-LARGE-CODE:   adds  [[R4:r[0-9]+]], #7
+; CHECK-LARGE-CODE:   bic   [[R4]], [[R4]], #4
+; CHECK-LARGE-CODE:   lsrs  r4, [[R4]], #2
+; CHECK-LARGE-CODE:   movw  [[IP:r[0-9]+]], :lower16:__chkstk
+; CHECK-LARGE-CODE:   movt  [[IP]], :upper16:__chkstk
+; CHECK-LARGE-CODE:   blx   [[IP]]
+; CHECK-LARGE-CODE:   sub.w sp, sp, r4
Index: llvm/test/CodeGen/ARM/named-reg-alloc.ll
===================================================================
--- llvm/test/CodeGen/ARM/named-reg-alloc.ll
+++ llvm/test/CodeGen/ARM/named-reg-alloc.ll
@@ -4,11 +4,11 @@
 define i32 @get_stack() nounwind {
 entry:
 ; FIXME: Include an allocatable-specific error message
-; CHECK: Invalid register name "r5".
+; CHECK: Invalid register name "r3".
 	%sp = call i32 @llvm.read_register.i32(metadata !0)
   ret i32 %sp
 }
 
 declare i32 @llvm.read_register.i32(metadata) nounwind
 
-!0 = !{!"r5\00"}
+!0 = !{!"r3\00"}
Index: llvm/test/CodeGen/ARM/reg-alloc-no-alignment.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/ARM/reg-alloc-no-alignment.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -verify-machineinstrs -mattr=+reserve-r10,+reserve-r9,+reserve-r8,+reserve-r4 \
+; RUN:     -asm-verbose=false | FileCheck %s
+
+
+declare dso_local i32 @bar(i32*) #1
+
+; Reserved registers should not be used to correct alignment.
+define hidden i32 @main() #0 {
+; CHECK-NOT: r10
+; CHECK-NOT: r9
+; CHECK-NOT: r8
+; CHECK-NOT: r4
+; CHECK: {r5, r7, lr}
+  %a = alloca i32, i32 4, align 8
+  %1 = call i32 @bar (i32* %a)
+  ret i32 %1
+}
\ No newline at end of file
Index: llvm/test/CodeGen/ARM/reg-alloc-with-fixed-reg-r4-r5-r6.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/ARM/reg-alloc-with-fixed-reg-r4-r5-r6.ll
@@ -0,0 +1,60 @@
+; RUN: llc < %s -mattr=+reserve-r4,+reserve-r5,+reserve-r6 -mtriple=arm-linux-gnueabi -O0 -filetype=asm --regalloc=fast 2>&1 | FileCheck %s
+;
+; Equivalent C source code
+; void bar(unsigned int i,
+;          unsigned int j,
+;          unsigned int k,
+;          unsigned int l,
+;          unsigned int m,
+;          unsigned int n,
+;          unsigned int o,
+;          unsigned int p)
+; {
+;     unsigned int result = i + j + k + l +m + n + o + p;
+; }
+
+define void @bar(i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p) nounwind {
+entry:
+; CHECK-NOT: push {{{.*}}r4{{.*}}}
+; CHECK-NOT: push {{{.*}}r5, r6{{.*}}}
+  %i.addr = alloca i32, align 4
+  %j.addr = alloca i32, align 4
+  %k.addr = alloca i32, align 4
+  %l.addr = alloca i32, align 4
+  %m.addr = alloca i32, align 4
+  %n.addr = alloca i32, align 4
+  %o.addr = alloca i32, align 4
+  %p.addr = alloca i32, align 4
+  %result = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  store i32 %j, i32* %j.addr, align 4
+  store i32 %k, i32* %k.addr, align 4
+  store i32 %l, i32* %l.addr, align 4
+  store i32 %m, i32* %m.addr, align 4
+  store i32 %n, i32* %n.addr, align 4
+  store i32 %o, i32* %o.addr, align 4
+  store i32 %p, i32* %p.addr, align 4
+  %0 = load i32, i32* %i.addr, align 4
+  %1 = load i32, i32* %j.addr, align 4
+  %add = add i32 %0, %1
+  %2 = load i32, i32* %k.addr, align 4
+  %add1 = add i32 %add, %2
+  %3 = load i32, i32* %l.addr, align 4
+  %add2 = add i32 %add1, %3
+  %4 = load i32, i32* %m.addr, align 4
+  %add3 = add i32 %add2, %4
+  %5 = load i32, i32* %n.addr, align 4
+  %add4 = add i32 %add3, %5
+  %6 = load i32, i32* %o.addr, align 4
+  %add5 = add i32 %add4, %6
+  %7 = load i32, i32* %p.addr, align 4
+  %add6 = add i32 %add5, %7
+  store i32 %add6, i32* %result, align 4
+; CHECK-NOT: {{.*}}r4{{.*}}
+; CHECK-NOT: {{.*}}r5{{.*}}
+; CHECK-NOT: {{.*}}r6{{.*}}
+; CHECK: {{.*}}r7{{.*}}
+  ret void
+; CHECK-NOT: pop {{{.*}}r4{{.*}}}
+; CHECK-NOT: pop {{{.*}}r5, r6{{.*}}}
+}
Index: llvm/test/CodeGen/ARM/reg-alloc-with-fixed-reg-r5-r6.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/ARM/reg-alloc-with-fixed-reg-r5-r6.ll
@@ -0,0 +1,58 @@
+; RUN: llc < %s -mattr=+reserve-r5,+reserve-r6 -mtriple=arm-linux-gnueabi -O0 -filetype=asm --regalloc=fast 2>&1 | FileCheck %s
+;
+; Equivalent C source code
+; void bar(unsigned int i,
+;          unsigned int j,
+;          unsigned int k,
+;          unsigned int l,
+;          unsigned int m,
+;          unsigned int n,
+;          unsigned int o,
+;          unsigned int p)
+; {
+;     unsigned int result = i + j + k + l +m + n + o + p;
+; }
+
+define void @bar(i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p) nounwind {
+entry:
+; CHECK-NOT: push {{{.*}}r5, r6{{.*}}}
+  %i.addr = alloca i32, align 4
+  %j.addr = alloca i32, align 4
+  %k.addr = alloca i32, align 4
+  %l.addr = alloca i32, align 4
+  %m.addr = alloca i32, align 4
+  %n.addr = alloca i32, align 4
+  %o.addr = alloca i32, align 4
+  %p.addr = alloca i32, align 4
+  %result = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  store i32 %j, i32* %j.addr, align 4
+  store i32 %k, i32* %k.addr, align 4
+  store i32 %l, i32* %l.addr, align 4
+  store i32 %m, i32* %m.addr, align 4
+  store i32 %n, i32* %n.addr, align 4
+  store i32 %o, i32* %o.addr, align 4
+  store i32 %p, i32* %p.addr, align 4
+  %0 = load i32, i32* %i.addr, align 4
+  %1 = load i32, i32* %j.addr, align 4
+  %add = add i32 %0, %1
+  %2 = load i32, i32* %k.addr, align 4
+  %add1 = add i32 %add, %2
+  %3 = load i32, i32* %l.addr, align 4
+  %add2 = add i32 %add1, %3
+  %4 = load i32, i32* %m.addr, align 4
+  %add3 = add i32 %add2, %4
+  %5 = load i32, i32* %n.addr, align 4
+  %add4 = add i32 %add3, %5
+  %6 = load i32, i32* %o.addr, align 4
+  %add5 = add i32 %add4, %6
+  %7 = load i32, i32* %p.addr, align 4
+  %add6 = add i32 %add5, %7
+  store i32 %add6, i32* %result, align 4
+; CHECK: {{.*}}r4{{.*}}
+; CHECK-NOT: {{.*}}r5{{.*}}
+; CHECK-NOT: {{.*}}r6{{.*}}
+; CHECK: {{.*}}r7{{.*}}
+  ret void
+; CHECK-NOT: pop {{{.*}}r5, r6{{.*}}}
+}
Index: llvm/test/CodeGen/ARM/reg-alloc-with-fixed-reg-r5.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/ARM/reg-alloc-with-fixed-reg-r5.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -mattr=+reserve-r5 -mtriple=arm-linux-gnueabi -O0 -filetype=asm --regalloc=fast 2>&1 | FileCheck %s
+;
+; Equivalent C source code
+; void bar(unsigned int i,
+;          unsigned int j,
+;          unsigned int k,
+;          unsigned int l,
+;          unsigned int m,
+;          unsigned int n,
+;          unsigned int o,
+;          unsigned int p)
+; {
+;     unsigned int result = i + j + k + l +m + n + o + p;
+; }
+
+define void @bar(i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p) nounwind {
+entry:
+; CHECK-NOT: push {{{.*}}r5,{{.*}}}
+  %i.addr = alloca i32, align 4
+  %j.addr = alloca i32, align 4
+  %k.addr = alloca i32, align 4
+  %l.addr = alloca i32, align 4
+  %m.addr = alloca i32, align 4
+  %n.addr = alloca i32, align 4
+  %o.addr = alloca i32, align 4
+  %p.addr = alloca i32, align 4
+  %result = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  store i32 %j, i32* %j.addr, align 4
+  store i32 %k, i32* %k.addr, align 4
+  store i32 %l, i32* %l.addr, align 4
+  store i32 %m, i32* %m.addr, align 4
+  store i32 %n, i32* %n.addr, align 4
+  store i32 %o, i32* %o.addr, align 4
+  store i32 %p, i32* %p.addr, align 4
+  %0 = load i32, i32* %i.addr, align 4
+  %1 = load i32, i32* %j.addr, align 4
+  %add = add i32 %0, %1
+  %2 = load i32, i32* %k.addr, align 4
+  %add1 = add i32 %add, %2
+  %3 = load i32, i32* %l.addr, align 4
+  %add2 = add i32 %add1, %3
+  %4 = load i32, i32* %m.addr, align 4
+  %add3 = add i32 %add2, %4
+  %5 = load i32, i32* %n.addr, align 4
+  %add4 = add i32 %add3, %5
+  %6 = load i32, i32* %o.addr, align 4
+  %add5 = add i32 %add4, %6
+  %7 = load i32, i32* %p.addr, align 4
+  %add6 = add i32 %add5, %7
+  store i32 %add6, i32* %result, align 4
+; CHECK: {{.*}}r4{{.*}}
+; CHECK-NOT: {{.*}}r5{{.*}}
+; CHECK: {{.*}}r6{{.*}}
+  ret void
+; CHECK-NOT: pop {{{.*}}r5,{{.*}}}
+}
Index: llvm/test/CodeGen/ARM/reg-allog-with-fixed-reg-r4.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/ARM/reg-allog-with-fixed-reg-r4.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -mattr=+reserve-r4 -mtriple=arm-linux-gnueabi -O0 -filetype=asm --regalloc=fast 2>&1 | FileCheck %s
+;
+; Equivalent C source code
+; void bar(unsigned int i,
+;          unsigned int j,
+;          unsigned int k,
+;          unsigned int l,
+;          unsigned int m,
+;          unsigned int n,
+;          unsigned int o,
+;          unsigned int p)
+; {
+;     unsigned int result = i + j + k + l +m + n + o + p;
+; }
+
+define void @bar(i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p) nounwind {
+entry:
+; CHECK-NOT: push {{{.*}}r4,{{.*}}}
+  %i.addr = alloca i32, align 4
+  %j.addr = alloca i32, align 4
+  %k.addr = alloca i32, align 4
+  %l.addr = alloca i32, align 4
+  %m.addr = alloca i32, align 4
+  %n.addr = alloca i32, align 4
+  %o.addr = alloca i32, align 4
+  %p.addr = alloca i32, align 4
+  %result = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  store i32 %j, i32* %j.addr, align 4
+  store i32 %k, i32* %k.addr, align 4
+  store i32 %l, i32* %l.addr, align 4
+  store i32 %m, i32* %m.addr, align 4
+  store i32 %n, i32* %n.addr, align 4
+  store i32 %o, i32* %o.addr, align 4
+  store i32 %p, i32* %p.addr, align 4
+  %0 = load i32, i32* %i.addr, align 4
+  %1 = load i32, i32* %j.addr, align 4
+  %add = add i32 %0, %1
+  %2 = load i32, i32* %k.addr, align 4
+  %add1 = add i32 %add, %2
+  %3 = load i32, i32* %l.addr, align 4
+  %add2 = add i32 %add1, %3
+  %4 = load i32, i32* %m.addr, align 4
+  %add3 = add i32 %add2, %4
+  %5 = load i32, i32* %n.addr, align 4
+  %add4 = add i32 %add3, %5
+  %6 = load i32, i32* %o.addr, align 4
+  %add5 = add i32 %add4, %6
+  %7 = load i32, i32* %p.addr, align 4
+  %add6 = add i32 %add5, %7
+  store i32 %add6, i32* %result, align 4
+; CHECK: {{.*}}r5{{.*}}
+; CHECK-NOT: {{.*}}r4{{.*}}
+; CHECK: {{.*}}r6{{.*}}
+  ret void
+; CHECK-NOT: pop {{{.*}}r4,{{.*}}}
+}
Index: llvm/test/CodeGen/ARM/segmented-stacks-fixed-r4-r5.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/ARM/segmented-stacks-fixed-r4-r5.ll
@@ -0,0 +1,60 @@
+; RUN: not llc < %s -mtriple=arm-linux-androideabi -mattr=+v4t -mattr=+reserve-r4 2>&1 | FileCheck %s -check-prefix=ARM-android-ERROR-r4
+; RUN: not llc < %s -mtriple=arm-linux-androideabi -mattr=+v4t -mattr=+reserve-r5 2>&1 | FileCheck %s -check-prefix=ARM-android-ERROR-r5
+; RUN: not llc < %s -mtriple=arm-linux-unknown-gnueabi -mattr=+v4t -mattr=+reserve-r4 2>&1 | FileCheck %s -check-prefix=ARM-linux-ERROR
+
+; ARM-android-ERROR-r4: -ffixed-r4 is not allowed for this target when segmented stacks are in use.
+; ARM-android-ERROR-r5: -ffixed-r5 is not allowed for this target when segmented stacks are in use.
+; ARM-linux-ERROR: -ffixed-r4 is not allowed for this target when segmented stacks are in use.
+
+; RUN: llc < %s -mtriple=armv7--linux-gnueabihf -mattr=+reserve-r4 -verify-machineinstrs | FileCheck %s -check-prefix=r4-ARM-linux
+; RUN: llc < %s -mtriple=armv7--linux-gnueabihf -mattr=+reserve-r4 -mattr=+reserve-r5 -verify-machineinstrs | FileCheck %s -check-prefix=r4-r5-ARM-linux
+
+; Just to prevent the alloca from being optimized away
+declare void @dummy_use(i32*, i32)
+
+define void @test_basic() #0 {
+        %mem = alloca i32, i32 10
+        call void @dummy_use (i32* %mem, i32 10)
+  ret void
+
+; r4-ARM-linux:           push    {r5, r6}
+; r4-ARM-linux-NEXT:      mov     r6, r4
+; r4-ARM-linux-NEXT:      mrc     p15, #0, r4, c13, c0, #3
+; r4-ARM-linux-NEXT:      mov     r5, sp
+; r4-ARM-linux-NEXT:       ldr     r4, [r4, #4]
+; r4-ARM-linux-NEXT:      cmp     r4, r5
+; r4-ARM-linux-NEXT:      blo     .LBB0_2
+
+; r4-ARM-linux:           mov     r4, #48
+; r4-ARM-linux-NEXT:      mov     r5, #0
+; r4-ARM-linux-NEXT:      stmdb   sp!, {lr}
+; r4-ARM-linux-NEXT:      bl      __morestack
+; r4-ARM-linux-NEXT:      ldm     sp!, {lr}
+; r4-ARM-linux-NEXT:      mov     r4, r6
+; r4-ARM-linux-NEXT:      pop     {r5, r6}
+; r4-ARM-linux-NEXT:      bx      lr
+
+
+
+; r4-r5-ARM-linux:        push    {r6, r7}
+; r4-r5-ARM-linux-NEXT:   mov     r6, r4
+; r4-r5-ARM-linux-NEXT:   mrc     p15, #0, r4, c13, c0, #3
+; r4-r5-ARM-linux-NEXT:   mov     r7, r5
+; r4-r5-ARM-linux-NEXT:   mov     r5, sp
+; r4-r5-ARM-linux-NEXT:   ldr     r4, [r4, #4]
+; r4-r5-ARM-linux-NEXT:   cmp     r4, r5
+; r4-r5-ARM-linux-NEXT:   blo     .LBB0_2
+
+; r4-r5-ARM-linux:        mov     r4, #48
+; r4-r5-ARM-linux-NEXT:   mov     r5, #0
+; r4-r5-ARM-linux-NEXT:    stmdb   sp!, {lr}
+; r4-r5-ARM-linux-NEXT:   bl      __morestack
+; r4-r5-ARM-linux-NEXT:   ldm     sp!, {lr}
+; r4-r5-ARM-linux-NEXT:   mov     r4, r6
+; r4-r5-ARM-linux-NEXT:   mov     r5, r7
+; r4-r5-ARM-linux-NEXT:   pop     {r6, r7}
+; r4-r5-ARM-linux-NEXT:   bx      lr
+
+}
+
+attributes #0 = { "split-stack" }
Index: llvm/test/CodeGen/Thumb/callee_save_reserved.ll
===================================================================
--- llvm/test/CodeGen/Thumb/callee_save_reserved.ll
+++ llvm/test/CodeGen/Thumb/callee_save_reserved.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -mtriple=thumbv6m-none-eabi -verify-machineinstrs -frame-pointer=none -mattr=+reserve-r6,+reserve-r8 \
+; RUN: llc < %s -mtriple=thumbv6m-none-eabi -verify-machineinstrs -frame-pointer=none -mattr=+reserve-r6,+reserve-r5 \
 ; RUN:     -asm-verbose=false | FileCheck --check-prefix=CHECK-INVALID %s
 
 ; Reserved low registers should not be used to correct reg deficit.
 define <4 x i32> @four_high_four_return_reserved() {
 entry:
-  ; CHECK-INVALID-NOT: r{{6|8}}
+  ; CHECK-INVALID-NOT: r{{[5-6]+}}
   tail call void asm sideeffect "", "~{r8},~{r9}"()
   %vecinit = insertelement <4 x i32> undef, i32 1, i32 0
   %vecinit11 = insertelement <4 x i32> %vecinit, i32 2, i32 1
Index: llvm/test/CodeGen/Thumb/emergency-spill-slot.ll
===================================================================
--- llvm/test/CodeGen/Thumb/emergency-spill-slot.ll
+++ llvm/test/CodeGen/Thumb/emergency-spill-slot.ll
@@ -10,8 +10,8 @@
 ; CHECK-NEXT:    .setfp r7, sp, #12
 ; CHECK-NEXT:    add r7, sp, #12
 ; CHECK-NEXT:    .pad #4100
-; CHECK-NEXT:    ldr r6, .LCPI0_0
-; CHECK-NEXT:    add sp, r6
+; CHECK-NEXT:    ldr r4, .LCPI0_0
+; CHECK-NEXT:    add sp, r4
 ; CHECK-NEXT:    mov r6, sp
 ; CHECK-NEXT:    adds r0, r0, #7
 ; CHECK-NEXT:    movs r1, #7
@@ -60,8 +60,8 @@
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    .pad #8196
-; CHECK-NEXT:    ldr r7, .LCPI1_0
-; CHECK-NEXT:    add sp, r7
+; CHECK-NEXT:    ldr r4, .LCPI1_0
+; CHECK-NEXT:    add sp, r4
 ; CHECK-NEXT:    add r0, sp, #4
 ; CHECK-NEXT:    ldr r1, .LCPI1_2
 ; CHECK-NEXT:    add r1, sp
@@ -74,8 +74,8 @@
 ; CHECK-NEXT:    ldr r0, [sp]
 ; CHECK-NEXT:    @APP
 ; CHECK-NEXT:    @NO_APP
-; CHECK-NEXT:    ldr r7, .LCPI1_1
-; CHECK-NEXT:    add sp, r7
+; CHECK-NEXT:    ldr r4, .LCPI1_1
+; CHECK-NEXT:    add sp, r4
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
@@ -120,8 +120,8 @@
 ; CHECK-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    .pad #8196
-; CHECK-NEXT:    ldr r6, .LCPI2_0
-; CHECK-NEXT:    add sp, r6
+; CHECK-NEXT:    ldr r4, .LCPI2_0
+; CHECK-NEXT:    add sp, r4
 ; CHECK-NEXT:    add r0, sp, #4
 ; CHECK-NEXT:    ldr r1, .LCPI2_2
 ; CHECK-NEXT:    add r1, sp
@@ -134,8 +134,8 @@
 ; CHECK-NEXT:    ldr r7, [sp]
 ; CHECK-NEXT:    @APP
 ; CHECK-NEXT:    @NO_APP
-; CHECK-NEXT:    ldr r6, .LCPI2_1
-; CHECK-NEXT:    add sp, r6
+; CHECK-NEXT:    ldr r4, .LCPI2_1
+; CHECK-NEXT:    add sp, r4
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
Index: llvm/test/CodeGen/Thumb2/segmented-stacks-fixed-r4.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/Thumb2/segmented-stacks-fixed-r4.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -mcpu=arm1156t2-s -mattr=+thumb2 -mattr=+reserve-r4 -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-android
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -mcpu=arm1156t2-s -mattr=+thumb2 -mattr=+reserve-r4 -filetype=obj
+
+
+; Just to prevent the alloca from being optimized away
+declare void @dummy_use(i32*, i32)
+
+define void @test_basic() #0 {
+        %mem = alloca i32, i32 10
+        call void @dummy_use (i32* %mem, i32 10)
+	ret void
+
+; Thumb-android:      test_basic:
+
+; Thumb-android:      push    {r5, r6}
+; Thumb-android-NEXT: mov     r6, r4
+; Thumb-android-NEXT: mrc     p15, #0, r4, c13, c0, #3
+; Thumb-android-NEXT: mov     r5, sp
+; Thumb-android-NEXT: ldr     r4, [r4, #252]
+; Thumb-android-NEXT: cmp     r4, r5
+; Thumb-android-NEXT: blo     .LBB0_2
+
+; Thumb-android:      mov     r4, #48
+; Thumb-android-NEXT: mov     r5, #0
+; Thumb-android-NEXT: push    {lr}
+; Thumb-android-NEXT: bl      __morestack
+; Thumb-android-NEXT: ldr     lr, [sp], #4
+; Thumb-android-NEXT: mov     r4, r6
+; Thumb-android-NEXT: pop     {r5, r6}
+; Thumb-android-NEXT: bx      lr
+
+; Thumb-android:      pop     {r5, r6}
+
+}
+
+attributes #0 = { "split-stack" }