diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -91,6 +91,8 @@
   /// all registers that were disabled are removed from the list.
   SmallVector<MCPhysReg, 16> UpdatedCSRs;
 
+  void initUpdatedCSRs();
+
   /// RegAllocHints - This vector records register allocation hints for
   /// virtual registers. For each virtual register, it keeps a pair of hint
   /// type and hints vector making up the allocation hints. Only the first
@@ -231,12 +233,17 @@
 
   /// Disables the register from the list of CSRs.
   /// I.e. the register will not appear as part of the CSR mask.
-  /// \see UpdatedCalleeSavedRegs.
-  void disableCalleeSavedRegister(unsigned Reg);
+  /// \see UpdatedCSRs.
+  void disableCalleeSavedRegister(Register Reg);
+
+  /// Enables the register from the list of CSRs.
+  /// I.e. the register will appear as part of the CSR mask.
+  /// \see UpdatedCSRs.
+  void enableCalleeSavedRegister(Register Reg);
 
   /// Returns list of callee saved registers.
   /// The function returns the updated CSR list (after taking into account
-  /// registers that are disabled from the CSR list).
+  /// registers that are enabled/disabled from the CSR list).
   const MCPhysReg *getCalleeSavedRegs() const;
 
   /// Sets the updated Callee Saved Registers list.
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -610,30 +610,54 @@
   return false;
 }
 
-void MachineRegisterInfo::disableCalleeSavedRegister(unsigned Reg) {
+void MachineRegisterInfo::initUpdatedCSRs() {
+  if (IsUpdatedCSRsInitialized)
+    return;
 
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
-  assert(Reg && (Reg < TRI->getNumRegs()) &&
-         "Trying to disable an invalid register");
+  const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF);
+  for (const MCPhysReg *I = CSR; *I; ++I)
+    UpdatedCSRs.push_back(*I);
 
-  if (!IsUpdatedCSRsInitialized) {
-    const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF);
-    for (const MCPhysReg *I = CSR; *I; ++I)
-      UpdatedCSRs.push_back(*I);
+  // Zero value represents the end of the register list
+  // (no more registers should be pushed).
+  UpdatedCSRs.push_back(0);
 
-    // Zero value represents the end of the register list
-    // (no more registers should be pushed).
-    UpdatedCSRs.push_back(0);
+  IsUpdatedCSRsInitialized = true;
+}
 
-    IsUpdatedCSRsInitialized = true;
-  }
+void MachineRegisterInfo::disableCalleeSavedRegister(Register Reg) {
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  assert(Reg && (Reg < TRI->getNumRegs()) &&
+         "Trying to disable an invalid register");
+
+  initUpdatedCSRs();
 
-  // Remove the register (and its aliases from the list).
+  // Remove the register (and its aliases) from the CSR list.
   for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
     UpdatedCSRs.erase(std::remove(UpdatedCSRs.begin(), UpdatedCSRs.end(), *AI),
                       UpdatedCSRs.end());
 }
 
+void MachineRegisterInfo::enableCalleeSavedRegister(Register Reg) {
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  assert(Reg && (Reg < TRI->getNumRegs()) &&
+         "Trying to disable an invalid register");
+
+  initUpdatedCSRs();
+
+  // Remove the null terminator from the end of the list.
+  assert(UpdatedCSRs.back() == 0);
+  UpdatedCSRs.pop_back();
+
+  // Add the register (and its sub-registers) to the CSR list.
+  for (MCSubRegIterator SRI(Reg, TRI, true); SRI.isValid(); ++SRI)
+    UpdatedCSRs.push_back(*SRI);
+
+  // Put the null terminator back.
+  UpdatedCSRs.push_back(0);
+}
+
 const MCPhysReg *MachineRegisterInfo::getCalleeSavedRegs() const {
   if (IsUpdatedCSRsInitialized)
     return UpdatedCSRs.data();
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -453,6 +453,8 @@
         FrameIdx = MFI.CreateFixedSpillStackObject(Size, FixedSlot->Offset);
       }
 
+      LLVM_DEBUG(dbgs() << "Assigned " << RegInfo->getName(Reg)
+                        << " to spill slot " << FrameIdx << "\n");
       CS.setFrameIdx(FrameIdx);
     }
   }
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.h b/llvm/lib/Target/ARM/ARMFrameLowering.h
--- a/llvm/lib/Target/ARM/ARMFrameLowering.h
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -56,6 +56,10 @@
 
   void getCalleeSaves(const MachineFunction &MF,
                       BitVector &SavedRegs) const override;
+  void findRegDefsOutsideSaveRestore(MachineFunction &MF,
+                                     BitVector &Regs) const;
+  unsigned spillExtraRegsForIPRA(MachineFunction &MF, BitVector &SavedRegs,
+                                 bool HasFPRegSaves) const;
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS) const override;
 
@@ -63,9 +67,8 @@
                                 MachineBasicBlock &MBB) const override;
 
   /// Returns true if the target will correctly handle shrink wrapping.
-  bool enableShrinkWrapping(const MachineFunction &MF) const override {
-    return true;
-  }
+  bool enableShrinkWrapping(const MachineFunction &MF) const override;
+
   bool isProfitableForNoCSROpt(const Function &F) const override {
     // The no-CSR optimisation is bad for code size on ARM, because we can save
     // many registers with a single PUSH/POP pair.
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -71,6 +71,14 @@
 SpillAlignedNEONRegs("align-neon-spills", cl::Hidden, cl::init(true),
                      cl::desc("Align ARM NEON spills in prolog and epilog"));
 
+static cl::opt<bool> EnableExtraSpills(
+    "arm-extra-spills", cl::Hidden, cl::init(false),
+    cl::desc("Preserve extra registers when useful for IPRA"));
+
+// Testing option to bypass some profitability checks.
+static cl::opt<bool> ForceExtraSpills("arm-extra-spills-force", cl::Hidden,
+                                      cl::init(false));
+
 static MachineBasicBlock::iterator
 skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
                         unsigned NumAlignedDPRCS2Regs);
@@ -1617,6 +1625,251 @@
   SavedRegs.set(ARM::R4);
 }
 
+// Compute the set of registers which cannot be preserved, because they are
+// either modified outside the PUSH/POP instructions, or are live at the point
+// where the POP will be inserted. This only considers r0-r3, which are
+// currently the only registers we voluntatrily save when the PCS doesn't
+// require it.
+void ARMFrameLowering::findRegDefsOutsideSaveRestore(
+    MachineFunction &MF, BitVector &UnsaveableRegs) const {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  SmallSet<MachineBasicBlock *, 2> SaveBlocks;
+  SmallSet<MachineBasicBlock *, 2> RestoreBlocks;
+
+  if (MFI.getSavePoint()) {
+    SaveBlocks.insert(MFI.getSavePoint());
+    RestoreBlocks.insert(MFI.getRestorePoint());
+  } else {
+    SaveBlocks.insert(&MF.front());
+    for (MachineBasicBlock &MBB : MF)
+      if (MBB.isReturnBlock())
+        RestoreBlocks.insert(&MBB);
+  }
+
+  // Walk blocks from the function entry and exits (following control flow both
+  // ways), stopping when we get to a save/restore block. Check for
+  // instructions which modify any of the registers we care about.
+  SmallVector<MachineBasicBlock *, 4> WorkList;
+  SmallSet<MachineBasicBlock *, 4> VisitedBlocks;
+  LLVM_DEBUG(dbgs() << "Entry block: " << MF.front().getName() << "\n");
+  WorkList.push_back(&MF.front());
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBB.isReturnBlock()) {
+      LLVM_DEBUG(dbgs() << "Return block: " << MBB.getName() << "\n");
+      WorkList.push_back(&MBB);
+    }
+  }
+
+  auto CheckOutsideInst = [&UnsaveableRegs, TRI](MachineInstr &MI) {
+    for (Register Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) {
+      if (MI.modifiesRegister(Reg, TRI)) {
+        UnsaveableRegs.set(Reg);
+        LLVM_DEBUG(dbgs() << "Register " << TRI->getName(Reg)
+                          << " modified by instruction " << MI << "\n");
+      }
+    }
+  };
+
+  while (!WorkList.empty()) {
+    MachineBasicBlock *MBB = WorkList.pop_back_val();
+
+    if (VisitedBlocks.count(MBB))
+      continue;
+    VisitedBlocks.insert(MBB);
+
+    bool IsSave = SaveBlocks.count(MBB);
+    bool IsRestore = RestoreBlocks.count(MBB);
+
+    LLVM_DEBUG(dbgs() << "Visiting block " << MBB->getName() << ", IsSave="
+                      << IsSave << ", IsRestore=" << IsRestore << "\n");
+
+    // If this is a restore block, the POP instruction will be inserted just
+    // before the terminator, so we need to consider any terminator
+    // instructions to be outside the preserved region. We also need to check
+    // for registers which are live at the POP insertion point, because these
+    // can't be restored without changing their value.
+    if (IsRestore) {
+      LivePhysRegs LPR(*TRI);
+      LPR.addLiveOuts(*MBB);
+      for (auto &Term : reverse(MBB->terminators())) {
+        LPR.stepBackward(Term);
+        CheckOutsideInst(Term);
+      }
+
+      for (Register Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) {
+        if (LPR.contains(Reg)) {
+          UnsaveableRegs.set(Reg);
+          LLVM_DEBUG(dbgs() << "Register " << TRI->getName(Reg)
+                            << " live-out of restore block " << MBB->getName()
+                            << "\n");
+        }
+      }
+    }
+
+    // If this block is completely outside the save/restore region, then any
+    // modified registers can't be preserved. A save block counts as being
+    // inside the saved region, with the possible exception of the last few
+    // instructions if it's also a restore block, handled above. We don't visit
+    // blocks which are completely inside the saved region and don't have any
+    // save/restore instructions, so don't need to check that here.
+    if (!IsSave && !IsRestore)
+      for (auto &MI : *MBB)
+        CheckOutsideInst(MI);
+
+    // Walk the control flow graph in both directions, except for blocks which
+    // are inside the PUSH/POP region.
+    if (IsSave || !IsRestore)
+      for (auto Pred : MBB->predecessors())
+        WorkList.push_back(Pred);
+    if (!IsSave || IsRestore)
+      for (auto Succ : MBB->successors())
+        WorkList.push_back(Succ);
+  }
+}
+
+bool ARMFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+  // Shrink wrapping is detrimental to code size because it prevents merging
+  // the CSR restore and function return into one POP instruction. It also
+  // conflicts with saving extra registers for IPRA, because it makes more
+  // registers live at the PUSH/POP.
+  if (MF.getFunction().hasMinSize())
+    return false;
+
+  return true;
+}
+
+// When doing inter-procedural register allocation, saving extra registers in
+// [r0,r3] will allow us to keep live values in them in any callers. The extra
+// saves and restores don't cost us any code-size if we are already emitting
+// PUSH and POP instructions.
+unsigned ARMFrameLowering::spillExtraRegsForIPRA(MachineFunction &MF,
+                                                 BitVector &SavedRegs,
+                                                 bool HasFPRegSaves) const {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  LLVM_DEBUG(dbgs() << "Extra spills for " << MF.getName() << ": ");
+
+  if (!EnableExtraSpills) {
+    LLVM_DEBUG(dbgs() << "optimisation not enabled\n");
+    return 0;
+  }
+
+  // If IPRA is not enabled, nothing will be able to take advantage of the
+  // extra saved registers.
+  if (!MF.getTarget().Options.EnableIPRA) {
+    LLVM_DEBUG(dbgs() << "IPRA disabled\n");
+    return 0;
+  }
+
+  // These registers will take extra time to save and restore, and will often
+  // go unused, so only to this at -Oz.
+  if (!MF.getFunction().hasMinSize()) {
+    LLVM_DEBUG(dbgs() << "not minsize\n");
+    return 0;
+  }
+
+  // If we are not currently spilling any registers, we'd need to add an extra
+  // PUSH/POP pair, so this isn't worth it.
+  if (!SavedRegs.any()) {
+    LLVM_DEBUG(dbgs() << "no existing push/pop\n");
+    return 0;
+  }
+
+  // If we can't guarantee that this definition of the function is the one
+  // which will be picked by the linker, then IPRA can't make use of any extra
+  // saved registers.
+  if (!MF.getFunction().isDefinitionExact()) {
+    LLVM_DEBUG(dbgs() << "inexact definition\n");
+    return 0;
+  }
+
+  int NumVisibleCallers = 0;
+  for (const User *U : MF.getFunction().users()) {
+    if (const CallBase *Call = dyn_cast<CallBase>(U)) {
+      if (Call->getCalledOperand() == &MF.getFunction()) {
+        ++NumVisibleCallers;
+      }
+    }
+  }
+
+  // If we don't have any direct callers in the current translation unit,
+  // nothing will be able to take advantage of the extra saved registers.
+  if (NumVisibleCallers == 0 && !ForceExtraSpills) {
+    LLVM_DEBUG(dbgs() << "no visible callers\n");
+    return 0;
+  }
+
+  // If we need to emit unwind tables, these will be longer if we need to
+  // preserve r0-r3, so we need a lot of visible calls to make this worthwhile.
+  if (MF.getFunction().needsUnwindTableEntry() && NumVisibleCallers <= 8 &&
+      !ForceExtraSpills) {
+    LLVM_DEBUG(dbgs() << "needs unwind table\n");
+    return 0;
+  }
+
+  // Ok, we've decided we are going to try the optimisation.
+  LLVM_DEBUG(dbgs() << "enabled\n");
+
+  // Compute the registers which can't be preserved because they are either
+  // modified before the PUSH or after the POP, or are live at the point where
+  // the POP will be inserted.
+  BitVector NonPreserveableRegisters;
+  NonPreserveableRegisters.resize(TRI->getNumRegs());
+  findRegDefsOutsideSaveRestore(MF, NonPreserveableRegisters);
+
+  unsigned NumExtraRegs = 0;
+
+  // We'd also like to leave some registers free so that we can use them to
+  // fold a small SP update into the PUSH/POP. We can't know exactly what this
+  // optimisation can do, because stack layout isn't finalised, but we can make
+  // a good enough estimate.
+  unsigned StackSize = MFI.estimateStackSize(MF);
+
+  // If the stack space is large, we probably won't be able to fold the SP
+  // update into the push/pop, so we should use all the registers we want. If
+  // we have FP register saves, then the SP update will be folded into the
+  // VPUSH/VPOP instead, and we can use the GPRs freely.
+  if (StackSize > 16 || HasFPRegSaves)
+    StackSize = 0;
+
+  LLVM_DEBUG(dbgs() << "Estimated " << StackSize
+                    << " bytes of SP update being folded into push/pop\n");
+
+  for (Register Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) {
+    if (StackSize) {
+      StackSize -= 4;
+      LLVM_DEBUG(dbgs() << "not saving " << TRI->getName(Reg)
+                        << ", wanted for SP update\n");
+      continue;
+    }
+
+    // If we don't modify the register anywhere in this function, IPRA will
+    // already know that it is preserved, and there's no point in saving it.
+    if (!MRI.isPhysRegModified(Reg)) {
+      LLVM_DEBUG(dbgs() << "not saving " << TRI->getName(Reg)
+                        << ", not modified\n");
+      continue;
+    }
+
+    if (NonPreserveableRegisters[Reg]) {
+      LLVM_DEBUG(dbgs() << "not saving " << TRI->getName(Reg)
+                        << ", modified outide save region\n");
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "also saving " << TRI->getName(Reg) << " for IPRA\n");
+    SavedRegs.set(Reg);
+    MRI.enableCalleeSavedRegister(Reg);
+    ++NumExtraRegs;
+  }
+
+  return NumExtraRegs;
+}
+
 void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                             BitVector &SavedRegs,
                                             RegScavenger *RS) const {
@@ -2007,6 +2260,14 @@
                         << "\n");
     }
 
+    // When using IPRA, we might want to preserve some of r0-r3, to reduce
+    // register pressure in our callers.
+    unsigned ExtraIPRASpills =
+        spillExtraRegsForIPRA(MF, SavedRegs, NumFPRSpills != 0);
+    NumGPRSpills += ExtraIPRASpills;
+    if (ExtraIPRASpills)
+      CS1Spilled = true;
+
     // Avoid spilling LR in Thumb1 if there's a tail call: it's expensive to
     // restore LR in that case.
     bool ExpensiveLRRestore = AFI->isThumb1OnlyFunction() && MFI.hasTailCall();
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2458,26 +2458,25 @@
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  if (!isTailCall) {
-    const uint32_t *Mask;
-    const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
-    if (isThisReturn) {
-      // For 'this' returns, use the R0-preserving mask if applicable
-      Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
-      if (!Mask) {
-        // Set isThisReturn to false if the calling convention is not one that
-        // allows 'returned' to be modeled in this way, so LowerCallResult does
-        // not try to pass 'this' straight through
-        isThisReturn = false;
-        Mask = ARI->getCallPreservedMask(MF, CallConv);
-      }
-    } else
+  const uint32_t *Mask;
+  const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
+  if (isThisReturn) {
+    // For 'this' returns, use the R0-preserving mask if applicable
+    Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
+    if (!Mask) {
+      // Set isThisReturn to false if the calling convention is not one that
+      // allows 'returned' to be modeled in this way, so LowerCallResult does
+      // not try to pass 'this' straight through
+      isThisReturn = false;
       Mask = ARI->getCallPreservedMask(MF, CallConv);
-
-    assert(Mask && "Missing call preserved mask for calling convention");
-    Ops.push_back(DAG.getRegisterMask(Mask));
+    }
+  } else {
+    Mask = ARI->getCallPreservedMask(MF, CallConv);
   }
 
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
--- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -216,6 +216,10 @@
         break;
       }
       LLVM_FALLTHROUGH;
+    case ARM::R0:
+    case ARM::R1:
+    case ARM::R2:
+    case ARM::R3:
     case ARM::R4:
     case ARM::R5:
     case ARM::R6:
@@ -848,7 +852,8 @@
   if (!LoRegsToSave.none()) {
     MachineInstrBuilder MIB =
         BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL));
-    for (unsigned Reg : {ARM::R4, ARM::R5, ARM::R6, ARM::R7, ARM::LR}) {
+    for (unsigned Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3, ARM::R4, ARM::R5,
+                         ARM::R6, ARM::R7, ARM::LR}) {
       if (LoRegsToSave[Reg]) {
         bool isKill = !MRI.isLiveIn(Reg);
         if (isKill && !MRI.isReserved(Reg))
@@ -956,6 +961,9 @@
       llvm_unreachable("callee-saved register of unexpected class");
     }
 
+    if (Reg == ARM::LR)
+      I.setRestored(false);
+
     // If this is a low register not used as the frame pointer, we may want to
     // use it for restoring the high registers.
     if ((ARM::tGPRRegClass.contains(Reg)) &&
@@ -980,6 +988,9 @@
   static const unsigned AllCopyRegs[] = {ARM::R0, ARM::R1, ARM::R2, ARM::R3,
                                          ARM::R4, ARM::R5, ARM::R6, ARM::R7};
   static const unsigned AllHighRegs[] = {ARM::R8, ARM::R9, ARM::R10, ARM::R11};
+  static const unsigned AllLoRegs[] = {ARM::R0, ARM::R1, ARM::R2,
+                                       ARM::R3, ARM::R4, ARM::R5,
+                                       ARM::R6, ARM::R7, ARM::LR};
 
   const unsigned *AllCopyRegsEnd = std::end(AllCopyRegs);
   const unsigned *AllHighRegsEnd = std::end(AllHighRegs);
@@ -1018,16 +1029,10 @@
       BuildMI(MF, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL));
 
   bool NeedsPop = false;
-  for (unsigned i = CSI.size(); i != 0; --i) {
-    CalleeSavedInfo &Info = CSI[i-1];
-    unsigned Reg = Info.getReg();
-
-    // High registers (excluding lr) have already been dealt with
-    if (!(ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR))
+  for (unsigned Reg : AllLoRegs) {
+    if (!LoRegsToRestore[Reg])
       continue;
-
     if (Reg == ARM::LR) {
-      Info.setRestored(false);
       if (!MBB.succ_empty() ||
           MI->getOpcode() == ARM::TCRETURNdi ||
           MI->getOpcode() == ARM::TCRETURNri)
diff --git a/llvm/test/CodeGen/ARM/ipra-extra-spills-exceptions.ll b/llvm/test/CodeGen/ARM/ipra-extra-spills-exceptions.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ipra-extra-spills-exceptions.ll
@@ -0,0 +1,149 @@
+; RUN: llc -mtriple armv7a--none-eabi   -enable-ipra=true -arm-extra-spills -arm-extra-spills-force -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple thumbv7a--none-eabi -enable-ipra=true -arm-extra-spills -arm-extra-spills-force -verify-machineinstrs < %s | FileCheck %s
+
+; Test the interaction between IPRA and C++ exception handling. Currently, IPRA
+; only marks registers as preserved on the non-exceptional return path, not in
+; the landing pad.
+
+declare dso_local i8* @__cxa_allocate_exception(i32) local_unnamed_addr
+declare dso_local void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr
+declare dso_local i32 @__gxx_personality_v0(...)
+declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone
+declare dso_local i8* @__cxa_begin_catch(i8*) local_unnamed_addr
+declare dso_local void @__cxa_end_catch() local_unnamed_addr
+
+@g = dso_local local_unnamed_addr global i32 0, align 4
+@_ZTIi = external dso_local constant i8*
+
+define dso_local i32 @_Z11maybe_throwv() minsize {
+; This function might return normally, or might throw an exception. r0 is used
+; for a return value, we can preserve r1-r3 for IPRA.
+; CHECK:      .save   {r1, r2, r3, lr}
+; CHECK-NEXT: push    {r1, r2, r3, lr}
+; CHECK:      pop{{(..)?}}    {r1, r2, r3, pc}
+entry:
+  %0 = load i32, i32* @g, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %exception = tail call i8* @__cxa_allocate_exception(i32 4)
+  %1 = bitcast i8* %exception to i32*
+  store i32 42, i32* %1, align 8
+  tail call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+  unreachable
+
+if.else:                                          ; preds = %entry
+  ret i32 1337
+}
+
+; Use inline assembly to force r0-r3 to be alive across a potentially throwing
+; call, using them on the non-exceptional return path. r0 is the return value,
+; so must be copied to another register. r1-r3 are voluntarily preserved by the
+; callee, so can be left in those registers.
+define dso_local i32 @_Z25test_non_exceptional_pathv() minsize personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK:      @APP
+; CHECK-NEXT: @ def r0-r3
+; CHECK-NEXT: @NO_APP
+; CHECK-NEXT: mov     [[SAVE_R0:r[0-9]+]], r0
+; CHECK-NEXT: .Ltmp{{.*}}
+; CHECK-NEXT: bl      _Z11maybe_throwv
+; CHECK:      mov     r0, [[SAVE_R0]]
+; CHECK-NEXT: @APP
+; CHECK-NEXT: @ use r0-r3
+; CHECK-NEXT: @NO_APP
+entry:
+  %0 = tail call { i32, i32, i32, i32 } asm sideeffect "// def r0-r3", "={r0},={r1},={r2},={r3}"()
+  %call = invoke i32 @_Z11maybe_throwv()
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %1 = landingpad { i8*, i32 }
+          cleanup
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %2 = extractvalue { i8*, i32 } %1, 1
+  %3 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %2, %3
+  br i1 %matches, label %catch, label %ehcleanup
+
+catch:                                            ; preds = %lpad
+  %4 = extractvalue { i8*, i32 } %1, 0
+  %5 = tail call i8* @__cxa_begin_catch(i8* %4)
+  %6 = bitcast i8* %5 to i32*
+  %7 = load i32, i32* %6, align 4
+  tail call void @__cxa_end_catch()
+  br label %cleanup
+
+try.cont:                                         ; preds = %entry
+  %asmresult3 = extractvalue { i32, i32, i32, i32 } %0, 3
+  %asmresult2 = extractvalue { i32, i32, i32, i32 } %0, 2
+  %asmresult1 = extractvalue { i32, i32, i32, i32 } %0, 1
+  %asmresult = extractvalue { i32, i32, i32, i32 } %0, 0
+  tail call void asm sideeffect "// use r0-r3", "{r0},{r1},{r2},{r3}"(i32 %asmresult, i32 %asmresult1, i32 %asmresult2, i32 %asmresult3)
+  br label %cleanup
+
+cleanup:                                          ; preds = %try.cont, %catch
+  %retval.0 = phi i32 [ 0, %try.cont ], [ %7, %catch ]
+  ret i32 %retval.0
+
+ehcleanup:                                        ; preds = %lpad
+  resume { i8*, i32 } %1
+}
+
+
+; Use inline assembly to force r0-r3 to be alive across a potentially throwing
+; call, using them after catching the exception. IPRA does not currently mark
+; voluntarily preserved registers as live into the landing pad block, so all
+; four registers must be copied elsewhere.
+define dso_local i32 @_Z21test_exceptional_pathv() local_unnamed_addr minsize personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK:      @APP
+; CHECK-NEXT: @ def r0-r3
+; CHECK-NEXT: @NO_APP
+; CHECK-DAG: mov [[SAVE_R0:r[0-9]+]], r0
+; CHECK-DAG: mov [[SAVE_R1:r[0-9]+]], r1
+; CHECK-DAG: mov [[SAVE_R2:r[0-9]+]], r2
+; CHECK-DAG: mov [[SAVE_R3:r[0-9]+]], r3
+; CHECK:      bl      _Z11maybe_throw
+
+; CHECK:      bl      __cxa_begin_catch
+; CHECK:      mov     r0, [[SAVE_R0]]
+; CHECK-NEXT: mov     r1, [[SAVE_R1]]
+; CHECK-NEXT: mov     r2, [[SAVE_R2]]
+; CHECK-NEXT: mov     r3, [[SAVE_R3]]
+; CHECK-NEXT: @APP
+; CHECK-NEXT: @ use r0-r3
+; CHECK-NEXT: @NO_APP
+entry:
+  %0 = tail call { i32, i32, i32, i32 } asm sideeffect "// def r0-r3", "={r0},={r1},={r2},={r3}"()
+  %asmresult = extractvalue { i32, i32, i32, i32 } %0, 0
+  %asmresult1 = extractvalue { i32, i32, i32, i32 } %0, 1
+  %asmresult2 = extractvalue { i32, i32, i32, i32 } %0, 2
+  %asmresult3 = extractvalue { i32, i32, i32, i32 } %0, 3
+  %call = invoke i32 @_Z11maybe_throwv()
+          to label %cleanup unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %1 = landingpad { i8*, i32 }
+          cleanup
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %2 = extractvalue { i8*, i32 } %1, 1
+  %3 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %2, %3
+  br i1 %matches, label %catch, label %ehcleanup
+
+catch:                                            ; preds = %lpad
+  %4 = extractvalue { i8*, i32 } %1, 0
+  %5 = tail call i8* @__cxa_begin_catch(i8* %4)
+  %6 = bitcast i8* %5 to i32*
+  %7 = load i32, i32* %6, align 4
+  tail call void asm sideeffect "// use r0-r3", "{r0},{r1},{r2},{r3}"(i32 %asmresult, i32 %asmresult1, i32 %asmresult2, i32 %asmresult3)
+  tail call void @__cxa_end_catch()
+  br label %cleanup
+
+cleanup:                                          ; preds = %entry, %catch
+  %retval.0 = phi i32 [ %7, %catch ], [ 0, %entry ]
+  ret i32 %retval.0
+
+ehcleanup:                                        ; preds = %lpad
+  resume { i8*, i32 } %1
+}
diff --git a/llvm/test/CodeGen/ARM/ipra-extra-spills.ll b/llvm/test/CodeGen/ARM/ipra-extra-spills.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ipra-extra-spills.ll
@@ -0,0 +1,406 @@
+; RUN: llc -mtriple armv7a--none-eabi   -enable-ipra=true -arm-extra-spills -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ARM
+; RUN: llc -mtriple thumbv7a--none-eabi -enable-ipra=true -arm-extra-spills -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB2
+; RUN: llc -mtriple thumbv6m--none-eabi -enable-ipra=true -arm-extra-spills -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB1
+
+; This clobbers r0, and already needs a push/pop, so we also save and restore
+; r0. The push of r11 is to maintain stack alignment (though that isn't
+; technically needed in this example).
+define void @test_r0_r4() minsize nounwind {
+; CHECK-LABEL: test_r0_r4:
+; ARM: .save   {r0, r4, r11, lr}
+; ARM: push    {r0, r4, r11, lr}
+; ARM: pop     {r0, r4, r11, pc}
+; THUMB1: .save   {r0, r4, r7, lr}
+; THUMB1: push    {r0, r4, r7, lr}
+; THUMB1: pop     {r0, r4, r7, pc}
+; THUMB2: .save   {r0, r4, r7, lr}
+; THUMB2: push    {r0, r4, r7, lr}
+; THUMB2: pop     {r0, r4, r7, pc}
+  call void asm sideeffect "", "~{r0},~{r4}"()
+  ret void
+}
+
+; This clobbers r0-r3, and already needs a push/pop, so we also save and
+; restore all of them.
+define void @test_r0_r1_r2_r3_r4() minsize nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3_r4:
+; CHECK: .save   {r0, r1, r2, r3, r4, lr}
+; CHECK: push    {r0, r1, r2, r3, r4, lr}
+; CHECK: pop     {r0, r1, r2, r3, r4, pc}
+  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"()
+  ret void
+}
+
+; Check that IPRA does make use of the extra saved registers.
+define void @test_ipra() nounwind {
+; CHECK-LABEL: test_ipra:
+; CHECK: ASM1: r0, r1, r2, r3
+; CHECK-NOT: r0
+; CHECK-NOT: r1
+; CHECK-NOT: r2
+; CHECK-NOT: r3
+; CHECK: bl      test_r0_r1_r2_r3_r4
+; CHECK-NOT: r0
+; CHECK-NOT: r1
+; CHECK-NOT: r2
+; CHECK-NOT: r3
+; CHECK: ASM2: r0, r1, r2, r3
+  %regs = call { i32, i32, i32, i32 } asm sideeffect "// ASM1: $0, $1, $2, $3", "={r0},={r1},={r2},={r3}"() 
+  %r0 = extractvalue { i32, i32, i32, i32 } %regs, 0
+  %r1 = extractvalue { i32, i32, i32, i32 } %regs, 1
+  %r2 = extractvalue { i32, i32, i32, i32 } %regs, 2
+  %r3 = extractvalue { i32, i32, i32, i32 } %regs, 3
+  call void @test_r0_r1_r2_r3_r4()
+  call void asm sideeffect "// ASM2: $0, $1, $2, $3", "{r0},{r1},{r2},{r3}"(i32 %r0, i32 %r1, i32 %r2, i32 %r3)
+  ret void
+}
+
+; This clobbers r0-r3, but doesn't otherwise need a push/pop, so we don't add
+; them.
+define void @test_r0_r1_r2_r3() minsize nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3:
+; CHECK-NOT: push
+; CHECK-NOT: pop
+  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"()
+  ret void
+}
+
+; This isn't called in this function, so we don't push any extra registers.
+define void @test_r0_r4_not_called() minsize nounwind {
+; CHECK-LABEL: test_r0_r4_not_called:
+; CHECK: .save   {r4, lr}
+; CHECK: push    {r4, lr}
+; CHECK: pop     {r4, pc}
+; CHECK-NOT: push
+; CHECK-NOT: pop
+  call void asm sideeffect "", "~{r0},~{r4}"()
+  ret void
+}
+
+; This function is only optsize, not minsize, so we don't add any extra saves.
+define void @test_r0_r4_not_minsize() optsize nounwind {
+; CHECK-LABEL: test_r0_r4_not_minsize:
+; CHECK: .save   {r4, lr}
+; CHECK: push    {r4, lr}
+; CHECK: pop     {r4, pc}
+; CHECK-NOT: push
+; CHECK-NOT: pop
+  call void asm sideeffect "", "~{r0},~{r4}"()
+  ret void
+}
+
+; This function is not an exact definition (the linker could pick an
+; alternative version of it), so we don't add any extra saves.
+define linkonce_odr void @test_r0_r4_not_exact() minsize nounwind {
+; CHECK-LABEL: test_r0_r4_not_exact:
+; CHECK: .save   {r4, lr}
+; CHECK: push    {r4, lr}
+; CHECK: pop     {r4, pc}
+; CHECK-NOT: push
+; CHECK-NOT: pop
+  call void asm sideeffect "", "~{r0},~{r4}"()
+  ret void
+}
+
+; This clobbers r0-r3, but returns a value in r0, so only r1-r3 are saved.
+define i32 @test_r0_r1_r2_r3_r4_return_1() minsize nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3_r4_return_1:
+; ARM: .save   {r1, r2, r3, r4, r11, lr}
+; ARM: push    {r1, r2, r3, r4, r11, lr}
+; ARM: pop     {r1, r2, r3, r4, r11, pc}
+; THUMB1: .save   {r1, r2, r3, r4, r7, lr}
+; THUMB1: push    {r1, r2, r3, r4, r7, lr}
+; THUMB1: pop     {r1, r2, r3, r4, r7, pc}
+; THUMB2: .save   {r1, r2, r3, r4, r7, lr}
+; THUMB2: push    {r1, r2, r3, r4, r7, lr}
+; THUMB2: pop     {r1, r2, r3, r4, r7, pc}
+  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"()
+  ret i32 42
+}
+
+; This clobbers r0-r3, but returns a value in r0 and r1, so only r2-r3 are
+; saved.
+define i64 @test_r0_r1_r2_r3_r4_return_2() minsize nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3_r4_return_2:
+; CHECK: .save   {r2, r3, r4, lr}
+; CHECK: push    {r2, r3, r4, lr}
+; CHECK: pop     {r2, r3, r4, pc}
+  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"()
+  ret i64 42
+}
+
+; This clobbers r0-r3, but returns a value in all of r0-r3, so none of them can
+; be saved.
+define i128 @test_r0_r1_r2_r3_r4_return_4() minsize nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3_r4_return_4:
+; CHECK: .save   {r4, lr}
+; CHECK: push    {r4, lr}
+; CHECK: pop     {r4, pc}
+  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"()
+  ret i128 42
+}
+
+; This clobbers r0-r3, and returns a value in s0, so all of r0-r3 are saved (we
+; previously only checked the number of return registers, ignoring their
+; class).
+define arm_aapcs_vfpcc float @test_r0_r1_r2_r3_r4_return_float() minsize nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3_r4_return_float:
+; ARM: .save   {r0, r1, r2, r3, r4, lr}
+; ARM: push    {r0, r1, r2, r3, r4, lr}
+; ARM: pop     {r0, r1, r2, r3, r4, pc}
+; THUMB1: .save   {r1, r2, r3, r4, r7, lr}
+; THUMB1: push    {r1, r2, r3, r4, r7, lr}
+; THUMB1: pop     {r1, r2, r3, r4, r7, pc}
+; THUMB2: .save   {r0, r1, r2, r3, r4, lr}
+; THUMB2: push    {r0, r1, r2, r3, r4, lr}
+; THUMB2: pop     {r0, r1, r2, r3, r4, pc}
+  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"()
+  ret float 42.0
+}
+
+; Saving of high registers in thumb1 is more complicated, because they need to
+; be copied down to low registers to use push/pop instructions. Luckily, the
+; extra registers we are preserving are low registers, which are handled by the
+; outer-most push/pop pair, so this doesn't interact badly.
+define void @test_save_high_regs() minsize nounwind {
+; CHECK-LABEL: test_save_high_regs:
+; ARM: .save   {r0, r1, r2, r3, r7, r8, r9, r10, r11, lr}
+; ARM: push    {r0, r1, r2, r3, r7, r8, r9, r10, r11, lr}
+; ARM: pop     {r0, r1, r2, r3, r7, r8, r9, r10, r11, pc}
+; THUMB1:      .save   {r0, r1, r2, r3, r7, lr}
+; THUMB1-NEXT: push    {r0, r1, r2, r3, r7, lr}
+; THUMB1-NEXT: mov     lr, r11
+; THUMB1-NEXT: mov     r7, r10
+; THUMB1-NEXT: mov     r3, r9
+; THUMB1-NEXT: mov     r2, r8
+; THUMB1-NEXT: .save   {r8, r9, r10, r11}
+; THUMB1-NEXT: push    {r2, r3, r7, lr}
+; THUMB1:      pop     {r0, r1, r2, r3}
+; THUMB1-NEXT: mov     r8, r0
+; THUMB1-NEXT: mov     r9, r1
+; THUMB1-NEXT: mov     r10, r2
+; THUMB1-NEXT: mov     r11, r3
+; THUMB1-NEXT: pop     {r0, r1, r2, r3, r7, pc}
+; THUMB2: .save   {r0, r1, r2, r3, r7, r8, r9, r10, r11, lr}
+; THUMB2: push.w  {r0, r1, r2, r3, r7, r8, r9, r10, r11, lr}
+; THUMB2: pop.w   {r0, r1, r2, r3, r7, r8, r9, r10, r11, pc}
+  call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r8},~{r9},~{r10},~{r11}"()
+  ret void
+}
+
+; We can also use extra registers in the PUSH/POP instructions to move the SP
+; to make space for local variables. These registers aren't preserved, because
+; the space they are saved in is used for the local variable. We try to back
+; off the extra-CSRs optimisation to allow this to still happen. In this case,
+; there are 8 bytes of stack space needed, so we preserve two argument
+; registers and use the other two for the SP update.
+define void @test_r0_r1_r2_r3_r4_stack8() minsize nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3_r4_stack8:
+; CHECK: .save   {r2, r3, r4, lr}
+; CHECK: push    {r0, r1, r2, r3, r4, lr}
+; CHECK: pop     {r0, r1, r2, r3, r4, pc}
+  %a = alloca [2 x i32], align 4
+  call void asm sideeffect "str $1, [$0]; str $1, [$0, #4]", "{r0},{r1},~{r2},~{r3},~{r4}"([2 x i32]* %a, i32 42)
+  ret void
+}
+
+; Check that, when the above function is called, r0 and r1 (used for the SP
+; updates) are considered clobbered, and r2 and r3 are preserved.
+define void @test_r0_r1_r2_r3_r4_stack8_caller() nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3_r4_stack8_caller:
+; CHECK:      ASM1: r0, r1, r2, r3
+; CHECK-NEXT: @NO_APP
+; CHECK-NEXT: mov     r4, r0
+; CHECK-NEXT: mov     r5, r1
+; CHECK-NEXT: bl      test_r0_r1_r2_r3_r4
+; CHECK-NEXT: mov     r0, r4
+; CHECK-NEXT: mov     r1, r5
+; CHECK-NEXT: @APP
+; CHECK-NEXT: ASM2: r0, r1, r2, r3
+  %regs = call { i32, i32, i32, i32 } asm sideeffect "// ASM1: $0, $1, $2, $3", "={r0},={r1},={r2},={r3}"() 
+  %r0 = extractvalue { i32, i32, i32, i32 } %regs, 0
+  %r1 = extractvalue { i32, i32, i32, i32 } %regs, 1
+  %r2 = extractvalue { i32, i32, i32, i32 } %regs, 2
+  %r3 = extractvalue { i32, i32, i32, i32 } %regs, 3
+  call void @test_r0_r1_r2_r3_r4_stack8()
+  call void asm sideeffect "// ASM2: $0, $1, $2, $3", "{r0},{r1},{r2},{r3}"(i32 %r0, i32 %r1, i32 %r2, i32 %r3)
+  ret void
+}
+
+; Like @test_r0_r1_r2_r3_r4_stack8, but 16 bytes of stack space are needed, so
+; all of r0-r3 are used for the SP update, and not preserved.
+define void @test_r0_r1_r2_r3_r4_stack16() minsize nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3_r4_stack16:
+; CHECK: .save   {r4, lr}
+; CHECK: push    {r0, r1, r2, r3, r4, lr}
+; CHECK: pop     {r0, r1, r2, r3, r4, pc}
+  %a = alloca [4 x i32], align 4
+  call void asm sideeffect "str $1, [$0]; str $1, [$0, #4]", "{r0},{r1},~{r2},~{r3},~{r4}"([4 x i32]* %a, i32 42)
+  ret void
+}
+
+; If more than 16 bytes of stack space are needed, it's unlikely that the
+; SP-update folding optimisation will succeed, so we revert back to preserving
+; r0-r3 for use in our callers.
+define void @test_r0_r1_r2_r3_r4_stack24() minsize nounwind {
+; CHECK-LABEL: test_r0_r1_r2_r3_r4_stack24:
+; CHECK: .save   {r0, r1, r2, r3, r4, lr}
+; CHECK: push    {r0, r1, r2, r3, r4, lr}
+; CHECK: pop     {r0, r1, r2, r3, r4, pc}
+  %a = alloca [6 x i32], align 4
+  call void asm sideeffect "str $1, [$0]; str $1, [$0, #4]", "{r0},{r1},~{r2},~{r3},~{r4}"([6 x i32]* %a, i32 42)
+  ret void
+}
+
+define i32 @tail_callee(i32 %a, i32 %b) minsize nounwind {
+entry:
+  tail call void asm sideeffect "", "~{r2}"()
+  ret i32 %a
+}
+
+; The tail call happens outside the save/restore region, so prevents us from
+; preserving some registers. r0 and r1 are outgoing arguments to the tail-call,
+; so can't be preserved. r2 is modified inside the tail-called function, so
+; can't be presrved. r3 is known to be preserved by the callee, so can be
+; presrved. For Thumb1, we can't (efficiently) use a tail-call here, so r1-r3
+; are all preserved, with r0 being the return value.
+define i32 @test_tail_call() minsize nounwind {
+entry:
+; CHECK-LABEL: test_tail_call:
+; ARM: .save   {r3, lr}
+; ARM: push    {r3, lr}
+; ARM: pop     {r3, lr}
+; ARM: b       tail_callee
+; THUMB2: .save   {r3, lr}
+; THUMB2: push    {r3, lr}
+; THUMB2: pop.w   {r3, lr}
+; THUMB2: b       tail_callee
+; THUMB1: .save   {r1, r2, r3, lr}
+; THUMB1: push    {r1, r2, r3, lr}
+; THUMB1: bl      tail_callee
+; THUMB1: pop     {r1, r2, r3, pc}
+  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{lr}"()
+  %call = tail call i32 @tail_callee(i32 3, i32 4)
+  ret i32 %call
+}
+
+declare i32 @tail_callee_external(i32 %a, i32 %b)
+
+; If we tail-call an external function, it could clobber any of r0-r3.
+define i32 @test_tail_call_external() minsize nounwind {
+entry:
+; CHECK-LABEL: test_tail_call_external:
+; ARM: .save   {r11, lr}
+; ARM: push    {r11, lr}
+; ARM: pop     {r11, lr}
+; ARM: b       tail_callee_external
+; THUMB2: .save   {r7, lr}
+; THUMB2: push    {r7, lr}
+; THUMB2: pop.w   {r7, lr}
+; THUMB2: b       tail_callee_external
+; THUMB1: .save   {r1, r2, r3, lr}
+; THUMB1: push    {r1, r2, r3, lr}
+; THUMB1: bl      tail_callee_external
+; THUMB1: pop     {r1, r2, r3, pc}
+  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{lr}"()
+  %call = tail call i32 @tail_callee_external(i32 3, i32 4)
+  ret i32 %call
+}
+
+define linkonce_odr i32 @tail_callee_linkonce_odr(i32 %a, i32 %b) minsize nounwind {
+entry:
+  tail call void asm sideeffect "", "~{r2}"()
+  ret i32 %a
+}
+
+; If a tail-callee has an interposable linkage type (such as linkonce_odr), we
+; can't assume the linker will pick the definition we can see, so must assume
+; it clobbers all of r0-r3.
+define i32 @test_tail_call_linkonce_odr() minsize nounwind {
+entry:
+; CHECK-LABEL: test_tail_call_linkonce_odr:
+; ARM: .save   {r11, lr}
+; ARM: push    {r11, lr}
+; ARM: pop     {r11, lr}
+; ARM: b       tail_callee_linkonce_odr
+; THUMB2: .save   {r7, lr}
+; THUMB2: push    {r7, lr}
+; THUMB2: pop.w   {r7, lr}
+; THUMB2: b       tail_callee_linkonce_odr
+; THUMB1: .save   {r1, r2, r3, lr}
+; THUMB1: push    {r1, r2, r3, lr}
+; THUMB1: bl      tail_callee_linkonce_odr
+; THUMB1: pop     {r1, r2, r3, pc}
+  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{lr}"()
+  %call = tail call i32 @tail_callee_linkonce_odr(i32 3, i32 4)
+  ret i32 %call
+}
+
+; This function doesn't have the nounwind attribute, so unwind tables will be
+; emitted. Saving r0-r3 requires a longer unwind instruction sequence, which
+; results in an increase in total code size if there are few callers to make
+; use of the extra registers.
+define void @test_unwind_tables() minsize {
+; CHECK-LABEL: test_unwind_tables:
+; ARM: .save   {r4, lr}
+; ARM: push    {r4, lr}
+; ARM: pop     {r4, pc}
+; THUMB1: .save   {r4, lr}
+; THUMB1: push    {r4, lr}
+; THUMB1: pop     {r4, pc}
+; THUMB2: .save   {r4, lr}
+; THUMB2: push    {r4, lr}
+; THUMB2: pop     {r4, pc}
+  call void asm sideeffect "", "~{r0},~{r4}"()
+  ret void
+}
+
+; This requires an unwind table, but has many call sites, so overall we expect
+; the benefits to outweigh the size increase of the unwind table.
+define void @test_unwind_tables_many_calls() minsize {
+; CHECK-LABEL: test_unwind_tables_many_calls:
+; ARM: .save   {r0, r4, r11, lr}
+; ARM: push    {r0, r4, r11, lr}
+; ARM: pop     {r0, r4, r11, pc}
+; THUMB1: .save   {r0, r4, r7, lr}
+; THUMB1: push    {r0, r4, r7, lr}
+; THUMB1: pop     {r0, r4, r7, pc}
+; THUMB2: .save   {r0, r4, r7, lr}
+; THUMB2: push    {r0, r4, r7, lr}
+; THUMB2: pop     {r0, r4, r7, pc}
+  call void asm sideeffect "", "~{r0},~{r4}"()
+  ret void
+}
+
+; We don't do this optimisation is there are no callers in the same translation
+; unit (otherwise IPRA wouldn't be able to take advantage of the extra saved
+; registers), so most functions in this file are called here.
+define void @caller() {
+; CHECK-LABEL: caller:
+  call void @test_r0_r4()
+  call void @test_r0_r1_r2_r3_r4()
+  call void @test_r0_r1_r2_r3()
+  call void @test_r0_r4_not_minsize()
+  call void @test_r0_r4_not_exact()
+  %t1 = call i32 @test_r0_r1_r2_r3_r4_return_1()
+  %t2 = call i64 @test_r0_r1_r2_r3_r4_return_2()
+  %t3 = call i128 @test_r0_r1_r2_r3_r4_return_4()
+  %t4 = call float @test_r0_r1_r2_r3_r4_return_float()
+  call void @test_save_high_regs()
+  call void @test_r0_r1_r2_r3_r4_stack16()
+  call void @test_r0_r1_r2_r3_r4_stack24()
+  %t5 = call i32 @test_tail_call()
+  %t6 = call i32 @test_tail_call_external()
+  %t7 = call i32 @test_tail_call_linkonce_odr()
+  call void @test_unwind_tables()
+  call void @test_unwind_tables_many_calls()
+  call void @test_unwind_tables_many_calls()
+  call void @test_unwind_tables_many_calls()
+  call void @test_unwind_tables_many_calls()
+  call void @test_unwind_tables_many_calls()
+  call void @test_unwind_tables_many_calls()
+  call void @test_unwind_tables_many_calls()
+  call void @test_unwind_tables_many_calls()
+  call void @test_unwind_tables_many_calls()
+  ret void
+}
diff --git a/llvm/test/CodeGen/Thumb2/ifcvt-minsize.ll b/llvm/test/CodeGen/Thumb2/ifcvt-minsize.ll
--- a/llvm/test/CodeGen/Thumb2/ifcvt-minsize.ll
+++ b/llvm/test/CodeGen/Thumb2/ifcvt-minsize.ll
@@ -66,16 +66,13 @@
 define void @f3(i32 %x) #0 {
 ; CHECK-LABEL: f3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    cmp r0, #1
-; CHECK-NEXT:    bne .LBB2_2
-; CHECK-NEXT:  @ %bb.1: @ %t
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    bl fn
-; CHECK-NEXT:    pop.w {r7, lr}
-; CHECK-NEXT:  .LBB2_2: @ %f
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    cmp r0, #1
+; CHECK-NEXT:    itt eq
+; CHECK-NEXT:    moveq r0, #0
+; CHECK-NEXT:    bleq fn
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %p = icmp eq i32 %x, 1
   br i1 %p, label %t, label %f