Index: include/llvm/ADT/SparseSet.h
===================================================================
--- include/llvm/ADT/SparseSet.h
+++ include/llvm/ADT/SparseSet.h
@@ -23,6 +23,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Allocator.h"
+#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <cstdlib>
@@ -128,7 +129,6 @@
 
   using KeyT = typename KeyFunctorT::argument_type;
   using DenseT = SmallVector<ValueT, 8>;
-  using size_type = unsigned;
   DenseT Dense;
   SparseT *Sparse = nullptr;
   unsigned Universe = 0;
@@ -141,6 +141,7 @@
   using const_reference = const ValueT &;
   using pointer = ValueT *;
   using const_pointer = const ValueT *;
+  using size_type = unsigned;
 
   SparseSet() = default;
   SparseSet(const SparseSet &) = delete;
@@ -154,17 +155,27 @@
   /// @param U Universe size. All object keys must be less than U.
   ///
   void setUniverse(unsigned U) {
-    // It's not hard to resize the universe on a non-empty set, but it doesn't
-    // seem like a likely use case, so we can add that code when we need it.
-    assert(empty() && "Can only resize universe on an empty map");
     // Hysteresis prevents needless reallocations.
     if (U >= Universe/4 && U <= Universe)
       return;
-    free(Sparse);
+    if (U > Universe)
+      U = std::max(U, 2 * Universe);
+
     // The Sparse array doesn't actually need to be initialized, so malloc
     // would be enough here, but that will cause tools like valgrind to
     // complain about branching on uninitialized data.
-    Sparse = static_cast<SparseT*>(safe_calloc(U, sizeof(SparseT)));
+    SparseT *S = static_cast<SparseT*>(safe_calloc(U, sizeof(SparseT)));
+
+    // Record already inserted elements in the new Sparse array.
+    for (unsigned i = 0, e = size(); i < e; i++) {
+      unsigned Idx = ValIndexOf(Dense[i]);
+      assert(Idx <= U && "Index of an already inserted element is bigger than "
+                         "the new universe size");
+      S[Idx] = i;
+    }
+
+    free(Sparse);
+    Sparse = S;
     Universe = U;
   }
 
Index: include/llvm/CodeGen/LiveRangeEdit.h
===================================================================
--- include/llvm/CodeGen/LiveRangeEdit.h
+++ include/llvm/CodeGen/LiveRangeEdit.h
@@ -40,6 +40,7 @@
 class MachineLoopInfo;
 class MachineOperand;
 class TargetInstrInfo;
+class TargetRegisterClass;
 class TargetRegisterInfo;
 class VirtRegMap;
 
@@ -177,8 +178,10 @@
     return makeArrayRef(NewRegs).slice(FirstNew);
   }
 
-  /// createFrom - Create a new virtual register based on OldReg.
-  unsigned createFrom(unsigned OldReg);
+  /// createFrom - Create a new virtual register based on OldReg. If RC is
+  /// specified then the register will have this class, else the class of OldReg
+  /// is used.
+  unsigned createFrom(unsigned OldReg, const TargetRegisterClass *RC = nullptr);
 
   /// create - Create a new register with the same class and original slot as
   /// parent.
Index: include/llvm/CodeGen/TargetInstrInfo.h
===================================================================
--- include/llvm/CodeGen/TargetInstrInfo.h
+++ include/llvm/CodeGen/TargetInstrInfo.h
@@ -898,6 +898,29 @@
                      "TargetInstrInfo::loadRegFromStackSlot!");
   }
 
+  /// Return a register class that is appropriate for stack save/restore of the
+  /// given register class.
+  ///
+  /// For instance, Thumb1 does not provide instructions to directly
+  /// save/restore high registers. Storing a high register must be done by first
+  /// copying the value in a low register and then saving this register.
+  /// Similarly, reload requires an adequately reversed sequence. For this case,
+  /// the method returns the low-register class when given the high-register
+  /// class.
+  ///
+  /// This allows to allocate a new register with the returned class and insert
+  /// a COPY instruction before/after the store/load created by
+  /// storeRegToStackSlot()/loadRegFromStackSlot():
+  ///   %1:save-restore-class = COPY %0:original-class
+  ///   STR %1:save-restore-class, %stack.1
+  ///
+  ///   %1:save-restore-class = LDR %stack.1
+  ///   %0:original-class = COPY %1:save-restore-class
+  virtual const TargetRegisterClass *
+  getRegClassForStackSaveRestore(const TargetRegisterClass *RC) const {
+    return RC;
+  }
+
   /// This function is called for all pseudo instructions
   /// that remain after register allocation. Many pseudo instructions are
   /// created to help register allocation. This is the place to convert them
Index: lib/CodeGen/InlineSpiller.cpp
===================================================================
--- lib/CodeGen/InlineSpiller.cpp
+++ lib/CodeGen/InlineSpiller.cpp
@@ -222,7 +222,7 @@
   bool foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>>,
                          MachineInstr *LoadMI = nullptr);
   void insertReload(unsigned VReg, SlotIndex, MachineBasicBlock::iterator MI);
-  void insertSpill(unsigned VReg, bool isKill, MachineBasicBlock::iterator MI);
+  void insertSpill(unsigned VReg, MachineBasicBlock::iterator MI);
 
   void spillAroundUses(unsigned Reg);
   void spillAll();
@@ -872,8 +872,21 @@
   MachineBasicBlock &MBB = *MI->getParent();
 
   MachineInstrSpan MIS(MI);
-  TII.loadRegFromStackSlot(MBB, MI, NewVReg, StackSlot,
-                           MRI.getRegClass(NewVReg), &TRI);
+  unsigned LoadReg = NewVReg;
+  const TargetRegisterClass &RC = *MRI.getRegClass(NewVReg);
+  const TargetRegisterClass &LoadRC = *TII.getRegClassForStackSaveRestore(&RC);
+  if (&RC != &LoadRC) {
+    LoadReg = Edit->createFrom(NewVReg, &LoadRC);
+    LLVM_DEBUG(dbgs() << "Using " << printReg(LoadReg, &TRI) << ":"
+                      << TRI.getRegClassName(&LoadRC)
+                      << " as an intermediate for the reload\n");
+  }
+
+  TII.loadRegFromStackSlot(MBB, MI, LoadReg, StackSlot, &LoadRC, &TRI);
+
+  if (&RC != &LoadRC)
+    BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(TargetOpcode::COPY), NewVReg)
+        .addReg(LoadReg, RegState::Kill);
 
   LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MI);
 
@@ -897,31 +910,47 @@
 }
 
 /// insertSpill - Insert a spill of NewVReg after MI.
-void InlineSpiller::insertSpill(unsigned NewVReg, bool isKill,
-                                 MachineBasicBlock::iterator MI) {
+void InlineSpiller::insertSpill(unsigned NewVReg,
+                                MachineBasicBlock::iterator MI) {
   MachineBasicBlock &MBB = *MI->getParent();
 
-  MachineInstrSpan MIS(MI);
+  MachineBasicBlock::iterator InsertMI = std::next(MI);
   bool IsRealSpill = true;
   if (isFullUndefDef(*MI)) {
     // Don't spill undef value.
     // Anything works for undef, in particular keeping the memory
     // uninitialized is a viable option and it saves code size and
     // run time.
-    BuildMI(MBB, std::next(MI), MI->getDebugLoc(), TII.get(TargetOpcode::KILL))
-        .addReg(NewVReg, getKillRegState(isKill));
+    BuildMI(MBB, InsertMI, MI->getDebugLoc(), TII.get(TargetOpcode::KILL))
+        .addReg(NewVReg, RegState::Kill);
     IsRealSpill = false;
-  } else
-    TII.storeRegToStackSlot(MBB, std::next(MI), NewVReg, isKill, StackSlot,
-                            MRI.getRegClass(NewVReg), &TRI);
+  } else {
+    unsigned StoreReg = NewVReg;
+    const TargetRegisterClass &RC = *MRI.getRegClass(NewVReg);
+    const TargetRegisterClass &StoreRC =
+        *TII.getRegClassForStackSaveRestore(&RC);
+    if (&RC != &StoreRC) {
+      StoreReg = Edit->createFrom(NewVReg, &StoreRC);
+      LLVM_DEBUG(dbgs() << "Using " << printReg(StoreReg, &TRI) << ":"
+                        << TRI.getRegClassName(&StoreRC)
+                        << " as an intermediate for the spill\n");
+
+      BuildMI(MBB, InsertMI, MI->getDebugLoc(), TII.get(TargetOpcode::COPY),
+              StoreReg)
+          .addReg(NewVReg, RegState::Kill);
+    }
+
+    TII.storeRegToStackSlot(MBB, InsertMI, StoreReg, RegState::Kill, StackSlot,
+                            &StoreRC, &TRI);
+  }
 
-  LIS.InsertMachineInstrRangeInMaps(std::next(MI), MIS.end());
+  LIS.InsertMachineInstrRangeInMaps(std::next(MI), InsertMI);
 
-  LLVM_DEBUG(dumpMachineInstrRangeWithSlotIndex(std::next(MI), MIS.end(), LIS,
+  LLVM_DEBUG(dumpMachineInstrRangeWithSlotIndex(std::next(MI), InsertMI, LIS,
                                                 "spill"));
   ++NumSpills;
   if (IsRealSpill)
-    HSpiller.addToMergeableSpills(*std::next(MI), StackSlot, Original);
+    HSpiller.addToMergeableSpills(*std::prev(InsertMI), StackSlot, Original);
 }
 
 /// spillAroundUses - insert spill code around each use of Reg.
@@ -1021,7 +1050,7 @@
     // FIXME: Use a second vreg if instruction has no tied ops.
     if (RI.Writes)
       if (hasLiveDef)
-        insertSpill(NewVReg, true, MI);
+        insertSpill(NewVReg, MI);
   }
 }
 
Index: lib/CodeGen/LiveRangeEdit.cpp
===================================================================
--- lib/CodeGen/LiveRangeEdit.cpp
+++ lib/CodeGen/LiveRangeEdit.cpp
@@ -52,8 +52,11 @@
   return LI;
 }
 
-unsigned LiveRangeEdit::createFrom(unsigned OldReg) {
-  unsigned VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+unsigned LiveRangeEdit::createFrom(unsigned OldReg,
+                                   const TargetRegisterClass *RC) {
+  if (RC == nullptr)
+    RC = MRI.getRegClass(OldReg);
+  unsigned VReg = MRI.createVirtualRegister(RC);
   if (VRM) {
     VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
   }
Index: lib/CodeGen/RegAllocFast.cpp
===================================================================
--- lib/CodeGen/RegAllocFast.cpp
+++ lib/CodeGen/RegAllocFast.cpp
@@ -189,9 +189,8 @@
 
     void usePhysReg(MachineOperand &MO);
     void definePhysReg(MachineBasicBlock::iterator MI, MCPhysReg PhysReg,
-                       RegState NewState);
+                       RegState NewState, bool IsUsedInInstr = false);
     unsigned calcSpillCost(MCPhysReg PhysReg) const;
-    void assignVirtToPhysReg(LiveReg &, MCPhysReg PhysReg);
 
     LiveRegMap::iterator findLiveVirtReg(unsigned VirtReg) {
       return LiveVirtRegs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
@@ -201,7 +200,9 @@
       return LiveVirtRegs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
     }
 
-    void allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint);
+    bool allocVirtReg(MachineInstr &MI, unsigned VirtReg, unsigned Hint,
+                      MCPhysReg *PhysReg, bool IsUsedInInstr);
+    void assignVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint);
     MCPhysReg defineVirtReg(MachineInstr &MI, unsigned OpNum, unsigned VirtReg,
                             unsigned Hint);
     LiveReg &reloadVirtReg(MachineInstr &MI, unsigned OpNum, unsigned VirtReg,
@@ -215,6 +216,11 @@
     void reload(MachineBasicBlock::iterator Before, unsigned VirtReg,
                 MCPhysReg PhysReg);
 
+    unsigned createVirtReg(const TargetRegisterClass &RC);
+    void handleIntermediarySpill(MachineBasicBlock::iterator BeginMII,
+                                 MachineBasicBlock::iterator EndMII,
+                                 unsigned VirtReg);
+
     void dumpState();
   };
 
@@ -259,9 +265,34 @@
   LLVM_DEBUG(dbgs() << " to stack slot #" << FI << '\n');
 
   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
-  TII->storeRegToStackSlot(*MBB, Before, AssignedReg, Kill, FI, &RC, TRI);
+  const TargetRegisterClass &StoreRC =
+      *TII->getRegClassForStackSaveRestore(&RC);
+
+  MachineBasicBlock::iterator PrevMII =
+      Before == MBB->begin() ? MBB->end() : std::prev(Before);
+  unsigned StoreReg = AssignedReg;
+  bool NeedsIntermediary = &RC != &StoreRC && !StoreRC.contains(StoreReg);
+  if (NeedsIntermediary) {
+    assert(&StoreRC == TII->getRegClassForStackSaveRestore(&StoreRC) &&
+           "Invalid regclass cascade for stack save");
+    StoreReg = createVirtReg(StoreRC);
+    LLVM_DEBUG(dbgs() << "Using " << printReg(StoreReg, TRI) << ":"
+                      << TRI->getRegClassName(&StoreRC)
+                      << " as an intermediary for the spill\n");
+
+    BuildMI(*MBB, Before, Before->getDebugLoc(), TII->get(TargetOpcode::COPY),
+            StoreReg)
+        .addReg(AssignedReg, llvm::RegState::Kill);
+  }
+
+  TII->storeRegToStackSlot(*MBB, Before, StoreReg, Kill, FI, &StoreRC, TRI);
   ++NumStores;
 
+  if (NeedsIntermediary)
+    handleIntermediarySpill(PrevMII == MBB->end() ? MBB->begin()
+                                                  : std::next(PrevMII),
+                            Before, StoreReg);
+
   // If this register is used by DBG_VALUE then insert new DBG_VALUE to
   // identify spilled location as the place to find corresponding variable's
   // value.
@@ -285,8 +316,32 @@
                     << printReg(PhysReg, TRI) << '\n');
   int FI = getStackSpaceFor(VirtReg);
   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
-  TII->loadRegFromStackSlot(*MBB, Before, PhysReg, FI, &RC, TRI);
+  const TargetRegisterClass &LoadRC = *TII->getRegClassForStackSaveRestore(&RC);
+
+  MachineBasicBlock::iterator PrevMII =
+      Before == MBB->begin() ? MBB->end() : std::prev(Before);
+  unsigned LoadReg = PhysReg;
+  bool NeedsIntermediary = &RC != &LoadRC && !LoadRC.contains(LoadReg);
+  if (NeedsIntermediary) {
+    assert(&LoadRC == TII->getRegClassForStackSaveRestore(&LoadRC) &&
+           "Invalid regclass cascade for stack restore");
+    LoadReg = createVirtReg(LoadRC);
+    LLVM_DEBUG(dbgs() << "Using " << printReg(LoadReg, TRI) << ":"
+                      << TRI->getRegClassName(&LoadRC)
+                      << " as an intermediary for the reload\n");
+  }
+
+  TII->loadRegFromStackSlot(*MBB, Before, LoadReg, FI, &LoadRC, TRI);
   ++NumLoads;
+
+  if (NeedsIntermediary) {
+    BuildMI(*MBB, Before, Before->getDebugLoc(), TII->get(TargetOpcode::COPY),
+            PhysReg)
+        .addReg(LoadReg, llvm::RegState::Kill);
+    handleIntermediarySpill(PrevMII == MBB->end() ? MBB->begin()
+                                                  : std::next(PrevMII),
+                            Before, LoadReg);
+  }
 }
 
 /// Return true if MO is the only remaining reference to its virtual register,
@@ -456,8 +511,10 @@
 /// similar to defineVirtReg except the physreg is reserved instead of
 /// allocated.
 void RegAllocFast::definePhysReg(MachineBasicBlock::iterator MI,
-                                 MCPhysReg PhysReg, RegState NewState) {
-  markRegUsedInInstr(PhysReg);
+                                 MCPhysReg PhysReg, RegState NewState,
+                                 bool IsUsedInInstr) {
+  if (IsUsedInInstr)
+    markRegUsedInInstr(PhysReg);
   switch (unsigned VirtReg = PhysRegState[PhysReg]) {
   case regDisabled:
     break;
@@ -542,23 +599,10 @@
   return Cost;
 }
 
-/// This method updates local state so that we know that PhysReg is the
-/// proper container for VirtReg now.  The physical register must not be used
-/// for anything else when this is called.
-void RegAllocFast::assignVirtToPhysReg(LiveReg &LR, MCPhysReg PhysReg) {
-  unsigned VirtReg = LR.VirtReg;
-  LLVM_DEBUG(dbgs() << "Assigning " << printReg(VirtReg, TRI) << " to "
-                    << printReg(PhysReg, TRI) << '\n');
-  assert(LR.PhysReg == 0 && "Already assigned a physreg");
-  assert(PhysReg != 0 && "Trying to assign no register");
-  LR.PhysReg = PhysReg;
-  setPhysRegState(PhysReg, VirtReg);
-}
-
 /// Allocates a physical register for VirtReg.
-void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint) {
-  const unsigned VirtReg = LR.VirtReg;
-
+bool RegAllocFast::allocVirtReg(MachineInstr &MI, unsigned VirtReg,
+                                unsigned Hint, MCPhysReg *OutPhysReg,
+                                bool IsUsedInInstr) {
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "Can only allocate virtual registers");
 
@@ -573,9 +617,9 @@
     unsigned Cost = calcSpillCost(Hint);
     if (Cost < spillDirty) {
       if (Cost)
-        definePhysReg(MI, Hint, regFree);
-      assignVirtToPhysReg(LR, Hint);
-      return;
+        definePhysReg(MI, Hint, regFree, IsUsedInInstr);
+      *OutPhysReg = Hint;
+      return true;
     }
   }
 
@@ -583,8 +627,8 @@
   ArrayRef<MCPhysReg> AllocationOrder = RegClassInfo.getOrder(&RC);
   for (MCPhysReg PhysReg : AllocationOrder) {
     if (PhysRegState[PhysReg] == regFree && !isRegUsedInInstr(PhysReg)) {
-      assignVirtToPhysReg(LR, PhysReg);
-      return;
+      *OutPhysReg = PhysReg;
+      return true;
     }
   }
 
@@ -599,8 +643,8 @@
     LLVM_DEBUG(dbgs() << "Cost: " << Cost << " BestCost: " << BestCost << '\n');
     // Cost is 0 when all aliases are already disabled.
     if (Cost == 0) {
-      assignVirtToPhysReg(LR, PhysReg);
-      return;
+      *OutPhysReg = PhysReg;
+      return true;
     }
     if (Cost < BestCost) {
       BestReg = PhysReg;
@@ -608,19 +652,37 @@
     }
   }
 
-  if (!BestReg) {
+  if (BestReg) {
+    definePhysReg(MI, BestReg, regFree, IsUsedInInstr);
+    *OutPhysReg = BestReg;
+    return true;
+  }
+
+  *OutPhysReg = *AllocationOrder.begin();
+  return false;
+}
+
+void RegAllocFast::assignVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint) {
+  assert(LR.PhysReg == 0 && "Already assigned a physreg");
+
+  const unsigned VirtReg = LR.VirtReg;
+  MCPhysReg PhysReg;
+  bool Defined = allocVirtReg(MI, VirtReg, Hint, &PhysReg, true);
+  if (!Defined) {
     // Nothing we can do. Report an error and keep going with a bad allocation.
     if (MI.isInlineAsm())
       MI.emitError("inline assembly requires more registers than available");
     else
       MI.emitError("ran out of registers during register allocation");
-    definePhysReg(MI, *AllocationOrder.begin(), regFree);
-    assignVirtToPhysReg(LR, *AllocationOrder.begin());
-    return;
+    definePhysReg(MI, PhysReg, regFree);
   }
 
-  definePhysReg(MI, BestReg, regFree);
-  assignVirtToPhysReg(LR, BestReg);
+  // Update local state so that we know that PhysReg is the proper container for
+  // VirtReg now.
+  LLVM_DEBUG(dbgs() << "Assigning " << printReg(VirtReg, TRI) << " to "
+                    << printReg(PhysReg, TRI) << '\n');
+  LR.PhysReg = PhysReg;
+  setPhysRegState(PhysReg, VirtReg);
 }
 
 /// Allocates a register for VirtReg and mark it as dirty.
@@ -640,7 +702,7 @@
       if (UseMI.isCopyLike())
         Hint = UseMI.getOperand(0).getReg();
     }
-    allocVirtReg(MI, *LRI, Hint);
+    assignVirtReg(MI, *LRI, Hint);
   } else if (LRI->LastUse) {
     // Redefining a live register - kill at the last use, unless it is this
     // instruction defining VirtReg multiple times.
@@ -667,7 +729,7 @@
   std::tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
   MachineOperand &MO = MI.getOperand(OpNum);
   if (!LRI->PhysReg) {
-    allocVirtReg(MI, *LRI, Hint);
+    assignVirtReg(MI, *LRI, Hint);
     reload(MI, VirtReg, LRI->PhysReg);
   } else if (LRI->Dirty) {
     if (isLastUseOfLocalReg(MO)) {
@@ -734,6 +796,69 @@
   return Dead;
 }
 
+/// Create a new virtual register for use by the allocator.
+unsigned RegAllocFast::createVirtReg(const TargetRegisterClass &RC) {
+  unsigned Reg = MRI->createVirtualRegister(&RC);
+  unsigned NumVirtRegs = MRI->getNumVirtRegs();
+  StackSlotForVirtReg.resize(NumVirtRegs);
+  LiveVirtRegs.setUniverse(NumVirtRegs);
+  return Reg;
+}
+
+/// Process a spill/reload sequence that uses an intermediary register. The
+/// method expects an instruction range implementing the spill/reload and id of
+/// the new intermediary register. The intermediary is allocated to a physical
+/// register and the instruction sequence is appropriately updated.
+void RegAllocFast::handleIntermediarySpill(MachineBasicBlock::iterator BeginMII,
+                                           MachineBasicBlock::iterator EndMII,
+                                           unsigned VirtReg) {
+  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+         "Not a virtual register");
+
+  const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+  LLVM_DEBUG(dbgs() << "Allocating intermediary register "
+                    << printReg(VirtReg, TRI) << ":"
+                    << TRI->getRegClassName(&RC)
+                    << " to a physical register\n");
+
+  // Allocate the intermediary virtual register to a physical register.
+  MCPhysReg InterPhysReg;
+  bool Defined = allocVirtReg(*BeginMII, VirtReg, 0, &InterPhysReg, false);
+  if (!Defined) {
+    // If an instruction uses a large number of registers (for instance, it is a
+    // complex INLINEASM), it is possible that all registers that can store the
+    // intermediary are already in use. In that case, one of these registers is
+    // temporarily spilled so the intermediary can be allocated.
+    //
+    // Note: The target must guarantee that an intermediary register can be
+    // successfully stored/loaded without modifying content of any of its super
+    // registers.
+    int FI = getStackSpaceFor(VirtReg);
+
+    LLVM_DEBUG(dbgs() << "Temporarily spilling " << printReg(InterPhysReg, TRI)
+                      << " to stack slot #" << FI
+                      << " to allocate intermediary register "
+                      << printReg(VirtReg, TRI) << ":"
+                      << TRI->getRegClassName(&RC) << "\n");
+
+    // TODO Fix debug information for the spill (DBG_VALUE).
+    TII->storeRegToStackSlot(*MBB, BeginMII, InterPhysReg, true, FI, &RC, TRI);
+    ++NumStores;
+
+    TII->loadRegFromStackSlot(*MBB, EndMII, InterPhysReg, FI, &RC, TRI);
+    ++NumLoads;
+  }
+
+  // Update the intermediary register in the spill sequence.
+  for (MachineInstr &MI : make_range(BeginMII, EndMII)) {
+    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+      const MachineOperand &MO = MI.getOperand(I);
+      if (MO.isReg() && MO.getReg() == VirtReg)
+        setPhysReg(MI, I, InterPhysReg);
+    }
+  }
+}
+
 // Handles special instruction operand like early clobbers and tied ops when
 // there are additional physreg defines.
 void RegAllocFast::handleThroughOperands(MachineInstr &MI,
@@ -1016,6 +1141,19 @@
       }
     }
 
+    unsigned DefOpEnd = MI.getNumOperands();
+    if (MI.isCall()) {
+      // Spill all virtregs before a call. This serves one purpose: If an
+      // exception is thrown, the landing pad is going to expect to find
+      // registers in their spill slots.
+      // Note: although this is appealing to just consider all definitions
+      // as call-clobbered, this is not correct because some of those
+      // definitions may be used later on and we do not want to reuse
+      // those for virtual registers in between.
+      LLVM_DEBUG(dbgs() << "  Spilling remaining registers before call.\n");
+      spillAll(MI);
+    }
+
     // Track registers defined by instruction - early clobbers and tied uses at
     // this point.
     UsedInInstr.clear();
@@ -1030,19 +1168,6 @@
       }
     }
 
-    unsigned DefOpEnd = MI.getNumOperands();
-    if (MI.isCall()) {
-      // Spill all virtregs before a call. This serves one purpose: If an
-      // exception is thrown, the landing pad is going to expect to find
-      // registers in their spill slots.
-      // Note: although this is appealing to just consider all definitions
-      // as call-clobbered, this is not correct because some of those
-      // definitions may be used later on and we do not want to reuse
-      // those for virtual registers in between.
-      LLVM_DEBUG(dbgs() << "  Spilling remaining registers before call.\n");
-      spillAll(MI);
-    }
-
     // Third scan.
     // Allocate defs and collect dead defs.
     for (unsigned I = 0; I != DefOpEnd; ++I) {
Index: lib/Target/ARM/Thumb1InstrInfo.h
===================================================================
--- lib/Target/ARM/Thumb1InstrInfo.h
+++ lib/Target/ARM/Thumb1InstrInfo.h
@@ -53,6 +53,9 @@
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
+  const TargetRegisterClass *
+  getRegClassForStackSaveRestore(const TargetRegisterClass *RC) const override;
+
   bool canCopyGluedNodeDuringSchedule(SDNode *N) const override;
 private:
   void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
Index: lib/Target/ARM/Thumb1InstrInfo.cpp
===================================================================
--- lib/Target/ARM/Thumb1InstrInfo.cpp
+++ lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -132,6 +132,13 @@
   }
 }
 
+const TargetRegisterClass *Thumb1InstrInfo::getRegClassForStackSaveRestore(
+    const TargetRegisterClass *RC) const {
+  if (ARM::hGPRRegClass.hasSubClassEq(RC))
+    return &ARM::tGPRRegClass;
+  return RC;
+}
+
 void Thumb1InstrInfo::expandLoadStackGuard(
     MachineBasicBlock::iterator MI) const {
   MachineFunction &MF = *MI->getParent()->getParent();
Index: test/CodeGen/Thumb/hgpr-spill-basic.mir
===================================================================
--- /dev/null
+++ test/CodeGen/Thumb/hgpr-spill-basic.mir
@@ -0,0 +1,74 @@
+# RUN: llc -run-pass regallocbasic %s -o - | FileCheck %s --check-prefix=CHECK-ALLOC
+# RUN: llc -run-pass regallocbasic,virtregrewriter %s -o - | FileCheck %s --check-prefix=CHECK-REWRITE
+
+# This test examines register allocation and spilling of high register in Thumb1
+# with Basic Register Allocator. The test uses two consecutive inline assembler
+# expressions that both request an input variable to be loaded in a high
+# register. The first expression marks {r8, r9, r10, r11} as clobbered, the
+# second one marks {r12, lr} as such. The allocator cannot choose the same
+# register to load the variable and a spill occurs.
+#
+# The test checks that InlineSpiller used by Basic Register Allocator implements
+# the following:
+# * A high register in Thumb1 is spilled by inserting a copy to a low register
+#   and then saving that.
+# * A high register in Thumb1 is restored by inserting a load to a low register
+#   and then a copy to the high register.
+
+--- |
+  ; ModuleID = 'test.ll'
+  source_filename = "test.c"
+  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv6m-none--eabi"
+
+  define dso_local void @constraint_h() {
+  entry:
+    %i = alloca i32, align 4
+    %0 = load i32, i32* %i, align 4
+    call void asm sideeffect "@ $0", "h,~{r8},~{r9},~{r10},~{r11}"(i32 %0)
+    call void asm sideeffect "@ $0", "h,~{r12},~{lr}"(i32 %0)
+    ret void
+  }
+
+...
+---
+name: constraint_h
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: hgpr }
+  - { id: 1, class: tgpr }
+stack:
+  - { id: 0, name: i, size: 4, alignment: 4, stack-id: 0, local-offset: -4 }
+body: |
+  bb.0.entry:
+    %1:tgpr = tLDRspi %stack.0.i, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i)
+    %0:hgpr = COPY %1
+    INLINEASM &"@ $0", 1, 589833, %0, 12, implicit-def early-clobber $r8, implicit-def early-clobber $r9, implicit-def early-clobber $r10, implicit-def early-clobber $r11
+    INLINEASM &"@ $0", 1, 589833, %0, 12, implicit-def early-clobber $r12, implicit-def early-clobber $lr
+    tBX_RET 14, $noreg
+
+...
+
+# CHECK-ALLOC: bb.0.entry:
+# CHECK-ALLOC-NEXT: %1:tgpr = tLDRspi %stack.0.i, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i)
+# CHECK-ALLOC-NEXT: %2:gpr = COPY %1
+# CHECK-ALLOC-NEXT: %3:tgpr = COPY %2
+# CHECK-ALLOC-NEXT: tSTRspi %3, %stack.1, 0, 14, $noreg :: (store 4 into %stack.1)
+# CHECK-ALLOC-NEXT: %5:tgpr = tLDRspi %stack.1, 0, 14, $noreg :: (load 4 from %stack.1)
+# CHECK-ALLOC-NEXT: %4:hgpr = COPY %5
+# CHECK-ALLOC-NEXT: INLINEASM &"@ $0", 1, 589833, %4, 12, implicit-def early-clobber $r8, implicit-def early-clobber $r9, implicit-def early-clobber $r10, implicit-def early-clobber $r11
+# CHECK-ALLOC-NEXT: %7:tgpr = tLDRspi %stack.1, 0, 14, $noreg :: (load 4 from %stack.1)
+# CHECK-ALLOC-NEXT: %6:hgpr = COPY %7
+# CHECK-ALLOC-NEXT: INLINEASM &"@ $0", 1, 589833, %6, 12, implicit-def early-clobber $r12, implicit-def early-clobber $lr
+# CHECK-ALLOC-NEXT: tBX_RET 14, $noreg
+
+# CHECK-REWRITE: bb.0.entry:
+# CHECK-REWRITE-NEXT: renamable $r0 = tLDRspi %stack.0.i, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i)
+# CHECK-REWRITE-NEXT: tSTRspi killed renamable $r0, %stack.1, 0, 14, $noreg :: (store 4 into %stack.1)
+# CHECK-REWRITE-NEXT: renamable $r0 = tLDRspi %stack.1, 0, 14, $noreg :: (load 4 from %stack.1)
+# CHECK-REWRITE-NEXT: renamable $r12 = COPY killed renamable $r0
+# CHECK-REWRITE-NEXT: INLINEASM &"@ $0", 1, 589833, killed renamable $r12, 12, implicit-def early-clobber $r8, implicit-def early-clobber $r9, implicit-def early-clobber $r10, implicit-def early-clobber $r11
+# CHECK-REWRITE-NEXT: renamable $r0 = tLDRspi %stack.1, 0, 14, $noreg :: (load 4 from %stack.1)
+# CHECK-REWRITE-NEXT: renamable $r8 = COPY killed renamable $r0
+# CHECK-REWRITE-NEXT: INLINEASM &"@ $0", 1, 589833, killed renamable $r8, 12, implicit-def early-clobber $r12, implicit-def early-clobber $lr
+# CHECK-REWRITE-NEXT: tBX_RET 14, $noreg
Index: test/CodeGen/Thumb/hgpr-spill-fast-all.mir
===================================================================
--- /dev/null
+++ test/CodeGen/Thumb/hgpr-spill-fast-all.mir
@@ -0,0 +1,155 @@
+# RUN: llc -run-pass regallocfast %s -o - | FileCheck %s
+
+# Check that Fast Register Allocator can succesfully spill all virtual registers
+# before a call instruction, including any high registers.
+#
+# The test operates as follows:
+# * Load a value in a high register which gets allocated to r12.
+# * Load values in all low registers r0-r7.
+# * Perform a call. The allocator spills all virtual registers prior calls and
+#   so it must be able to successfully store the values loaded in r12, r0-r7 to
+#   the stack.
+
+--- |
+  ; ModuleID = 'test.ll'
+  source_filename = "test.c"
+  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv6m-none--eabi"
+
+  define dso_local i32 @constraint_h() {
+  entry:
+    %ih = alloca i32, align 4
+    %i0 = alloca i32, align 4
+    %i1 = alloca i32, align 4
+    %i2 = alloca i32, align 4
+    %i3 = alloca i32, align 4
+    %i4 = alloca i32, align 4
+    %i5 = alloca i32, align 4
+    %i6 = alloca i32, align 4
+    %i7 = alloca i32, align 4
+    %0 = load i32, i32* %ih, align 4
+    %1 = load i32, i32* %i0, align 4
+    %2 = load i32, i32* %i1, align 4
+    %3 = load i32, i32* %i2, align 4
+    %4 = load i32, i32* %i3, align 4
+    %5 = load i32, i32* %i4, align 4
+    %6 = load i32, i32* %i5, align 4
+    %7 = load i32, i32* %i6, align 4
+    %8 = load i32, i32* %i7, align 4
+    call void @bar()
+    %add = add nsw i32 %0, %1
+    %add1 = add nsw i32 %add, %2
+    %add2 = add nsw i32 %add1, %3
+    %add3 = add nsw i32 %add2, %4
+    %add4 = add nsw i32 %add3, %5
+    %add5 = add nsw i32 %add4, %6
+    %add6 = add nsw i32 %add5, %7
+    %add7 = add nsw i32 %add6, %8
+    ret i32 %add7
+  }
+
+  declare void @bar()
+
+...
+---
+name: constraint_h
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: tgpr }
+  - { id: 1, class: hgpr }
+  - { id: 2, class: tgpr }
+  - { id: 3, class: tgpr }
+  - { id: 4, class: tgpr }
+  - { id: 5, class: tgpr }
+  - { id: 6, class: tgpr }
+  - { id: 7, class: tgpr }
+  - { id: 8, class: tgpr }
+  - { id: 9, class: tgpr }
+  - { id: 10, class: tgpr }
+  - { id: 11, class: tgpr }
+  - { id: 12, class: tgpr }
+  - { id: 13, class: tgpr }
+  - { id: 14, class: tgpr }
+  - { id: 15, class: tgpr }
+  - { id: 16, class: tgpr }
+  - { id: 17, class: tgpr }
+  - { id: 18, class: tgpr }
+stack:
+  - { id: 0, name: ih, size: 4, alignment: 4, stack-id: 0, local-offset: -4 }
+  - { id: 1, name: i0, size: 4, alignment: 4, stack-id: 0, local-offset: -8 }
+  - { id: 2, name: i1, size: 4, alignment: 4, stack-id: 0, local-offset: -12 }
+  - { id: 3, name: i2, size: 4, alignment: 4, stack-id: 0, local-offset: -16 }
+  - { id: 4, name: i3, size: 4, alignment: 4, stack-id: 0, local-offset: -20 }
+  - { id: 5, name: i4, size: 4, alignment: 4, stack-id: 0, local-offset: -24 }
+  - { id: 6, name: i5, size: 4, alignment: 4, stack-id: 0, local-offset: -28 }
+  - { id: 7, name: i6, size: 4, alignment: 4, stack-id: 0, local-offset: -32 }
+  - { id: 8, name: i7, size: 4, alignment: 4, stack-id: 0, local-offset: -36 }
+body: |
+  bb.0.entry:
+    %0:tgpr = tLDRspi %stack.0.ih, 0, 14, $noreg :: (dereferenceable load 4 from %ir.ih)
+    %1:hgpr = COPY %0
+    %2:tgpr = tLDRspi %stack.1.i0, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i0)
+    %3:tgpr = tLDRspi %stack.2.i1, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i1)
+    %4:tgpr = tLDRspi %stack.3.i2, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i2)
+    %5:tgpr = tLDRspi %stack.4.i3, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i3)
+    %6:tgpr = tLDRspi %stack.5.i4, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i4)
+    %7:tgpr = tLDRspi %stack.6.i5, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i5)
+    %8:tgpr = tLDRspi %stack.7.i6, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i6)
+    %9:tgpr = tLDRspi %stack.8.i7, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i7)
+    tBL 14, $noreg, @bar, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    %10:tgpr = COPY %1
+    %11:tgpr, $cpsr = nsw tADDrr %10, %2, 14, $noreg
+    %12:tgpr, $cpsr = nsw tADDrr %11, %3, 14, $noreg
+    %13:tgpr, $cpsr = nsw tADDrr %12, %4, 14, $noreg
+    %14:tgpr, $cpsr = nsw tADDrr %13, %5, 14, $noreg
+    %15:tgpr, $cpsr = nsw tADDrr %14, %6, 14, $noreg
+    %16:tgpr, $cpsr = nsw tADDrr %15, %7, 14, $noreg
+    %17:tgpr, $cpsr = nsw tADDrr %16, %8, 14, $noreg
+    %18:tgpr, $cpsr = nsw tADDrr %17, %9, 14, $noreg
+    $r0 = COPY %18
+    tBX_RET 14, $noreg, implicit $r0
+
+...
+
+# CHECK: bb.0.entry:
+# CHECK-NEXT: renamable $r0 = tLDRspi %stack.0.ih, 0, 14, $noreg :: (dereferenceable load 4 from %ir.ih)
+# CHECK-NEXT: renamable $r12 = COPY killed renamable $r0
+# CHECK-NEXT: renamable $r0 = tLDRspi %stack.1.i0, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i0)
+# CHECK-NEXT: renamable $r1 = tLDRspi %stack.2.i1, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i1)
+# CHECK-NEXT: renamable $r2 = tLDRspi %stack.3.i2, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i2)
+# CHECK-NEXT: renamable $r3 = tLDRspi %stack.4.i3, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i3)
+# CHECK-NEXT: renamable $r4 = tLDRspi %stack.5.i4, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i4)
+# CHECK-NEXT: renamable $r5 = tLDRspi %stack.6.i5, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i5)
+# CHECK-NEXT: renamable $r6 = tLDRspi %stack.7.i6, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i6)
+# CHECK-NEXT: renamable $r7 = tLDRspi %stack.8.i7, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i7)
+# CHECK-NEXT: tSTRspi killed $r0, %stack.10, 0, 14, $noreg :: (store 4 into %stack.10)
+# CHECK-NEXT: renamable $r0 = COPY killed $r12
+# CHECK-NEXT: tSTRspi killed renamable $r0, %stack.9, 0, 14, $noreg :: (store 4 into %stack.9)
+# CHECK-NEXT: tSTRspi killed $r1, %stack.11, 0, 14, $noreg :: (store 4 into %stack.11)
+# CHECK-NEXT: tSTRspi killed $r2, %stack.12, 0, 14, $noreg :: (store 4 into %stack.12)
+# CHECK-NEXT: tSTRspi killed $r3, %stack.13, 0, 14, $noreg :: (store 4 into %stack.13)
+# CHECK-NEXT: tSTRspi killed $r4, %stack.14, 0, 14, $noreg :: (store 4 into %stack.14)
+# CHECK-NEXT: tSTRspi killed $r5, %stack.15, 0, 14, $noreg :: (store 4 into %stack.15)
+# CHECK-NEXT: tSTRspi killed $r6, %stack.16, 0, 14, $noreg :: (store 4 into %stack.16)
+# CHECK-NEXT: tSTRspi killed $r7, %stack.17, 0, 14, $noreg :: (store 4 into %stack.17)
+# CHECK-NEXT: tBL 14, $noreg, @bar, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+# CHECK-NEXT: renamable $r0 = tLDRspi %stack.9, 0, 14, $noreg :: (load 4 from %stack.9)
+# CHECK-NEXT: $r12 = COPY killed renamable $r0
+# CHECK-NEXT: renamable $r0 = COPY killed renamable $r12
+# CHECK-NEXT: $r1 = tLDRspi %stack.10, 0, 14, $noreg :: (load 4 from %stack.10)
+# CHECK-NEXT: renamable $r0, $cpsr = nsw tADDrr killed renamable $r0, killed renamable $r1, 14, $noreg
+# CHECK-NEXT: $r2 = tLDRspi %stack.11, 0, 14, $noreg :: (load 4 from %stack.11)
+# CHECK-NEXT: renamable $r0, $cpsr = nsw tADDrr killed renamable $r0, killed renamable $r2, 14, $noreg
+# CHECK-NEXT: $r3 = tLDRspi %stack.12, 0, 14, $noreg :: (load 4 from %stack.12)
+# CHECK-NEXT: renamable $r0, $cpsr = nsw tADDrr killed renamable $r0, killed renamable $r3, 14, $noreg
+# CHECK-NEXT: $r4 = tLDRspi %stack.13, 0, 14, $noreg :: (load 4 from %stack.13)
+# CHECK-NEXT: renamable $r0, $cpsr = nsw tADDrr killed renamable $r0, killed renamable $r4, 14, $noreg
+# CHECK-NEXT: $r5 = tLDRspi %stack.14, 0, 14, $noreg :: (load 4 from %stack.14)
+# CHECK-NEXT: renamable $r0, $cpsr = nsw tADDrr killed renamable $r0, killed renamable $r5, 14, $noreg
+# CHECK-NEXT: $r6 = tLDRspi %stack.15, 0, 14, $noreg :: (load 4 from %stack.15)
+# CHECK-NEXT: renamable $r0, $cpsr = nsw tADDrr killed renamable $r0, killed renamable $r6, 14, $noreg
+# CHECK-NEXT: $r7 = tLDRspi %stack.16, 0, 14, $noreg :: (load 4 from %stack.16)
+# CHECK-NEXT: renamable $r0, $cpsr = nsw tADDrr killed renamable $r0, killed renamable $r7, 14, $noreg
+# CHECK-NEXT: $r1 = tLDRspi %stack.17, 0, 14, $noreg :: (load 4 from %stack.17)
+# CHECK-NEXT: renamable $r0, $cpsr = nsw tADDrr killed renamable $r0, killed renamable $r1, 14, $noreg
+# CHECK-NEXT: tBX_RET 14, $noreg, implicit killed $r0
Index: test/CodeGen/Thumb/hgpr-spill-fast-tsave.mir
===================================================================
--- /dev/null
+++ test/CodeGen/Thumb/hgpr-spill-fast-tsave.mir
@@ -0,0 +1,116 @@
+# RUN: llc -run-pass regallocfast %s -o - | FileCheck %s
+
+# Check that when storing a high register to a stack slot using an intermediary,
+# Fast Register Allocator is also able to spill a value in a register that it
+# needs to allocate for the intermediary.
+#
+# The test operates as follows:
+# * Physically define registers r0-r6 to make them reserved.
+# * Load a value in a high register which gets allocated to r12.
+# * Load a value in a low register which gets allocated to the remaining
+#   register r7.
+# * Use INLINEASM that has r0-r6 and the value currently in r7 as inputs but
+#   marks r12 as clobbered. The allocator must store the current value in r12 to
+#   the stack. This requires the value in r7 to be also spilled and then
+#   reloaded.
+
+--- |
+  ; ModuleID = 'test.ll'
+  source_filename = "test.c"
+  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv6m-none--eabi"
+
+  define dso_local i32 @constraint_h() {
+  entry:
+    %i0 = alloca i32, align 4
+    %i1 = alloca i32, align 4
+    %i2 = alloca i32, align 4
+    %i3 = alloca i32, align 4
+    %i4 = alloca i32, align 4
+    %i5 = alloca i32, align 4
+    %i6 = alloca i32, align 4
+    %ih = alloca i32, align 4
+    %i7 = alloca i32, align 4
+    %0 = load i32, i32* %i0, align 4
+    %1 = load i32, i32* %i1, align 4
+    %2 = load i32, i32* %i2, align 4
+    %3 = load i32, i32* %i3, align 4
+    %4 = load i32, i32* %i4, align 4
+    %5 = load i32, i32* %i5, align 4
+    %6 = load i32, i32* %i6, align 4
+    %7 = load i32, i32* %ih, align 4
+    %8 = load i32, i32* %i7, align 4
+    call void asm sideeffect "@ $0 $1 $2 $3 $4 $5 $6 $7", "{r0},{r1},{r2},{r3},{r4},{r5},{r6},r,~{r12}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %8)
+    ret i32 %8
+  }
+
+...
+---
+name: constraint_h
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: tgpr }
+  - { id: 1, class: tgpr }
+  - { id: 2, class: tgpr }
+  - { id: 3, class: tgpr }
+  - { id: 4, class: tgpr }
+  - { id: 5, class: tgpr }
+  - { id: 6, class: tgpr }
+  - { id: 7, class: tgpr }
+  - { id: 8, class: hgpr }
+  - { id: 9, class: tgpr }
+stack:
+  - { id: 0, name: i0, size: 4, alignment: 4, stack-id: 0, local-offset: -4 }
+  - { id: 1, name: i1, size: 4, alignment: 4, stack-id: 0, local-offset: -8 }
+  - { id: 2, name: i2, size: 4, alignment: 4, stack-id: 0, local-offset: -12 }
+  - { id: 3, name: i3, size: 4, alignment: 4, stack-id: 0, local-offset: -16 }
+  - { id: 4, name: i4, size: 4, alignment: 4, stack-id: 0, local-offset: -20 }
+  - { id: 5, name: i5, size: 4, alignment: 4, stack-id: 0, local-offset: -24 }
+  - { id: 6, name: i6, size: 4, alignment: 4, stack-id: 0, local-offset: -28 }
+  - { id: 7, name: ih, size: 4, alignment: 4, stack-id: 0, local-offset: -32 }
+  - { id: 8, name: i7, size: 4, alignment: 4, stack-id: 0, local-offset: -36 }
+body: |
+  bb.0.entry:
+    %0:tgpr = tLDRspi %stack.0.i0, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i0)
+    %1:tgpr = tLDRspi %stack.1.i1, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i1)
+    %2:tgpr = tLDRspi %stack.2.i2, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i2)
+    %3:tgpr = tLDRspi %stack.3.i3, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i3)
+    %4:tgpr = tLDRspi %stack.4.i4, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i4)
+    %5:tgpr = tLDRspi %stack.5.i5, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i5)
+    %6:tgpr = tLDRspi %stack.6.i6, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i6)
+    $r0 = COPY %0
+    $r1 = COPY %1
+    $r2 = COPY %2
+    $r3 = COPY %3
+    $r4 = COPY %4
+    $r5 = COPY %5
+    $r6 = COPY %6
+    %7:tgpr = tLDRspi %stack.7.ih, 0, 14, $noreg :: (dereferenceable load 4 from %ir.ih)
+    %8:hgpr = COPY %7
+    %9:tgpr = tLDRspi %stack.8.i7, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i7)
+    INLINEASM &"@ $0 $1 $2 $3 $4 $5 $6 $7", 1, 9, $r0, 9, $r1, 9, $r2, 9, $r3, 9, $r4, 9, $r5, 9, $r6, 655369, %9, 12, implicit-def early-clobber $r12
+    $r0 = COPY %8
+    tBX_RET 14, $noreg, implicit $r0
+
+...
+
+# CHECK: bb.0.entry:
+# CHECK-NEXT: renamable $r0 = tLDRspi %stack.0.i0, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i0)
+# CHECK-NEXT: renamable $r1 = tLDRspi %stack.1.i1, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i1)
+# CHECK-NEXT: renamable $r2 = tLDRspi %stack.2.i2, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i2)
+# CHECK-NEXT: renamable $r3 = tLDRspi %stack.3.i3, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i3)
+# CHECK-NEXT: renamable $r4 = tLDRspi %stack.4.i4, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i4)
+# CHECK-NEXT: renamable $r5 = tLDRspi %stack.5.i5, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i5)
+# CHECK-NEXT: renamable $r6 = tLDRspi %stack.6.i6, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i6)
+# CHECK-NEXT: renamable $r7 = tLDRspi %stack.7.ih, 0, 14, $noreg :: (dereferenceable load 4 from %ir.ih)
+# CHECK-NEXT: renamable $r12 = COPY killed renamable $r7
+# CHECK-NEXT: renamable $r7 = tLDRspi %stack.8.i7, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i7)
+# CHECK-NEXT: tSTRspi killed $r7, %stack.10, 0, 14, $noreg :: (store 4 into %stack.10)
+# CHECK-NEXT: renamable $r7 = COPY killed $r12
+# CHECK-NEXT: tSTRspi killed renamable $r7, %stack.9, 0, 14, $noreg :: (store 4 into %stack.9)
+# CHECK-NEXT: $r7 = tLDRspi %stack.10, 0, 14, $noreg :: (load 4 from %stack.10)
+# CHECK-NEXT: INLINEASM &"@ $0 $1 $2 $3 $4 $5 $6 $7", 1, 9, killed $r0, 9, killed $r1, 9, killed $r2, 9, killed $r3, 9, killed $r4, 9, killed $r5, 9, killed $r6, 655369, killed renamable $r7, 12, implicit-def early-clobber $r12
+# CHECK-NEXT: renamable $r0 = tLDRspi %stack.9, 0, 14, $noreg :: (load 4 from %stack.9)
+# CHECK-NEXT: $r12 = COPY killed renamable $r0
+# CHECK-NEXT: $r0 = COPY killed renamable $r12
+# CHECK-NEXT: tBX_RET 14, $noreg, implicit killed $r0
Index: test/CodeGen/Thumb/hgpr-spill-fast-tsave2.mir
===================================================================
--- /dev/null
+++ test/CodeGen/Thumb/hgpr-spill-fast-tsave2.mir
@@ -0,0 +1,116 @@
+# RUN: llc -run-pass regallocfast %s -o - | FileCheck %s
+
+# Check that when storing a high register to a stack slot using an intermediary,
+# Fast Register Allocator is able to insert a temporary spill of a register that
+# it needs for the intermediary if no such register can be normally allocated.
+#
+# The test operates as follows:
+# * Physically define registers r0-r6 to make them reserved.
+# * Load a value in a high register which gets allocated to r12.
+# * Physically define the remaining low register r7 to make it reserved.
+# * Use INLINEASM that has r0-r7 as inputs but marks r12 as clobbered. The
+#   allocator must store the current value in r12 to the stack. This requires a
+#   temporary spill of one of the low registers that are already used by
+#   INLINEASM.
+
+--- |
+  ; ModuleID = 'test.ll'
+  source_filename = "test.c"
+  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv6m-none--eabi"
+
+  define dso_local i32 @constraint_h() {
+  entry:
+    %i0 = alloca i32, align 4
+    %i1 = alloca i32, align 4
+    %i2 = alloca i32, align 4
+    %i3 = alloca i32, align 4
+    %i4 = alloca i32, align 4
+    %i5 = alloca i32, align 4
+    %i6 = alloca i32, align 4
+    %ih = alloca i32, align 4
+    %i7 = alloca i32, align 4
+    %0 = load i32, i32* %i0, align 4
+    %1 = load i32, i32* %i1, align 4
+    %2 = load i32, i32* %i2, align 4
+    %3 = load i32, i32* %i3, align 4
+    %4 = load i32, i32* %i4, align 4
+    %5 = load i32, i32* %i5, align 4
+    %6 = load i32, i32* %i6, align 4
+    %7 = load i32, i32* %ih, align 4
+    %8 = load i32, i32* %i7, align 4
+    call void asm sideeffect "@ $0 $1 $2 $3 $4 $5 $6 $7", "{r0},{r1},{r2},{r3},{r4},{r5},{r6},{r7},~{r12}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %8)
+    ret i32 %8
+  }
+
+...
+---
+name: constraint_h
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: tgpr }
+  - { id: 1, class: tgpr }
+  - { id: 2, class: tgpr }
+  - { id: 3, class: tgpr }
+  - { id: 4, class: tgpr }
+  - { id: 5, class: tgpr }
+  - { id: 6, class: tgpr }
+  - { id: 7, class: tgpr }
+  - { id: 8, class: hgpr }
+  - { id: 9, class: tgpr }
+stack:
+  - { id: 0, name: i0, size: 4, alignment: 4, stack-id: 0, local-offset: -4 }
+  - { id: 1, name: i1, size: 4, alignment: 4, stack-id: 0, local-offset: -8 }
+  - { id: 2, name: i2, size: 4, alignment: 4, stack-id: 0, local-offset: -12 }
+  - { id: 3, name: i3, size: 4, alignment: 4, stack-id: 0, local-offset: -16 }
+  - { id: 4, name: i4, size: 4, alignment: 4, stack-id: 0, local-offset: -20 }
+  - { id: 5, name: i5, size: 4, alignment: 4, stack-id: 0, local-offset: -24 }
+  - { id: 6, name: i6, size: 4, alignment: 4, stack-id: 0, local-offset: -28 }
+  - { id: 7, name: ih, size: 4, alignment: 4, stack-id: 0, local-offset: -32 }
+  - { id: 8, name: i7, size: 4, alignment: 4, stack-id: 0, local-offset: -36 }
+body: |
+  bb.0.entry:
+    %0:tgpr = tLDRspi %stack.0.i0, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i0)
+    %1:tgpr = tLDRspi %stack.1.i1, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i1)
+    %2:tgpr = tLDRspi %stack.2.i2, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i2)
+    %3:tgpr = tLDRspi %stack.3.i3, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i3)
+    %4:tgpr = tLDRspi %stack.4.i4, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i4)
+    %5:tgpr = tLDRspi %stack.5.i5, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i5)
+    %6:tgpr = tLDRspi %stack.6.i6, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i6)
+    $r0 = COPY %0
+    $r1 = COPY %1
+    $r2 = COPY %2
+    $r3 = COPY %3
+    $r4 = COPY %4
+    $r5 = COPY %5
+    $r6 = COPY %6
+    %7:tgpr = tLDRspi %stack.7.ih, 0, 14, $noreg :: (dereferenceable load 4 from %ir.ih)
+    %8:hgpr = COPY %7
+    %9:tgpr = tLDRspi %stack.8.i7, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i7)
+    $r7 = COPY %9
+    INLINEASM &"@ $0 $1 $2 $3 $4 $5 $6 $7", 1, 9, $r0, 9, $r1, 9, $r2, 9, $r3, 9, $r4, 9, $r5, 9, $r6, 9, $r7, 12, implicit-def early-clobber $r12
+    $r0 = COPY %8
+    tBX_RET 14, $noreg, implicit $r0
+
+...
+
+# CHECK: bb.0.entry:
+# CHECK-NEXT: renamable $r0 = tLDRspi %stack.0.i0, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i0)
+# CHECK-NEXT: renamable $r1 = tLDRspi %stack.1.i1, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i1)
+# CHECK-NEXT: renamable $r2 = tLDRspi %stack.2.i2, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i2)
+# CHECK-NEXT: renamable $r3 = tLDRspi %stack.3.i3, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i3)
+# CHECK-NEXT: renamable $r4 = tLDRspi %stack.4.i4, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i4)
+# CHECK-NEXT: renamable $r5 = tLDRspi %stack.5.i5, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i5)
+# CHECK-NEXT: renamable $r6 = tLDRspi %stack.6.i6, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i6)
+# CHECK-NEXT: renamable $r7 = tLDRspi %stack.7.ih, 0, 14, $noreg :: (dereferenceable load 4 from %ir.ih)
+# CHECK-NEXT: renamable $r12 = COPY killed renamable $r7
+# CHECK-NEXT: renamable $r7 = tLDRspi %stack.8.i7, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i7)
+# CHECK-NEXT: tSTRspi killed $r0, %stack.10, 0, 14, $noreg :: (store 4 into %stack.10)
+# CHECK-NEXT: renamable $r0 = COPY killed $r12
+# CHECK-NEXT: tSTRspi killed renamable $r0, %stack.9, 0, 14, $noreg :: (store 4 into %stack.9)
+# CHECK-NEXT: $r0 = tLDRspi %stack.10, 0, 14, $noreg :: (load 4 from %stack.10)
+# CHECK-NEXT: INLINEASM &"@ $0 $1 $2 $3 $4 $5 $6 $7", 1, 9, killed $r0, 9, killed $r1, 9, killed $r2, 9, killed $r3, 9, killed $r4, 9, killed $r5, 9, killed $r6, 9, killed $r7, 12, implicit-def early-clobber $r12
+# CHECK-NEXT: renamable $r0 = tLDRspi %stack.9, 0, 14, $noreg :: (load 4 from %stack.9)
+# CHECK-NEXT: $r12 = COPY killed renamable $r0
+# CHECK-NEXT: $r0 = COPY killed renamable $r12
+# CHECK-NEXT: tBX_RET 14, $noreg, implicit killed $r0
Index: test/CodeGen/Thumb/hgpr-spill-fast.mir
===================================================================
--- /dev/null
+++ test/CodeGen/Thumb/hgpr-spill-fast.mir
@@ -0,0 +1,56 @@
+# RUN: llc -run-pass regallocfast %s -o - | FileCheck %s
+
+# This test examines register allocation and spilling of high registers in
+# Thumb1 with Fast Register Allocator. The test uses inline assembler that
+# requests an input variable to be loaded in a high register but at the same
+# time has r12 marked as clobbered. The allocator initially satisfies the load
+# request by selecting r12 but then needs to spill this register when it reaches
+# the INLINEASM instruction and notices its clobber definition.
+#
+# The test checks that Fast Register Allocator implements the following:
+# * A high register in Thumb1 is spilled by inserting a copy to a low register
+#   and then saving that.
+# * A high register in Thumb1 is restored by inserting a load to a low register
+#   and then a copy to the high register.
+
+--- |
+  ; ModuleID = 'test.ll'
+  source_filename = "test.c"
+  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv6m-none--eabi"
+
+  define dso_local void @constraint_h() {
+  entry:
+    %i = alloca i32, align 4
+    %0 = load i32, i32* %i, align 4
+    call void asm sideeffect "@ $0", "h,~{r12}"(i32 %0)
+    ret void
+  }
+
+...
+---
+name: constraint_h
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: hgpr }
+  - { id: 1, class: tgpr }
+stack:
+  - { id: 0, name: i, size: 4, alignment: 4, stack-id: 0, local-offset: -4 }
+body: |
+  bb.0.entry:
+    %1:tgpr = tLDRspi %stack.0.i, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i)
+    %0:hgpr = COPY %1
+    INLINEASM &"@ $0", 1, 589833, %0, 12, implicit-def early-clobber $r12
+    tBX_RET 14, $noreg
+
+...
+
+# CHECK: bb.0.entry:
+# CHECK-NEXT: renamable $r0 = tLDRspi %stack.0.i, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i)
+# CHECK-NEXT: renamable $r12 = COPY killed renamable $r0
+# CHECK-NEXT: renamable $r0 = COPY killed $r12
+# CHECK-NEXT: tSTRspi killed renamable $r0, %stack.1, 0, 14, $noreg :: (store 4 into %stack.1)
+# CHECK-NEXT: renamable $r0 = tLDRspi %stack.1, 0, 14, $noreg :: (load 4 from %stack.1)
+# CHECK-NEXT: $r8 = COPY killed renamable $r0
+# CHECK-NEXT: INLINEASM &"@ $0", 1, 589833, killed renamable $r8, 12, implicit-def early-clobber $r12
+# CHECK-NEXT: tBX_RET 14, $noreg