Index: include/llvm/CodeGen/LiveRangeEdit.h
===================================================================
--- include/llvm/CodeGen/LiveRangeEdit.h
+++ include/llvm/CodeGen/LiveRangeEdit.h
@@ -60,6 +60,7 @@
 private:
   LiveInterval *Parent;
   SmallVectorImpl<unsigned> &NewRegs;
+  SmallPtrSet<MachineInstr *, 32> *DeadRemats;
   MachineRegisterInfo &MRI;
   LiveIntervals &LIS;
   VirtRegMap *VRM;
@@ -111,18 +112,21 @@
   /// @param parent The register being spilled or split.
   /// @param newRegs List to receive any new registers created. This needn't be
   ///                empty initially, any existing registers are ignored.
+  /// @param deadRemats The collection of all the instructions defining an
+  ///                   original reg and are dead after remat.
   /// @param MF The MachineFunction the live range edit is taking place in.
   /// @param lis The collection of all live intervals in this function.
   /// @param vrm Map of virtual registers to physical registers for this
   ///            function.  If NULL, no virtual register map updates will
   ///            be done.  This could be the case if called before Regalloc.
   LiveRangeEdit(LiveInterval *parent, SmallVectorImpl<unsigned> &newRegs,
+                SmallPtrSet<MachineInstr *, 32> *deadRemats,
                 MachineFunction &MF, LiveIntervals &lis, VirtRegMap *vrm,
                 Delegate *delegate = nullptr)
-      : Parent(parent), NewRegs(newRegs), MRI(MF.getRegInfo()), LIS(lis),
-        VRM(vrm), TII(*MF.getSubtarget().getInstrInfo()),
-        TheDelegate(delegate), FirstNew(newRegs.size()),
-        ScannedRemattable(false) {
+      : Parent(parent), NewRegs(newRegs), DeadRemats(deadRemats),
+        MRI(MF.getRegInfo()), LIS(lis), VRM(vrm),
+        TII(*MF.getSubtarget().getInstrInfo()), TheDelegate(delegate),
+        FirstNew(newRegs.size()), ScannedRemattable(false) {
     MRI.setDelegate(this);
   }
 
@@ -141,6 +145,7 @@
   unsigned size() const { return NewRegs.size()-FirstNew; }
   bool empty() const { return size() == 0; }
   unsigned get(unsigned idx) const { return NewRegs[idx+FirstNew]; }
+  void pop_back() { NewRegs.pop_back(); }
 
   ArrayRef<unsigned> regs() const {
     return makeArrayRef(NewRegs).slice(FirstNew);
@@ -175,8 +180,11 @@
   /// Remat - Information needed to rematerialize at a specific location.
   struct Remat {
     VNInfo *ParentVNI;      // parent_'s value at the remat location.
-    MachineInstr *OrigMI;   // Instruction defining ParentVNI.
-    explicit Remat(VNInfo *ParentVNI) : ParentVNI(ParentVNI), OrigMI(nullptr) {}
+    VNInfo *OrigVNI;        // ParentVNI.def may be a copy only. OrigVNI.def
+                            // contains the real expr for remat.
+    MachineInstr *OrigMI;   // Instruction defining OrigVNI.
+    explicit Remat(VNInfo *ParentVNI, VNInfo *OrigVNI)
+        : ParentVNI(ParentVNI), OrigVNI(OrigVNI), OrigMI(nullptr) {}
   };
 
   /// canRematerializeAt - Determine if ParentVNI can be rematerialized at
@@ -208,6 +216,12 @@
     return Rematted.count(ParentVNI);
   }
 
+  void markDeadRemat(MachineInstr *inst) {
+    // For regallocs other than Greedy, DeadRemats is nullptr for now.
+    if (DeadRemats)
+      DeadRemats->insert(inst);
+  }
+
   /// eraseVirtReg - Notify the delegate that Reg is no longer in use, and try
   /// to erase it from LIS.
   void eraseVirtReg(unsigned Reg);
@@ -218,8 +232,11 @@
   /// RegsBeingSpilled lists registers currently being spilled by the register
   /// allocator.  These registers should not be split into new intervals
   /// as currently those new intervals are not guaranteed to spill.
-  void eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
-                         ArrayRef<unsigned> RegsBeingSpilled = None);
+  /// NoSplit indicates it is used after the iterations of selectOrSplit and
+  /// registers should not be split into new intervals.
+  void eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead,
+                         ArrayRef<unsigned> RegsBeingSpilled = None,
+                         bool NoSplit = false);
 
   /// calculateRegClassAndHint - Recompute register class and hint for each new
   /// register.
Index: lib/CodeGen/InlineSpiller.cpp
===================================================================
--- lib/CodeGen/InlineSpiller.cpp
+++ lib/CodeGen/InlineSpiller.cpp
@@ -48,13 +48,70 @@
 STATISTIC(NumFolded,          "Number of folded stack accesses");
 STATISTIC(NumFoldedLoads,     "Number of folded loads");
 STATISTIC(NumRemats,          "Number of rematerialized defs for spilling");
-STATISTIC(NumOmitReloadSpill, "Number of omitted spills of reloads");
-STATISTIC(NumHoists,          "Number of hoisted spills");
 
 static cl::opt<bool> DisableHoisting("disable-spill-hoist", cl::Hidden,
                                      cl::desc("Disable inline spill hoisting"));
 
 namespace {
+class HoistSpiller {
+  MachineFunction &MF;
+  LiveIntervals &LIS;
+  LiveStacks &LSS;
+  AliasAnalysis *AA;
+  MachineDominatorTree &MDT;
+  MachineLoopInfo &Loops;
+  VirtRegMap &VRM;
+  MachineFrameInfo &MFI;
+  MachineRegisterInfo &MRI;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+  const MachineBlockFrequencyInfo &MBFI;
+
+  // Map from StackSlot to its original register.
+  DenseMap<int, unsigned> StackSlotToReg;
+  // Map from pair of (StackSlot and Original VNI) to a set of spills which
+  // have the same stackslot and have equal values defined by Original VNI.
+  // These spills are mergable and are hoist candiates.
+  typedef DenseMap<std::pair<int, VNInfo *>, SmallPtrSet<MachineInstr *, 16>>
+      MergableSpillsMap;
+  MergableSpillsMap MergableSpills;
+
+  /// Virt2SibingsMap - This is the map from original register to a set
+  /// containing all its siblings. To hoist a spill to another BB, we need
+  /// to find out a live sibling there and use it as the RHS of the new spill.
+  DenseMap<unsigned, DenseSet<unsigned>> Virt2SiblingsMap;
+
+  bool isSpillCandBB(unsigned OrigReg, VNInfo *OrigVNI, MachineBasicBlock *BB,
+                     unsigned &LiveReg);
+  void getVisitOrders(
+      MachineBasicBlock *Root, SmallPtrSet<MachineInstr *, 16> &Spills,
+      SmallVectorImpl<MachineDomTreeNode *> &Orders,
+      SmallVectorImpl<MachineInstr *> &SpillsToRm,
+      DenseMap<MachineDomTreeNode *, unsigned> &SpillsToKept,
+      DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill);
+  void runHoistSpills(unsigned OrigReg, VNInfo *OrigVNI,
+                      SmallPtrSet<MachineInstr *, 16> &Spills,
+                      SmallVectorImpl<MachineInstr *> &SpillsToRm,
+                      DenseMap<MachineBasicBlock *, unsigned> &SpillsToIns);
+
+public:
+  HoistSpiller(MachineFunctionPass &pass, MachineFunction &mf, VirtRegMap &vrm)
+      : MF(mf), LIS(pass.getAnalysis<LiveIntervals>()),
+        LSS(pass.getAnalysis<LiveStacks>()),
+        AA(&pass.getAnalysis<AAResultsWrapperPass>().getAAResults()),
+        MDT(pass.getAnalysis<MachineDominatorTree>()),
+        Loops(pass.getAnalysis<MachineLoopInfo>()), VRM(vrm),
+        MFI(*mf.getFrameInfo()), MRI(mf.getRegInfo()),
+        TII(*mf.getSubtarget().getInstrInfo()),
+        TRI(*mf.getSubtarget().getRegisterInfo()),
+        MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()) {}
+
+  void addToMergableSpills(MachineInstr *Spill, int StackSlot,
+                           unsigned Original);
+  bool rmFromMergableSpills(MachineInstr *Spill, int StackSlot);
+  void hoistAllSpills(LiveRangeEdit &Edit);
+};
+
 class InlineSpiller : public Spiller {
   MachineFunction &MF;
   LiveIntervals &LIS;
@@ -85,56 +142,12 @@
   // Values that failed to remat at some point.
   SmallPtrSet<VNInfo*, 8> UsedValues;
 
-public:
-  // Information about a value that was defined by a copy from a sibling
-  // register.
-  struct SibValueInfo {
-    // True when all reaching defs were reloads: No spill is necessary.
-    bool AllDefsAreReloads;
-
-    // True when value is defined by an original PHI not from splitting.
-    bool DefByOrigPHI;
-
-    // True when the COPY defining this value killed its source.
-    bool KillsSource;
-
-    // The preferred register to spill.
-    unsigned SpillReg;
-
-    // The value of SpillReg that should be spilled.
-    VNInfo *SpillVNI;
-
-    // The block where SpillVNI should be spilled. Currently, this must be the
-    // block containing SpillVNI->def.
-    MachineBasicBlock *SpillMBB;
-
-    // A defining instruction that is not a sibling copy or a reload, or NULL.
-    // This can be used as a template for rematerialization.
-    MachineInstr *DefMI;
-
-    // List of values that depend on this one.  These values are actually the
-    // same, but live range splitting has placed them in different registers,
-    // or SSA update needed to insert PHI-defs to preserve SSA form.  This is
-    // copies of the current value and phi-kills.  Usually only phi-kills cause
-    // more than one dependent value.
-    TinyPtrVector<VNInfo*> Deps;
-
-    SibValueInfo(unsigned Reg, VNInfo *VNI)
-      : AllDefsAreReloads(true), DefByOrigPHI(false), KillsSource(false),
-        SpillReg(Reg), SpillVNI(VNI), SpillMBB(nullptr), DefMI(nullptr) {}
-
-    // Returns true when a def has been found.
-    bool hasDef() const { return DefByOrigPHI || DefMI; }
-  };
-
-private:
-  // Values in RegsToSpill defined by sibling copies.
-  typedef DenseMap<VNInfo*, SibValueInfo> SibValueMap;
-  SibValueMap SibValues;
-
   // Dead defs generated during spilling.
   SmallVector<MachineInstr*, 8> DeadDefs;
 
+  // Object records spills information and does the hoisting.
+  HoistSpiller *HSpiller;
+
   ~InlineSpiller() override {}
 
 public:
@@ -147,9 +160,14 @@
         MFI(*mf.getFrameInfo()), MRI(mf.getRegInfo()),
         TII(*mf.getSubtarget().getInstrInfo()),
         TRI(*mf.getSubtarget().getRegisterInfo()),
-        MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()) {}
+        MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()), HSpiller(nullptr) {
+  }
 
   void spill(LiveRangeEdit &) override;
+  void setHSpiller(HoistSpiller *HS) { HSpiller = HS; }
+  HoistSpiller *getHSpiller() { return HSpiller; }
+  /// Methods for support type inquiry through isa, cast, and dyn_cast:
+  static inline bool classof(const Spiller *V) { return true; }
 
 private:
   bool isSnippet(const LiveInterval &SnipLI);
@@ -161,11 +179,6 @@
   }
 
   bool isSibling(unsigned Reg);
-  MachineInstr *traceSiblingValue(unsigned, VNInfo*, VNInfo*);
-  void propagateSiblingValue(SibValueMap::iterator, VNInfo *VNI = nullptr);
-  void analyzeSiblingValues();
-
-  bool hoistSpill(LiveInterval &SpillLI, MachineInstr &CopyMI);
   void eliminateRedundantSpills(LiveInterval &LI, VNInfo *VNI);
 
   void markValueUsed(LiveInterval*, VNInfo*);
@@ -194,6 +207,21 @@
   return new InlineSpiller(pass, mf, vrm);
 }
 
+void createHoistSpiller(MachineFunctionPass &pass, MachineFunction &mf,
+                        VirtRegMap &vrm, Spiller *spiller) {
+  HoistSpiller *HSpiller = new HoistSpiller(pass, mf, vrm);
+  (dyn_cast<InlineSpiller>(spiller))->setHSpiller(HSpiller);
+}
+
+void startHoistSpiller(MachineFunction &mf, VirtRegMap &vrm, LiveIntervals &lis,
+                       Spiller *spiller) {
+  SmallVector<unsigned, 4> NewVRegs;
+  LiveRangeEdit LRE(nullptr, NewVRegs, nullptr, mf, lis, &vrm, nullptr);
+  HoistSpiller *HSpiller = (dyn_cast<InlineSpiller>(spiller))->getHSpiller();
+  HSpiller->hoistAllSpills(LRE);
+  assert(NewVRegs.size() == 0 &&
+         "No new vregs should be generated in hoistAllSpills");
+}
 }
 
 //===----------------------------------------------------------------------===//
@@ -297,460 +325,11 @@
   }
 }
 
-
-//===----------------------------------------------------------------------===//
-//                            Sibling Values
-//===----------------------------------------------------------------------===//
-
-// After live range splitting, some values to be spilled may be defined by
-// copies from sibling registers. We trace the sibling copies back to the
-// original value if it still exists. We need it for rematerialization.
-//
-// Even when the value can't be rematerialized, we still want to determine if
-// the value has already been spilled, or we may want to hoist the spill from a
-// loop.
-
 bool InlineSpiller::isSibling(unsigned Reg) {
   return TargetRegisterInfo::isVirtualRegister(Reg) &&
            VRM.getOriginal(Reg) == Original;
 }
 
-#ifndef NDEBUG
-static raw_ostream &operator<<(raw_ostream &OS,
-                               const InlineSpiller::SibValueInfo &SVI) {
-  OS << "spill " << PrintReg(SVI.SpillReg) << ':'
-     << SVI.SpillVNI->id << '@' << SVI.SpillVNI->def;
-  if (SVI.SpillMBB)
-    OS << " in BB#" << SVI.SpillMBB->getNumber();
-  if (SVI.AllDefsAreReloads)
-    OS << " all-reloads";
-  if (SVI.DefByOrigPHI)
-    OS << " orig-phi";
-  if (SVI.KillsSource)
-    OS << " kill";
-  OS << " deps[";
-  for (VNInfo *Dep : SVI.Deps)
-    OS << ' ' << Dep->id << '@' << Dep->def;
-  OS << " ]";
-  if (SVI.DefMI)
-    OS << " def: " << *SVI.DefMI;
-  else
-    OS << '\n';
-  return OS;
-}
-#endif
-
-/// propagateSiblingValue - Propagate the value in SVI to dependents if it is
-/// known.  Otherwise remember the dependency for later.
-///
-/// @param SVIIter SibValues entry to propagate.
-/// @param VNI Dependent value, or NULL to propagate to all saved dependents.
-void InlineSpiller::propagateSiblingValue(SibValueMap::iterator SVIIter,
-                                          VNInfo *VNI) {
-  SibValueMap::value_type *SVI = &*SVIIter;
-
-  // When VNI is non-NULL, add it to SVI's deps, and only propagate to that.
-  TinyPtrVector<VNInfo*> FirstDeps;
-  if (VNI) {
-    FirstDeps.push_back(VNI);
-    SVI->second.Deps.push_back(VNI);
-  }
-
-  // Has the value been completely determined yet?  If not, defer propagation.
-  if (!SVI->second.hasDef())
-    return;
-
-  // Work list of values to propagate.
-  SmallSetVector<SibValueMap::value_type *, 8> WorkList;
-  WorkList.insert(SVI);
-
-  do {
-    SVI = WorkList.pop_back_val();
-    TinyPtrVector<VNInfo*> *Deps = VNI ? &FirstDeps : &SVI->second.Deps;
-    VNI = nullptr;
-
-    SibValueInfo &SV = SVI->second;
-    if (!SV.SpillMBB)
-      SV.SpillMBB = LIS.getMBBFromIndex(SV.SpillVNI->def);
-
-    DEBUG(dbgs() << "  prop to " << Deps->size() << ": "
-                 << SVI->first->id << '@' << SVI->first->def << ":\t" << SV);
-
-    assert(SV.hasDef() && "Propagating undefined value");
-
-    // Should this value be propagated as a preferred spill candidate?  We don't
-    // propagate values of registers that are about to spill.
-    bool PropSpill = !DisableHoisting && !isRegToSpill(SV.SpillReg);
-    unsigned SpillDepth = ~0u;
-
-    for (VNInfo *Dep : *Deps) {
-      SibValueMap::iterator DepSVI = SibValues.find(Dep);
-      assert(DepSVI != SibValues.end() && "Dependent value not in SibValues");
-      SibValueInfo &DepSV = DepSVI->second;
-      if (!DepSV.SpillMBB)
-        DepSV.SpillMBB = LIS.getMBBFromIndex(DepSV.SpillVNI->def);
-
-      bool Changed = false;
-
-      // Propagate defining instruction.
-      if (!DepSV.hasDef()) {
-        Changed = true;
-        DepSV.DefMI = SV.DefMI;
-        DepSV.DefByOrigPHI = SV.DefByOrigPHI;
-      }
-
-      // Propagate AllDefsAreReloads.  For PHI values, this computes an AND of
-      // all predecessors.
-      if (!SV.AllDefsAreReloads && DepSV.AllDefsAreReloads) {
-        Changed = true;
-        DepSV.AllDefsAreReloads = false;
-      }
-
-      // Propagate best spill value.
-      if (PropSpill && SV.SpillVNI != DepSV.SpillVNI) {
-        if (SV.SpillMBB == DepSV.SpillMBB) {
-          // DepSV is in the same block.  Hoist when dominated.
-          if (DepSV.KillsSource && SV.SpillVNI->def < DepSV.SpillVNI->def) {
-            // This is an alternative def earlier in the same MBB.
-            // Hoist the spill as far as possible in SpillMBB. This can ease
-            // register pressure:
-            //
-            //   x = def
-            //   y = use x
-            //   s = copy x
-            //
-            // Hoisting the spill of s to immediately after the def removes the
-            // interference between x and y:
-            //
-            //   x = def
-            //   spill x
-            //   y = use x<kill>
-            //
-            // This hoist only helps when the DepSV copy kills its source.
-            Changed = true;
-            DepSV.SpillReg = SV.SpillReg;
-            DepSV.SpillVNI = SV.SpillVNI;
-            DepSV.SpillMBB = SV.SpillMBB;
-          }
-        } else {
-          // DepSV is in a different block.
-          if (SpillDepth == ~0u)
-            SpillDepth = Loops.getLoopDepth(SV.SpillMBB);
-
-          // Also hoist spills to blocks with smaller loop depth, but make sure
-          // that the new value dominates.  Non-phi dependents are always
-          // dominated, phis need checking.
-
-          const BranchProbability MarginProb(4, 5); // 80%
-          // Hoist a spill to outer loop if there are multiple dependents (it
-          // can be beneficial if more than one dependents are hoisted) or
-          // if DepSV (the hoisting source) is hotter than SV (the hoisting
-          // destination) (we add a 80% margin to bias a little towards
-          // loop depth).
-          bool HoistCondition =
-            (MBFI.getBlockFreq(DepSV.SpillMBB) >=
-             (MBFI.getBlockFreq(SV.SpillMBB) * MarginProb)) ||
-            Deps->size() > 1;
-
-          if ((Loops.getLoopDepth(DepSV.SpillMBB) > SpillDepth) &&
-              HoistCondition &&
-              (!DepSVI->first->isPHIDef() ||
-               MDT.dominates(SV.SpillMBB, DepSV.SpillMBB))) {
-            Changed = true;
-            DepSV.SpillReg = SV.SpillReg;
-            DepSV.SpillVNI = SV.SpillVNI;
-            DepSV.SpillMBB = SV.SpillMBB;
-          }
-        }
-      }
-
-      if (!Changed)
-        continue;
-
-      // Something changed in DepSVI. Propagate to dependents.
-      WorkList.insert(&*DepSVI);
-
-      DEBUG(dbgs() << "  update " << DepSVI->first->id << '@'
-            << DepSVI->first->def << " to:\t" << DepSV);
-    }
-  } while (!WorkList.empty());
-}
-
-/// traceSiblingValue - Trace a value that is about to be spilled back to the
-/// real defining instructions by looking through sibling copies. Always stay
-/// within the range of OrigVNI so the registers are known to carry the same
-/// value.
-///
-/// Determine if the value is defined by all reloads, so spilling isn't
-/// necessary - the value is already in the stack slot.
-///
-/// Return a defining instruction that may be a candidate for rematerialization.
-///
-MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
-                                               VNInfo *OrigVNI) {
-  // Check if a cached value already exists.
-  SibValueMap::iterator SVI;
-  bool Inserted;
-  std::tie(SVI, Inserted) =
-    SibValues.insert(std::make_pair(UseVNI, SibValueInfo(UseReg, UseVNI)));
-  if (!Inserted) {
-    DEBUG(dbgs() << "Cached value " << PrintReg(UseReg) << ':'
-                 << UseVNI->id << '@' << UseVNI->def << ' ' << SVI->second);
-    return SVI->second.DefMI;
-  }
-
-  DEBUG(dbgs() << "Tracing value " << PrintReg(UseReg) << ':'
-               << UseVNI->id << '@' << UseVNI->def << '\n');
-
-  // List of (Reg, VNI) that have been inserted into SibValues, but need to be
-  // processed.
-  SmallVector<std::pair<unsigned, VNInfo*>, 8> WorkList;
-  WorkList.push_back(std::make_pair(UseReg, UseVNI));
-
-  LiveInterval &OrigLI = LIS.getInterval(Original);
-  do {
-    unsigned Reg;
-    VNInfo *VNI;
-    std::tie(Reg, VNI) = WorkList.pop_back_val();
-    DEBUG(dbgs() << "  " << PrintReg(Reg) << ':' << VNI->id << '@' << VNI->def
-                 << ":\t");
-
-    // First check if this value has already been computed.
-    SVI = SibValues.find(VNI);
-    assert(SVI != SibValues.end() && "Missing SibValues entry");
-
-    // Trace through PHI-defs created by live range splitting.
-    if (VNI->isPHIDef()) {
-      // Stop at original PHIs.  We don't know the value at the
-      // predecessors. Look up the VNInfo for the current definition
-      // in OrigLI, to properly determine whether or not this phi was
-      // added by splitting.
-      if (VNI->def == OrigLI.getVNInfoAt(VNI->def)->def) {
-        DEBUG(dbgs() << "orig phi value\n");
-        SVI->second.DefByOrigPHI = true;
-        SVI->second.AllDefsAreReloads = false;
-        propagateSiblingValue(SVI);
-        continue;
-      }
-
-      // This is a PHI inserted by live range splitting.  We could trace the
-      // live-out value from predecessor blocks, but that search can be very
-      // expensive if there are many predecessors and many more PHIs as
-      // generated by tail-dup when it sees an indirectbr.  Instead, look at
-      // all the non-PHI defs that have the same value as OrigVNI.  They must
-      // jointly dominate VNI->def.  This is not optimal since VNI may actually
-      // be jointly dominated by a smaller subset of defs, so there is a change
-      // we will miss a AllDefsAreReloads optimization.
-
-      // Separate all values dominated by OrigVNI into PHIs and non-PHIs.
-      SmallVector<VNInfo*, 8> PHIs, NonPHIs;
-      LiveInterval &LI = LIS.getInterval(Reg);
-
-      for (LiveInterval::vni_iterator VI = LI.vni_begin(), VE = LI.vni_end();
-           VI != VE; ++VI) {
-        VNInfo *VNI2 = *VI;
-        if (VNI2->isUnused())
-          continue;
-        if (!OrigLI.containsOneValue() &&
-            OrigLI.getVNInfoAt(VNI2->def) != OrigVNI)
-          continue;
-        if (VNI2->isPHIDef() && VNI2->def != OrigVNI->def)
-          PHIs.push_back(VNI2);
-        else
-          NonPHIs.push_back(VNI2);
-      }
-      DEBUG(dbgs() << "split phi value, checking " << PHIs.size()
-                   << " phi-defs, and " << NonPHIs.size()
-                   << " non-phi/orig defs\n");
-
-      // Create entries for all the PHIs.  Don't add them to the worklist, we
-      // are processing all of them in one go here.
-      for (VNInfo *PHI : PHIs)
-        SibValues.insert(std::make_pair(PHI, SibValueInfo(Reg, PHI)));
-
-      // Add every PHI as a dependent of all the non-PHIs.
-      for (VNInfo *NonPHI : NonPHIs) {
-        // Known value? Try an insertion.
-        std::tie(SVI, Inserted) =
-          SibValues.insert(std::make_pair(NonPHI, SibValueInfo(Reg, NonPHI)));
-        // Add all the PHIs as dependents of NonPHI.
-        SVI->second.Deps.insert(SVI->second.Deps.end(), PHIs.begin(),
-                                PHIs.end());
-        // This is the first time we see NonPHI, add it to the worklist.
-        if (Inserted)
-          WorkList.push_back(std::make_pair(Reg, NonPHI));
-        else
-          // Propagate to all inserted PHIs, not just VNI.
-          propagateSiblingValue(SVI);
-      }
-
-      // Next work list item.
-      continue;
-    }
-
-    MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
-    assert(MI && "Missing def");
-
-    // Trace through sibling copies.
-    if (unsigned SrcReg = isFullCopyOf(MI, Reg)) {
-      if (isSibling(SrcReg)) {
-        LiveInterval &SrcLI = LIS.getInterval(SrcReg);
-        LiveQueryResult SrcQ = SrcLI.Query(VNI->def);
-        assert(SrcQ.valueIn() && "Copy from non-existing value");
-        // Check if this COPY kills its source.
-        SVI->second.KillsSource = SrcQ.isKill();
-        VNInfo *SrcVNI = SrcQ.valueIn();
-        DEBUG(dbgs() << "copy of " << PrintReg(SrcReg) << ':'
-                     << SrcVNI->id << '@' << SrcVNI->def
-                     << " kill=" << unsigned(SVI->second.KillsSource) << '\n');
-        // Known sibling source value? Try an insertion.
-        std::tie(SVI, Inserted) = SibValues.insert(
-            std::make_pair(SrcVNI, SibValueInfo(SrcReg, SrcVNI)));
-        // This is the first time we see Src, add it to the worklist.
-        if (Inserted)
-          WorkList.push_back(std::make_pair(SrcReg, SrcVNI));
-        propagateSiblingValue(SVI, VNI);
-        // Next work list item.
-        continue;
-      }
-    }
-
-    // Track reachable reloads.
-    SVI->second.DefMI = MI;
-    SVI->second.SpillMBB = MI->getParent();
-    int FI;
-    if (Reg == TII.isLoadFromStackSlot(MI, FI) && FI == StackSlot) {
-      DEBUG(dbgs() << "reload\n");
-      propagateSiblingValue(SVI);
-      // Next work list item.
-      continue;
-    }
-
-    // Potential remat candidate.
-    DEBUG(dbgs() << "def " << *MI);
-    SVI->second.AllDefsAreReloads = false;
-    propagateSiblingValue(SVI);
-  } while (!WorkList.empty());
-
-  // Look up the value we were looking for.  We already did this lookup at the
-  // top of the function, but SibValues may have been invalidated.
-  SVI = SibValues.find(UseVNI);
-  assert(SVI != SibValues.end() && "Didn't compute requested info");
-  DEBUG(dbgs() << "  traced to:\t" << SVI->second);
-  return SVI->second.DefMI;
-}
-
-/// analyzeSiblingValues - Trace values defined by sibling copies back to
-/// something that isn't a sibling copy.
-///
-/// Keep track of values that may be rematerializable.
-void InlineSpiller::analyzeSiblingValues() {
-  SibValues.clear();
-
-  // No siblings at all?
-  if (Edit->getReg() == Original)
-    return;
-
-  LiveInterval &OrigLI = LIS.getInterval(Original);
-  for (unsigned Reg : RegsToSpill) {
-    LiveInterval &LI = LIS.getInterval(Reg);
-    for (LiveInterval::const_vni_iterator VI = LI.vni_begin(),
-         VE = LI.vni_end(); VI != VE; ++VI) {
-      VNInfo *VNI = *VI;
-      if (VNI->isUnused())
-        continue;
-      MachineInstr *DefMI = nullptr;
-      if (!VNI->isPHIDef()) {
-       DefMI = LIS.getInstructionFromIndex(VNI->def);
-       assert(DefMI && "No defining instruction");
-      }
-      // Check possible sibling copies.
-      if (VNI->isPHIDef() || DefMI->isCopy()) {
-        VNInfo *OrigVNI = OrigLI.getVNInfoAt(VNI->def);
-        assert(OrigVNI && "Def outside original live range");
-        if (OrigVNI->def != VNI->def)
-          DefMI = traceSiblingValue(Reg, VNI, OrigVNI);
-      }
-      if (DefMI && Edit->checkRematerializable(VNI, DefMI, AA)) {
-        DEBUG(dbgs() << "Value " << PrintReg(Reg) << ':' << VNI->id << '@'
-                     << VNI->def << " may remat from " << *DefMI);
-      }
-    }
-  }
-}
-
-/// hoistSpill - Given a sibling copy that defines a value to be spilled, insert
-/// a spill at a better location.
-bool InlineSpiller::hoistSpill(LiveInterval &SpillLI, MachineInstr &CopyMI) {
-  SlotIndex Idx = LIS.getInstructionIndex(CopyMI);
-  VNInfo *VNI = SpillLI.getVNInfoAt(Idx.getRegSlot());
-  assert(VNI && VNI->def == Idx.getRegSlot() && "Not defined by copy");
-  SibValueMap::iterator I = SibValues.find(VNI);
-  if (I == SibValues.end())
-    return false;
-
-  const SibValueInfo &SVI = I->second;
-
-  // Let the normal folding code deal with the boring case.
-  if (!SVI.AllDefsAreReloads && SVI.SpillVNI == VNI)
-    return false;
-
-  // SpillReg may have been deleted by remat and DCE.
-  if (!LIS.hasInterval(SVI.SpillReg)) {
-    DEBUG(dbgs() << "Stale interval: " << PrintReg(SVI.SpillReg) << '\n');
-    SibValues.erase(I);
-    return false;
-  }
-
-  LiveInterval &SibLI = LIS.getInterval(SVI.SpillReg);
-  if (!SibLI.containsValue(SVI.SpillVNI)) {
-    DEBUG(dbgs() << "Stale value: " << PrintReg(SVI.SpillReg) << '\n');
-    SibValues.erase(I);
-    return false;
-  }
-
-  // Conservatively extend the stack slot range to the range of the original
-  // value. We may be able to do better with stack slot coloring by being more
-  // careful here.
-  assert(StackInt && "No stack slot assigned yet.");
-  LiveInterval &OrigLI = LIS.getInterval(Original);
-  VNInfo *OrigVNI = OrigLI.getVNInfoAt(Idx);
-  StackInt->MergeValueInAsValue(OrigLI, OrigVNI, StackInt->getValNumInfo(0));
-  DEBUG(dbgs() << "\tmerged orig valno " << OrigVNI->id << ": "
-               << *StackInt << '\n');
-
-  // Already spilled everywhere.
-  if (SVI.AllDefsAreReloads) {
-    DEBUG(dbgs() << "\tno spill needed: " << SVI);
-    ++NumOmitReloadSpill;
-    return true;
-  }
-  // We are going to spill SVI.SpillVNI immediately after its def, so clear out
-  // any later spills of the same value.
-  eliminateRedundantSpills(SibLI, SVI.SpillVNI);
-
-  MachineBasicBlock *MBB = LIS.getMBBFromIndex(SVI.SpillVNI->def);
-  MachineBasicBlock::iterator MII;
-  if (SVI.SpillVNI->isPHIDef())
-    MII = MBB->SkipPHIsAndLabels(MBB->begin());
-  else {
-    MachineInstr *DefMI = LIS.getInstructionFromIndex(SVI.SpillVNI->def);
-    assert(DefMI && "Defining instruction disappeared");
-    MII = DefMI;
-    ++MII;
-  }
-  // Insert spill without kill flag immediately after def.
-  TII.storeRegToStackSlot(*MBB, MII, SVI.SpillReg, false, StackSlot,
-                          MRI.getRegClass(SVI.SpillReg), &TRI);
-  --MII; // Point to store instruction.
-  LIS.InsertMachineInstrInMaps(*MII);
-  DEBUG(dbgs() << "\thoisted: " << SVI.SpillVNI->def << '\t' << *MII);
-
-  ++NumSpills;
-  ++NumHoists;
-  return true;
-}
-
 /// eliminateRedundantSpills - SLI:VNI is known to be on the stack. Remove any
 /// redundant spills of this value in SLI.reg and sibling copies.
 void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
@@ -805,7 +384,8 @@
         MI->setDesc(TII.get(TargetOpcode::KILL));
         DeadDefs.push_back(MI);
         ++NumSpillsRemoved;
-        --NumSpills;
+        if (HSpiller && HSpiller->rmFromMergableSpills(MI, StackSlot))
+          --NumSpills;
       }
     }
   } while (!WorkList.empty());
@@ -876,11 +456,11 @@
   if (SnippetCopies.count(&MI))
     return false;
 
-  // Use an OrigVNI from traceSiblingValue when ParentVNI is a sibling copy.
-  LiveRangeEdit::Remat RM(ParentVNI);
-  SibValueMap::const_iterator SibI = SibValues.find(ParentVNI);
-  if (SibI != SibValues.end())
-    RM.OrigMI = SibI->second.DefMI;
+  LiveInterval &OrigLI = LIS.getInterval(Original);
+  VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx);
+  LiveRangeEdit::Remat RM(ParentVNI, OrigVNI);
+  RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def);
+
   if (!Edit->canRematerializeAt(RM, UseIdx, false)) {
     markValueUsed(&VirtReg, ParentVNI);
     DEBUG(dbgs() << "\tcannot remat for " << UseIdx << '\t' << MI);
@@ -931,7 +511,6 @@
 /// reMaterializeAll - Try to rematerialize as many uses as possible,
 /// and trim the live ranges after.
 void InlineSpiller::reMaterializeAll() {
-  // analyzeSiblingValues has already tested all relevant defining instructions.
   if (!Edit->anyRematerializable(AA))
     return;
 
@@ -1017,6 +596,9 @@
   if (InstrReg != Reg || FI != StackSlot)
     return false;
 
+  if (!IsLoad && HSpiller)
+    HSpiller->rmFromMergableSpills(MI, StackSlot);
+
   DEBUG(dbgs() << "Coalescing stack access: " << *MI);
   LIS.RemoveMachineInstrFromMaps(*MI);
   MI->eraseFromParent();
@@ -1141,6 +723,10 @@
     LIS.removePhysRegDefAt(Reg, Idx);
   }
 
+  int FI;
+  if (TII.isStoreToStackSlot(MI, FI) && HSpiller &&
+      HSpiller->rmFromMergableSpills(MI, FI))
+    --NumSpills;
   LIS.ReplaceMachineInstrInMaps(*MI, *FoldMI);
   MI->eraseFromParent();
 
@@ -1166,9 +752,11 @@
 
   if (!WasCopy)
     ++NumFolded;
-  else if (Ops.front().second == 0)
+  else if (Ops.front().second == 0) {
     ++NumSpills;
-  else
+    if (HSpiller)
+      HSpiller->addToMergableSpills(FoldMI, StackSlot, Original);
+  } else
     ++NumReloads;
   return true;
 }
@@ -1203,6 +791,8 @@
   DEBUG(dumpMachineInstrRangeWithSlotIndex(std::next(MI), MIS.end(), LIS,
                                            "spill"));
   ++NumSpills;
+  if (HSpiller)
+    HSpiller->addToMergableSpills(std::next(MI), StackSlot, Original);
 }
 
 /// spillAroundUses - insert spill code around each use of Reg.
@@ -1265,15 +855,7 @@
         SnippetCopies.insert(MI);
         continue;
       }
-      if (RI.Writes) {
-        // Hoist the spill of a sib-reg copy.
-        if (hoistSpill(OldLI, *MI)) {
-          // This COPY is now dead, the value is already in the stack slot.
-          MI->getOperand(0).setIsDead();
-          DeadDefs.push_back(MI);
-          continue;
-        }
-      } else {
+      if (!RI.Writes) {
         // This is a reload for a sib-reg copy. Drop spills downstream.
         LiveInterval &SibLI = LIS.getInterval(SibReg);
         eliminateRedundantSpills(SibLI, SibLI.getVNInfoAt(Idx));
@@ -1380,7 +962,6 @@
   assert(DeadDefs.empty() && "Previous spill didn't remove dead defs");
 
   collectRegsToSpill();
-  analyzeSiblingValues();
   reMaterializeAll();
 
   // Remat may handle everything.
@@ -1389,3 +970,332 @@
 
   Edit->calculateRegClassAndHint(MF, Loops, MBFI);
 }
+
+// When a spill is inserted, add the spill to MergableSpills map.
+void HoistSpiller::addToMergableSpills(MachineInstr *Spill, int StackSlot,
+                                       unsigned Original) {
+  StackSlotToReg[StackSlot] = Original;
+  SlotIndex Idx = LIS.getInstructionIndex(*Spill);
+  VNInfo *OrigVNI = LIS.getInterval(Original).getVNInfoAt(Idx.getRegSlot());
+  std::pair<int, VNInfo *> MIdx = std::make_pair(StackSlot, OrigVNI);
+  MergableSpills[MIdx].insert(Spill);
+}
+
+// When a spill is removed, remove the spill from MergableSpills map.
+// Return true if the spill is removed successfully.
+bool HoistSpiller::rmFromMergableSpills(MachineInstr *Spill, int StackSlot) {
+  int Original = StackSlotToReg[StackSlot];
+  if (!Original)
+    return false;
+  SlotIndex Idx = LIS.getInstructionIndex(*Spill);
+  VNInfo *OrigVNI = LIS.getInterval(Original).getVNInfoAt(Idx.getRegSlot());
+  std::pair<int, VNInfo *> MIdx = std::make_pair(StackSlot, OrigVNI);
+  return MergableSpills[MIdx].erase(Spill);
+}
+
+// Check BB to see if it is a possible target BB to place a hoisted spill,
+// .i.e, there should be a living sibling of OrigReg at the insert point.
+bool HoistSpiller::isSpillCandBB(unsigned OrigReg, VNInfo *OrigVNI,
+                                 MachineBasicBlock *BB, unsigned &LiveReg) {
+  SlotIndex Idx;
+  MachineBasicBlock::iterator MI = BB->getFirstTerminator();
+  if (MI != BB->end())
+    Idx = LIS.getInstructionIndex(*MI);
+  else
+    Idx = LIS.getMBBEndIdx(BB).getPrevSlot();
+  DenseSet<unsigned> &Siblings = Virt2SiblingsMap[OrigReg];
+  assert((LIS.getInterval(OrigReg)).getVNInfoAt(Idx) == OrigVNI &&
+         "Unexpected VNI");
+
+  for (auto const ent : Siblings) {
+    LiveInterval &LI = LIS.getInterval(ent);
+    VNInfo *VNI = LI.getVNInfoAt(Idx);
+    if (VNI) {
+      LiveReg = ent;
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Get the top-bottom order to visit the BB nodes containing spills.
+/// Redundent spills will be found and put into SpillsToRm at the
+/// same time.
+void HoistSpiller::getVisitOrders(
+    MachineBasicBlock *Root, SmallPtrSet<MachineInstr *, 16> &Spills,
+    SmallVectorImpl<MachineDomTreeNode *> &Orders,
+    SmallVectorImpl<MachineInstr *> &SpillsToRm,
+    DenseMap<MachineDomTreeNode *, unsigned> &SpillsToKept,
+    DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill) {
+  // For each spill, check the BB the spill is located at and set
+  // SpillBBToSpill[]. If a BB contains more than one spill, only
+  // keep the spill with smaller SlotIndex.
+  for (const auto CurrentSpill : Spills) {
+    MachineBasicBlock *Block = CurrentSpill->getParent();
+    MachineDomTreeNode *Node = MDT.DT->getNode(Block);
+    MachineInstr *PrevSpill = SpillBBToSpill[Node];
+    if (PrevSpill) {
+      SlotIndex PIdx = LIS.getInstructionIndex(*PrevSpill);
+      SlotIndex CIdx = LIS.getInstructionIndex(*CurrentSpill);
+      MachineInstr *SpillToRm = (CIdx > PIdx) ? CurrentSpill : PrevSpill;
+      MachineInstr *SpillToKeep = (CIdx > PIdx) ? PrevSpill : CurrentSpill;
+      SpillsToRm.push_back(SpillToRm);
+      SpillBBToSpill[MDT.DT->getNode(Block)] = SpillToKeep;
+    } else {
+      SpillBBToSpill[MDT.DT->getNode(Block)] = CurrentSpill;
+    }
+  }
+  for (const auto SpillToRm : SpillsToRm)
+    Spills.erase(SpillToRm);
+
+  SmallPtrSet<MachineDomTreeNode *, 8> WorkSet;
+  SmallPtrSet<MachineDomTreeNode *, 8> NodesOnPath;
+  MachineDomTreeNode *RootIDomNode = MDT[Root]->getIDom();
+  // For every node on the dominator tree with spill, walk upside on the
+  // dominator tree until reaching the Root node. If there is other node
+  // found with spill on the path, the original node is redundent and will
+  // be removed. All the nodes on the path from node with non-redundent spill
+  // to Root node will be added to the WorkSet, which is the set we want
+  // to look at during hoisting spills in the next step.
+  for (const auto Spill : Spills) {
+    MachineBasicBlock *Block = Spill->getParent();
+    MachineDomTreeNode *Node = MDT[Block];
+    MachineInstr *SpillToRm = nullptr;
+    while (Node != RootIDomNode) {
+      if (Node != MDT[Block] && SpillBBToSpill[Node]) {
+        SpillToRm = SpillBBToSpill[MDT[Block]];
+        break;
+      } else if (WorkSet.count(Node)) {
+        break;
+      } else {
+        NodesOnPath.insert(Node);
+      }
+      Node = Node->getIDom();
+    }
+    if (SpillToRm) {
+      SpillsToRm.push_back(SpillToRm);
+    } else {
+      SpillsToKept[MDT[Block]] = 0;
+      WorkSet.insert(NodesOnPath.begin(), NodesOnPath.end());
+    }
+    NodesOnPath.clear();
+  }
+
+  // Sort the nodes in WorkSet in top-bottom order and save the nodes
+  // in Orders.
+  unsigned idx = 0;
+  Orders.push_back(MDT.DT->getNode(Root));
+  do {
+    MachineDomTreeNode *Node = Orders[idx++];
+    const std::vector<MachineDomTreeNode *> &Children = Node->getChildren();
+    unsigned NumChildren = Children.size();
+    for (unsigned i = 0; i != NumChildren; ++i) {
+      MachineDomTreeNode *Child = Children[i];
+      if (WorkSet.count(Child))
+        Orders.push_back(Child);
+    }
+  } while (idx != Orders.size());
+
+  DEBUG(dbgs() << "Orders size is " << Orders.size() << "\n");
+  {
+    SmallVector<MachineDomTreeNode *, 32>::reverse_iterator RIt =
+        Orders.rbegin();
+    for (; RIt != Orders.rend(); RIt++)
+      DEBUG(dbgs() << "BB" << (*RIt)->getBlock()->getNumber() << ",");
+  }
+  DEBUG(dbgs() << "\n");
+}
+
+/// Try to hoist spills according to BB hotness. The spills to removed will
+/// be saved in SpillsToRm. The spills to be inserted will be saved in
+/// SpillsToIns.
+void HoistSpiller::runHoistSpills(
+    unsigned OrigReg, VNInfo *OrigVNI, SmallPtrSet<MachineInstr *, 16> &Spills,
+    SmallVectorImpl<MachineInstr *> &SpillsToRm,
+    DenseMap<MachineBasicBlock *, unsigned> &SpillsToIns) {
+  // Visit order of dominator tree nodes.
+  SmallVector<MachineDomTreeNode *, 32> Orders;
+  // SpillsToKept contains all the nodes where spills are to be inserted
+  // during hoisting. If the spill to be inserted is an original spill
+  // (not a hoisted one), the value of the map entry is 0. If the spill
+  // is a hoisted spill, the value of the map entry is the VReg to be used
+  // on the RHS of the spill.
+  DenseMap<MachineDomTreeNode *, unsigned> SpillsToKept;
+  // Map from BB to the spill inside of it.
+  DenseMap<MachineDomTreeNode *, MachineInstr *> SpillBBToSpill;
+  MachineBasicBlock *Root = LIS.getMBBFromIndex(OrigVNI->def);
+  getVisitOrders(Root, Spills, Orders, SpillsToRm, SpillsToKept,
+                 SpillBBToSpill);
+
+  // SpillsInSubTree keeps the map from a dom tree node to a nodes set.
+  // It saves the locations where spills are to be inserted in the
+  // subtree of the node.
+  DenseMap<MachineDomTreeNode *, SmallPtrSet<MachineDomTreeNode *, 16>>
+      SpillsInSubTree;
+  // Iterate Orders set in reverse order, which will be a bottom-top order
+  // in the dominator tree. Once we visit a dom tree node, we know its
+  // children has already been visited and the spill locations in the
+  // subtrees of all the children have been determined.
+  SmallVector<MachineDomTreeNode *, 32>::reverse_iterator RIt = Orders.rbegin();
+  for (; RIt != Orders.rend(); RIt++) {
+    MachineBasicBlock *Block = (*RIt)->getBlock();
+
+    // If Block contains an original spill, simply continue.
+    if (SpillsToKept.find(*RIt) != SpillsToKept.end() && !SpillsToKept[*RIt]) {
+      SpillsInSubTree[*RIt].insert(*RIt);
+      continue;
+    }
+
+    // Collect spills in subtree of current node (*RIt) to
+    // SpillsInSubTree[*RIt].
+    const std::vector<MachineDomTreeNode *> &Children = (*RIt)->getChildren();
+    unsigned NumChildren = Children.size();
+    for (unsigned i = 0; i != NumChildren; ++i) {
+      MachineDomTreeNode *Child = Children[i];
+      SpillsInSubTree[*RIt].insert(SpillsInSubTree[Child].begin(),
+                                   SpillsInSubTree[Child].end());
+      SpillsInSubTree.erase(Child);
+    }
+
+    // No spills in subtree, simply continue.
+    if (SpillsInSubTree[*RIt].empty())
+      continue;
+
+    // Check whether Block is a possible candidate to insert spill.
+    unsigned LiveReg = 0;
+    if (!isSpillCandBB(OrigReg, OrigVNI, Block, LiveReg))
+      continue;
+
+    // Now Block is a proper target BB for hoisting spills. Decide whether to
+    // hoist the spills to current node. Get existing cost of all the spills
+    // in SpillsInSubTree[Block].
+    BlockFrequency SpillCost = 0;
+    for (const auto SpillBB : SpillsInSubTree[*RIt])
+      SpillCost += MBFI.getBlockFreq(SpillBB->getBlock());
+
+    // If there are multiple spills that could be merged, bias a little
+    // to hoist the spill.
+    BranchProbability MarginProb = (SpillsInSubTree[*RIt].size() > 1)
+                                       ? BranchProbability(9, 10)
+                                       : BranchProbability(1, 1);
+    if (SpillCost > MBFI.getBlockFreq(Block) * MarginProb) {
+      // Hoist: Move spills to current Block.
+      for (const auto SpillBB : SpillsInSubTree[*RIt]) {
+        // When SpillBB is a BB contains original spill, insert the spill
+        // to SpillsToRm.
+        if (SpillsToKept.find(SpillBB) != SpillsToKept.end() &&
+            !SpillsToKept[SpillBB]) {
+          MachineInstr *SpillToRm = SpillBBToSpill[SpillBB];
+          SpillsToRm.push_back(SpillToRm);
+        }
+        // SpillBB will not contain spill anymore, remove it from SpillsToKept.
+        SpillsToKept.erase(SpillBB);
+      }
+      // Current Block is the BB containing the new hoisted spill. Add it to
+      // SpillsToKept. LiveReg is the RHS of the spill.
+      SpillsToKept[*RIt] = LiveReg;
+      DEBUG({
+        dbgs() << "spills in BB: ";
+        for (const auto Rspill : SpillsInSubTree[*RIt])
+          dbgs() << Rspill->getBlock()->getNumber() << " ";
+        dbgs() << "were promoted to BB" << (*RIt)->getBlock()->getNumber()
+               << "\n";
+      });
+      SpillsInSubTree[*RIt].clear();
+      SpillsInSubTree[*RIt].insert(*RIt);
+    }
+  }
+  // For spills in SpillsToKept with LiveReg set (.i.e, not original spill),
+  // save them to SpillsToIns.
+  for (const auto ent : SpillsToKept) {
+    if (ent.second)
+      SpillsToIns[ent.first->getBlock()] = ent.second;
+  }
+}
+
+/// For spills with equal values, remove redundent spills and hoist spills
+/// to a less hot spot.
+void HoistSpiller::hoistAllSpills(LiveRangeEdit &Edit) {
+  // Save the mapping between stackslot and its original reg.
+  DenseMap<int, unsigned> SlotToOrigReg;
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    int Slot = VRM.getStackSlot(Reg);
+    if (Slot != VirtRegMap::NO_STACK_SLOT) {
+      for (const auto &ent : MergableSpills) {
+        if (ent.first.first == Slot &&
+            SlotToOrigReg.find(Slot) == SlotToOrigReg.end())
+          SlotToOrigReg[Slot] = VRM.getOriginal(Reg);
+      }
+    }
+    unsigned Original = VRM.getPreSplitReg(Reg);
+    if (!MRI.def_empty(Reg))
+      Virt2SiblingsMap[Original].insert(Reg);
+  }
+
+  // Each entry in MergableSpills contains a spill set with equal values.
+  for (auto &ent : MergableSpills) {
+    int Slot = ent.first.first;
+    unsigned OrigReg = SlotToOrigReg[Slot];
+    VNInfo *OrigVNI = ent.first.second;
+    SmallPtrSet<MachineInstr *, 16> &EqValSpills = ent.second;
+    if (!ent.second.size())
+      continue;
+
+    DEBUG({
+      dbgs() << "\nFor Slot" << Slot << " and VN" << OrigVNI->id << ":\n"
+             << "Equal spills in BB: ";
+      for (const auto spill : EqValSpills)
+        dbgs() << spill->getParent()->getNumber() << " ";
+      dbgs() << "\n";
+    });
+
+    // SpillsToRm is the spill set to be removed from EqValSpills.
+    SmallVector<MachineInstr *, 16> SpillsToRm;
+    // SpillsToIns is the spill set to be newly inserted after hoisting.
+    DenseMap<MachineBasicBlock *, unsigned> SpillsToIns;
+
+    runHoistSpills(OrigReg, OrigVNI, EqValSpills, SpillsToRm, SpillsToIns);
+
+    DEBUG({
+      dbgs() << "Finally inserted spills in BB: ";
+      for (const auto Ispill : SpillsToIns)
+        dbgs() << Ispill.first->getNumber() << " ";
+      dbgs() << "\nFinally removed spills in BB: ";
+      for (const auto Rspill : SpillsToRm)
+        dbgs() << Rspill->getParent()->getNumber() << " ";
+      dbgs() << "\n";
+    });
+
+    // Stack live range update.
+    LiveInterval &StackIntvl = LSS.getInterval(Slot);
+    if (!SpillsToIns.empty() || !SpillsToRm.empty()) {
+      LiveInterval &OrigLI = LIS.getInterval(OrigReg);
+      StackIntvl.MergeValueInAsValue(OrigLI, OrigVNI,
+                                     StackIntvl.getValNumInfo(0));
+    }
+
+    // Insert hoisted spills.
+    for (auto const ent : SpillsToIns) {
+      MachineBasicBlock *BB = ent.first;
+      unsigned LiveReg = ent.second;
+      MachineBasicBlock::iterator MI = BB->getFirstTerminator();
+      TII.storeRegToStackSlot(*BB, MI, LiveReg, false, Slot,
+                              MRI.getRegClass(LiveReg), &TRI);
+      LIS.InsertMachineInstrRangeInMaps(std::prev(MI), MI);
+      ++NumSpills;
+    }
+
+    // Remove redundent spills or change them to dead instructions.
+    NumSpills -= SpillsToRm.size();
+    for (auto const ent : SpillsToRm) {
+      ent->setDesc(TII.get(TargetOpcode::KILL));
+      for (unsigned i = ent->getNumOperands(); i; --i) {
+        MachineOperand &MO = ent->getOperand(i - 1);
+        if (MO.isReg() && MO.isImplicit() && MO.isDef() && !MO.isDead())
+          ent->RemoveOperand(i - 1);
+      }
+    }
+    Edit.eliminateDeadDefs(SpillsToRm, None, true);
+  }
+}
Index: lib/CodeGen/LiveRangeEdit.cpp
===================================================================
--- lib/CodeGen/LiveRangeEdit.cpp
+++ lib/CodeGen/LiveRangeEdit.cpp
@@ -63,10 +63,13 @@
   for (VNInfo *VNI : getParent().valnos) {
     if (VNI->isUnused())
       continue;
-    MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
+    unsigned Original = VRM->getOriginal(getReg());
+    LiveInterval &OrigLI = LIS.getInterval(Original);
+    VNInfo *OrigVNI = OrigLI.getVNInfoAt(VNI->def);
+    MachineInstr *DefMI = LIS.getInstructionFromIndex(OrigVNI->def);
     if (!DefMI)
       continue;
-    checkRematerializable(VNI, DefMI, aa);
+    checkRematerializable(OrigVNI, DefMI, aa);
   }
   ScannedRemattable = true;
 }
@@ -119,18 +122,13 @@
   assert(ScannedRemattable && "Call anyRematerializable first");
 
   // Use scanRemattable info.
-  if (!Remattable.count(RM.ParentVNI))
+  if (!Remattable.count(RM.OrigVNI))
     return false;
 
   // No defining instruction provided.
   SlotIndex DefIdx;
-  if (RM.OrigMI)
-    DefIdx = LIS.getInstructionIndex(*RM.OrigMI);
-  else {
-    DefIdx = RM.ParentVNI->def;
-    RM.OrigMI = LIS.getInstructionFromIndex(DefIdx);
-    assert(RM.OrigMI && "No defining instruction for remattable value");
-  }
+  assert(RM.OrigMI && "No defining instruction for remattable value");
+  DefIdx = LIS.getInstructionIndex(*(RM.OrigMI));
 
   // If only cheap remats were requested, bail out early.
   if (cheapAsAMove && !TII.isAsCheapAsAMove(RM.OrigMI))
@@ -261,6 +259,15 @@
   // Collect virtual registers to be erased after MI is gone.
   SmallVector<unsigned, 8> RegsToErase;
   bool ReadsPhysRegs = false;
+  bool isOrigDef = false;
+  unsigned Dest;
+  if (VRM && MI->getOperand(0).isReg()) {
+    Dest = MI->getOperand(0).getReg();
+    unsigned Original = VRM->getOriginal(Dest);
+    LiveInterval &OrigLI = LIS.getInterval(Original);
+    VNInfo *OrigVNI = OrigLI.getVNInfoAt(Idx);
+    isOrigDef = SlotIndex::isSameInstr(OrigVNI->def, Idx);
+  }
 
   // Check for live intervals that may shrink
   for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
@@ -314,11 +321,24 @@
     }
     DEBUG(dbgs() << "Converted physregs to:\t" << *MI);
   } else {
-    if (TheDelegate)
-      TheDelegate->LRE_WillEraseInstruction(MI);
-    LIS.RemoveMachineInstrFromMaps(*MI);
-    MI->eraseFromParent();
-    ++NumDCEDeleted;
+    // If the dest of MI is an original reg, don't delete the inst. Replace
+    // the dest with a new reg, keep the inst for remat of other siblings.
+    // The inst is saved in LiveRangeEdit::DeadRemats and will be deleted
+    // after all the allocations of the func are done.
+    if (isOrigDef) {
+      unsigned NewDest = createFrom(Dest);
+      pop_back();
+      markDeadRemat(MI);
+      const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+      MI->substituteRegister(Dest, NewDest, 0, TRI);
+      MI->getOperand(0).setIsDead(false);
+    } else {
+      if (TheDelegate)
+        TheDelegate->LRE_WillEraseInstruction(MI);
+      LIS.RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+      ++NumDCEDeleted;
+    }
   }
 
   // Erase any virtregs that are now empty and unused. There may be <undef>
@@ -332,8 +352,9 @@
   }
 }
 
-void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
-                                      ArrayRef<unsigned> RegsBeingSpilled) {
+void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead,
+                                      ArrayRef<unsigned> RegsBeingSpilled,
+                                      bool NoSplit) {
   ToShrinkSet ToShrink;
 
   for (;;) {
@@ -355,6 +376,9 @@
     if (!LIS.shrinkToUses(LI, &Dead))
       continue;
 
+    if (NoSplit)
+      continue;
+
     // Don't create new intervals for a register being spilled.
     // The new intervals would have to be spilled anyway so its not worth it.
     // Also they currently aren't spilled so creating them and not spilling
Index: lib/CodeGen/RegAllocBase.h
===================================================================
--- lib/CodeGen/RegAllocBase.h
+++ lib/CodeGen/RegAllocBase.h
@@ -65,6 +65,12 @@
   LiveRegMatrix *Matrix;
   RegisterClassInfo RegClassInfo;
 
+  /// Inst which is a def of an original reg and whose defs are already all
+  /// dead after remat is saved in DeadRemats. The deletion of such inst is
+  /// postponed till all the allocations are done, so its remat expr is
+  /// always available for the remat of all the siblings of the original reg.
+  SmallPtrSet<MachineInstr *, 32> DeadRemats;
+
   RegAllocBase()
     : TRI(nullptr), MRI(nullptr), VRM(nullptr), LIS(nullptr), Matrix(nullptr) {}
 
@@ -77,6 +83,9 @@
   // physical register assignments.
   void allocatePhysRegs();
 
+  // Remove dead defs because of rematerialization.
+  void eliminateDeadRemats();
+
   // Get a temporary reference to a Spiller instance.
   virtual Spiller &spiller() = 0;
 
Index: lib/CodeGen/RegAllocBase.cpp
===================================================================
--- lib/CodeGen/RegAllocBase.cpp
+++ lib/CodeGen/RegAllocBase.cpp
@@ -153,3 +153,11 @@
     }
   }
 }
+
+void RegAllocBase::eliminateDeadRemats() {
+  for (auto ent : DeadRemats) {
+    LIS->RemoveMachineInstrFromMaps(*ent);
+    ent->eraseFromParent();
+  }
+  DeadRemats.clear();
+}
Index: lib/CodeGen/RegAllocBasic.cpp
===================================================================
--- lib/CodeGen/RegAllocBasic.cpp
+++ lib/CodeGen/RegAllocBasic.cpp
@@ -199,7 +199,7 @@
     Matrix->unassign(Spill);
 
     // Spill the extracted interval.
-    LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM);
+    LiveRangeEdit LRE(&Spill, SplitVRegs, &DeadRemats, *MF, *LIS, VRM);
     spiller().spill(LRE);
   }
   return true;
@@ -258,7 +258,7 @@
   DEBUG(dbgs() << "spilling: " << VirtReg << '\n');
   if (!VirtReg.isSpillable())
     return ~0u;
-  LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM);
+  LiveRangeEdit LRE(&VirtReg, SplitVRegs, &DeadRemats, *MF, *LIS, VRM);
   spiller().spill(LRE);
 
   // The live virtual register requesting allocation was spilled, so tell
@@ -283,6 +283,7 @@
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
 
   allocatePhysRegs();
+  eliminateDeadRemats();
 
   // Diagnostic output before rewriting
   DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *VRM << "\n");
Index: lib/CodeGen/RegAllocGreedy.cpp
===================================================================
--- lib/CodeGen/RegAllocGreedy.cpp
+++ lib/CodeGen/RegAllocGreedy.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "AllocationOrder.h"
 #include "InterferenceCache.h"
 #include "LiveDebugVariables.h"
@@ -33,6 +32,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
@@ -44,6 +44,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <queue>
 
@@ -55,14 +56,14 @@
 STATISTIC(NumLocalSplits,  "Number of split local live ranges");
 STATISTIC(NumEvicted,      "Number of interferences evicted");
 
-static cl::opt<SplitEditor::ComplementSpillMode>
-SplitSpillMode("split-spill-mode", cl::Hidden,
-  cl::desc("Spill mode for splitting live ranges"),
-  cl::values(clEnumValN(SplitEditor::SM_Partition, "default", "Default"),
-             clEnumValN(SplitEditor::SM_Size,  "size",  "Optimize for size"),
-             clEnumValN(SplitEditor::SM_Speed, "speed", "Optimize for speed"),
-             clEnumValEnd),
-  cl::init(SplitEditor::SM_Partition));
+static cl::opt<SplitEditor::ComplementSpillMode> SplitSpillMode(
+    "split-spill-mode", cl::Hidden,
+    cl::desc("Spill mode for splitting live ranges"),
+    cl::values(clEnumValN(SplitEditor::SM_Partition, "default", "Default"),
+               clEnumValN(SplitEditor::SM_Size, "size", "Optimize for size"),
+               clEnumValN(SplitEditor::SM_Speed, "speed", "Optimize for speed"),
+               clEnumValEnd),
+    cl::init(SplitEditor::SM_Speed));
 
 static cl::opt<unsigned>
 LastChanceRecoloringMaxDepth("lcr-max-depth", cl::Hidden,
@@ -397,6 +398,7 @@
                                SmallVirtRegSet &, unsigned);
   void tryHintRecoloring(LiveInterval &);
   void tryHintsRecoloring();
+  void postOptimization();
 
   /// Model the information carried by one end of a copy.
   struct HintInfo {
@@ -1465,7 +1467,7 @@
                                  SmallVectorImpl<unsigned> &NewVRegs) {
   SmallVector<unsigned, 8> UsedCands;
   // Prepare split editor.
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, &DeadRemats, *MF, *LIS, VRM, this);
   SE->reset(LREdit, SplitSpillMode);
 
   // Assign all edge bundles to the preferred candidate, or NoCand.
@@ -1513,7 +1515,7 @@
   assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
   unsigned Reg = VirtReg.reg;
   bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, &DeadRemats, *MF, *LIS, VRM, this);
   SE->reset(LREdit, SplitSpillMode);
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
   for (unsigned i = 0; i != UseBlocks.size(); ++i) {
@@ -1585,7 +1587,7 @@
 
   // Always enable split spill mode, since we're effectively spilling to a
   // register.
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, &DeadRemats, *MF, *LIS, VRM, this);
   SE->reset(LREdit, SplitEditor::SM_Size);
 
   ArrayRef<SlotIndex> Uses = SA->getUseSlots();
@@ -1908,7 +1910,7 @@
                << '-' << Uses[BestAfter] << ", " << BestDiff
                << ", " << (BestAfter - BestBefore + 1) << " instrs\n");
 
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, &DeadRemats, *MF, *LIS, VRM, this);
   SE->reset(LREdit);
 
   SE->openIntv();
@@ -2551,7 +2553,7 @@
     NewVRegs.push_back(VirtReg.reg);
   } else {
     NamedRegionTimer T("Spiller", TimerGroupName, TimePassesIsEnabled);
-    LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+    LiveRangeEdit LRE(&VirtReg, NewVRegs, &DeadRemats, *MF, *LIS, VRM, this);
     spiller().spill(LRE);
     setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);
 
@@ -2564,6 +2566,11 @@
   return 0;
 }
 
+void RAGreedy::postOptimization() {
+  eliminateDeadRemats();
+  startHoistSpiller(*MF, *VRM, *LIS, &spiller());
+}
+
 bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n"
                << "********** Function: " << mf.getName() << '\n');
@@ -2587,6 +2594,7 @@
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   DomTree = &getAnalysis<MachineDominatorTree>();
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
+  createHoistSpiller(*this, *MF, *VRM, &spiller());
   Loops = &getAnalysis<MachineLoopInfo>();
   Bundles = &getAnalysis<EdgeBundles>();
   SpillPlacer = &getAnalysis<SpillPlacement>();
@@ -2609,6 +2617,8 @@
 
   allocatePhysRegs();
   tryHintsRecoloring();
+  postOptimization();
+
   releaseMemory();
   return true;
 }
Index: lib/CodeGen/RegAllocPBQP.cpp
===================================================================
--- lib/CodeGen/RegAllocPBQP.cpp
+++ lib/CodeGen/RegAllocPBQP.cpp
@@ -123,6 +123,12 @@
 
   RegSet VRegsToAlloc, EmptyIntervalVRegs;
 
+  /// Inst which is a def of an original reg and whose defs are already all
+  /// dead after remat is saved in DeadRemats. The deletion of such inst is
+  /// postponed till all the allocations are done, so its remat expr is
+  /// always available for the remat of all the siblings of the original reg.
+  SmallPtrSet<MachineInstr *, 32> DeadRemats;
+
   /// \brief Finds the initial set of vreg intervals to allocate.
   void findVRegIntervalsToAlloc(const MachineFunction &MF, LiveIntervals &LIS);
 
@@ -146,6 +152,8 @@
   void finalizeAlloc(MachineFunction &MF, LiveIntervals &LIS,
                      VirtRegMap &VRM) const;
 
+  /// Remove dead defs because of rematerialization.
+  void eliminateDeadRemats(LiveIntervals &LIS);
 };
 
 char RegAllocPBQP::ID = 0;
@@ -631,7 +639,8 @@
                              VirtRegMap &VRM, Spiller &VRegSpiller) {
 
   VRegsToAlloc.erase(VReg);
-  LiveRangeEdit LRE(&LIS.getInterval(VReg), NewIntervals, MF, LIS, &VRM);
+  LiveRangeEdit LRE(&LIS.getInterval(VReg), NewIntervals, &DeadRemats, MF, LIS,
+                    &VRM);
   VRegSpiller.spill(LRE);
 
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
@@ -713,6 +722,14 @@
   }
 }
 
+void RegAllocPBQP::eliminateDeadRemats(LiveIntervals &LIS) {
+  for (auto ent : DeadRemats) {
+    LIS.RemoveMachineInstrFromMaps(*ent);
+    ent->eraseFromParent();
+  }
+  DeadRemats.clear();
+}
+
 static inline float normalizePBQPSpillWeight(float UseDefFreq, unsigned Size,
                                          unsigned NumInstr) {
   // All intervals have a spill weight that is mostly proportional to the number
@@ -798,6 +815,7 @@
 
   // Finalise allocation, allocate empty ranges.
   finalizeAlloc(MF, LIS, VRM);
+  eliminateDeadRemats(LIS);
   VRegsToAlloc.clear();
   EmptyIntervalVRegs.clear();
 
Index: lib/CodeGen/RegisterCoalescer.cpp
===================================================================
--- lib/CodeGen/RegisterCoalescer.cpp
+++ lib/CodeGen/RegisterCoalescer.cpp
@@ -459,8 +459,8 @@
 
 void RegisterCoalescer::eliminateDeadDefs() {
   SmallVector<unsigned, 8> NewRegs;
-  LiveRangeEdit(nullptr, NewRegs, *MF, *LIS,
-                nullptr, this).eliminateDeadDefs(DeadDefs);
+  LiveRangeEdit(nullptr, NewRegs, nullptr, *MF, *LIS, nullptr, this)
+      .eliminateDeadDefs(DeadDefs);
 }
 
 void RegisterCoalescer::LRE_WillEraseInstruction(MachineInstr *MI) {
Index: lib/CodeGen/Spiller.h
===================================================================
--- lib/CodeGen/Spiller.h
+++ lib/CodeGen/Spiller.h
@@ -16,6 +16,7 @@
   class MachineFunction;
   class MachineFunctionPass;
   class VirtRegMap;
+  class LiveIntervals;
 
   /// Spiller interface.
   ///
@@ -28,7 +29,6 @@
 
     /// spill - Spill the LRE.getParent() live interval.
     virtual void spill(LiveRangeEdit &LRE) = 0;
-
   };
 
   /// Create and return a spiller that will insert spill code directly instead
@@ -37,6 +37,13 @@
                                MachineFunction &mf,
                                VirtRegMap &vrm);
 
+  void createHoistSpiller(MachineFunctionPass &pass, MachineFunction &mf,
+                          VirtRegMap &vrm, Spiller *);
+
+  /// startHoistSpiller - create a HoistSpiller object and start to hoist
+  /// Spills.
+  void startHoistSpiller(MachineFunction &mf, VirtRegMap &vrm,
+                         LiveIntervals &lis, Spiller *);
 }
 
 #endif
Index: lib/CodeGen/SplitKit.h
===================================================================
--- lib/CodeGen/SplitKit.h
+++ lib/CodeGen/SplitKit.h
@@ -18,6 +18,7 @@
 #include "LiveRangeCalc.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 
@@ -329,9 +330,13 @@
   MachineBasicBlock *findShallowDominator(MachineBasicBlock *MBB,
                                           MachineBasicBlock *DefMBB);
 
-  /// hoistCopiesForSize - Hoist back-copies to the complement interval in a
-  /// way that minimizes code size. This implements the SM_Size spill mode.
-  void hoistCopiesForSize();
+  /// removeRedundentCopies - Remove redundent back-copies if it has been
+  /// decided those back-copies will not be hoisted.
+  void removeRedundentCopies(DenseSet<unsigned> &NotToHoistSet,
+                             SmallVectorImpl<VNInfo *> &BackCopies);
+
+  /// hoistCopies - Hoist back-copies to the complement interval.
+  void hoistCopies();
 
   /// transferValues - Transfer values to the new ranges.
   /// Return true if any ranges were skipped.
Index: lib/CodeGen/SplitKit.cpp
===================================================================
--- lib/CodeGen/SplitKit.cpp
+++ lib/CodeGen/SplitKit.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -430,7 +431,12 @@
   bool Late = RegIdx != 0;
 
   // Attempt cheap-as-a-copy rematerialization.
-  LiveRangeEdit::Remat RM(ParentVNI);
+  unsigned Original = VRM.getOriginal(Edit->get(RegIdx));
+  LiveInterval &OrigLI = LIS.getInterval(Original);
+  VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx);
+  LiveRangeEdit::Remat RM(ParentVNI, OrigVNI);
+  RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def);
+
   if (Edit->canRematerializeAt(RM, UseIdx, true)) {
     Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, TRI, Late);
     ++NumRemats;
@@ -716,7 +722,62 @@
   }
 }
 
-void SplitEditor::hoistCopiesForSize() {
+/// Remove redundent backcopies if the backcopies for the same ParentVNI cannot
+/// be hoisted because of too much cost.
+void SplitEditor::removeRedundentCopies(DenseSet<unsigned> &NotToHoistSet,
+                                        SmallVectorImpl<VNInfo *> &BackCopies) {
+  LiveInterval *LI = &LIS.getInterval(Edit->get(0));
+  LiveInterval *Parent = &Edit->getParent();
+  SmallVector<SmallPtrSet<VNInfo *, 8>, 8> EqualVNs(Parent->getNumValNums());
+  SmallPtrSet<VNInfo *, 8> DominatedVNIs;
+
+  // Aggregate VNIs having the same value as ParentVNI.
+  for (VNInfo *VNI : LI->valnos) {
+    if (VNI->isUnused())
+      continue;
+    VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
+    EqualVNs[ParentVNI->id].insert(VNI);
+  }
+
+  // For VNI aggregation of each ParentVNI, collect dominated, .i.e,
+  // redundent VNIs to BackCopies.
+  for (unsigned i = 0, e = Parent->getNumValNums(); i != e; ++i) {
+    VNInfo *ParentVNI = Parent->getValNumInfo(i);
+    if (!NotToHoistSet.count(ParentVNI->id))
+      continue;
+    for (auto Ent1 : EqualVNs[ParentVNI->id]) {
+      for (auto Ent2 : EqualVNs[ParentVNI->id]) {
+        if (Ent1 == Ent2 || DominatedVNIs.count(Ent1) ||
+            DominatedVNIs.count(Ent2))
+          continue;
+
+        MachineBasicBlock *MBB1 = LIS.getMBBFromIndex(Ent1->def);
+        MachineBasicBlock *MBB2 = LIS.getMBBFromIndex(Ent2->def);
+        if (MBB1 == MBB2) {
+          DominatedVNIs.insert(Ent1->def < Ent2->def ? Ent2 : Ent1);
+        } else if (MDT.dominates(MBB1, MBB2)) {
+          DominatedVNIs.insert(Ent2);
+        } else if (MDT.dominates(MBB2, MBB1)) {
+          DominatedVNIs.insert(Ent1);
+        }
+      }
+    }
+    if (!DominatedVNIs.empty()) {
+      forceRecompute(0, ParentVNI);
+      for (auto Ent : DominatedVNIs) {
+        BackCopies.push_back(Ent);
+      }
+      DominatedVNIs.clear();
+    }
+  }
+}
+
+/// For SM_Size mode, find a common dominator for all the back-copies for
+/// the same ParentVNI and hoist the backcopies to the dominator BB.
+/// For SM_Speed mode, if the common dominator is hot and it is not beneficial
+/// to do the hoisting, simply remove the dominated backcopies for the same
+/// ParentVNI.
+void SplitEditor::hoistCopies() {
   // Get the complement interval, always RegIdx 0.
   LiveInterval *LI = &LIS.getInterval(Edit->get(0));
   LiveInterval *Parent = &Edit->getParent();
@@ -725,6 +786,11 @@
   // indexed by ParentVNI->id.
   typedef std::pair<MachineBasicBlock*, SlotIndex> DomPair;
   SmallVector<DomPair, 8> NearestDom(Parent->getNumValNums());
+  // The total cost of all the back-copies for each ParentVNI.
+  SmallVector<BlockFrequency, 8> Costs(Parent->getNumValNums());
+  // The ParentVNI->id set for which hoisting back-copies are not beneficial
+  // for Speed.
+  DenseSet<unsigned> NotToHoistSet;
 
   // Find the nearest common dominator for parent values with multiple
   // back-copies.  If a single back-copy dominates, put it in DomPair.second.
@@ -740,6 +806,7 @@
       continue;
 
     MachineBasicBlock *ValMBB = LIS.getMBBFromIndex(VNI->def);
+
     DomPair &Dom = NearestDom[ParentVNI->id];
 
     // Keep directly defined parent values.  This is either a PHI or an
@@ -774,6 +841,7 @@
       else if (Near != Dom.first)
         // None dominate. Hoist to common dominator, need new def.
         Dom = DomPair(Near, SlotIndex());
+      Costs[ParentVNI->id] += MBFI.getBlockFreq(ValMBB);
     }
 
     DEBUG(dbgs() << "Multi-mapped complement " << VNI->id << '@' << VNI->def
@@ -792,6 +860,11 @@
     MachineBasicBlock *DefMBB = LIS.getMBBFromIndex(ParentVNI->def);
     // Get a less loopy dominator than Dom.first.
     Dom.first = findShallowDominator(Dom.first, DefMBB);
+    if (SpillMode == SM_Speed &&
+        MBFI.getBlockFreq(Dom.first) > Costs[ParentVNI->id]) {
+      NotToHoistSet.insert(ParentVNI->id);
+      continue;
+    }
     SlotIndex Last = LIS.getMBBEndIdx(Dom.first).getPrevSlot();
     Dom.second =
       defFromParent(0, ParentVNI, Last, *Dom.first,
@@ -806,11 +879,18 @@
       continue;
     VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
     const DomPair &Dom = NearestDom[ParentVNI->id];
-    if (!Dom.first || Dom.second == VNI->def)
+    if (!Dom.first || Dom.second == VNI->def ||
+        NotToHoistSet.count(ParentVNI->id))
       continue;
     BackCopies.push_back(VNI);
     forceRecompute(0, ParentVNI);
   }
+
+  // If it is not beneficial to hoist all the BackCopies, simply remove
+  // redundent BackCopies in speed mode.
+  if (SpillMode == SM_Speed && !NotToHoistSet.empty())
+    removeRedundentCopies(NotToHoistSet, BackCopies);
+
   removeBackCopies(BackCopies);
 }
 
@@ -1004,6 +1084,8 @@
       // Dead defs end at the dead slot.
       if (S.end != S.valno->def.getDeadSlot())
         continue;
+      if (S.valno->isPHIDef())
+        continue;
       MachineInstr *MI = LIS.getInstructionFromIndex(S.valno->def);
       assert(MI && "Missing instruction for dead def");
       MI->addRegisterDead(LI->reg, &TRI);
@@ -1048,10 +1130,8 @@
     // Leave all back-copies as is.
     break;
   case SM_Size:
-    hoistCopiesForSize();
-    break;
   case SM_Speed:
-    llvm_unreachable("Spill mode 'speed' not implemented yet");
+    hoistCopies();
   }
 
   // Transfer the simply mapped values, check if any are skipped.
Index: test/CodeGen/AArch64/aarch64-deferred-spilling.ll
===================================================================
--- test/CodeGen/AArch64/aarch64-deferred-spilling.ll
+++ test/CodeGen/AArch64/aarch64-deferred-spilling.ll
@@ -1,514 +0,0 @@
-;RUN: llc < %s -mtriple=aarch64--linux-android -regalloc=greedy -enable-deferred-spilling=true -mcpu=cortex-a57 -disable-fp-elim | FileCheck %s --check-prefix=CHECK --check-prefix=DEFERRED
-;RUN: llc < %s -mtriple=aarch64--linux-android -regalloc=greedy -enable-deferred-spilling=false -mcpu=cortex-a57 -disable-fp-elim | FileCheck %s --check-prefix=CHECK --check-prefix=REGULAR
-
-; Check that we do not end up with useless spill code.
-;
-; Move to the basic block we are interested in.
-;
-; CHECK: // %if.then.120
-;
-; REGULAR: str w21, [sp, #[[OFFSET:[0-9]+]]] // 4-byte Folded Spill
-; Check that w21 wouldn't need to be spilled since it is never reused.
-; REGULAR-NOT: {{[wx]}}21{{,?}}
-;
-; Check that w22 is used to carry a value through the call.
-; DEFERRED-NOT: str {{[wx]}}22,
-; DEFERRED: mov {{[wx]}}22,
-; DEFERRED-NOT: str {{[wx]}}22,
-;
-; CHECK:        bl      fprintf
-;
-; DEFERRED-NOT: ldr {{[wx]}}22,
-; DEFERRED: mov {{[wx][0-9]+}}, {{[wx]}}22
-; DEFERRED-NOT: ldr {{[wx]}}22,
-;
-; REGULAR-NOT: {{[wx]}}21{{,?}}
-; REGULAR: ldr w21, [sp, #[[OFFSET]]] // 4-byte Folded Reload
-;
-; End of the basic block we are interested in.
-; CHECK:        b
-; CHECK: {{[^:]+}}: // %sw.bb.123
-
-%struct.__sFILE = type { i8*, i32, i32, i32, i32, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, i8*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
-%struct.__sbuf = type { i8*, i64 }
-%struct.DState = type { %struct.bz_stream*, i32, i8, i32, i8, i32, i32, i32, i32, i32, i8, i32, i32, i32, i32, i32, [256 x i32], i32, [257 x i32], [257 x i32], i32*, i16*, i8*, i32, i32, i32, i32, i32, [256 x i8], [16 x i8], [256 x i8], [4096 x i8], [16 x i32], [18002 x i8], [18002 x i8], [6 x [258 x i8]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32*, i32*, i32* }
-%struct.bz_stream = type { i8*, i32, i32, i32, i8*, i32, i32, i32, i8*, i8* (i8*, i32, i32)*, void (i8*, i8*)*, i8* }
-
-@__sF = external global [0 x %struct.__sFILE], align 8
-@.str = private unnamed_addr constant [20 x i8] c"\0A    [%d: stuff+mf \00", align 1
-
-declare i32 @fprintf(%struct.__sFILE* nocapture, i8* nocapture readonly, ...)
-
-declare void @bar(i32)
-
-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
-
-define i32 @foo(%struct.DState* %s) {
-entry:
-  %state = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 1
-  %tmp = load i32, i32* %state, align 4
-  %cmp = icmp eq i32 %tmp, 10
-  %save_i = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 40
-  br i1 %cmp, label %if.end.thread, label %if.end
-
-if.end.thread:                                    ; preds = %entry
-  %save_j = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 41
-  %save_t = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 42
-  %save_alphaSize = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 43
-  %save_nGroups = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 44
-  %save_nSelectors = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 45
-  %save_EOB = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 46
-  %save_groupNo = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 47
-  %save_groupPos = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 48
-  %save_nextSym = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 49
-  %save_nblockMAX = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 50
-  %save_nblock = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 51
-  %save_es = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 52
-  %save_N = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 53
-  %save_curr = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 54
-  %save_zt = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 55
-  %save_zn = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 56
-  %save_zvec = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 57
-  %save_zj = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 58
-  %tmp1 = bitcast i32* %save_i to i8*
-  call void @llvm.memset.p0i8.i64(i8* %tmp1, i8 0, i64 108, i32 4, i1 false)
-  br label %sw.default
-
-if.end:                                           ; preds = %entry
-  %.pre = load i32, i32* %save_i, align 4
-  %save_j3.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 41
-  %.pre406 = load i32, i32* %save_j3.phi.trans.insert, align 4
-  %save_t4.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 42
-  %.pre407 = load i32, i32* %save_t4.phi.trans.insert, align 4
-  %save_alphaSize5.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 43
-  %.pre408 = load i32, i32* %save_alphaSize5.phi.trans.insert, align 4
-  %save_nGroups6.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 44
-  %.pre409 = load i32, i32* %save_nGroups6.phi.trans.insert, align 4
-  %save_nSelectors7.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 45
-  %.pre410 = load i32, i32* %save_nSelectors7.phi.trans.insert, align 4
-  %save_EOB8.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 46
-  %.pre411 = load i32, i32* %save_EOB8.phi.trans.insert, align 4
-  %save_groupNo9.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 47
-  %.pre412 = load i32, i32* %save_groupNo9.phi.trans.insert, align 4
-  %save_groupPos10.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 48
-  %.pre413 = load i32, i32* %save_groupPos10.phi.trans.insert, align 4
-  %save_nextSym11.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 49
-  %.pre414 = load i32, i32* %save_nextSym11.phi.trans.insert, align 4
-  %save_nblockMAX12.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 50
-  %.pre415 = load i32, i32* %save_nblockMAX12.phi.trans.insert, align 4
-  %save_nblock13.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 51
-  %.pre416 = load i32, i32* %save_nblock13.phi.trans.insert, align 4
-  %save_es14.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 52
-  %.pre417 = load i32, i32* %save_es14.phi.trans.insert, align 4
-  %save_N15.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 53
-  %.pre418 = load i32, i32* %save_N15.phi.trans.insert, align 4
-  %save_curr16.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 54
-  %.pre419 = load i32, i32* %save_curr16.phi.trans.insert, align 4
-  %save_zt17.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 55
-  %.pre420 = load i32, i32* %save_zt17.phi.trans.insert, align 4
-  %save_zn18.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 56
-  %.pre421 = load i32, i32* %save_zn18.phi.trans.insert, align 4
-  %save_zvec19.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 57
-  %.pre422 = load i32, i32* %save_zvec19.phi.trans.insert, align 4
-  %save_zj20.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 58
-  %.pre423 = load i32, i32* %save_zj20.phi.trans.insert, align 4
-  switch i32 %tmp, label %sw.default [
-    i32 13, label %sw.bb
-    i32 14, label %if.end.sw.bb.65_crit_edge
-    i32 25, label %if.end.sw.bb.123_crit_edge
-  ]
-
-if.end.sw.bb.123_crit_edge:                       ; preds = %if.end
-  %.pre433 = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
-  br label %sw.bb.123
-
-if.end.sw.bb.65_crit_edge:                        ; preds = %if.end
-  %bsLive69.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
-  %.pre426 = load i32, i32* %bsLive69.phi.trans.insert, align 4
-  br label %sw.bb.65
-
-sw.bb:                                            ; preds = %if.end
-  %sunkaddr = ptrtoint %struct.DState* %s to i64
-  %sunkaddr485 = add i64 %sunkaddr, 8
-  %sunkaddr486 = inttoptr i64 %sunkaddr485 to i32*
-  store i32 13, i32* %sunkaddr486, align 4
-  %bsLive = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
-  %tmp2 = load i32, i32* %bsLive, align 4
-  %cmp28.400 = icmp sgt i32 %tmp2, 7
-  br i1 %cmp28.400, label %sw.bb.if.then.29_crit_edge, label %if.end.33.lr.ph
-
-sw.bb.if.then.29_crit_edge:                       ; preds = %sw.bb
-  %sunkaddr487 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr488 = add i64 %sunkaddr487, 32
-  %sunkaddr489 = inttoptr i64 %sunkaddr488 to i32*
-  %.pre425 = load i32, i32* %sunkaddr489, align 4
-  br label %if.then.29
-
-if.end.33.lr.ph:                                  ; preds = %sw.bb
-  %tmp3 = bitcast %struct.DState* %s to %struct.bz_stream**
-  %.pre424 = load %struct.bz_stream*, %struct.bz_stream** %tmp3, align 8
-  %avail_in.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre424, i64 0, i32 1
-  %.pre430 = load i32, i32* %avail_in.phi.trans.insert, align 4
-  %tmp4 = add i32 %.pre430, -1
-  br label %if.end.33
-
-if.then.29:                                       ; preds = %while.body.backedge, %sw.bb.if.then.29_crit_edge
-  %tmp5 = phi i32 [ %.pre425, %sw.bb.if.then.29_crit_edge ], [ %or, %while.body.backedge ]
-  %.lcssa393 = phi i32 [ %tmp2, %sw.bb.if.then.29_crit_edge ], [ %add, %while.body.backedge ]
-  %sub = add nsw i32 %.lcssa393, -8
-  %shr = lshr i32 %tmp5, %sub
-  %and = and i32 %shr, 255
-  %sunkaddr491 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr492 = add i64 %sunkaddr491, 36
-  %sunkaddr493 = inttoptr i64 %sunkaddr492 to i32*
-  store i32 %sub, i32* %sunkaddr493, align 4
-  %blockSize100k = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 9
-  store i32 %and, i32* %blockSize100k, align 4
-  %and.off = add nsw i32 %and, -49
-  %tmp6 = icmp ugt i32 %and.off, 8
-  br i1 %tmp6, label %save_state_and_return, label %if.end.62
-
-if.end.33:                                        ; preds = %while.body.backedge, %if.end.33.lr.ph
-  %lsr.iv482 = phi i32 [ %tmp4, %if.end.33.lr.ph ], [ %lsr.iv.next483, %while.body.backedge ]
-  %tmp7 = phi i32 [ %tmp2, %if.end.33.lr.ph ], [ %add, %while.body.backedge ]
-  %cmp35 = icmp eq i32 %lsr.iv482, -1
-  br i1 %cmp35, label %save_state_and_return, label %if.end.37
-
-if.end.37:                                        ; preds = %if.end.33
-  %tmp8 = bitcast %struct.bz_stream* %.pre424 to i8**
-  %sunkaddr494 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr495 = add i64 %sunkaddr494, 32
-  %sunkaddr496 = inttoptr i64 %sunkaddr495 to i32*
-  %tmp9 = load i32, i32* %sunkaddr496, align 4
-  %shl = shl i32 %tmp9, 8
-  %tmp10 = load i8*, i8** %tmp8, align 8
-  %tmp11 = load i8, i8* %tmp10, align 1
-  %conv = zext i8 %tmp11 to i32
-  %or = or i32 %conv, %shl
-  store i32 %or, i32* %sunkaddr496, align 4
-  %add = add nsw i32 %tmp7, 8
-  %sunkaddr497 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr498 = add i64 %sunkaddr497, 36
-  %sunkaddr499 = inttoptr i64 %sunkaddr498 to i32*
-  store i32 %add, i32* %sunkaddr499, align 4
-  %incdec.ptr = getelementptr inbounds i8, i8* %tmp10, i64 1
-  store i8* %incdec.ptr, i8** %tmp8, align 8
-  %sunkaddr500 = ptrtoint %struct.bz_stream* %.pre424 to i64
-  %sunkaddr501 = add i64 %sunkaddr500, 8
-  %sunkaddr502 = inttoptr i64 %sunkaddr501 to i32*
-  store i32 %lsr.iv482, i32* %sunkaddr502, align 4
-  %sunkaddr503 = ptrtoint %struct.bz_stream* %.pre424 to i64
-  %sunkaddr504 = add i64 %sunkaddr503, 12
-  %sunkaddr505 = inttoptr i64 %sunkaddr504 to i32*
-  %tmp12 = load i32, i32* %sunkaddr505, align 4
-  %inc = add i32 %tmp12, 1
-  store i32 %inc, i32* %sunkaddr505, align 4
-  %cmp49 = icmp eq i32 %inc, 0
-  br i1 %cmp49, label %if.then.51, label %while.body.backedge
-
-if.then.51:                                       ; preds = %if.end.37
-  %sunkaddr506 = ptrtoint %struct.bz_stream* %.pre424 to i64
-  %sunkaddr507 = add i64 %sunkaddr506, 16
-  %sunkaddr508 = inttoptr i64 %sunkaddr507 to i32*
-  %tmp13 = load i32, i32* %sunkaddr508, align 4
-  %inc53 = add i32 %tmp13, 1
-  store i32 %inc53, i32* %sunkaddr508, align 4
-  br label %while.body.backedge
-
-while.body.backedge:                              ; preds = %if.then.51, %if.end.37
-  %lsr.iv.next483 = add i32 %lsr.iv482, -1
-  %cmp28 = icmp sgt i32 %add, 7
-  br i1 %cmp28, label %if.then.29, label %if.end.33
-
-if.end.62:                                        ; preds = %if.then.29
-  %sub64 = add nsw i32 %and, -48
-  %sunkaddr509 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr510 = add i64 %sunkaddr509, 40
-  %sunkaddr511 = inttoptr i64 %sunkaddr510 to i32*
-  store i32 %sub64, i32* %sunkaddr511, align 4
-  br label %sw.bb.65
-
-sw.bb.65:                                         ; preds = %if.end.62, %if.end.sw.bb.65_crit_edge
-  %bsLive69.pre-phi = phi i32* [ %bsLive69.phi.trans.insert, %if.end.sw.bb.65_crit_edge ], [ %bsLive, %if.end.62 ]
-  %tmp14 = phi i32 [ %.pre426, %if.end.sw.bb.65_crit_edge ], [ %sub, %if.end.62 ]
-  %sunkaddr512 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr513 = add i64 %sunkaddr512, 8
-  %sunkaddr514 = inttoptr i64 %sunkaddr513 to i32*
-  store i32 14, i32* %sunkaddr514, align 4
-  %cmp70.397 = icmp sgt i32 %tmp14, 7
-  br i1 %cmp70.397, label %if.then.72, label %if.end.82.lr.ph
-
-if.end.82.lr.ph:                                  ; preds = %sw.bb.65
-  %tmp15 = bitcast %struct.DState* %s to %struct.bz_stream**
-  %.pre427 = load %struct.bz_stream*, %struct.bz_stream** %tmp15, align 8
-  %avail_in84.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre427, i64 0, i32 1
-  %.pre431 = load i32, i32* %avail_in84.phi.trans.insert, align 4
-  %tmp16 = add i32 %.pre431, -1
-  br label %if.end.82
-
-if.then.72:                                       ; preds = %while.body.68.backedge, %sw.bb.65
-  %.lcssa390 = phi i32 [ %tmp14, %sw.bb.65 ], [ %add97, %while.body.68.backedge ]
-  %sub76 = add nsw i32 %.lcssa390, -8
-  %sunkaddr516 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr517 = add i64 %sunkaddr516, 36
-  %sunkaddr518 = inttoptr i64 %sunkaddr517 to i32*
-  store i32 %sub76, i32* %sunkaddr518, align 4
-  %currBlockNo = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 11
-  %tmp17 = load i32, i32* %currBlockNo, align 4
-  %inc117 = add nsw i32 %tmp17, 1
-  store i32 %inc117, i32* %currBlockNo, align 4
-  %verbosity = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 12
-  %tmp18 = load i32, i32* %verbosity, align 4
-  %cmp118 = icmp sgt i32 %tmp18, 1
-  br i1 %cmp118, label %if.then.120, label %sw.bb.123, !prof !0
-
-if.end.82:                                        ; preds = %while.body.68.backedge, %if.end.82.lr.ph
-  %lsr.iv480 = phi i32 [ %tmp16, %if.end.82.lr.ph ], [ %lsr.iv.next481, %while.body.68.backedge ]
-  %tmp19 = phi i32 [ %tmp14, %if.end.82.lr.ph ], [ %add97, %while.body.68.backedge ]
-  %cmp85 = icmp eq i32 %lsr.iv480, -1
-  br i1 %cmp85, label %save_state_and_return, label %if.end.88
-
-if.end.88:                                        ; preds = %if.end.82
-  %tmp20 = bitcast %struct.bz_stream* %.pre427 to i8**
-  %sunkaddr519 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr520 = add i64 %sunkaddr519, 32
-  %sunkaddr521 = inttoptr i64 %sunkaddr520 to i32*
-  %tmp21 = load i32, i32* %sunkaddr521, align 4
-  %shl90 = shl i32 %tmp21, 8
-  %tmp22 = load i8*, i8** %tmp20, align 8
-  %tmp23 = load i8, i8* %tmp22, align 1
-  %conv93 = zext i8 %tmp23 to i32
-  %or94 = or i32 %conv93, %shl90
-  store i32 %or94, i32* %sunkaddr521, align 4
-  %add97 = add nsw i32 %tmp19, 8
-  %sunkaddr522 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr523 = add i64 %sunkaddr522, 36
-  %sunkaddr524 = inttoptr i64 %sunkaddr523 to i32*
-  store i32 %add97, i32* %sunkaddr524, align 4
-  %incdec.ptr100 = getelementptr inbounds i8, i8* %tmp22, i64 1
-  store i8* %incdec.ptr100, i8** %tmp20, align 8
-  %sunkaddr525 = ptrtoint %struct.bz_stream* %.pre427 to i64
-  %sunkaddr526 = add i64 %sunkaddr525, 8
-  %sunkaddr527 = inttoptr i64 %sunkaddr526 to i32*
-  store i32 %lsr.iv480, i32* %sunkaddr527, align 4
-  %sunkaddr528 = ptrtoint %struct.bz_stream* %.pre427 to i64
-  %sunkaddr529 = add i64 %sunkaddr528, 12
-  %sunkaddr530 = inttoptr i64 %sunkaddr529 to i32*
-  %tmp24 = load i32, i32* %sunkaddr530, align 4
-  %inc106 = add i32 %tmp24, 1
-  store i32 %inc106, i32* %sunkaddr530, align 4
-  %cmp109 = icmp eq i32 %inc106, 0
-  br i1 %cmp109, label %if.then.111, label %while.body.68.backedge
-
-if.then.111:                                      ; preds = %if.end.88
-  %sunkaddr531 = ptrtoint %struct.bz_stream* %.pre427 to i64
-  %sunkaddr532 = add i64 %sunkaddr531, 16
-  %sunkaddr533 = inttoptr i64 %sunkaddr532 to i32*
-  %tmp25 = load i32, i32* %sunkaddr533, align 4
-  %inc114 = add i32 %tmp25, 1
-  store i32 %inc114, i32* %sunkaddr533, align 4
-  br label %while.body.68.backedge
-
-while.body.68.backedge:                           ; preds = %if.then.111, %if.end.88
-  %lsr.iv.next481 = add i32 %lsr.iv480, -1
-  %cmp70 = icmp sgt i32 %add97, 7
-  br i1 %cmp70, label %if.then.72, label %if.end.82
-
-if.then.120:                                      ; preds = %if.then.72
-  %call = tail call i32 (%struct.__sFILE*, i8*, ...) @fprintf(%struct.__sFILE* getelementptr inbounds ([0 x %struct.__sFILE], [0 x %struct.__sFILE]* @__sF, i64 0, i64 2), i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str, i64 0, i64 0), i32 %inc117)
-  br label %sw.bb.123
-
-sw.bb.123:                                        ; preds = %if.then.120, %if.then.72, %if.end.sw.bb.123_crit_edge
-  %bsLive127.pre-phi = phi i32* [ %.pre433, %if.end.sw.bb.123_crit_edge ], [ %bsLive69.pre-phi, %if.then.72 ], [ %bsLive69.pre-phi, %if.then.120 ]
-  %sunkaddr534 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr535 = add i64 %sunkaddr534, 8
-  %sunkaddr536 = inttoptr i64 %sunkaddr535 to i32*
-  store i32 25, i32* %sunkaddr536, align 4
-  %tmp26 = load i32, i32* %bsLive127.pre-phi, align 4
-  %cmp128.395 = icmp sgt i32 %tmp26, 7
-  br i1 %cmp128.395, label %sw.bb.123.if.then.130_crit_edge, label %if.end.140.lr.ph
-
-sw.bb.123.if.then.130_crit_edge:                  ; preds = %sw.bb.123
-  %sunkaddr537 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr538 = add i64 %sunkaddr537, 32
-  %sunkaddr539 = inttoptr i64 %sunkaddr538 to i32*
-  %.pre429 = load i32, i32* %sunkaddr539, align 4
-  br label %if.then.130
-
-if.end.140.lr.ph:                                 ; preds = %sw.bb.123
-  %tmp27 = bitcast %struct.DState* %s to %struct.bz_stream**
-  %.pre428 = load %struct.bz_stream*, %struct.bz_stream** %tmp27, align 8
-  %avail_in142.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre428, i64 0, i32 1
-  %.pre432 = load i32, i32* %avail_in142.phi.trans.insert, align 4
-  %tmp28 = add i32 %.pre432, -1
-  br label %if.end.140
-
-if.then.130:                                      ; preds = %while.body.126.backedge, %sw.bb.123.if.then.130_crit_edge
-  %tmp29 = phi i32 [ %.pre429, %sw.bb.123.if.then.130_crit_edge ], [ %or152, %while.body.126.backedge ]
-  %.lcssa = phi i32 [ %tmp26, %sw.bb.123.if.then.130_crit_edge ], [ %add155, %while.body.126.backedge ]
-  %sub134 = add nsw i32 %.lcssa, -8
-  %shr135 = lshr i32 %tmp29, %sub134
-  store i32 %sub134, i32* %bsLive127.pre-phi, align 4
-  %origPtr = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 13
-  %tmp30 = load i32, i32* %origPtr, align 4
-  %shl175 = shl i32 %tmp30, 8
-  %conv176 = and i32 %shr135, 255
-  %or177 = or i32 %shl175, %conv176
-  store i32 %or177, i32* %origPtr, align 4
-  %nInUse = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 27
-  %tmp31 = load i32, i32* %nInUse, align 4
-  %add179 = add nsw i32 %tmp31, 2
-  br label %save_state_and_return
-
-if.end.140:                                       ; preds = %while.body.126.backedge, %if.end.140.lr.ph
-  %lsr.iv = phi i32 [ %tmp28, %if.end.140.lr.ph ], [ %lsr.iv.next, %while.body.126.backedge ]
-  %tmp32 = phi i32 [ %tmp26, %if.end.140.lr.ph ], [ %add155, %while.body.126.backedge ]
-  %cmp143 = icmp eq i32 %lsr.iv, -1
-  br i1 %cmp143, label %save_state_and_return, label %if.end.146
-
-if.end.146:                                       ; preds = %if.end.140
-  %tmp33 = bitcast %struct.bz_stream* %.pre428 to i8**
-  %sunkaddr541 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr542 = add i64 %sunkaddr541, 32
-  %sunkaddr543 = inttoptr i64 %sunkaddr542 to i32*
-  %tmp34 = load i32, i32* %sunkaddr543, align 4
-  %shl148 = shl i32 %tmp34, 8
-  %tmp35 = load i8*, i8** %tmp33, align 8
-  %tmp36 = load i8, i8* %tmp35, align 1
-  %conv151 = zext i8 %tmp36 to i32
-  %or152 = or i32 %conv151, %shl148
-  store i32 %or152, i32* %sunkaddr543, align 4
-  %add155 = add nsw i32 %tmp32, 8
-  store i32 %add155, i32* %bsLive127.pre-phi, align 4
-  %incdec.ptr158 = getelementptr inbounds i8, i8* %tmp35, i64 1
-  store i8* %incdec.ptr158, i8** %tmp33, align 8
-  %sunkaddr544 = ptrtoint %struct.bz_stream* %.pre428 to i64
-  %sunkaddr545 = add i64 %sunkaddr544, 8
-  %sunkaddr546 = inttoptr i64 %sunkaddr545 to i32*
-  store i32 %lsr.iv, i32* %sunkaddr546, align 4
-  %sunkaddr547 = ptrtoint %struct.bz_stream* %.pre428 to i64
-  %sunkaddr548 = add i64 %sunkaddr547, 12
-  %sunkaddr549 = inttoptr i64 %sunkaddr548 to i32*
-  %tmp37 = load i32, i32* %sunkaddr549, align 4
-  %inc164 = add i32 %tmp37, 1
-  store i32 %inc164, i32* %sunkaddr549, align 4
-  %cmp167 = icmp eq i32 %inc164, 0
-  br i1 %cmp167, label %if.then.169, label %while.body.126.backedge
-
-if.then.169:                                      ; preds = %if.end.146
-  %sunkaddr550 = ptrtoint %struct.bz_stream* %.pre428 to i64
-  %sunkaddr551 = add i64 %sunkaddr550, 16
-  %sunkaddr552 = inttoptr i64 %sunkaddr551 to i32*
-  %tmp38 = load i32, i32* %sunkaddr552, align 4
-  %inc172 = add i32 %tmp38, 1
-  store i32 %inc172, i32* %sunkaddr552, align 4
-  br label %while.body.126.backedge
-
-while.body.126.backedge:                          ; preds = %if.then.169, %if.end.146
-  %lsr.iv.next = add i32 %lsr.iv, -1
-  %cmp128 = icmp sgt i32 %add155, 7
-  br i1 %cmp128, label %if.then.130, label %if.end.140
-
-sw.default:                                       ; preds = %if.end, %if.end.thread
-  %tmp39 = phi i32 [ 0, %if.end.thread ], [ %.pre, %if.end ]
-  %tmp40 = phi i32 [ 0, %if.end.thread ], [ %.pre406, %if.end ]
-  %tmp41 = phi i32 [ 0, %if.end.thread ], [ %.pre407, %if.end ]
-  %tmp42 = phi i32 [ 0, %if.end.thread ], [ %.pre408, %if.end ]
-  %tmp43 = phi i32 [ 0, %if.end.thread ], [ %.pre409, %if.end ]
-  %tmp44 = phi i32 [ 0, %if.end.thread ], [ %.pre410, %if.end ]
-  %tmp45 = phi i32 [ 0, %if.end.thread ], [ %.pre411, %if.end ]
-  %tmp46 = phi i32 [ 0, %if.end.thread ], [ %.pre412, %if.end ]
-  %tmp47 = phi i32 [ 0, %if.end.thread ], [ %.pre413, %if.end ]
-  %tmp48 = phi i32 [ 0, %if.end.thread ], [ %.pre414, %if.end ]
-  %tmp49 = phi i32 [ 0, %if.end.thread ], [ %.pre415, %if.end ]
-  %tmp50 = phi i32 [ 0, %if.end.thread ], [ %.pre416, %if.end ]
-  %tmp51 = phi i32 [ 0, %if.end.thread ], [ %.pre417, %if.end ]
-  %tmp52 = phi i32 [ 0, %if.end.thread ], [ %.pre418, %if.end ]
-  %tmp53 = phi i32 [ 0, %if.end.thread ], [ %.pre419, %if.end ]
-  %tmp54 = phi i32 [ 0, %if.end.thread ], [ %.pre420, %if.end ]
-  %tmp55 = phi i32 [ 0, %if.end.thread ], [ %.pre421, %if.end ]
-  %tmp56 = phi i32 [ 0, %if.end.thread ], [ %.pre422, %if.end ]
-  %tmp57 = phi i32 [ 0, %if.end.thread ], [ %.pre423, %if.end ]
-  %save_j3.pre-phi469 = phi i32* [ %save_j, %if.end.thread ], [ %save_j3.phi.trans.insert, %if.end ]
-  %save_t4.pre-phi467 = phi i32* [ %save_t, %if.end.thread ], [ %save_t4.phi.trans.insert, %if.end ]
-  %save_alphaSize5.pre-phi465 = phi i32* [ %save_alphaSize, %if.end.thread ], [ %save_alphaSize5.phi.trans.insert, %if.end ]
-  %save_nGroups6.pre-phi463 = phi i32* [ %save_nGroups, %if.end.thread ], [ %save_nGroups6.phi.trans.insert, %if.end ]
-  %save_nSelectors7.pre-phi461 = phi i32* [ %save_nSelectors, %if.end.thread ], [ %save_nSelectors7.phi.trans.insert, %if.end ]
-  %save_EOB8.pre-phi459 = phi i32* [ %save_EOB, %if.end.thread ], [ %save_EOB8.phi.trans.insert, %if.end ]
-  %save_groupNo9.pre-phi457 = phi i32* [ %save_groupNo, %if.end.thread ], [ %save_groupNo9.phi.trans.insert, %if.end ]
-  %save_groupPos10.pre-phi455 = phi i32* [ %save_groupPos, %if.end.thread ], [ %save_groupPos10.phi.trans.insert, %if.end ]
-  %save_nextSym11.pre-phi453 = phi i32* [ %save_nextSym, %if.end.thread ], [ %save_nextSym11.phi.trans.insert, %if.end ]
-  %save_nblockMAX12.pre-phi451 = phi i32* [ %save_nblockMAX, %if.end.thread ], [ %save_nblockMAX12.phi.trans.insert, %if.end ]
-  %save_nblock13.pre-phi449 = phi i32* [ %save_nblock, %if.end.thread ], [ %save_nblock13.phi.trans.insert, %if.end ]
-  %save_es14.pre-phi447 = phi i32* [ %save_es, %if.end.thread ], [ %save_es14.phi.trans.insert, %if.end ]
-  %save_N15.pre-phi445 = phi i32* [ %save_N, %if.end.thread ], [ %save_N15.phi.trans.insert, %if.end ]
-  %save_curr16.pre-phi443 = phi i32* [ %save_curr, %if.end.thread ], [ %save_curr16.phi.trans.insert, %if.end ]
-  %save_zt17.pre-phi441 = phi i32* [ %save_zt, %if.end.thread ], [ %save_zt17.phi.trans.insert, %if.end ]
-  %save_zn18.pre-phi439 = phi i32* [ %save_zn, %if.end.thread ], [ %save_zn18.phi.trans.insert, %if.end ]
-  %save_zvec19.pre-phi437 = phi i32* [ %save_zvec, %if.end.thread ], [ %save_zvec19.phi.trans.insert, %if.end ]
-  %save_zj20.pre-phi435 = phi i32* [ %save_zj, %if.end.thread ], [ %save_zj20.phi.trans.insert, %if.end ]
-  tail call void @bar(i32 4001)
-  br label %save_state_and_return
-
-save_state_and_return:                            ; preds = %sw.default, %if.end.140, %if.then.130, %if.end.82, %if.end.33, %if.then.29
-  %tmp58 = phi i32 [ %tmp39, %sw.default ], [ %.pre, %if.then.29 ], [ %.pre, %if.then.130 ], [ %.pre, %if.end.140 ], [ %.pre, %if.end.82 ], [ %.pre, %if.end.33 ]
-  %tmp59 = phi i32 [ %tmp40, %sw.default ], [ %.pre406, %if.then.29 ], [ %.pre406, %if.then.130 ], [ %.pre406, %if.end.140 ], [ %.pre406, %if.end.82 ], [ %.pre406, %if.end.33 ]
-  %tmp60 = phi i32 [ %tmp41, %sw.default ], [ %.pre407, %if.then.29 ], [ %.pre407, %if.then.130 ], [ %.pre407, %if.end.140 ], [ %.pre407, %if.end.82 ], [ %.pre407, %if.end.33 ]
-  %tmp61 = phi i32 [ %tmp43, %sw.default ], [ %.pre409, %if.then.29 ], [ %.pre409, %if.then.130 ], [ %.pre409, %if.end.140 ], [ %.pre409, %if.end.82 ], [ %.pre409, %if.end.33 ]
-  %tmp62 = phi i32 [ %tmp44, %sw.default ], [ %.pre410, %if.then.29 ], [ %.pre410, %if.then.130 ], [ %.pre410, %if.end.140 ], [ %.pre410, %if.end.82 ], [ %.pre410, %if.end.33 ]
-  %tmp63 = phi i32 [ %tmp45, %sw.default ], [ %.pre411, %if.then.29 ], [ %.pre411, %if.then.130 ], [ %.pre411, %if.end.140 ], [ %.pre411, %if.end.82 ], [ %.pre411, %if.end.33 ]
-  %tmp64 = phi i32 [ %tmp46, %sw.default ], [ %.pre412, %if.then.29 ], [ %.pre412, %if.then.130 ], [ %.pre412, %if.end.140 ], [ %.pre412, %if.end.82 ], [ %.pre412, %if.end.33 ]
-  %tmp65 = phi i32 [ %tmp47, %sw.default ], [ %.pre413, %if.then.29 ], [ %.pre413, %if.then.130 ], [ %.pre413, %if.end.140 ], [ %.pre413, %if.end.82 ], [ %.pre413, %if.end.33 ]
-  %tmp66 = phi i32 [ %tmp48, %sw.default ], [ %.pre414, %if.then.29 ], [ %.pre414, %if.then.130 ], [ %.pre414, %if.end.140 ], [ %.pre414, %if.end.82 ], [ %.pre414, %if.end.33 ]
-  %tmp67 = phi i32 [ %tmp49, %sw.default ], [ %.pre415, %if.then.29 ], [ %.pre415, %if.then.130 ], [ %.pre415, %if.end.140 ], [ %.pre415, %if.end.82 ], [ %.pre415, %if.end.33 ]
-  %tmp68 = phi i32 [ %tmp51, %sw.default ], [ %.pre417, %if.then.29 ], [ %.pre417, %if.then.130 ], [ %.pre417, %if.end.140 ], [ %.pre417, %if.end.82 ], [ %.pre417, %if.end.33 ]
-  %tmp69 = phi i32 [ %tmp52, %sw.default ], [ %.pre418, %if.then.29 ], [ %.pre418, %if.then.130 ], [ %.pre418, %if.end.140 ], [ %.pre418, %if.end.82 ], [ %.pre418, %if.end.33 ]
-  %tmp70 = phi i32 [ %tmp53, %sw.default ], [ %.pre419, %if.then.29 ], [ %.pre419, %if.then.130 ], [ %.pre419, %if.end.140 ], [ %.pre419, %if.end.82 ], [ %.pre419, %if.end.33 ]
-  %tmp71 = phi i32 [ %tmp54, %sw.default ], [ %.pre420, %if.then.29 ], [ %.pre420, %if.then.130 ], [ %.pre420, %if.end.140 ], [ %.pre420, %if.end.82 ], [ %.pre420, %if.end.33 ]
-  %tmp72 = phi i32 [ %tmp55, %sw.default ], [ %.pre421, %if.then.29 ], [ %.pre421, %if.then.130 ], [ %.pre421, %if.end.140 ], [ %.pre421, %if.end.82 ], [ %.pre421, %if.end.33 ]
-  %tmp73 = phi i32 [ %tmp56, %sw.default ], [ %.pre422, %if.then.29 ], [ %.pre422, %if.then.130 ], [ %.pre422, %if.end.140 ], [ %.pre422, %if.end.82 ], [ %.pre422, %if.end.33 ]
-  %tmp74 = phi i32 [ %tmp57, %sw.default ], [ %.pre423, %if.then.29 ], [ %.pre423, %if.then.130 ], [ %.pre423, %if.end.140 ], [ %.pre423, %if.end.82 ], [ %.pre423, %if.end.33 ]
-  %save_j3.pre-phi468 = phi i32* [ %save_j3.pre-phi469, %sw.default ], [ %save_j3.phi.trans.insert, %if.then.29 ], [ %save_j3.phi.trans.insert, %if.then.130 ], [ %save_j3.phi.trans.insert, %if.end.140 ], [ %save_j3.phi.trans.insert, %if.end.82 ], [ %save_j3.phi.trans.insert, %if.end.33 ]
-  %save_t4.pre-phi466 = phi i32* [ %save_t4.pre-phi467, %sw.default ], [ %save_t4.phi.trans.insert, %if.then.29 ], [ %save_t4.phi.trans.insert, %if.then.130 ], [ %save_t4.phi.trans.insert, %if.end.140 ], [ %save_t4.phi.trans.insert, %if.end.82 ], [ %save_t4.phi.trans.insert, %if.end.33 ]
-  %save_alphaSize5.pre-phi464 = phi i32* [ %save_alphaSize5.pre-phi465, %sw.default ], [ %save_alphaSize5.phi.trans.insert, %if.then.29 ], [ %save_alphaSize5.phi.trans.insert, %if.then.130 ], [ %save_alphaSize5.phi.trans.insert, %if.end.140 ], [ %save_alphaSize5.phi.trans.insert, %if.end.82 ], [ %save_alphaSize5.phi.trans.insert, %if.end.33 ]
-  %save_nGroups6.pre-phi462 = phi i32* [ %save_nGroups6.pre-phi463, %sw.default ], [ %save_nGroups6.phi.trans.insert, %if.then.29 ], [ %save_nGroups6.phi.trans.insert, %if.then.130 ], [ %save_nGroups6.phi.trans.insert, %if.end.140 ], [ %save_nGroups6.phi.trans.insert, %if.end.82 ], [ %save_nGroups6.phi.trans.insert, %if.end.33 ]
-  %save_nSelectors7.pre-phi460 = phi i32* [ %save_nSelectors7.pre-phi461, %sw.default ], [ %save_nSelectors7.phi.trans.insert, %if.then.29 ], [ %save_nSelectors7.phi.trans.insert, %if.then.130 ], [ %save_nSelectors7.phi.trans.insert, %if.end.140 ], [ %save_nSelectors7.phi.trans.insert, %if.end.82 ], [ %save_nSelectors7.phi.trans.insert, %if.end.33 ]
-  %save_EOB8.pre-phi458 = phi i32* [ %save_EOB8.pre-phi459, %sw.default ], [ %save_EOB8.phi.trans.insert, %if.then.29 ], [ %save_EOB8.phi.trans.insert, %if.then.130 ], [ %save_EOB8.phi.trans.insert, %if.end.140 ], [ %save_EOB8.phi.trans.insert, %if.end.82 ], [ %save_EOB8.phi.trans.insert, %if.end.33 ]
-  %save_groupNo9.pre-phi456 = phi i32* [ %save_groupNo9.pre-phi457, %sw.default ], [ %save_groupNo9.phi.trans.insert, %if.then.29 ], [ %save_groupNo9.phi.trans.insert, %if.then.130 ], [ %save_groupNo9.phi.trans.insert, %if.end.140 ], [ %save_groupNo9.phi.trans.insert, %if.end.82 ], [ %save_groupNo9.phi.trans.insert, %if.end.33 ]
-  %save_groupPos10.pre-phi454 = phi i32* [ %save_groupPos10.pre-phi455, %sw.default ], [ %save_groupPos10.phi.trans.insert, %if.then.29 ], [ %save_groupPos10.phi.trans.insert, %if.then.130 ], [ %save_groupPos10.phi.trans.insert, %if.end.140 ], [ %save_groupPos10.phi.trans.insert, %if.end.82 ], [ %save_groupPos10.phi.trans.insert, %if.end.33 ]
-  %save_nextSym11.pre-phi452 = phi i32* [ %save_nextSym11.pre-phi453, %sw.default ], [ %save_nextSym11.phi.trans.insert, %if.then.29 ], [ %save_nextSym11.phi.trans.insert, %if.then.130 ], [ %save_nextSym11.phi.trans.insert, %if.end.140 ], [ %save_nextSym11.phi.trans.insert, %if.end.82 ], [ %save_nextSym11.phi.trans.insert, %if.end.33 ]
-  %save_nblockMAX12.pre-phi450 = phi i32* [ %save_nblockMAX12.pre-phi451, %sw.default ], [ %save_nblockMAX12.phi.trans.insert, %if.then.29 ], [ %save_nblockMAX12.phi.trans.insert, %if.then.130 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.140 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.82 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.33 ]
-  %save_nblock13.pre-phi448 = phi i32* [ %save_nblock13.pre-phi449, %sw.default ], [ %save_nblock13.phi.trans.insert, %if.then.29 ], [ %save_nblock13.phi.trans.insert, %if.then.130 ], [ %save_nblock13.phi.trans.insert, %if.end.140 ], [ %save_nblock13.phi.trans.insert, %if.end.82 ], [ %save_nblock13.phi.trans.insert, %if.end.33 ]
-  %save_es14.pre-phi446 = phi i32* [ %save_es14.pre-phi447, %sw.default ], [ %save_es14.phi.trans.insert, %if.then.29 ], [ %save_es14.phi.trans.insert, %if.then.130 ], [ %save_es14.phi.trans.insert, %if.end.140 ], [ %save_es14.phi.trans.insert, %if.end.82 ], [ %save_es14.phi.trans.insert, %if.end.33 ]
-  %save_N15.pre-phi444 = phi i32* [ %save_N15.pre-phi445, %sw.default ], [ %save_N15.phi.trans.insert, %if.then.29 ], [ %save_N15.phi.trans.insert, %if.then.130 ], [ %save_N15.phi.trans.insert, %if.end.140 ], [ %save_N15.phi.trans.insert, %if.end.82 ], [ %save_N15.phi.trans.insert, %if.end.33 ]
-  %save_curr16.pre-phi442 = phi i32* [ %save_curr16.pre-phi443, %sw.default ], [ %save_curr16.phi.trans.insert, %if.then.29 ], [ %save_curr16.phi.trans.insert, %if.then.130 ], [ %save_curr16.phi.trans.insert, %if.end.140 ], [ %save_curr16.phi.trans.insert, %if.end.82 ], [ %save_curr16.phi.trans.insert, %if.end.33 ]
-  %save_zt17.pre-phi440 = phi i32* [ %save_zt17.pre-phi441, %sw.default ], [ %save_zt17.phi.trans.insert, %if.then.29 ], [ %save_zt17.phi.trans.insert, %if.then.130 ], [ %save_zt17.phi.trans.insert, %if.end.140 ], [ %save_zt17.phi.trans.insert, %if.end.82 ], [ %save_zt17.phi.trans.insert, %if.end.33 ]
-  %save_zn18.pre-phi438 = phi i32* [ %save_zn18.pre-phi439, %sw.default ], [ %save_zn18.phi.trans.insert, %if.then.29 ], [ %save_zn18.phi.trans.insert, %if.then.130 ], [ %save_zn18.phi.trans.insert, %if.end.140 ], [ %save_zn18.phi.trans.insert, %if.end.82 ], [ %save_zn18.phi.trans.insert, %if.end.33 ]
-  %save_zvec19.pre-phi436 = phi i32* [ %save_zvec19.pre-phi437, %sw.default ], [ %save_zvec19.phi.trans.insert, %if.then.29 ], [ %save_zvec19.phi.trans.insert, %if.then.130 ], [ %save_zvec19.phi.trans.insert, %if.end.140 ], [ %save_zvec19.phi.trans.insert, %if.end.82 ], [ %save_zvec19.phi.trans.insert, %if.end.33 ]
-  %save_zj20.pre-phi434 = phi i32* [ %save_zj20.pre-phi435, %sw.default ], [ %save_zj20.phi.trans.insert, %if.then.29 ], [ %save_zj20.phi.trans.insert, %if.then.130 ], [ %save_zj20.phi.trans.insert, %if.end.140 ], [ %save_zj20.phi.trans.insert, %if.end.82 ], [ %save_zj20.phi.trans.insert, %if.end.33 ]
-  %nblock.1 = phi i32 [ %tmp50, %sw.default ], [ %.pre416, %if.then.29 ], [ 0, %if.then.130 ], [ %.pre416, %if.end.140 ], [ %.pre416, %if.end.82 ], [ %.pre416, %if.end.33 ]
-  %alphaSize.1 = phi i32 [ %tmp42, %sw.default ], [ %.pre408, %if.then.29 ], [ %add179, %if.then.130 ], [ %.pre408, %if.end.140 ], [ %.pre408, %if.end.82 ], [ %.pre408, %if.end.33 ]
-  %retVal.0 = phi i32 [ 0, %sw.default ], [ -5, %if.then.29 ], [ -4, %if.then.130 ], [ 0, %if.end.140 ], [ 0, %if.end.82 ], [ 0, %if.end.33 ]
-  store i32 %tmp58, i32* %save_i, align 4
-  store i32 %tmp59, i32* %save_j3.pre-phi468, align 4
-  store i32 %tmp60, i32* %save_t4.pre-phi466, align 4
-  store i32 %alphaSize.1, i32* %save_alphaSize5.pre-phi464, align 4
-  store i32 %tmp61, i32* %save_nGroups6.pre-phi462, align 4
-  store i32 %tmp62, i32* %save_nSelectors7.pre-phi460, align 4
-  store i32 %tmp63, i32* %save_EOB8.pre-phi458, align 4
-  store i32 %tmp64, i32* %save_groupNo9.pre-phi456, align 4
-  store i32 %tmp65, i32* %save_groupPos10.pre-phi454, align 4
-  store i32 %tmp66, i32* %save_nextSym11.pre-phi452, align 4
-  store i32 %tmp67, i32* %save_nblockMAX12.pre-phi450, align 4
-  store i32 %nblock.1, i32* %save_nblock13.pre-phi448, align 4
-  store i32 %tmp68, i32* %save_es14.pre-phi446, align 4
-  store i32 %tmp69, i32* %save_N15.pre-phi444, align 4
-  store i32 %tmp70, i32* %save_curr16.pre-phi442, align 4
-  store i32 %tmp71, i32* %save_zt17.pre-phi440, align 4
-  store i32 %tmp72, i32* %save_zn18.pre-phi438, align 4
-  store i32 %tmp73, i32* %save_zvec19.pre-phi436, align 4
-  store i32 %tmp74, i32* %save_zj20.pre-phi434, align 4
-  ret i32 %retVal.0
-}
-
-!0 = !{!"branch_weights", i32 10, i32 1}
Index: test/CodeGen/ARM/subreg-remat.ll
===================================================================
--- test/CodeGen/ARM/subreg-remat.ll
+++ test/CodeGen/ARM/subreg-remat.ll
@@ -11,10 +11,10 @@
 ; since it implicitly reads the ssub_1 sub-register.
 ;
 ; CHECK: f1
-; CHECK: vmov    d0, r0, r0
-; CHECK: vldr s1, LCPI
+; CHECK: vmov    d1, r0, r0
+; CHECK: vldr s3, LCPI
 ; The vector must be spilled:
-; CHECK: vstr d0,
+; CHECK: vstr d1,
 ; CHECK: asm clobber d0
 ; And reloaded after the asm:
 ; CHECK: vldr [[D16:d[0-9]+]],
Index: test/CodeGen/SPARC/spill.ll
===================================================================
--- test/CodeGen/SPARC/spill.ll
+++ test/CodeGen/SPARC/spill.ll
@@ -7,8 +7,9 @@
 ;; registers to ensure the spill will happen.
 
 ; CHECK-LABEL: test_i32_spill:
-; CHECK:       and %i0, %i1, %o0
-; CHECK:       st %o0, [%fp+{{.+}}]
+; CHECK:       and %i0, %i1, %i0
+; CHECK:       mov %i0, %o0
+; CHECK:       st %i0, [%fp+{{.+}}]
 ; CHECK:       add %o0, %o0, %g0
 ; CHECK:       ld [%fp+{{.+}}, %i0
 define i32 @test_i32_spill(i32 %a, i32 %b) {
@@ -20,9 +21,11 @@
 }
 
 ; CHECK-LABEL: test_i64_spill:
-; CHECK:       and %i0, %i2, %o0
-; CHECK:       and %i1, %i3, %o1
-; CHECK:       std %o0, [%fp+{{.+}}]
+; CHECK:       and %i0, %i2, %i4
+; CHECK:       and %i1, %i3, %i5
+; CHECK:       mov %i4, %o0
+; CHECK:       mov %i5, %o1
+; CHECK:       std %i4, [%fp+{{.+}}]
 ; CHECK:       add %o0, %o0, %g0
 ; CHECK:       ldd [%fp+{{.+}}, %i0
 define i64 @test_i64_spill(i64 %a, i64 %b) {
Index: test/CodeGen/X86/avx512-bugfix-25270.ll
===================================================================
--- test/CodeGen/X86/avx512-bugfix-25270.ll
+++ test/CodeGen/X86/avx512-bugfix-25270.ll
@@ -10,9 +10,9 @@
 ; CHECK-NEXT:    subq $112, %rsp
 ; CHECK-NEXT:    movq %rdi, %rbx
 ; CHECK-NEXT:    vmovdqu32 (%rbx), %zmm0
-; CHECK-NEXT:    vmovups %zmm0, (%rsp) ## 64-byte Spill
 ; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm1
 ; CHECK-NEXT:    vmovdqa32 %zmm1, (%rbx)
+; CHECK-NEXT:    vmovups %zmm0, (%rsp) ## 64-byte Spill
 ; CHECK-NEXT:    callq _Print__512
 ; CHECK-NEXT:    vmovups (%rsp), %zmm0 ## 64-byte Reload
 ; CHECK-NEXT:    callq _Print__512
Index: test/CodeGen/X86/fold-push.ll
===================================================================
--- test/CodeGen/X86/fold-push.ll
+++ test/CodeGen/X86/fold-push.ll
@@ -5,8 +5,9 @@
 
 define void @test(i32 %a, i32 %b) optsize nounwind {
 ; CHECK-LABEL: test:
-; CHECK: movl [[EAX:%e..]], (%esp)
-; CHECK-NEXT: pushl [[EAX]]
+; CHECK: addl
+; CHECK-NEXT: pushl [[EAX:%e..]]
+; CHECK-NEXT: movl [[EAX]], 4(%esp)
 ; CHECK-NEXT: calll
 ; CHECK-NEXT: addl $4, %esp
 ; CHECK: nop
@@ -24,8 +25,9 @@
 
 define void @test_min(i32 %a, i32 %b) minsize nounwind {
 ; CHECK-LABEL: test_min:
-; CHECK: movl [[EAX:%e..]], (%esp)
-; CHECK-NEXT: pushl [[EAX]]
+; CHECK: addl
+; CHECK-NEXT: pushl [[EAX:%e..]]
+; CHECK-NEXT: movl [[EAX]], 4(%esp)
 ; CHECK-NEXT: calll
 ; CHECK-NEXT: popl
 ; CHECK: nop
Index: test/CodeGen/X86/hoist-spill.ll
===================================================================
--- test/CodeGen/X86/hoist-spill.ll
+++ test/CodeGen/X86/hoist-spill.ll
@@ -0,0 +1,115 @@
+; RUN: llc < %s | grep 'Spill' |sed 's%.*\(-[0-9]\+(\%rsp)\).*%\1%g' |sort |uniq -d |awk '{if (/rsp/); exit -1}'
+; Check no spills to the same stack slot after hoisting.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = external global i32*, align 8
+@b = external global i32, align 4
+@d = external global i32*, align 8
+
+; Function Attrs: norecurse noreturn nounwind uwtable
+define void @fn1(i32 %p1) #0 {
+entry:
+  %tmp = load i32*, i32** @d, align 8
+  %tmp1 = load i32*, i32** @a, align 8
+  %tmp2 = sext i32 %p1 to i64
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc14, %entry
+  %indvar = phi i32 [ %indvar.next, %for.inc14 ], [ 0, %entry ]
+  %indvars.iv30.in = phi i32 [ %indvars.iv30, %for.inc14 ], [ %p1, %entry ]
+  %c.0 = phi i32 [ %inc15, %for.inc14 ], [ 1, %entry ]
+  %k.0 = phi i32 [ %k.1.lcssa, %for.inc14 ], [ undef, %entry ]
+  %tmp3 = icmp sgt i32 undef, 0
+  %smax52 = select i1 %tmp3, i32 undef, i32 0
+  %tmp4 = zext i32 %smax52 to i64
+  %tmp5 = icmp sgt i64 undef, %tmp4
+  %smax53 = select i1 %tmp5, i64 undef, i64 %tmp4
+  %tmp6 = add nsw i64 %smax53, 1
+  %tmp7 = sub nsw i64 %tmp6, %tmp4
+  %tmp8 = add nsw i64 %tmp7, -8
+  %tmp9 = sub i32 undef, %indvar
+  %tmp10 = icmp sgt i64 %tmp2, 0
+  %smax40 = select i1 %tmp10, i64 %tmp2, i64 0
+  %scevgep41 = getelementptr i32, i32* %tmp1, i64 %smax40
+  %indvars.iv30 = add i32 %indvars.iv30.in, -1
+  %tmp11 = icmp sgt i32 %indvars.iv30, 0
+  %smax = select i1 %tmp11, i32 %indvars.iv30, i32 0
+  %tmp12 = zext i32 %smax to i64
+  %sub = sub nsw i32 %p1, %c.0
+  %cmp = icmp sgt i32 %sub, 0
+  %sub. = select i1 %cmp, i32 %sub, i32 0
+  %cmp326 = icmp sgt i32 %k.0, %p1
+  br i1 %cmp326, label %for.cond4.preheader, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %for.cond
+  br label %for.body
+
+for.cond4.preheader:                              ; preds = %for.body, %for.cond
+  %k.1.lcssa = phi i32 [ %k.0, %for.cond ], [ %add, %for.body ]
+  %cmp528 = icmp sgt i32 %sub., %p1
+  br i1 %cmp528, label %for.inc14, label %for.body6.preheader
+
+for.body6.preheader:                              ; preds = %for.cond4.preheader
+  br i1 undef, label %for.body6, label %min.iters.checked
+
+min.iters.checked:                                ; preds = %for.body6.preheader
+  br i1 undef, label %for.body6, label %vector.memcheck
+
+vector.memcheck:                                  ; preds = %min.iters.checked
+  %bound1 = icmp ule i32* undef, %scevgep41
+  %memcheck.conflict = and i1 undef, %bound1
+  br i1 %memcheck.conflict, label %for.body6, label %vector.body.preheader
+
+vector.body.preheader:                            ; preds = %vector.memcheck
+  %lcmp.mod = icmp eq i64 undef, 0
+  br i1 %lcmp.mod, label %vector.body.preheader.split, label %vector.body.prol
+
+vector.body.prol:                                 ; preds = %vector.body.prol, %vector.body.preheader
+  %prol.iter.cmp = icmp eq i64 undef, 0
+  br i1 %prol.iter.cmp, label %vector.body.preheader.split, label %vector.body.prol
+
+vector.body.preheader.split:                      ; preds = %vector.body.prol, %vector.body.preheader
+  %tmp13 = icmp ult i64 %tmp8, 24
+  br i1 %tmp13, label %middle.block, label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.body.preheader.split
+  %index = phi i64 [ %index.next.3, %vector.body ], [ 0, %vector.body.preheader.split ]
+  %index.next = add i64 %index, 8
+  %offset.idx.1 = add i64 %tmp12, %index.next
+  %tmp14 = getelementptr inbounds i32, i32* %tmp, i64 %offset.idx.1
+  %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
+  %wide.load.1 = load <4 x i32>, <4 x i32>* %tmp15, align 4
+  %tmp16 = getelementptr inbounds i32, i32* %tmp1, i64 %offset.idx.1
+  %tmp17 = bitcast i32* %tmp16 to <4 x i32>*
+  store <4 x i32> %wide.load.1, <4 x i32>* %tmp17, align 4
+  %index.next.3 = add i64 %index, 32
+  br i1 undef, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body, %vector.body.preheader.split
+  br i1 undef, label %for.inc14, label %for.body6
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %k.127 = phi i32 [ %k.0, %for.body.preheader ], [ %add, %for.body ]
+  %add = add nsw i32 %k.127, 1
+  %tmp18 = load i32, i32* undef, align 4
+  store i32 %tmp18, i32* @b, align 4
+  br i1 undef, label %for.body, label %for.cond4.preheader
+
+for.body6:                                        ; preds = %for.body6, %middle.block, %vector.memcheck, %min.iters.checked, %for.body6.preheader
+  %indvars.iv32 = phi i64 [ undef, %for.body6 ], [ %tmp12, %vector.memcheck ], [ %tmp12, %min.iters.checked ], [ %tmp12, %for.body6.preheader ], [ undef, %middle.block ]
+  %arrayidx8 = getelementptr inbounds i32, i32* %tmp, i64 %indvars.iv32
+  %tmp19 = load i32, i32* %arrayidx8, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %tmp1, i64 %indvars.iv32
+  store i32 %tmp19, i32* %arrayidx10, align 4
+  %cmp5 = icmp slt i64 %indvars.iv32, undef
+  br i1 %cmp5, label %for.body6, label %for.inc14
+
+for.inc14:                                        ; preds = %for.body6, %middle.block, %for.cond4.preheader
+  %inc15 = add nuw nsw i32 %c.0, 1
+  %indvar.next = add i32 %indvar, 1
+  br label %for.cond
+}
+
+attributes #0 = { norecurse noreturn nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/CodeGen/X86/new-remat.ll
===================================================================
--- test/CodeGen/X86/new-remat.ll
+++ test/CodeGen/X86/new-remat.ll
@@ -0,0 +1,75 @@
+; RUN: llc < %s | FileCheck %s
+; Check all spills are rematerialized.
+; CHECK-NOT: Spill
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = common global double 0.000000e+00, align 8
+@a = common global i32 0, align 4
+
+; Function Attrs: nounwind uwtable
+define i32 @uniform_testdata(i32 %p1) #0 {
+entry:
+  %cmp3 = icmp sgt i32 %p1, 0
+  br i1 %cmp3, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %p1, -1
+  %xtraiter = and i32 %p1, 7
+  %lcmp.mod = icmp eq i32 %xtraiter, 0
+  br i1 %lcmp.mod, label %for.body.preheader.split, label %for.body.prol.preheader
+
+for.body.prol.preheader:                          ; preds = %for.body.preheader
+  br label %for.body.prol
+
+for.body.prol:                                    ; preds = %for.body.prol.preheader, %for.body.prol
+  %i.04.prol = phi i32 [ %inc.prol, %for.body.prol ], [ 0, %for.body.prol.preheader ]
+  %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.prol.preheader ]
+  %1 = load double, double* @b, align 8
+  %call.prol = tail call double @pow(double %1, double 2.500000e-01) #2
+  %inc.prol = add nuw nsw i32 %i.04.prol, 1
+  %prol.iter.sub = add i32 %prol.iter, -1
+  %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
+  br i1 %prol.iter.cmp, label %for.body.preheader.split.loopexit, label %for.body.prol
+
+for.body.preheader.split.loopexit:                ; preds = %for.body.prol
+  %inc.prol.lcssa = phi i32 [ %inc.prol, %for.body.prol ]
+  br label %for.body.preheader.split
+
+for.body.preheader.split:                         ; preds = %for.body.preheader.split.loopexit, %for.body.preheader
+  %i.04.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.prol.lcssa, %for.body.preheader.split.loopexit ]
+  %2 = icmp ult i32 %0, 7
+  br i1 %2, label %for.end.loopexit, label %for.body.preheader.split.split
+
+for.body.preheader.split.split:                   ; preds = %for.body.preheader.split
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader.split.split
+  %i.04 = phi i32 [ %i.04.unr, %for.body.preheader.split.split ], [ %inc.7, %for.body ]
+  %3 = load double, double* @b, align 8
+  %call = tail call double @pow(double %3, double 2.500000e-01) #2
+  %4 = load double, double* @b, align 8
+  %call.1 = tail call double @pow(double %4, double 2.500000e-01) #2
+  %inc.7 = add nsw i32 %i.04, 8
+  %exitcond.7 = icmp eq i32 %inc.7, %p1
+  br i1 %exitcond.7, label %for.end.loopexit.unr-lcssa, label %for.body
+
+for.end.loopexit.unr-lcssa:                       ; preds = %for.body
+  br label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body.preheader.split, %for.end.loopexit.unr-lcssa
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %5 = load i32, i32* @a, align 4
+  ret i32 %5
+}
+
+; Function Attrs: nounwind
+declare double @pow(double, double) #1
+
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
Index: test/CodeGen/X86/ragreedy-hoist-spill.ll
===================================================================
--- test/CodeGen/X86/ragreedy-hoist-spill.ll
+++ test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -2,6 +2,7 @@
 
 ; This testing case is reduced from 254.gap SyFgets function.
 ; We make sure a spill is not hoisted to a hotter outer loop.
+; We make sure a spill is hoisted to a cold BB inside the hotter outer loop.
 
 %struct.TMP.1 = type { %struct.TMP.2*, %struct.TMP.2*, [1024 x i8] }
 %struct.TMP.2 = type { i8*, i32, i32, i16, i16, %struct.TMP.3, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.TMP.3, %struct.TMP.4*, i32, [3 x i8], [1 x i8], %struct.TMP.3, i32, i64 }
@@ -181,6 +182,10 @@
   br i1 %cmp476, label %if.end517, label %do.body479.preheader
 
 do.body479.preheader:
+  ; CHECK: do.body479.preheader
+  ; spill is hoisted here. Although loop depth1 is even hotter than loop depth2, do.body479.preheader is cold.
+  ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+  ; CHECK: land.rhs485
   %cmp4833314 = icmp eq i8 undef, 0
   br i1 %cmp4833314, label %if.end517, label %land.rhs485
 
@@ -200,8 +205,8 @@
 
 lor.rhs500:
   ; CHECK: lor.rhs500
-  ; Make sure that we don't hoist the spill to outer loops.
-  ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+  ; Make sure spill is hoisted to a cold preheader in outside loop.
+  ; CHECK-NOT: movq %r{{.*}}, {{[0-9]+}}(%rsp)
   ; CHECK: callq {{.*}}maskrune
   %call3.i.i2792 = call i32 @__maskrune(i32 undef, i64 256)
   br i1 undef, label %land.lhs.true504, label %do.body479.backedge
Index: test/CodeGen/X86/vselect-minmax.ll
===================================================================
--- test/CodeGen/X86/vselect-minmax.ll
+++ test/CodeGen/X86/vselect-minmax.ll
@@ -4888,13 +4888,14 @@
 define <8 x i64> @test122(<8 x i64> %a, <8 x i64> %b) {
 ; SSE2-LABEL: test122:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa %xmm7, %xmm8
-; SSE2-NEXT:    movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm7, %xmm11
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm9
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm11, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
 ; SSE2-NEXT:    pxor %xmm10, %xmm0
@@ -5163,7 +5164,6 @@
 ; SSE2-LABEL: test124:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    movdqa %xmm7, %xmm11
-; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
@@ -5172,6 +5172,7 @@
 ; SSE2-NEXT:    movdqa %xmm7, %xmm8
 ; SSE2-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm10, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
@@ -5465,13 +5466,14 @@
 define <8 x i64> @test126(<8 x i64> %a, <8 x i64> %b) {
 ; SSE2-LABEL: test126:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa %xmm7, %xmm8
-; SSE2-NEXT:    movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm7, %xmm11
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm9
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm11, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
 ; SSE2-NEXT:    pxor %xmm10, %xmm0
@@ -5794,7 +5796,6 @@
 ; SSE2-LABEL: test128:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    movdqa %xmm7, %xmm11
-; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
@@ -5803,6 +5804,7 @@
 ; SSE2-NEXT:    movdqa %xmm7, %xmm8
 ; SSE2-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm10, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
@@ -7608,13 +7610,14 @@
 define <8 x i64> @test154(<8 x i64> %a, <8 x i64> %b) {
 ; SSE2-LABEL: test154:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa %xmm7, %xmm8
-; SSE2-NEXT:    movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm7, %xmm11
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm9
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm11, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
 ; SSE2-NEXT:    pxor %xmm10, %xmm0
@@ -7881,7 +7884,6 @@
 ; SSE2-LABEL: test156:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    movdqa %xmm7, %xmm11
-; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
@@ -7890,6 +7892,7 @@
 ; SSE2-NEXT:    movdqa %xmm7, %xmm8
 ; SSE2-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm10, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
@@ -8181,13 +8184,14 @@
 define <8 x i64> @test158(<8 x i64> %a, <8 x i64> %b) {
 ; SSE2-LABEL: test158:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa %xmm7, %xmm8
-; SSE2-NEXT:    movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm7, %xmm11
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm9
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm11, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
 ; SSE2-NEXT:    pxor %xmm10, %xmm0
@@ -8508,7 +8512,6 @@
 ; SSE2-LABEL: test160:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    movdqa %xmm7, %xmm11
-; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
@@ -8517,6 +8520,7 @@
 ; SSE2-NEXT:    movdqa %xmm7, %xmm8
 ; SSE2-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm10, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11