Index: lib/CodeGen/InlineSpiller.cpp
===================================================================
--- lib/CodeGen/InlineSpiller.cpp
+++ lib/CodeGen/InlineSpiller.cpp
@@ -55,6 +55,65 @@
                                      cl::desc("Disable inline spill hoisting"));
 
 namespace {
+class HoistSpiller {
+  MachineFunction &MF;
+  LiveIntervals &LIS;
+  LiveStacks &LSS;
+  AliasAnalysis *AA;
+  MachineDominatorTree &MDT;
+  MachineLoopInfo &Loops;
+  VirtRegMap &VRM;
+  MachineFrameInfo &MFI;
+  MachineRegisterInfo &MRI;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+  const MachineBlockFrequencyInfo &MBFI;
+
+  // Map from StackSlot to its original register.
+  DenseMap<int, unsigned> StackSlotToReg;
+  // Map from pair of (StackSlot and Original VNI) to a set of spills which
+  // have the same stackslot and have equal values defined by Original VNI.
+  // These spills are mergable and are hoist candiates.
+  typedef DenseMap<std::pair<int, VNInfo *>, SmallPtrSet<MachineInstr *, 16>>
+      MergableSpillsMap;
+  MergableSpillsMap MergableSpills;
+
+  /// Virt2SibingsMap - This is the map from original register to a set
+  /// containing all its siblings. To hoist a spill to another BB, we need
+  /// to find out a live sibling there and use it as the RHS of the new spill.
+  DenseMap<unsigned, DenseSet<unsigned>> Virt2SiblingsMap;
+
+  bool isSpillCandBB(unsigned OrigReg, VNInfo *OrigVNI, MachineBasicBlock *BB,
+                     unsigned &LiveReg);
+  void getVisitOrders(
+      MachineBasicBlock *Root, SmallPtrSet<MachineInstr *, 16> &Spills,
+      SmallVectorImpl<MachineDomTreeNode *> &Orders,
+      SmallVectorImpl<MachineInstr *> &SpillsToRm,
+      DenseMap<MachineDomTreeNode *, unsigned> &SpillsToKept,
+      DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill);
+  void runHoistSpills(unsigned OrigReg, VNInfo *OrigVNI,
+                      SmallPtrSet<MachineInstr *, 16> &Spills,
+                      SmallVectorImpl<MachineInstr *> &SpillsToRm,
+                      DenseMap<MachineBasicBlock *, unsigned> &SpillsToIns);
+
+public:
+  HoistSpiller(MachineFunctionPass &pass, MachineFunction &mf, VirtRegMap &vrm)
+      : MF(mf), LIS(pass.getAnalysis<LiveIntervals>()),
+        LSS(pass.getAnalysis<LiveStacks>()),
+        AA(&pass.getAnalysis<AAResultsWrapperPass>().getAAResults()),
+        MDT(pass.getAnalysis<MachineDominatorTree>()),
+        Loops(pass.getAnalysis<MachineLoopInfo>()), VRM(vrm),
+        MFI(*mf.getFrameInfo()), MRI(mf.getRegInfo()),
+        TII(*mf.getSubtarget().getInstrInfo()),
+        TRI(*mf.getSubtarget().getRegisterInfo()),
+        MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()) {}
+
+  void addToMergableSpills(MachineInstr *Spill, int StackSlot,
+                           unsigned Original);
+  bool rmFromMergableSpills(MachineInstr *Spill, int StackSlot);
+  void hoistAllSpills(LiveRangeEdit &Edit);
+};
+
 class InlineSpiller : public Spiller {
   MachineFunction &MF;
   LiveIntervals &LIS;
@@ -135,6 +194,9 @@
   // Dead defs generated during spilling.
   SmallVector<MachineInstr*, 8> DeadDefs;
 
+  // Object records spills information and does the hoisting.
+  HoistSpiller *HSpiller;
+
   ~InlineSpiller() override {}
 
 public:
@@ -147,9 +209,14 @@
         MFI(*mf.getFrameInfo()), MRI(mf.getRegInfo()),
         TII(*mf.getSubtarget().getInstrInfo()),
         TRI(*mf.getSubtarget().getRegisterInfo()),
-        MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()) {}
+        MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()), HSpiller(nullptr) {
+  }
 
   void spill(LiveRangeEdit &) override;
+  void setHSpiller(HoistSpiller *HS) { HSpiller = HS; }
+  HoistSpiller *getHSpiller() { return HSpiller; }
+  /// Methods for support type inquiry through isa, cast, and dyn_cast:
+  static inline bool classof(const Spiller *V) { return true; }
 
 private:
   bool isSnippet(const LiveInterval &SnipLI);
@@ -194,6 +261,21 @@
   return new InlineSpiller(pass, mf, vrm);
 }
 
+void createHoistSpiller(MachineFunctionPass &pass, MachineFunction &mf,
+                        VirtRegMap &vrm, Spiller *spiller) {
+  HoistSpiller *HSpiller = new HoistSpiller(pass, mf, vrm);
+  (dyn_cast<InlineSpiller>(spiller))->setHSpiller(HSpiller);
+}
+
+void startHoistSpiller(MachineFunction &mf, VirtRegMap &vrm, LiveIntervals &lis,
+                       Spiller *spiller) {
+  SmallVector<unsigned, 4> NewVRegs;
+  LiveRangeEdit LRE(nullptr, NewVRegs, nullptr, mf, lis, &vrm, nullptr);
+  HoistSpiller *HSpiller = (dyn_cast<InlineSpiller>(spiller))->getHSpiller();
+  HSpiller->hoistAllSpills(LRE);
+  assert(NewVRegs.size() == 0 &&
+         "No new vregs should be generated in hoistAllSpills");
+}
 }
 
 //===----------------------------------------------------------------------===//
@@ -808,7 +890,8 @@
         MI->setDesc(TII.get(TargetOpcode::KILL));
         DeadDefs.push_back(MI);
         ++NumSpillsRemoved;
-        --NumSpills;
+        if (HSpiller && HSpiller->rmFromMergableSpills(MI, StackSlot))
+          --NumSpills;
       }
     }
   } while (!WorkList.empty());
@@ -1023,6 +1106,9 @@
   if (InstrReg != Reg || FI != StackSlot)
     return false;
 
+  if (!IsLoad && HSpiller)
+    HSpiller->rmFromMergableSpills(MI, StackSlot);
+
   DEBUG(dbgs() << "Coalescing stack access: " << *MI);
   LIS.RemoveMachineInstrFromMaps(MI);
   MI->eraseFromParent();
@@ -1147,6 +1233,10 @@
     LIS.removePhysRegDefAt(Reg, Idx);
   }
 
+  int FI;
+  if (TII.isStoreToStackSlot(MI, FI) && HSpiller &&
+      HSpiller->rmFromMergableSpills(MI, FI))
+    --NumSpills;
   LIS.ReplaceMachineInstrInMaps(MI, FoldMI);
   MI->eraseFromParent();
 
@@ -1173,9 +1263,11 @@
 
   if (!WasCopy)
     ++NumFolded;
-  else if (Ops.front().second == 0)
+  else if (Ops.front().second == 0) {
     ++NumSpills;
-  else
+    if (HSpiller)
+      HSpiller->addToMergableSpills(FoldMI, StackSlot, Original);
+  } else
     ++NumReloads;
   return true;
 }
@@ -1210,6 +1302,8 @@
   DEBUG(dumpMachineInstrRangeWithSlotIndex(std::next(MI), MIS.end(), LIS,
                                            "spill"));
   ++NumSpills;
+  if (HSpiller)
+    HSpiller->addToMergableSpills(std::next(MI), StackSlot, Original);
 }
 
 /// spillAroundUses - insert spill code around each use of Reg.
@@ -1396,3 +1490,332 @@
 
   Edit->calculateRegClassAndHint(MF, Loops, MBFI);
 }
+
+// When a spill is inserted, add the spill to MergableSpills map.
+void HoistSpiller::addToMergableSpills(MachineInstr *Spill, int StackSlot,
+                                       unsigned Original) {
+  StackSlotToReg[StackSlot] = Original;
+  SlotIndex Idx = LIS.getInstructionIndex(Spill);
+  VNInfo *OrigVNI = LIS.getInterval(Original).getVNInfoAt(Idx.getRegSlot());
+  std::pair<int, VNInfo *> MIdx = std::make_pair(StackSlot, OrigVNI);
+  MergableSpills[MIdx].insert(Spill);
+}
+
+// When a spill is removed, remove the spill from MergableSpills map.
+// Return true if the spill is removed successfully.
+bool HoistSpiller::rmFromMergableSpills(MachineInstr *Spill, int StackSlot) {
+  int Original = StackSlotToReg[StackSlot];
+  if (!Original)
+    return false;
+  SlotIndex Idx = LIS.getInstructionIndex(Spill);
+  VNInfo *OrigVNI = LIS.getInterval(Original).getVNInfoAt(Idx.getRegSlot());
+  std::pair<int, VNInfo *> MIdx = std::make_pair(StackSlot, OrigVNI);
+  return MergableSpills[MIdx].erase(Spill);
+}
+
+// Check BB to see if it is a possible target BB to place a hoisted spill,
+// .i.e, there should be a living sibling of OrigReg at the insert point.
+bool HoistSpiller::isSpillCandBB(unsigned OrigReg, VNInfo *OrigVNI,
+                                 MachineBasicBlock *BB, unsigned &LiveReg) {
+  SlotIndex Idx;
+  MachineBasicBlock::iterator MI = BB->getFirstTerminator();
+  if (MI != BB->end())
+    Idx = LIS.getInstructionIndex(MI);
+  else
+    Idx = LIS.getMBBEndIdx(BB).getPrevSlot();
+  DenseSet<unsigned> &Siblings = Virt2SiblingsMap[OrigReg];
+  assert((LIS.getInterval(OrigReg)).getVNInfoAt(Idx) == OrigVNI &&
+         "Unexpected VNI");
+
+  for (auto const ent : Siblings) {
+    LiveInterval &LI = LIS.getInterval(ent);
+    VNInfo *VNI = LI.getVNInfoAt(Idx);
+    if (VNI) {
+      LiveReg = ent;
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Get the top-bottom order to visit the BB nodes containing spills.
+/// Redundent spills will be found and put into SpillsToRm at the
+/// same time.
+void HoistSpiller::getVisitOrders(
+    MachineBasicBlock *Root, SmallPtrSet<MachineInstr *, 16> &Spills,
+    SmallVectorImpl<MachineDomTreeNode *> &Orders,
+    SmallVectorImpl<MachineInstr *> &SpillsToRm,
+    DenseMap<MachineDomTreeNode *, unsigned> &SpillsToKept,
+    DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill) {
+  // For each spill, check the BB the spill is located at and set
+  // SpillBBToSpill[]. If a BB contains more than one spill, only
+  // keep the spill with smaller SlotIndex.
+  for (const auto CurrentSpill : Spills) {
+    MachineBasicBlock *Block = CurrentSpill->getParent();
+    MachineDomTreeNode *Node = MDT.DT->getNode(Block);
+    MachineInstr *PrevSpill = SpillBBToSpill[Node];
+    if (PrevSpill) {
+      SlotIndex PIdx = LIS.getInstructionIndex(PrevSpill);
+      SlotIndex CIdx = LIS.getInstructionIndex(CurrentSpill);
+      MachineInstr *SpillToRm = (CIdx > PIdx) ? CurrentSpill : PrevSpill;
+      MachineInstr *SpillToKeep = (CIdx > PIdx) ? PrevSpill : CurrentSpill;
+      SpillsToRm.push_back(SpillToRm);
+      SpillBBToSpill[MDT.DT->getNode(Block)] = SpillToKeep;
+    } else {
+      SpillBBToSpill[MDT.DT->getNode(Block)] = CurrentSpill;
+    }
+  }
+  for (const auto SpillToRm : SpillsToRm)
+    Spills.erase(SpillToRm);
+
+  SmallPtrSet<MachineDomTreeNode *, 8> WorkSet;
+  SmallPtrSet<MachineDomTreeNode *, 8> NodesOnPath;
+  MachineDomTreeNode *RootIDomNode = MDT[Root]->getIDom();
+  // For every node on the dominator tree with spill, walk upside on the
+  // dominator tree until reaching the Root node. If there is other node
+  // found with spill on the path, the original node is redundent and will
+  // be removed. All the nodes on the path from node with non-redundent spill
+  // to Root node will be added to the WorkSet, which is the set we want
+  // to look at during hoisting spills in the next step.
+  for (const auto Spill : Spills) {
+    MachineBasicBlock *Block = Spill->getParent();
+    MachineDomTreeNode *Node = MDT[Block];
+    MachineInstr *SpillToRm = nullptr;
+    while (Node != RootIDomNode) {
+      if (Node != MDT[Block] && SpillBBToSpill[Node]) {
+        SpillToRm = SpillBBToSpill[MDT[Block]];
+        break;
+      } else if (WorkSet.count(Node)) {
+        break;
+      } else {
+        NodesOnPath.insert(Node);
+      }
+      Node = Node->getIDom();
+    }
+    if (SpillToRm) {
+      SpillsToRm.push_back(SpillToRm);
+    } else {
+      SpillsToKept[MDT[Block]] = 0;
+      WorkSet.insert(NodesOnPath.begin(), NodesOnPath.end());
+    }
+    NodesOnPath.clear();
+  }
+
+  // Sort the nodes in WorkSet in top-bottom order and save the nodes
+  // in Orders.
+  unsigned idx = 0;
+  Orders.push_back(MDT.DT->getNode(Root));
+  do {
+    MachineDomTreeNode *Node = Orders[idx++];
+    const std::vector<MachineDomTreeNode *> &Children = Node->getChildren();
+    unsigned NumChildren = Children.size();
+    for (unsigned i = 0; i != NumChildren; ++i) {
+      MachineDomTreeNode *Child = Children[i];
+      if (WorkSet.count(Child))
+        Orders.push_back(Child);
+    }
+  } while (idx != Orders.size());
+
+  DEBUG(dbgs() << "Orders size is " << Orders.size() << "\n");
+  {
+    SmallVector<MachineDomTreeNode *, 32>::reverse_iterator RIt =
+        Orders.rbegin();
+    for (; RIt != Orders.rend(); RIt++)
+      DEBUG(dbgs() << "BB" << (*RIt)->getBlock()->getNumber() << ",");
+  }
+  DEBUG(dbgs() << "\n");
+}
+
+/// Try to hoist spills according to BB hotness. The spills to removed will
+/// be saved in SpillsToRm. The spills to be inserted will be saved in
+/// SpillsToIns.
+void HoistSpiller::runHoistSpills(
+    unsigned OrigReg, VNInfo *OrigVNI, SmallPtrSet<MachineInstr *, 16> &Spills,
+    SmallVectorImpl<MachineInstr *> &SpillsToRm,
+    DenseMap<MachineBasicBlock *, unsigned> &SpillsToIns) {
+  // Visit order of dominator tree nodes.
+  SmallVector<MachineDomTreeNode *, 32> Orders;
+  // SpillsToKept contains all the nodes where spills are to be inserted
+  // during hoisting. If the spill to be inserted is an original spill
+  // (not a hoisted one), the value of the map entry is 0. If the spill
+  // is a hoisted spill, the value of the map entry is the VReg to be used
+  // on the RHS of the spill.
+  DenseMap<MachineDomTreeNode *, unsigned> SpillsToKept;
+  // Map from BB to the spill inside of it.
+  DenseMap<MachineDomTreeNode *, MachineInstr *> SpillBBToSpill;
+  MachineBasicBlock *Root = LIS.getMBBFromIndex(OrigVNI->def);
+  getVisitOrders(Root, Spills, Orders, SpillsToRm, SpillsToKept,
+                 SpillBBToSpill);
+
+  // SpillsInSubTree keeps the map from a dom tree node to a nodes set.
+  // It saves the locations where spills are to be inserted in the
+  // subtree of the node.
+  DenseMap<MachineDomTreeNode *, SmallPtrSet<MachineDomTreeNode *, 16>>
+      SpillsInSubTree;
+  // Iterate Orders set in reverse order, which will be a bottom-top order
+  // in the dominator tree. Once we visit a dom tree node, we know its
+  // children has already been visited and the spill locations in the
+  // subtrees of all the children have been determined.
+  SmallVector<MachineDomTreeNode *, 32>::reverse_iterator RIt = Orders.rbegin();
+  for (; RIt != Orders.rend(); RIt++) {
+    MachineBasicBlock *Block = (*RIt)->getBlock();
+
+    // If Block contains an original spill, simply continue.
+    if (SpillsToKept.find(*RIt) != SpillsToKept.end() && !SpillsToKept[*RIt]) {
+      SpillsInSubTree[*RIt].insert(*RIt);
+      continue;
+    }
+
+    // Collect spills in subtree of current node (*RIt) to
+    // SpillsInSubTree[*RIt].
+    const std::vector<MachineDomTreeNode *> &Children = (*RIt)->getChildren();
+    unsigned NumChildren = Children.size();
+    for (unsigned i = 0; i != NumChildren; ++i) {
+      MachineDomTreeNode *Child = Children[i];
+      SpillsInSubTree[*RIt].insert(SpillsInSubTree[Child].begin(),
+                                   SpillsInSubTree[Child].end());
+      SpillsInSubTree.erase(Child);
+    }
+
+    // No spills in subtree, simply continue.
+    if (SpillsInSubTree[*RIt].empty())
+      continue;
+
+    // Check whether Block is a possible candidate to insert spill.
+    unsigned LiveReg = 0;
+    if (!isSpillCandBB(OrigReg, OrigVNI, Block, LiveReg))
+      continue;
+
+    // Now Block is a proper target BB for hoisting spills. Decide whether to
+    // hoist the spills to current node. Get existing cost of all the spills
+    // in SpillsInSubTree[Block].
+    BlockFrequency SpillCost = 0;
+    for (const auto SpillBB : SpillsInSubTree[*RIt])
+      SpillCost += MBFI.getBlockFreq(SpillBB->getBlock());
+
+    // If there are multiple spills that could be merged, bias a little
+    // to hoist the spill.
+    BranchProbability MarginProb = (SpillsInSubTree[*RIt].size() > 1)
+                                       ? BranchProbability(9, 10)
+                                       : BranchProbability(1, 1);
+    if (SpillCost > MBFI.getBlockFreq(Block) * MarginProb) {
+      // Hoist: Move spills to current Block.
+      for (const auto SpillBB : SpillsInSubTree[*RIt]) {
+        // When SpillBB is a BB contains original spill, insert the spill
+        // to SpillsToRm.
+        if (SpillsToKept.find(SpillBB) != SpillsToKept.end() &&
+            !SpillsToKept[SpillBB]) {
+          MachineInstr *SpillToRm = SpillBBToSpill[SpillBB];
+          SpillsToRm.push_back(SpillToRm);
+        }
+        // SpillBB will not contain spill anymore, remove it from SpillsToKept.
+        SpillsToKept.erase(SpillBB);
+      }
+      // Current Block is the BB containing the new hoisted spill. Add it to
+      // SpillsToKept. LiveReg is the RHS of the spill.
+      SpillsToKept[*RIt] = LiveReg;
+      DEBUG({
+        dbgs() << "spills in BB: ";
+        for (const auto Rspill : SpillsInSubTree[*RIt])
+          dbgs() << Rspill->getBlock()->getNumber() << " ";
+        dbgs() << "were promoted to BB" << (*RIt)->getBlock()->getNumber()
+               << "\n";
+      });
+      SpillsInSubTree[*RIt].clear();
+      SpillsInSubTree[*RIt].insert(*RIt);
+    }
+  }
+  // For spills in SpillsToKept with LiveReg set (.i.e, not original spill),
+  // save them to SpillsToIns.
+  for (const auto ent : SpillsToKept) {
+    if (ent.second)
+      SpillsToIns[ent.first->getBlock()] = ent.second;
+  }
+}
+
+/// For spills with equal values, remove redundent spills and hoist spills
+/// to a less hot spot.
+void HoistSpiller::hoistAllSpills(LiveRangeEdit &Edit) {
+  // Save the mapping between stackslot and its original reg.
+  DenseMap<int, unsigned> SlotToOrigReg;
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    int Slot = VRM.getStackSlot(Reg);
+    if (Slot != VirtRegMap::NO_STACK_SLOT) {
+      for (const auto &ent : MergableSpills) {
+        if (ent.first.first == Slot &&
+            SlotToOrigReg.find(Slot) == SlotToOrigReg.end())
+          SlotToOrigReg[Slot] = VRM.getOriginal(Reg);
+      }
+    }
+    unsigned Original = VRM.getPreSplitReg(Reg);
+    if (!MRI.def_empty(Reg))
+      Virt2SiblingsMap[Original].insert(Reg);
+  }
+
+  // Each entry in MergableSpills contains a spill set with equal values.
+  for (auto &ent : MergableSpills) {
+    int Slot = ent.first.first;
+    unsigned OrigReg = SlotToOrigReg[Slot];
+    VNInfo *OrigVNI = ent.first.second;
+    SmallPtrSet<MachineInstr *, 16> &EqValSpills = ent.second;
+    if (!ent.second.size())
+      continue;
+
+    DEBUG({
+      dbgs() << "\nFor Slot" << Slot << " and VN" << OrigVNI->id << ":\n"
+             << "Equal spills in BB: ";
+      for (const auto spill : EqValSpills)
+        dbgs() << spill->getParent()->getNumber() << " ";
+      dbgs() << "\n";
+    });
+
+    // SpillsToRm is the spill set to be removed from EqValSpills.
+    SmallVector<MachineInstr *, 16> SpillsToRm;
+    // SpillsToIns is the spill set to be newly inserted after hoisting.
+    DenseMap<MachineBasicBlock *, unsigned> SpillsToIns;
+
+    runHoistSpills(OrigReg, OrigVNI, EqValSpills, SpillsToRm, SpillsToIns);
+
+    DEBUG({
+      dbgs() << "Finally inserted spills in BB: ";
+      for (const auto Ispill : SpillsToIns)
+        dbgs() << Ispill.first->getNumber() << " ";
+      dbgs() << "\nFinally removed spills in BB: ";
+      for (const auto Rspill : SpillsToRm)
+        dbgs() << Rspill->getParent()->getNumber() << " ";
+      dbgs() << "\n";
+    });
+
+    // Stack live range update.
+    LiveInterval &StackIntvl = LSS.getInterval(Slot);
+    if (!SpillsToIns.empty() || !SpillsToRm.empty()) {
+      LiveInterval &OrigLI = LIS.getInterval(OrigReg);
+      StackIntvl.MergeValueInAsValue(OrigLI, OrigVNI,
+                                     StackIntvl.getValNumInfo(0));
+    }
+
+    // Insert hoisted spills.
+    for (auto const ent : SpillsToIns) {
+      MachineBasicBlock *BB = ent.first;
+      unsigned LiveReg = ent.second;
+      MachineBasicBlock::iterator MI = BB->getFirstTerminator();
+      TII.storeRegToStackSlot(*BB, MI, LiveReg, false, Slot,
+                              MRI.getRegClass(LiveReg), &TRI);
+      LIS.InsertMachineInstrRangeInMaps(std::prev(MI), MI);
+      ++NumSpills;
+    }
+
+    // Remove redundent spills or change them to dead instructions.
+    NumSpills -= SpillsToRm.size();
+    for (auto const ent : SpillsToRm) {
+      ent->setDesc(TII.get(TargetOpcode::KILL));
+      for (unsigned i = ent->getNumOperands(); i; --i) {
+        MachineOperand &MO = ent->getOperand(i - 1);
+        if (MO.isReg() && MO.isImplicit() && MO.isDef() && !MO.isDead())
+          ent->RemoveOperand(i - 1);
+      }
+    }
+    Edit.eliminateDeadDefs(SpillsToRm, None, true);
+  }
+}
Index: lib/CodeGen/RegAllocGreedy.cpp
===================================================================
--- lib/CodeGen/RegAllocGreedy.cpp
+++ lib/CodeGen/RegAllocGreedy.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "AllocationOrder.h"
 #include "InterferenceCache.h"
 #include "LiveDebugVariables.h"
@@ -33,6 +32,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
@@ -44,6 +44,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <queue>
 
@@ -55,14 +56,15 @@
 STATISTIC(NumLocalSplits,  "Number of split local live ranges");
 STATISTIC(NumEvicted,      "Number of interferences evicted");
 
-static cl::opt<SplitEditor::ComplementSpillMode>
-SplitSpillMode("split-spill-mode", cl::Hidden,
-  cl::desc("Spill mode for splitting live ranges"),
-  cl::values(clEnumValN(SplitEditor::SM_Partition, "default", "Default"),
-             clEnumValN(SplitEditor::SM_Size,  "size",  "Optimize for size"),
-             clEnumValN(SplitEditor::SM_Speed, "speed", "Optimize for speed"),
-             clEnumValEnd),
-  cl::init(SplitEditor::SM_Partition));
+static cl::opt<SplitEditor::ComplementSpillMode> SplitSpillMode(
+    "split-spill-mode", cl::Hidden,
+    cl::desc("Spill mode for splitting live ranges"),
+    cl::values(clEnumValN(SplitEditor::SM_Partition, "default", "Default"),
+               clEnumValN(SplitEditor::SM_Size, "size", "Optimize for size"),
+               clEnumValN(SplitEditor::SM_Speed, "speed", "Optimize for speed"),
+               clEnumValEnd),
+    // cl::init(SplitEditor::SM_Partition));
+    cl::init(SplitEditor::SM_Size));
 
 static cl::opt<unsigned>
 LastChanceRecoloringMaxDepth("lcr-max-depth", cl::Hidden,
@@ -397,6 +399,7 @@
                                SmallVirtRegSet &, unsigned);
   void tryHintRecoloring(LiveInterval &);
   void tryHintsRecoloring();
+  void postOptimization();
 
   /// Model the information carried by one end of a copy.
   struct HintInfo {
@@ -1457,7 +1460,7 @@
                                  SmallVectorImpl<unsigned> &NewVRegs) {
   SmallVector<unsigned, 8> UsedCands;
   // Prepare split editor.
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
   SE->reset(LREdit, SplitSpillMode);
 
   // Assign all edge bundles to the preferred candidate, or NoCand.
@@ -1505,7 +1508,7 @@
   assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
   unsigned Reg = VirtReg.reg;
   bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
   SE->reset(LREdit, SplitSpillMode);
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
   for (unsigned i = 0; i != UseBlocks.size(); ++i) {
@@ -1577,7 +1580,7 @@
 
   // Always enable split spill mode, since we're effectively spilling to a
   // register.
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
   SE->reset(LREdit, SplitEditor::SM_Size);
 
   ArrayRef<SlotIndex> Uses = SA->getUseSlots();
@@ -1900,7 +1903,7 @@
                << '-' << Uses[BestAfter] << ", " << BestDiff
                << ", " << (BestAfter - BestBefore + 1) << " instrs\n");
 
-  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
   SE->reset(LREdit);
 
   SE->openIntv();
@@ -2556,6 +2559,10 @@
   return 0;
 }
 
+void RAGreedy::postOptimization() {
+  startHoistSpiller(*MF, *VRM, *LIS, &spiller());
+}
+
 bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n"
                << "********** Function: " << mf.getName() << '\n');
@@ -2579,6 +2586,7 @@
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   DomTree = &getAnalysis<MachineDominatorTree>();
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
+  createHoistSpiller(*this, *MF, *VRM, &spiller());
   Loops = &getAnalysis<MachineLoopInfo>();
   Bundles = &getAnalysis<EdgeBundles>();
   SpillPlacer = &getAnalysis<SpillPlacement>();
@@ -2601,6 +2609,8 @@
 
   allocatePhysRegs();
   tryHintsRecoloring();
+  postOptimization();
+
   releaseMemory();
   return true;
 }
Index: lib/CodeGen/Spiller.h
===================================================================
--- lib/CodeGen/Spiller.h
+++ lib/CodeGen/Spiller.h
@@ -16,6 +16,7 @@
   class MachineFunction;
   class MachineFunctionPass;
   class VirtRegMap;
+  class LiveIntervals;
 
   /// Spiller interface.
   ///
@@ -28,7 +29,6 @@
 
     /// spill - Spill the LRE.getParent() live interval.
     virtual void spill(LiveRangeEdit &LRE) = 0;
-
   };
 
   /// Create and return a spiller that will insert spill code directly instead
@@ -37,6 +37,13 @@
                                MachineFunction &mf,
                                VirtRegMap &vrm);
 
+  void createHoistSpiller(MachineFunctionPass &pass, MachineFunction &mf,
+                          VirtRegMap &vrm, Spiller *);
+
+  /// startHoistSpiller - create a HoistSpiller object and start to hoist
+  /// Spills.
+  void startHoistSpiller(MachineFunction &mf, VirtRegMap &vrm,
+                         LiveIntervals &lis, Spiller *);
 }
 
 #endif
Index: test/CodeGen/AArch64/aarch64-deferred-spilling.ll
===================================================================
--- test/CodeGen/AArch64/aarch64-deferred-spilling.ll
+++ test/CodeGen/AArch64/aarch64-deferred-spilling.ll
@@ -1,514 +0,0 @@
-;RUN: llc < %s -mtriple=aarch64--linux-android -regalloc=greedy -enable-deferred-spilling=true -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=DEFERRED
-;RUN: llc < %s -mtriple=aarch64--linux-android -regalloc=greedy -enable-deferred-spilling=false -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=REGULAR
-
-; Check that we do not end up with useless spill code.
-;
-; Move to the basic block we are interested in.
-;
-; CHECK: // %if.then.120
-;
-; REGULAR: str w21, [sp, #[[OFFSET:[0-9]+]]] // 4-byte Folded Spill
-; Check that w21 wouldn't need to be spilled since it is never reused.
-; REGULAR-NOT: {{[wx]}}21{{,?}}
-;
-; Check that w22 is used to carry a value through the call.
-; DEFERRED-NOT: str {{[wx]}}22,
-; DEFERRED: mov {{[wx]}}22,
-; DEFERRED-NOT: str {{[wx]}}22,
-;
-; CHECK:        bl      fprintf
-;
-; DEFERRED-NOT: ldr {{[wx]}}22,
-; DEFERRED: mov {{[wx][0-9]+}}, {{[wx]}}22
-; DEFERRED-NOT: ldr {{[wx]}}22,
-;
-; REGULAR-NOT: {{[wx]}}21{{,?}}
-; REGULAR: ldr w21, [sp, #[[OFFSET]]] // 4-byte Folded Reload
-;
-; End of the basic block we are interested in.
-; CHECK:        b
-; CHECK: {{[^:]+}}: // %sw.bb.123
-
-%struct.__sFILE = type { i8*, i32, i32, i32, i32, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, i8*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
-%struct.__sbuf = type { i8*, i64 }
-%struct.DState = type { %struct.bz_stream*, i32, i8, i32, i8, i32, i32, i32, i32, i32, i8, i32, i32, i32, i32, i32, [256 x i32], i32, [257 x i32], [257 x i32], i32*, i16*, i8*, i32, i32, i32, i32, i32, [256 x i8], [16 x i8], [256 x i8], [4096 x i8], [16 x i32], [18002 x i8], [18002 x i8], [6 x [258 x i8]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32*, i32*, i32* }
-%struct.bz_stream = type { i8*, i32, i32, i32, i8*, i32, i32, i32, i8*, i8* (i8*, i32, i32)*, void (i8*, i8*)*, i8* }
-
-@__sF = external global [0 x %struct.__sFILE], align 8
-@.str = private unnamed_addr constant [20 x i8] c"\0A    [%d: stuff+mf \00", align 1
-
-declare i32 @fprintf(%struct.__sFILE* nocapture, i8* nocapture readonly, ...)
-
-declare void @bar(i32)
-
-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
-
-define i32 @foo(%struct.DState* %s) {
-entry:
-  %state = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 1
-  %tmp = load i32, i32* %state, align 4
-  %cmp = icmp eq i32 %tmp, 10
-  %save_i = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 40
-  br i1 %cmp, label %if.end.thread, label %if.end
-
-if.end.thread:                                    ; preds = %entry
-  %save_j = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 41
-  %save_t = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 42
-  %save_alphaSize = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 43
-  %save_nGroups = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 44
-  %save_nSelectors = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 45
-  %save_EOB = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 46
-  %save_groupNo = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 47
-  %save_groupPos = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 48
-  %save_nextSym = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 49
-  %save_nblockMAX = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 50
-  %save_nblock = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 51
-  %save_es = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 52
-  %save_N = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 53
-  %save_curr = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 54
-  %save_zt = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 55
-  %save_zn = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 56
-  %save_zvec = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 57
-  %save_zj = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 58
-  %tmp1 = bitcast i32* %save_i to i8*
-  call void @llvm.memset.p0i8.i64(i8* %tmp1, i8 0, i64 108, i32 4, i1 false)
-  br label %sw.default
-
-if.end:                                           ; preds = %entry
-  %.pre = load i32, i32* %save_i, align 4
-  %save_j3.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 41
-  %.pre406 = load i32, i32* %save_j3.phi.trans.insert, align 4
-  %save_t4.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 42
-  %.pre407 = load i32, i32* %save_t4.phi.trans.insert, align 4
-  %save_alphaSize5.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 43
-  %.pre408 = load i32, i32* %save_alphaSize5.phi.trans.insert, align 4
-  %save_nGroups6.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 44
-  %.pre409 = load i32, i32* %save_nGroups6.phi.trans.insert, align 4
-  %save_nSelectors7.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 45
-  %.pre410 = load i32, i32* %save_nSelectors7.phi.trans.insert, align 4
-  %save_EOB8.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 46
-  %.pre411 = load i32, i32* %save_EOB8.phi.trans.insert, align 4
-  %save_groupNo9.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 47
-  %.pre412 = load i32, i32* %save_groupNo9.phi.trans.insert, align 4
-  %save_groupPos10.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 48
-  %.pre413 = load i32, i32* %save_groupPos10.phi.trans.insert, align 4
-  %save_nextSym11.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 49
-  %.pre414 = load i32, i32* %save_nextSym11.phi.trans.insert, align 4
-  %save_nblockMAX12.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 50
-  %.pre415 = load i32, i32* %save_nblockMAX12.phi.trans.insert, align 4
-  %save_nblock13.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 51
-  %.pre416 = load i32, i32* %save_nblock13.phi.trans.insert, align 4
-  %save_es14.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 52
-  %.pre417 = load i32, i32* %save_es14.phi.trans.insert, align 4
-  %save_N15.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 53
-  %.pre418 = load i32, i32* %save_N15.phi.trans.insert, align 4
-  %save_curr16.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 54
-  %.pre419 = load i32, i32* %save_curr16.phi.trans.insert, align 4
-  %save_zt17.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 55
-  %.pre420 = load i32, i32* %save_zt17.phi.trans.insert, align 4
-  %save_zn18.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 56
-  %.pre421 = load i32, i32* %save_zn18.phi.trans.insert, align 4
-  %save_zvec19.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 57
-  %.pre422 = load i32, i32* %save_zvec19.phi.trans.insert, align 4
-  %save_zj20.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 58
-  %.pre423 = load i32, i32* %save_zj20.phi.trans.insert, align 4
-  switch i32 %tmp, label %sw.default [
-    i32 13, label %sw.bb
-    i32 14, label %if.end.sw.bb.65_crit_edge
-    i32 25, label %if.end.sw.bb.123_crit_edge
-  ]
-
-if.end.sw.bb.123_crit_edge:                       ; preds = %if.end
-  %.pre433 = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
-  br label %sw.bb.123
-
-if.end.sw.bb.65_crit_edge:                        ; preds = %if.end
-  %bsLive69.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
-  %.pre426 = load i32, i32* %bsLive69.phi.trans.insert, align 4
-  br label %sw.bb.65
-
-sw.bb:                                            ; preds = %if.end
-  %sunkaddr = ptrtoint %struct.DState* %s to i64
-  %sunkaddr485 = add i64 %sunkaddr, 8
-  %sunkaddr486 = inttoptr i64 %sunkaddr485 to i32*
-  store i32 13, i32* %sunkaddr486, align 4
-  %bsLive = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
-  %tmp2 = load i32, i32* %bsLive, align 4
-  %cmp28.400 = icmp sgt i32 %tmp2, 7
-  br i1 %cmp28.400, label %sw.bb.if.then.29_crit_edge, label %if.end.33.lr.ph
-
-sw.bb.if.then.29_crit_edge:                       ; preds = %sw.bb
-  %sunkaddr487 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr488 = add i64 %sunkaddr487, 32
-  %sunkaddr489 = inttoptr i64 %sunkaddr488 to i32*
-  %.pre425 = load i32, i32* %sunkaddr489, align 4
-  br label %if.then.29
-
-if.end.33.lr.ph:                                  ; preds = %sw.bb
-  %tmp3 = bitcast %struct.DState* %s to %struct.bz_stream**
-  %.pre424 = load %struct.bz_stream*, %struct.bz_stream** %tmp3, align 8
-  %avail_in.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre424, i64 0, i32 1
-  %.pre430 = load i32, i32* %avail_in.phi.trans.insert, align 4
-  %tmp4 = add i32 %.pre430, -1
-  br label %if.end.33
-
-if.then.29:                                       ; preds = %while.body.backedge, %sw.bb.if.then.29_crit_edge
-  %tmp5 = phi i32 [ %.pre425, %sw.bb.if.then.29_crit_edge ], [ %or, %while.body.backedge ]
-  %.lcssa393 = phi i32 [ %tmp2, %sw.bb.if.then.29_crit_edge ], [ %add, %while.body.backedge ]
-  %sub = add nsw i32 %.lcssa393, -8
-  %shr = lshr i32 %tmp5, %sub
-  %and = and i32 %shr, 255
-  %sunkaddr491 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr492 = add i64 %sunkaddr491, 36
-  %sunkaddr493 = inttoptr i64 %sunkaddr492 to i32*
-  store i32 %sub, i32* %sunkaddr493, align 4
-  %blockSize100k = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 9
-  store i32 %and, i32* %blockSize100k, align 4
-  %and.off = add nsw i32 %and, -49
-  %tmp6 = icmp ugt i32 %and.off, 8
-  br i1 %tmp6, label %save_state_and_return, label %if.end.62
-
-if.end.33:                                        ; preds = %while.body.backedge, %if.end.33.lr.ph
-  %lsr.iv482 = phi i32 [ %tmp4, %if.end.33.lr.ph ], [ %lsr.iv.next483, %while.body.backedge ]
-  %tmp7 = phi i32 [ %tmp2, %if.end.33.lr.ph ], [ %add, %while.body.backedge ]
-  %cmp35 = icmp eq i32 %lsr.iv482, -1
-  br i1 %cmp35, label %save_state_and_return, label %if.end.37
-
-if.end.37:                                        ; preds = %if.end.33
-  %tmp8 = bitcast %struct.bz_stream* %.pre424 to i8**
-  %sunkaddr494 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr495 = add i64 %sunkaddr494, 32
-  %sunkaddr496 = inttoptr i64 %sunkaddr495 to i32*
-  %tmp9 = load i32, i32* %sunkaddr496, align 4
-  %shl = shl i32 %tmp9, 8
-  %tmp10 = load i8*, i8** %tmp8, align 8
-  %tmp11 = load i8, i8* %tmp10, align 1
-  %conv = zext i8 %tmp11 to i32
-  %or = or i32 %conv, %shl
-  store i32 %or, i32* %sunkaddr496, align 4
-  %add = add nsw i32 %tmp7, 8
-  %sunkaddr497 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr498 = add i64 %sunkaddr497, 36
-  %sunkaddr499 = inttoptr i64 %sunkaddr498 to i32*
-  store i32 %add, i32* %sunkaddr499, align 4
-  %incdec.ptr = getelementptr inbounds i8, i8* %tmp10, i64 1
-  store i8* %incdec.ptr, i8** %tmp8, align 8
-  %sunkaddr500 = ptrtoint %struct.bz_stream* %.pre424 to i64
-  %sunkaddr501 = add i64 %sunkaddr500, 8
-  %sunkaddr502 = inttoptr i64 %sunkaddr501 to i32*
-  store i32 %lsr.iv482, i32* %sunkaddr502, align 4
-  %sunkaddr503 = ptrtoint %struct.bz_stream* %.pre424 to i64
-  %sunkaddr504 = add i64 %sunkaddr503, 12
-  %sunkaddr505 = inttoptr i64 %sunkaddr504 to i32*
-  %tmp12 = load i32, i32* %sunkaddr505, align 4
-  %inc = add i32 %tmp12, 1
-  store i32 %inc, i32* %sunkaddr505, align 4
-  %cmp49 = icmp eq i32 %inc, 0
-  br i1 %cmp49, label %if.then.51, label %while.body.backedge
-
-if.then.51:                                       ; preds = %if.end.37
-  %sunkaddr506 = ptrtoint %struct.bz_stream* %.pre424 to i64
-  %sunkaddr507 = add i64 %sunkaddr506, 16
-  %sunkaddr508 = inttoptr i64 %sunkaddr507 to i32*
-  %tmp13 = load i32, i32* %sunkaddr508, align 4
-  %inc53 = add i32 %tmp13, 1
-  store i32 %inc53, i32* %sunkaddr508, align 4
-  br label %while.body.backedge
-
-while.body.backedge:                              ; preds = %if.then.51, %if.end.37
-  %lsr.iv.next483 = add i32 %lsr.iv482, -1
-  %cmp28 = icmp sgt i32 %add, 7
-  br i1 %cmp28, label %if.then.29, label %if.end.33
-
-if.end.62:                                        ; preds = %if.then.29
-  %sub64 = add nsw i32 %and, -48
-  %sunkaddr509 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr510 = add i64 %sunkaddr509, 40
-  %sunkaddr511 = inttoptr i64 %sunkaddr510 to i32*
-  store i32 %sub64, i32* %sunkaddr511, align 4
-  br label %sw.bb.65
-
-sw.bb.65:                                         ; preds = %if.end.62, %if.end.sw.bb.65_crit_edge
-  %bsLive69.pre-phi = phi i32* [ %bsLive69.phi.trans.insert, %if.end.sw.bb.65_crit_edge ], [ %bsLive, %if.end.62 ]
-  %tmp14 = phi i32 [ %.pre426, %if.end.sw.bb.65_crit_edge ], [ %sub, %if.end.62 ]
-  %sunkaddr512 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr513 = add i64 %sunkaddr512, 8
-  %sunkaddr514 = inttoptr i64 %sunkaddr513 to i32*
-  store i32 14, i32* %sunkaddr514, align 4
-  %cmp70.397 = icmp sgt i32 %tmp14, 7
-  br i1 %cmp70.397, label %if.then.72, label %if.end.82.lr.ph
-
-if.end.82.lr.ph:                                  ; preds = %sw.bb.65
-  %tmp15 = bitcast %struct.DState* %s to %struct.bz_stream**
-  %.pre427 = load %struct.bz_stream*, %struct.bz_stream** %tmp15, align 8
-  %avail_in84.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre427, i64 0, i32 1
-  %.pre431 = load i32, i32* %avail_in84.phi.trans.insert, align 4
-  %tmp16 = add i32 %.pre431, -1
-  br label %if.end.82
-
-if.then.72:                                       ; preds = %while.body.68.backedge, %sw.bb.65
-  %.lcssa390 = phi i32 [ %tmp14, %sw.bb.65 ], [ %add97, %while.body.68.backedge ]
-  %sub76 = add nsw i32 %.lcssa390, -8
-  %sunkaddr516 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr517 = add i64 %sunkaddr516, 36
-  %sunkaddr518 = inttoptr i64 %sunkaddr517 to i32*
-  store i32 %sub76, i32* %sunkaddr518, align 4
-  %currBlockNo = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 11
-  %tmp17 = load i32, i32* %currBlockNo, align 4
-  %inc117 = add nsw i32 %tmp17, 1
-  store i32 %inc117, i32* %currBlockNo, align 4
-  %verbosity = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 12
-  %tmp18 = load i32, i32* %verbosity, align 4
-  %cmp118 = icmp sgt i32 %tmp18, 1
-  br i1 %cmp118, label %if.then.120, label %sw.bb.123, !prof !0
-
-if.end.82:                                        ; preds = %while.body.68.backedge, %if.end.82.lr.ph
-  %lsr.iv480 = phi i32 [ %tmp16, %if.end.82.lr.ph ], [ %lsr.iv.next481, %while.body.68.backedge ]
-  %tmp19 = phi i32 [ %tmp14, %if.end.82.lr.ph ], [ %add97, %while.body.68.backedge ]
-  %cmp85 = icmp eq i32 %lsr.iv480, -1
-  br i1 %cmp85, label %save_state_and_return, label %if.end.88
-
-if.end.88:                                        ; preds = %if.end.82
-  %tmp20 = bitcast %struct.bz_stream* %.pre427 to i8**
-  %sunkaddr519 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr520 = add i64 %sunkaddr519, 32
-  %sunkaddr521 = inttoptr i64 %sunkaddr520 to i32*
-  %tmp21 = load i32, i32* %sunkaddr521, align 4
-  %shl90 = shl i32 %tmp21, 8
-  %tmp22 = load i8*, i8** %tmp20, align 8
-  %tmp23 = load i8, i8* %tmp22, align 1
-  %conv93 = zext i8 %tmp23 to i32
-  %or94 = or i32 %conv93, %shl90
-  store i32 %or94, i32* %sunkaddr521, align 4
-  %add97 = add nsw i32 %tmp19, 8
-  %sunkaddr522 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr523 = add i64 %sunkaddr522, 36
-  %sunkaddr524 = inttoptr i64 %sunkaddr523 to i32*
-  store i32 %add97, i32* %sunkaddr524, align 4
-  %incdec.ptr100 = getelementptr inbounds i8, i8* %tmp22, i64 1
-  store i8* %incdec.ptr100, i8** %tmp20, align 8
-  %sunkaddr525 = ptrtoint %struct.bz_stream* %.pre427 to i64
-  %sunkaddr526 = add i64 %sunkaddr525, 8
-  %sunkaddr527 = inttoptr i64 %sunkaddr526 to i32*
-  store i32 %lsr.iv480, i32* %sunkaddr527, align 4
-  %sunkaddr528 = ptrtoint %struct.bz_stream* %.pre427 to i64
-  %sunkaddr529 = add i64 %sunkaddr528, 12
-  %sunkaddr530 = inttoptr i64 %sunkaddr529 to i32*
-  %tmp24 = load i32, i32* %sunkaddr530, align 4
-  %inc106 = add i32 %tmp24, 1
-  store i32 %inc106, i32* %sunkaddr530, align 4
-  %cmp109 = icmp eq i32 %inc106, 0
-  br i1 %cmp109, label %if.then.111, label %while.body.68.backedge
-
-if.then.111:                                      ; preds = %if.end.88
-  %sunkaddr531 = ptrtoint %struct.bz_stream* %.pre427 to i64
-  %sunkaddr532 = add i64 %sunkaddr531, 16
-  %sunkaddr533 = inttoptr i64 %sunkaddr532 to i32*
-  %tmp25 = load i32, i32* %sunkaddr533, align 4
-  %inc114 = add i32 %tmp25, 1
-  store i32 %inc114, i32* %sunkaddr533, align 4
-  br label %while.body.68.backedge
-
-while.body.68.backedge:                           ; preds = %if.then.111, %if.end.88
-  %lsr.iv.next481 = add i32 %lsr.iv480, -1
-  %cmp70 = icmp sgt i32 %add97, 7
-  br i1 %cmp70, label %if.then.72, label %if.end.82
-
-if.then.120:                                      ; preds = %if.then.72
-  %call = tail call i32 (%struct.__sFILE*, i8*, ...) @fprintf(%struct.__sFILE* getelementptr inbounds ([0 x %struct.__sFILE], [0 x %struct.__sFILE]* @__sF, i64 0, i64 2), i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str, i64 0, i64 0), i32 %inc117)
-  br label %sw.bb.123
-
-sw.bb.123:                                        ; preds = %if.then.120, %if.then.72, %if.end.sw.bb.123_crit_edge
-  %bsLive127.pre-phi = phi i32* [ %.pre433, %if.end.sw.bb.123_crit_edge ], [ %bsLive69.pre-phi, %if.then.72 ], [ %bsLive69.pre-phi, %if.then.120 ]
-  %sunkaddr534 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr535 = add i64 %sunkaddr534, 8
-  %sunkaddr536 = inttoptr i64 %sunkaddr535 to i32*
-  store i32 25, i32* %sunkaddr536, align 4
-  %tmp26 = load i32, i32* %bsLive127.pre-phi, align 4
-  %cmp128.395 = icmp sgt i32 %tmp26, 7
-  br i1 %cmp128.395, label %sw.bb.123.if.then.130_crit_edge, label %if.end.140.lr.ph
-
-sw.bb.123.if.then.130_crit_edge:                  ; preds = %sw.bb.123
-  %sunkaddr537 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr538 = add i64 %sunkaddr537, 32
-  %sunkaddr539 = inttoptr i64 %sunkaddr538 to i32*
-  %.pre429 = load i32, i32* %sunkaddr539, align 4
-  br label %if.then.130
-
-if.end.140.lr.ph:                                 ; preds = %sw.bb.123
-  %tmp27 = bitcast %struct.DState* %s to %struct.bz_stream**
-  %.pre428 = load %struct.bz_stream*, %struct.bz_stream** %tmp27, align 8
-  %avail_in142.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre428, i64 0, i32 1
-  %.pre432 = load i32, i32* %avail_in142.phi.trans.insert, align 4
-  %tmp28 = add i32 %.pre432, -1
-  br label %if.end.140
-
-if.then.130:                                      ; preds = %while.body.126.backedge, %sw.bb.123.if.then.130_crit_edge
-  %tmp29 = phi i32 [ %.pre429, %sw.bb.123.if.then.130_crit_edge ], [ %or152, %while.body.126.backedge ]
-  %.lcssa = phi i32 [ %tmp26, %sw.bb.123.if.then.130_crit_edge ], [ %add155, %while.body.126.backedge ]
-  %sub134 = add nsw i32 %.lcssa, -8
-  %shr135 = lshr i32 %tmp29, %sub134
-  store i32 %sub134, i32* %bsLive127.pre-phi, align 4
-  %origPtr = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 13
-  %tmp30 = load i32, i32* %origPtr, align 4
-  %shl175 = shl i32 %tmp30, 8
-  %conv176 = and i32 %shr135, 255
-  %or177 = or i32 %shl175, %conv176
-  store i32 %or177, i32* %origPtr, align 4
-  %nInUse = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 27
-  %tmp31 = load i32, i32* %nInUse, align 4
-  %add179 = add nsw i32 %tmp31, 2
-  br label %save_state_and_return
-
-if.end.140:                                       ; preds = %while.body.126.backedge, %if.end.140.lr.ph
-  %lsr.iv = phi i32 [ %tmp28, %if.end.140.lr.ph ], [ %lsr.iv.next, %while.body.126.backedge ]
-  %tmp32 = phi i32 [ %tmp26, %if.end.140.lr.ph ], [ %add155, %while.body.126.backedge ]
-  %cmp143 = icmp eq i32 %lsr.iv, -1
-  br i1 %cmp143, label %save_state_and_return, label %if.end.146
-
-if.end.146:                                       ; preds = %if.end.140
-  %tmp33 = bitcast %struct.bz_stream* %.pre428 to i8**
-  %sunkaddr541 = ptrtoint %struct.DState* %s to i64
-  %sunkaddr542 = add i64 %sunkaddr541, 32
-  %sunkaddr543 = inttoptr i64 %sunkaddr542 to i32*
-  %tmp34 = load i32, i32* %sunkaddr543, align 4
-  %shl148 = shl i32 %tmp34, 8
-  %tmp35 = load i8*, i8** %tmp33, align 8
-  %tmp36 = load i8, i8* %tmp35, align 1
-  %conv151 = zext i8 %tmp36 to i32
-  %or152 = or i32 %conv151, %shl148
-  store i32 %or152, i32* %sunkaddr543, align 4
-  %add155 = add nsw i32 %tmp32, 8
-  store i32 %add155, i32* %bsLive127.pre-phi, align 4
-  %incdec.ptr158 = getelementptr inbounds i8, i8* %tmp35, i64 1
-  store i8* %incdec.ptr158, i8** %tmp33, align 8
-  %sunkaddr544 = ptrtoint %struct.bz_stream* %.pre428 to i64
-  %sunkaddr545 = add i64 %sunkaddr544, 8
-  %sunkaddr546 = inttoptr i64 %sunkaddr545 to i32*
-  store i32 %lsr.iv, i32* %sunkaddr546, align 4
-  %sunkaddr547 = ptrtoint %struct.bz_stream* %.pre428 to i64
-  %sunkaddr548 = add i64 %sunkaddr547, 12
-  %sunkaddr549 = inttoptr i64 %sunkaddr548 to i32*
-  %tmp37 = load i32, i32* %sunkaddr549, align 4
-  %inc164 = add i32 %tmp37, 1
-  store i32 %inc164, i32* %sunkaddr549, align 4
-  %cmp167 = icmp eq i32 %inc164, 0
-  br i1 %cmp167, label %if.then.169, label %while.body.126.backedge
-
-if.then.169:                                      ; preds = %if.end.146
-  %sunkaddr550 = ptrtoint %struct.bz_stream* %.pre428 to i64
-  %sunkaddr551 = add i64 %sunkaddr550, 16
-  %sunkaddr552 = inttoptr i64 %sunkaddr551 to i32*
-  %tmp38 = load i32, i32* %sunkaddr552, align 4
-  %inc172 = add i32 %tmp38, 1
-  store i32 %inc172, i32* %sunkaddr552, align 4
-  br label %while.body.126.backedge
-
-while.body.126.backedge:                          ; preds = %if.then.169, %if.end.146
-  %lsr.iv.next = add i32 %lsr.iv, -1
-  %cmp128 = icmp sgt i32 %add155, 7
-  br i1 %cmp128, label %if.then.130, label %if.end.140
-
-sw.default:                                       ; preds = %if.end, %if.end.thread
-  %tmp39 = phi i32 [ 0, %if.end.thread ], [ %.pre, %if.end ]
-  %tmp40 = phi i32 [ 0, %if.end.thread ], [ %.pre406, %if.end ]
-  %tmp41 = phi i32 [ 0, %if.end.thread ], [ %.pre407, %if.end ]
-  %tmp42 = phi i32 [ 0, %if.end.thread ], [ %.pre408, %if.end ]
-  %tmp43 = phi i32 [ 0, %if.end.thread ], [ %.pre409, %if.end ]
-  %tmp44 = phi i32 [ 0, %if.end.thread ], [ %.pre410, %if.end ]
-  %tmp45 = phi i32 [ 0, %if.end.thread ], [ %.pre411, %if.end ]
-  %tmp46 = phi i32 [ 0, %if.end.thread ], [ %.pre412, %if.end ]
-  %tmp47 = phi i32 [ 0, %if.end.thread ], [ %.pre413, %if.end ]
-  %tmp48 = phi i32 [ 0, %if.end.thread ], [ %.pre414, %if.end ]
-  %tmp49 = phi i32 [ 0, %if.end.thread ], [ %.pre415, %if.end ]
-  %tmp50 = phi i32 [ 0, %if.end.thread ], [ %.pre416, %if.end ]
-  %tmp51 = phi i32 [ 0, %if.end.thread ], [ %.pre417, %if.end ]
-  %tmp52 = phi i32 [ 0, %if.end.thread ], [ %.pre418, %if.end ]
-  %tmp53 = phi i32 [ 0, %if.end.thread ], [ %.pre419, %if.end ]
-  %tmp54 = phi i32 [ 0, %if.end.thread ], [ %.pre420, %if.end ]
-  %tmp55 = phi i32 [ 0, %if.end.thread ], [ %.pre421, %if.end ]
-  %tmp56 = phi i32 [ 0, %if.end.thread ], [ %.pre422, %if.end ]
-  %tmp57 = phi i32 [ 0, %if.end.thread ], [ %.pre423, %if.end ]
-  %save_j3.pre-phi469 = phi i32* [ %save_j, %if.end.thread ], [ %save_j3.phi.trans.insert, %if.end ]
-  %save_t4.pre-phi467 = phi i32* [ %save_t, %if.end.thread ], [ %save_t4.phi.trans.insert, %if.end ]
-  %save_alphaSize5.pre-phi465 = phi i32* [ %save_alphaSize, %if.end.thread ], [ %save_alphaSize5.phi.trans.insert, %if.end ]
-  %save_nGroups6.pre-phi463 = phi i32* [ %save_nGroups, %if.end.thread ], [ %save_nGroups6.phi.trans.insert, %if.end ]
-  %save_nSelectors7.pre-phi461 = phi i32* [ %save_nSelectors, %if.end.thread ], [ %save_nSelectors7.phi.trans.insert, %if.end ]
-  %save_EOB8.pre-phi459 = phi i32* [ %save_EOB, %if.end.thread ], [ %save_EOB8.phi.trans.insert, %if.end ]
-  %save_groupNo9.pre-phi457 = phi i32* [ %save_groupNo, %if.end.thread ], [ %save_groupNo9.phi.trans.insert, %if.end ]
-  %save_groupPos10.pre-phi455 = phi i32* [ %save_groupPos, %if.end.thread ], [ %save_groupPos10.phi.trans.insert, %if.end ]
-  %save_nextSym11.pre-phi453 = phi i32* [ %save_nextSym, %if.end.thread ], [ %save_nextSym11.phi.trans.insert, %if.end ]
-  %save_nblockMAX12.pre-phi451 = phi i32* [ %save_nblockMAX, %if.end.thread ], [ %save_nblockMAX12.phi.trans.insert, %if.end ]
-  %save_nblock13.pre-phi449 = phi i32* [ %save_nblock, %if.end.thread ], [ %save_nblock13.phi.trans.insert, %if.end ]
-  %save_es14.pre-phi447 = phi i32* [ %save_es, %if.end.thread ], [ %save_es14.phi.trans.insert, %if.end ]
-  %save_N15.pre-phi445 = phi i32* [ %save_N, %if.end.thread ], [ %save_N15.phi.trans.insert, %if.end ]
-  %save_curr16.pre-phi443 = phi i32* [ %save_curr, %if.end.thread ], [ %save_curr16.phi.trans.insert, %if.end ]
-  %save_zt17.pre-phi441 = phi i32* [ %save_zt, %if.end.thread ], [ %save_zt17.phi.trans.insert, %if.end ]
-  %save_zn18.pre-phi439 = phi i32* [ %save_zn, %if.end.thread ], [ %save_zn18.phi.trans.insert, %if.end ]
-  %save_zvec19.pre-phi437 = phi i32* [ %save_zvec, %if.end.thread ], [ %save_zvec19.phi.trans.insert, %if.end ]
-  %save_zj20.pre-phi435 = phi i32* [ %save_zj, %if.end.thread ], [ %save_zj20.phi.trans.insert, %if.end ]
-  tail call void @bar(i32 4001)
-  br label %save_state_and_return
-
-save_state_and_return:                            ; preds = %sw.default, %if.end.140, %if.then.130, %if.end.82, %if.end.33, %if.then.29
-  %tmp58 = phi i32 [ %tmp39, %sw.default ], [ %.pre, %if.then.29 ], [ %.pre, %if.then.130 ], [ %.pre, %if.end.140 ], [ %.pre, %if.end.82 ], [ %.pre, %if.end.33 ]
-  %tmp59 = phi i32 [ %tmp40, %sw.default ], [ %.pre406, %if.then.29 ], [ %.pre406, %if.then.130 ], [ %.pre406, %if.end.140 ], [ %.pre406, %if.end.82 ], [ %.pre406, %if.end.33 ]
-  %tmp60 = phi i32 [ %tmp41, %sw.default ], [ %.pre407, %if.then.29 ], [ %.pre407, %if.then.130 ], [ %.pre407, %if.end.140 ], [ %.pre407, %if.end.82 ], [ %.pre407, %if.end.33 ]
-  %tmp61 = phi i32 [ %tmp43, %sw.default ], [ %.pre409, %if.then.29 ], [ %.pre409, %if.then.130 ], [ %.pre409, %if.end.140 ], [ %.pre409, %if.end.82 ], [ %.pre409, %if.end.33 ]
-  %tmp62 = phi i32 [ %tmp44, %sw.default ], [ %.pre410, %if.then.29 ], [ %.pre410, %if.then.130 ], [ %.pre410, %if.end.140 ], [ %.pre410, %if.end.82 ], [ %.pre410, %if.end.33 ]
-  %tmp63 = phi i32 [ %tmp45, %sw.default ], [ %.pre411, %if.then.29 ], [ %.pre411, %if.then.130 ], [ %.pre411, %if.end.140 ], [ %.pre411, %if.end.82 ], [ %.pre411, %if.end.33 ]
-  %tmp64 = phi i32 [ %tmp46, %sw.default ], [ %.pre412, %if.then.29 ], [ %.pre412, %if.then.130 ], [ %.pre412, %if.end.140 ], [ %.pre412, %if.end.82 ], [ %.pre412, %if.end.33 ]
-  %tmp65 = phi i32 [ %tmp47, %sw.default ], [ %.pre413, %if.then.29 ], [ %.pre413, %if.then.130 ], [ %.pre413, %if.end.140 ], [ %.pre413, %if.end.82 ], [ %.pre413, %if.end.33 ]
-  %tmp66 = phi i32 [ %tmp48, %sw.default ], [ %.pre414, %if.then.29 ], [ %.pre414, %if.then.130 ], [ %.pre414, %if.end.140 ], [ %.pre414, %if.end.82 ], [ %.pre414, %if.end.33 ]
-  %tmp67 = phi i32 [ %tmp49, %sw.default ], [ %.pre415, %if.then.29 ], [ %.pre415, %if.then.130 ], [ %.pre415, %if.end.140 ], [ %.pre415, %if.end.82 ], [ %.pre415, %if.end.33 ]
-  %tmp68 = phi i32 [ %tmp51, %sw.default ], [ %.pre417, %if.then.29 ], [ %.pre417, %if.then.130 ], [ %.pre417, %if.end.140 ], [ %.pre417, %if.end.82 ], [ %.pre417, %if.end.33 ]
-  %tmp69 = phi i32 [ %tmp52, %sw.default ], [ %.pre418, %if.then.29 ], [ %.pre418, %if.then.130 ], [ %.pre418, %if.end.140 ], [ %.pre418, %if.end.82 ], [ %.pre418, %if.end.33 ]
-  %tmp70 = phi i32 [ %tmp53, %sw.default ], [ %.pre419, %if.then.29 ], [ %.pre419, %if.then.130 ], [ %.pre419, %if.end.140 ], [ %.pre419, %if.end.82 ], [ %.pre419, %if.end.33 ]
-  %tmp71 = phi i32 [ %tmp54, %sw.default ], [ %.pre420, %if.then.29 ], [ %.pre420, %if.then.130 ], [ %.pre420, %if.end.140 ], [ %.pre420, %if.end.82 ], [ %.pre420, %if.end.33 ]
-  %tmp72 = phi i32 [ %tmp55, %sw.default ], [ %.pre421, %if.then.29 ], [ %.pre421, %if.then.130 ], [ %.pre421, %if.end.140 ], [ %.pre421, %if.end.82 ], [ %.pre421, %if.end.33 ]
-  %tmp73 = phi i32 [ %tmp56, %sw.default ], [ %.pre422, %if.then.29 ], [ %.pre422, %if.then.130 ], [ %.pre422, %if.end.140 ], [ %.pre422, %if.end.82 ], [ %.pre422, %if.end.33 ]
-  %tmp74 = phi i32 [ %tmp57, %sw.default ], [ %.pre423, %if.then.29 ], [ %.pre423, %if.then.130 ], [ %.pre423, %if.end.140 ], [ %.pre423, %if.end.82 ], [ %.pre423, %if.end.33 ]
-  %save_j3.pre-phi468 = phi i32* [ %save_j3.pre-phi469, %sw.default ], [ %save_j3.phi.trans.insert, %if.then.29 ], [ %save_j3.phi.trans.insert, %if.then.130 ], [ %save_j3.phi.trans.insert, %if.end.140 ], [ %save_j3.phi.trans.insert, %if.end.82 ], [ %save_j3.phi.trans.insert, %if.end.33 ]
-  %save_t4.pre-phi466 = phi i32* [ %save_t4.pre-phi467, %sw.default ], [ %save_t4.phi.trans.insert, %if.then.29 ], [ %save_t4.phi.trans.insert, %if.then.130 ], [ %save_t4.phi.trans.insert, %if.end.140 ], [ %save_t4.phi.trans.insert, %if.end.82 ], [ %save_t4.phi.trans.insert, %if.end.33 ]
-  %save_alphaSize5.pre-phi464 = phi i32* [ %save_alphaSize5.pre-phi465, %sw.default ], [ %save_alphaSize5.phi.trans.insert, %if.then.29 ], [ %save_alphaSize5.phi.trans.insert, %if.then.130 ], [ %save_alphaSize5.phi.trans.insert, %if.end.140 ], [ %save_alphaSize5.phi.trans.insert, %if.end.82 ], [ %save_alphaSize5.phi.trans.insert, %if.end.33 ]
-  %save_nGroups6.pre-phi462 = phi i32* [ %save_nGroups6.pre-phi463, %sw.default ], [ %save_nGroups6.phi.trans.insert, %if.then.29 ], [ %save_nGroups6.phi.trans.insert, %if.then.130 ], [ %save_nGroups6.phi.trans.insert, %if.end.140 ], [ %save_nGroups6.phi.trans.insert, %if.end.82 ], [ %save_nGroups6.phi.trans.insert, %if.end.33 ]
-  %save_nSelectors7.pre-phi460 = phi i32* [ %save_nSelectors7.pre-phi461, %sw.default ], [ %save_nSelectors7.phi.trans.insert, %if.then.29 ], [ %save_nSelectors7.phi.trans.insert, %if.then.130 ], [ %save_nSelectors7.phi.trans.insert, %if.end.140 ], [ %save_nSelectors7.phi.trans.insert, %if.end.82 ], [ %save_nSelectors7.phi.trans.insert, %if.end.33 ]
-  %save_EOB8.pre-phi458 = phi i32* [ %save_EOB8.pre-phi459, %sw.default ], [ %save_EOB8.phi.trans.insert, %if.then.29 ], [ %save_EOB8.phi.trans.insert, %if.then.130 ], [ %save_EOB8.phi.trans.insert, %if.end.140 ], [ %save_EOB8.phi.trans.insert, %if.end.82 ], [ %save_EOB8.phi.trans.insert, %if.end.33 ]
-  %save_groupNo9.pre-phi456 = phi i32* [ %save_groupNo9.pre-phi457, %sw.default ], [ %save_groupNo9.phi.trans.insert, %if.then.29 ], [ %save_groupNo9.phi.trans.insert, %if.then.130 ], [ %save_groupNo9.phi.trans.insert, %if.end.140 ], [ %save_groupNo9.phi.trans.insert, %if.end.82 ], [ %save_groupNo9.phi.trans.insert, %if.end.33 ]
-  %save_groupPos10.pre-phi454 = phi i32* [ %save_groupPos10.pre-phi455, %sw.default ], [ %save_groupPos10.phi.trans.insert, %if.then.29 ], [ %save_groupPos10.phi.trans.insert, %if.then.130 ], [ %save_groupPos10.phi.trans.insert, %if.end.140 ], [ %save_groupPos10.phi.trans.insert, %if.end.82 ], [ %save_groupPos10.phi.trans.insert, %if.end.33 ]
-  %save_nextSym11.pre-phi452 = phi i32* [ %save_nextSym11.pre-phi453, %sw.default ], [ %save_nextSym11.phi.trans.insert, %if.then.29 ], [ %save_nextSym11.phi.trans.insert, %if.then.130 ], [ %save_nextSym11.phi.trans.insert, %if.end.140 ], [ %save_nextSym11.phi.trans.insert, %if.end.82 ], [ %save_nextSym11.phi.trans.insert, %if.end.33 ]
-  %save_nblockMAX12.pre-phi450 = phi i32* [ %save_nblockMAX12.pre-phi451, %sw.default ], [ %save_nblockMAX12.phi.trans.insert, %if.then.29 ], [ %save_nblockMAX12.phi.trans.insert, %if.then.130 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.140 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.82 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.33 ]
-  %save_nblock13.pre-phi448 = phi i32* [ %save_nblock13.pre-phi449, %sw.default ], [ %save_nblock13.phi.trans.insert, %if.then.29 ], [ %save_nblock13.phi.trans.insert, %if.then.130 ], [ %save_nblock13.phi.trans.insert, %if.end.140 ], [ %save_nblock13.phi.trans.insert, %if.end.82 ], [ %save_nblock13.phi.trans.insert, %if.end.33 ]
-  %save_es14.pre-phi446 = phi i32* [ %save_es14.pre-phi447, %sw.default ], [ %save_es14.phi.trans.insert, %if.then.29 ], [ %save_es14.phi.trans.insert, %if.then.130 ], [ %save_es14.phi.trans.insert, %if.end.140 ], [ %save_es14.phi.trans.insert, %if.end.82 ], [ %save_es14.phi.trans.insert, %if.end.33 ]
-  %save_N15.pre-phi444 = phi i32* [ %save_N15.pre-phi445, %sw.default ], [ %save_N15.phi.trans.insert, %if.then.29 ], [ %save_N15.phi.trans.insert, %if.then.130 ], [ %save_N15.phi.trans.insert, %if.end.140 ], [ %save_N15.phi.trans.insert, %if.end.82 ], [ %save_N15.phi.trans.insert, %if.end.33 ]
-  %save_curr16.pre-phi442 = phi i32* [ %save_curr16.pre-phi443, %sw.default ], [ %save_curr16.phi.trans.insert, %if.then.29 ], [ %save_curr16.phi.trans.insert, %if.then.130 ], [ %save_curr16.phi.trans.insert, %if.end.140 ], [ %save_curr16.phi.trans.insert, %if.end.82 ], [ %save_curr16.phi.trans.insert, %if.end.33 ]
-  %save_zt17.pre-phi440 = phi i32* [ %save_zt17.pre-phi441, %sw.default ], [ %save_zt17.phi.trans.insert, %if.then.29 ], [ %save_zt17.phi.trans.insert, %if.then.130 ], [ %save_zt17.phi.trans.insert, %if.end.140 ], [ %save_zt17.phi.trans.insert, %if.end.82 ], [ %save_zt17.phi.trans.insert, %if.end.33 ]
-  %save_zn18.pre-phi438 = phi i32* [ %save_zn18.pre-phi439, %sw.default ], [ %save_zn18.phi.trans.insert, %if.then.29 ], [ %save_zn18.phi.trans.insert, %if.then.130 ], [ %save_zn18.phi.trans.insert, %if.end.140 ], [ %save_zn18.phi.trans.insert, %if.end.82 ], [ %save_zn18.phi.trans.insert, %if.end.33 ]
-  %save_zvec19.pre-phi436 = phi i32* [ %save_zvec19.pre-phi437, %sw.default ], [ %save_zvec19.phi.trans.insert, %if.then.29 ], [ %save_zvec19.phi.trans.insert, %if.then.130 ], [ %save_zvec19.phi.trans.insert, %if.end.140 ], [ %save_zvec19.phi.trans.insert, %if.end.82 ], [ %save_zvec19.phi.trans.insert, %if.end.33 ]
-  %save_zj20.pre-phi434 = phi i32* [ %save_zj20.pre-phi435, %sw.default ], [ %save_zj20.phi.trans.insert, %if.then.29 ], [ %save_zj20.phi.trans.insert, %if.then.130 ], [ %save_zj20.phi.trans.insert, %if.end.140 ], [ %save_zj20.phi.trans.insert, %if.end.82 ], [ %save_zj20.phi.trans.insert, %if.end.33 ]
-  %nblock.1 = phi i32 [ %tmp50, %sw.default ], [ %.pre416, %if.then.29 ], [ 0, %if.then.130 ], [ %.pre416, %if.end.140 ], [ %.pre416, %if.end.82 ], [ %.pre416, %if.end.33 ]
-  %alphaSize.1 = phi i32 [ %tmp42, %sw.default ], [ %.pre408, %if.then.29 ], [ %add179, %if.then.130 ], [ %.pre408, %if.end.140 ], [ %.pre408, %if.end.82 ], [ %.pre408, %if.end.33 ]
-  %retVal.0 = phi i32 [ 0, %sw.default ], [ -5, %if.then.29 ], [ -4, %if.then.130 ], [ 0, %if.end.140 ], [ 0, %if.end.82 ], [ 0, %if.end.33 ]
-  store i32 %tmp58, i32* %save_i, align 4
-  store i32 %tmp59, i32* %save_j3.pre-phi468, align 4
-  store i32 %tmp60, i32* %save_t4.pre-phi466, align 4
-  store i32 %alphaSize.1, i32* %save_alphaSize5.pre-phi464, align 4
-  store i32 %tmp61, i32* %save_nGroups6.pre-phi462, align 4
-  store i32 %tmp62, i32* %save_nSelectors7.pre-phi460, align 4
-  store i32 %tmp63, i32* %save_EOB8.pre-phi458, align 4
-  store i32 %tmp64, i32* %save_groupNo9.pre-phi456, align 4
-  store i32 %tmp65, i32* %save_groupPos10.pre-phi454, align 4
-  store i32 %tmp66, i32* %save_nextSym11.pre-phi452, align 4
-  store i32 %tmp67, i32* %save_nblockMAX12.pre-phi450, align 4
-  store i32 %nblock.1, i32* %save_nblock13.pre-phi448, align 4
-  store i32 %tmp68, i32* %save_es14.pre-phi446, align 4
-  store i32 %tmp69, i32* %save_N15.pre-phi444, align 4
-  store i32 %tmp70, i32* %save_curr16.pre-phi442, align 4
-  store i32 %tmp71, i32* %save_zt17.pre-phi440, align 4
-  store i32 %tmp72, i32* %save_zn18.pre-phi438, align 4
-  store i32 %tmp73, i32* %save_zvec19.pre-phi436, align 4
-  store i32 %tmp74, i32* %save_zj20.pre-phi434, align 4
-  ret i32 %retVal.0
-}
-
-!0 = !{!"branch_weights", i32 10, i32 1}
Index: test/CodeGen/ARM/subreg-remat.ll
===================================================================
--- test/CodeGen/ARM/subreg-remat.ll
+++ test/CodeGen/ARM/subreg-remat.ll
@@ -11,10 +11,10 @@
 ; since it implicitly reads the ssub_1 sub-register.
 ;
 ; CHECK: f1
-; CHECK: vmov    d0, r0, r0
-; CHECK: vldr s1, LCPI
+; CHECK: vmov    d1, r0, r0
+; CHECK: vldr s3, LCPI
 ; The vector must be spilled:
-; CHECK: vstr d0,
+; CHECK: vstr d1,
 ; CHECK: asm clobber d0
 ; And reloaded after the asm:
 ; CHECK: vldr [[D16:d[0-9]+]],
Index: test/CodeGen/SPARC/spill.ll
===================================================================
--- test/CodeGen/SPARC/spill.ll
+++ test/CodeGen/SPARC/spill.ll
@@ -7,8 +7,9 @@
 ;; registers to ensure the spill will happen.
 
 ; CHECK-LABEL: test_i32_spill:
-; CHECK:       and %i0, %i1, %o0
-; CHECK:       st %o0, [%fp+{{.+}}]
+; CHECK:       and %i0, %i1, %i0
+; CHECK:       mov %i0, %o0
+; CHECK:       st %i0, [%fp+{{.+}}]
 ; CHECK:       add %o0, %o0, %g0
 ; CHECK:       ld [%fp+{{.+}}, %i0
 define i32 @test_i32_spill(i32 %a, i32 %b) {
@@ -20,9 +21,11 @@
 }
 
 ; CHECK-LABEL: test_i64_spill:
-; CHECK:       and %i0, %i2, %o0
-; CHECK:       and %i1, %i3, %o1
-; CHECK:       std %o0, [%fp+{{.+}}]
+; CHECK:       and %i0, %i2, %i4
+; CHECK:       and %i1, %i3, %i5
+; CHECK:       mov %i4, %o0
+; CHECK:       mov %i5, %o1
+; CHECK:       std %i4, [%fp+{{.+}}]
 ; CHECK:       add %o0, %o0, %g0
 ; CHECK:       ldd [%fp+{{.+}}, %i0
 define i64 @test_i64_spill(i64 %a, i64 %b) {
Index: test/CodeGen/X86/avx512-bugfix-25270.ll
===================================================================
--- test/CodeGen/X86/avx512-bugfix-25270.ll
+++ test/CodeGen/X86/avx512-bugfix-25270.ll
@@ -10,9 +10,9 @@
 ; CHECK-NEXT:    subq $112, %rsp
 ; CHECK-NEXT:    movq %rdi, %rbx
 ; CHECK-NEXT:    vmovdqu32 (%rbx), %zmm0
-; CHECK-NEXT:    vmovups %zmm0, (%rsp) ## 64-byte Spill
 ; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm1
 ; CHECK-NEXT:    vmovdqa32 %zmm1, (%rbx)
+; CHECK-NEXT:    vmovups %zmm0, (%rsp) ## 64-byte Spill
 ; CHECK-NEXT:    callq _Print__512
 ; CHECK-NEXT:    vmovups (%rsp), %zmm0 ## 64-byte Reload
 ; CHECK-NEXT:    callq _Print__512
Index: test/CodeGen/X86/fold-push.ll
===================================================================
--- test/CodeGen/X86/fold-push.ll
+++ test/CodeGen/X86/fold-push.ll
@@ -5,8 +5,9 @@
 
 define void @test(i32 %a, i32 %b) optsize nounwind {
 ; CHECK-LABEL: test:
-; CHECK: movl [[EAX:%e..]], (%esp)
-; CHECK-NEXT: pushl [[EAX]]
+; CHECK: addl
+; CHECK-NEXT: pushl [[EAX:%e..]]
+; CHECK-NEXT: movl [[EAX]], 4(%esp)
 ; CHECK-NEXT: calll
 ; CHECK-NEXT: addl $4, %esp
 ; CHECK: nop
@@ -24,8 +25,9 @@
 
 define void @test_min(i32 %a, i32 %b) minsize nounwind {
 ; CHECK-LABEL: test_min:
-; CHECK: movl [[EAX:%e..]], (%esp)
-; CHECK-NEXT: pushl [[EAX]]
+; CHECK: addl
+; CHECK-NEXT: pushl [[EAX:%e..]]
+; CHECK-NEXT: movl [[EAX]], 4(%esp)
 ; CHECK-NEXT: calll
 ; CHECK-NEXT: popl
 ; CHECK: nop
Index: test/CodeGen/X86/hoist-spill.ll
===================================================================
--- test/CodeGen/X86/hoist-spill.ll
+++ test/CodeGen/X86/hoist-spill.ll
@@ -0,0 +1,115 @@
+; RUN: llc < %s | grep 'Spill' |sed 's%.*\(-[0-9]\+(\%rsp)\).*%\1%g' |sort |uniq -d |awk '{if (/rsp/); exit -1}'
+; Check no spills to the same stack slot after hoisting.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = external global i32*, align 8
+@b = external global i32, align 4
+@d = external global i32*, align 8
+
+; Function Attrs: norecurse noreturn nounwind uwtable
+define void @fn1(i32 %p1) #0 {
+entry:
+  %tmp = load i32*, i32** @d, align 8
+  %tmp1 = load i32*, i32** @a, align 8
+  %tmp2 = sext i32 %p1 to i64
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc14, %entry
+  %indvar = phi i32 [ %indvar.next, %for.inc14 ], [ 0, %entry ]
+  %indvars.iv30.in = phi i32 [ %indvars.iv30, %for.inc14 ], [ %p1, %entry ]
+  %c.0 = phi i32 [ %inc15, %for.inc14 ], [ 1, %entry ]
+  %k.0 = phi i32 [ %k.1.lcssa, %for.inc14 ], [ undef, %entry ]
+  %tmp3 = icmp sgt i32 undef, 0
+  %smax52 = select i1 %tmp3, i32 undef, i32 0
+  %tmp4 = zext i32 %smax52 to i64
+  %tmp5 = icmp sgt i64 undef, %tmp4
+  %smax53 = select i1 %tmp5, i64 undef, i64 %tmp4
+  %tmp6 = add nsw i64 %smax53, 1
+  %tmp7 = sub nsw i64 %tmp6, %tmp4
+  %tmp8 = add nsw i64 %tmp7, -8
+  %tmp9 = sub i32 undef, %indvar
+  %tmp10 = icmp sgt i64 %tmp2, 0
+  %smax40 = select i1 %tmp10, i64 %tmp2, i64 0
+  %scevgep41 = getelementptr i32, i32* %tmp1, i64 %smax40
+  %indvars.iv30 = add i32 %indvars.iv30.in, -1
+  %tmp11 = icmp sgt i32 %indvars.iv30, 0
+  %smax = select i1 %tmp11, i32 %indvars.iv30, i32 0
+  %tmp12 = zext i32 %smax to i64
+  %sub = sub nsw i32 %p1, %c.0
+  %cmp = icmp sgt i32 %sub, 0
+  %sub. = select i1 %cmp, i32 %sub, i32 0
+  %cmp326 = icmp sgt i32 %k.0, %p1
+  br i1 %cmp326, label %for.cond4.preheader, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %for.cond
+  br label %for.body
+
+for.cond4.preheader:                              ; preds = %for.body, %for.cond
+  %k.1.lcssa = phi i32 [ %k.0, %for.cond ], [ %add, %for.body ]
+  %cmp528 = icmp sgt i32 %sub., %p1
+  br i1 %cmp528, label %for.inc14, label %for.body6.preheader
+
+for.body6.preheader:                              ; preds = %for.cond4.preheader
+  br i1 undef, label %for.body6, label %min.iters.checked
+
+min.iters.checked:                                ; preds = %for.body6.preheader
+  br i1 undef, label %for.body6, label %vector.memcheck
+
+vector.memcheck:                                  ; preds = %min.iters.checked
+  %bound1 = icmp ule i32* undef, %scevgep41
+  %memcheck.conflict = and i1 undef, %bound1
+  br i1 %memcheck.conflict, label %for.body6, label %vector.body.preheader
+
+vector.body.preheader:                            ; preds = %vector.memcheck
+  %lcmp.mod = icmp eq i64 undef, 0
+  br i1 %lcmp.mod, label %vector.body.preheader.split, label %vector.body.prol
+
+vector.body.prol:                                 ; preds = %vector.body.prol, %vector.body.preheader
+  %prol.iter.cmp = icmp eq i64 undef, 0
+  br i1 %prol.iter.cmp, label %vector.body.preheader.split, label %vector.body.prol
+
+vector.body.preheader.split:                      ; preds = %vector.body.prol, %vector.body.preheader
+  %tmp13 = icmp ult i64 %tmp8, 24
+  br i1 %tmp13, label %middle.block, label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.body.preheader.split
+  %index = phi i64 [ %index.next.3, %vector.body ], [ 0, %vector.body.preheader.split ]
+  %index.next = add i64 %index, 8
+  %offset.idx.1 = add i64 %tmp12, %index.next
+  %tmp14 = getelementptr inbounds i32, i32* %tmp, i64 %offset.idx.1
+  %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
+  %wide.load.1 = load <4 x i32>, <4 x i32>* %tmp15, align 4
+  %tmp16 = getelementptr inbounds i32, i32* %tmp1, i64 %offset.idx.1
+  %tmp17 = bitcast i32* %tmp16 to <4 x i32>*
+  store <4 x i32> %wide.load.1, <4 x i32>* %tmp17, align 4
+  %index.next.3 = add i64 %index, 32
+  br i1 undef, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body, %vector.body.preheader.split
+  br i1 undef, label %for.inc14, label %for.body6
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %k.127 = phi i32 [ %k.0, %for.body.preheader ], [ %add, %for.body ]
+  %add = add nsw i32 %k.127, 1
+  %tmp18 = load i32, i32* undef, align 4
+  store i32 %tmp18, i32* @b, align 4
+  br i1 undef, label %for.body, label %for.cond4.preheader
+
+for.body6:                                        ; preds = %for.body6, %middle.block, %vector.memcheck, %min.iters.checked, %for.body6.preheader
+  %indvars.iv32 = phi i64 [ undef, %for.body6 ], [ %tmp12, %vector.memcheck ], [ %tmp12, %min.iters.checked ], [ %tmp12, %for.body6.preheader ], [ undef, %middle.block ]
+  %arrayidx8 = getelementptr inbounds i32, i32* %tmp, i64 %indvars.iv32
+  %tmp19 = load i32, i32* %arrayidx8, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %tmp1, i64 %indvars.iv32
+  store i32 %tmp19, i32* %arrayidx10, align 4
+  %cmp5 = icmp slt i64 %indvars.iv32, undef
+  br i1 %cmp5, label %for.body6, label %for.inc14
+
+for.inc14:                                        ; preds = %for.body6, %middle.block, %for.cond4.preheader
+  %inc15 = add nuw nsw i32 %c.0, 1
+  %indvar.next = add i32 %indvar, 1
+  br label %for.cond
+}
+
+attributes #0 = { norecurse noreturn nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/CodeGen/X86/new-remat.ll
===================================================================
--- test/CodeGen/X86/new-remat.ll
+++ test/CodeGen/X86/new-remat.ll
@@ -0,0 +1,75 @@
+; RUN: llc < %s | FileCheck %s
+; Check all spills are rematerialized.
+; CHECK-NOT: Spill
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = common global double 0.000000e+00, align 8
+@a = common global i32 0, align 4
+
+; Function Attrs: nounwind uwtable
+define i32 @uniform_testdata(i32 %p1) #0 {
+entry:
+  %cmp3 = icmp sgt i32 %p1, 0
+  br i1 %cmp3, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %0 = add i32 %p1, -1
+  %xtraiter = and i32 %p1, 7
+  %lcmp.mod = icmp eq i32 %xtraiter, 0
+  br i1 %lcmp.mod, label %for.body.preheader.split, label %for.body.prol.preheader
+
+for.body.prol.preheader:                          ; preds = %for.body.preheader
+  br label %for.body.prol
+
+for.body.prol:                                    ; preds = %for.body.prol.preheader, %for.body.prol
+  %i.04.prol = phi i32 [ %inc.prol, %for.body.prol ], [ 0, %for.body.prol.preheader ]
+  %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.prol.preheader ]
+  %1 = load double, double* @b, align 8
+  %call.prol = tail call double @pow(double %1, double 2.500000e-01) #2
+  %inc.prol = add nuw nsw i32 %i.04.prol, 1
+  %prol.iter.sub = add i32 %prol.iter, -1
+  %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
+  br i1 %prol.iter.cmp, label %for.body.preheader.split.loopexit, label %for.body.prol
+
+for.body.preheader.split.loopexit:                ; preds = %for.body.prol
+  %inc.prol.lcssa = phi i32 [ %inc.prol, %for.body.prol ]
+  br label %for.body.preheader.split
+
+for.body.preheader.split:                         ; preds = %for.body.preheader.split.loopexit, %for.body.preheader
+  %i.04.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.prol.lcssa, %for.body.preheader.split.loopexit ]
+  %2 = icmp ult i32 %0, 7
+  br i1 %2, label %for.end.loopexit, label %for.body.preheader.split.split
+
+for.body.preheader.split.split:                   ; preds = %for.body.preheader.split
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader.split.split
+  %i.04 = phi i32 [ %i.04.unr, %for.body.preheader.split.split ], [ %inc.7, %for.body ]
+  %3 = load double, double* @b, align 8
+  %call = tail call double @pow(double %3, double 2.500000e-01) #2
+  %4 = load double, double* @b, align 8
+  %call.1 = tail call double @pow(double %4, double 2.500000e-01) #2
+  %inc.7 = add nsw i32 %i.04, 8
+  %exitcond.7 = icmp eq i32 %inc.7, %p1
+  br i1 %exitcond.7, label %for.end.loopexit.unr-lcssa, label %for.body
+
+for.end.loopexit.unr-lcssa:                       ; preds = %for.body
+  br label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body.preheader.split, %for.end.loopexit.unr-lcssa
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %5 = load i32, i32* @a, align 4
+  ret i32 %5
+}
+
+; Function Attrs: nounwind
+declare double @pow(double, double) #1
+
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
Index: test/CodeGen/X86/ragreedy-hoist-spill.ll
===================================================================
--- test/CodeGen/X86/ragreedy-hoist-spill.ll
+++ test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy | FileCheck %s
 
 ; This testing case is reduced from 254.gap SyFgets funciton.
-; We make sure a spill is not hoisted to a hotter outer loop.
+; We make sure a spill is hoisted to a cold BB inside the hotter outer loop.
 
 %struct.TMP.1 = type { %struct.TMP.2*, %struct.TMP.2*, [1024 x i8] }
 %struct.TMP.2 = type { i8*, i32, i32, i16, i16, %struct.TMP.3, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.TMP.3, %struct.TMP.4*, i32, [3 x i8], [1 x i8], %struct.TMP.3, i32, i64 }
@@ -181,6 +181,10 @@
   br i1 %cmp476, label %if.end517, label %do.body479.preheader
 
 do.body479.preheader:
+  ; CHECK: do.body479.preheader
+  ; spill is hoisted here. Although loop depth1 is even hotter than loop depth2, do.body479.preheader is cold.
+  ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+  ; CHECK: land.rhs485
   %cmp4833314 = icmp eq i8 undef, 0
   br i1 %cmp4833314, label %if.end517, label %land.rhs485
 
@@ -200,8 +204,8 @@
 
 lor.rhs500:
   ; CHECK: lor.rhs500
-  ; Make sure that we don't hoist the spill to outer loops.
-  ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+  ; Make sure spill is hoisted to a cold preheader in outside loop.
+  ; CHECK-NOT: movq %r{{.*}}, {{[0-9]+}}(%rsp)
   ; CHECK: callq {{.*}}maskrune
   %call3.i.i2792 = call i32 @__maskrune(i32 undef, i64 256)
   br i1 undef, label %land.lhs.true504, label %do.body479.backedge
Index: test/CodeGen/X86/vselect-minmax.ll
===================================================================
--- test/CodeGen/X86/vselect-minmax.ll
+++ test/CodeGen/X86/vselect-minmax.ll
@@ -4888,13 +4888,14 @@
 define <8 x i64> @test122(<8 x i64> %a, <8 x i64> %b) {
 ; SSE2-LABEL: test122:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa %xmm7, %xmm8
-; SSE2-NEXT:    movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm7, %xmm11
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm9
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm11, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
 ; SSE2-NEXT:    pxor %xmm10, %xmm0
@@ -5163,7 +5164,6 @@
 ; SSE2-LABEL: test124:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    movdqa %xmm7, %xmm11
-; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
@@ -5172,6 +5172,7 @@
 ; SSE2-NEXT:    movdqa %xmm7, %xmm8
 ; SSE2-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm10, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
@@ -5465,13 +5466,14 @@
 define <8 x i64> @test126(<8 x i64> %a, <8 x i64> %b) {
 ; SSE2-LABEL: test126:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa %xmm7, %xmm8
-; SSE2-NEXT:    movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm7, %xmm11
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm9
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm11, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
 ; SSE2-NEXT:    pxor %xmm10, %xmm0
@@ -5794,7 +5796,6 @@
 ; SSE2-LABEL: test128:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    movdqa %xmm7, %xmm11
-; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
@@ -5803,6 +5804,7 @@
 ; SSE2-NEXT:    movdqa %xmm7, %xmm8
 ; SSE2-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm10, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
@@ -7608,13 +7610,14 @@
 define <8 x i64> @test154(<8 x i64> %a, <8 x i64> %b) {
 ; SSE2-LABEL: test154:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa %xmm7, %xmm8
-; SSE2-NEXT:    movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm7, %xmm11
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm9
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    movdqa %xmm11, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
 ; SSE2-NEXT:    pxor %xmm10, %xmm0
@@ -7881,7 +7884,6 @@
 ; SSE2-LABEL: test156:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    movdqa %xmm7, %xmm11
-; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
@@ -7890,6 +7892,7 @@
 ; SSE2-NEXT:    movdqa %xmm7, %xmm8
 ; SSE2-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm10, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
@@ -8181,13 +8184,14 @@
 define <8 x i64> @test158(<8 x i64> %a, <8 x i64> %b) {
 ; SSE2-LABEL: test158:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa %xmm7, %xmm8
-; SSE2-NEXT:    movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm7, %xmm11
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm9
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm11, %xmm8
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
 ; SSE2-NEXT:    pxor %xmm10, %xmm0
@@ -8508,7 +8512,6 @@
 ; SSE2-LABEL: test160:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    movdqa %xmm7, %xmm11
-; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
@@ -8517,6 +8520,7 @@
 ; SSE2-NEXT:    movdqa %xmm7, %xmm8
 ; SSE2-NEXT:    pxor %xmm10, %xmm8
 ; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm10, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm11
 ; SSE2-NEXT:    pcmpgtd %xmm8, %xmm11
Index: test/CodeGen/X86/win-catchpad.ll
===================================================================
--- test/CodeGen/X86/win-catchpad.ll
+++ test/CodeGen/X86/win-catchpad.ll
@@ -92,9 +92,9 @@
 ; X86: subl $8, %esp
 ; X86: addl $12, %ebp
 ; X86: movl %esp, -[[sp_offset]](%ebp)
-; X86: leal -[[local_offs]](%ebp), %[[addr_reg:[a-z]+]]
 ; X86: movl -32(%ebp), %[[e_reg:[a-z]+]]
 ; X86: movl $1, -{{[0-9]+}}(%ebp)
+; X86: leal -[[local_offs]](%ebp), %[[addr_reg:[a-z]+]]
 ; X86-DAG: movl %[[addr_reg]], 4(%esp)
 ; X86-DAG: movl %[[e_reg]], (%esp)
 ; X86: calll _f
@@ -109,8 +109,8 @@
 ; X86: subl $8, %esp
 ; X86: addl $12, %ebp
 ; X86: movl %esp, -[[sp_offset]](%ebp)
-; X86: leal -[[local_offs]](%ebp), %[[addr_reg:[a-z]+]]
 ; X86: movl $1, -{{[0-9]+}}(%ebp)
+; X86: leal -[[local_offs]](%ebp), %[[addr_reg:[a-z]+]]
 ; X86-DAG: movl %[[addr_reg]], 4(%esp)
 ; X86-DAG: movl $3, (%esp)
 ; X86: calll _f