Index: include/llvm/CodeGen/MachinePipeliner.h
===================================================================
--- include/llvm/CodeGen/MachinePipeliner.h
+++ include/llvm/CodeGen/MachinePipeliner.h
@@ -40,11 +40,13 @@
 #ifndef LLVM_LIB_CODEGEN_MACHINEPIPELINER_H
 #define LLVM_LIB_CODEGEN_MACHINEPIPELINER_H
 
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
-
+#include <initializer_list>
 namespace llvm {
 
 class NodeSet;
@@ -61,6 +63,7 @@
   const MachineDominatorTree *MDT = nullptr;
   const InstrItineraryData *InstrItins;
   const TargetInstrInfo *TII = nullptr;
+  const MachineBranchProbabilityInfo *MBPI = nullptr;
   RegisterClassInfo RegClassInfo;
   bool disabledByPragma = false;
   unsigned II_setByPragma = 0;
@@ -93,6 +96,7 @@
     AU.addRequired<MachineLoopInfo>();
     AU.addRequired<MachineDominatorTree>();
     AU.addRequired<LiveIntervals>();
+    AU.addRequired<MachineBranchProbabilityInfo>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -203,6 +207,8 @@
       Mutations.push_back(llvm::make_unique<CopyToPhiMutation>());
   }
 
+  MachinePipeliner &getPass() { return Pass; }
+
   void schedule() override;
   void finishBlock() override;
 
@@ -658,6 +664,146 @@
   void dump() const;
 };
 
+//===----------------------------------------------------------------------===//
+// CFG generation support
+//===----------------------------------------------------------------------===//
+
+// A CFG wraps CFGBlocks and Stages.
+class CFG;
+// A CFGBlock represents one basic block in the generated CFG. It contains
+// Stages.
+class CFGBlock;
+
+// A Stage lives inside a CFGBlock and contains a clone of one stage of the
+// schedule.
+class Stage {
+public:
+  // Create a Stage within the given CFGBlock. The stage will be fed the given
+  // input stages, which must be either zero, one or two in size. A placeholder
+  // nullptr can be given for the last stage if it is not yet defined.
+  Stage(CFGBlock *Block, std::initializer_list<Stage *> Inputs,
+        unsigned StageIdx);
+
+  // When nullptr was given as the last input in the constructor, this method
+  // sets the last input.
+  void setLastInput(Stage *Input) {
+    assert(!Inputs.back());
+    Inputs.back() = Input;
+  }
+
+  // Perform remapping of the rewritten instructions. All inputs must be
+  // non-nullptr at this time.
+  void finalize();
+
+  // Remap all  registers used outside the original loop.
+  void rewriteUsesOutsideOfLoop();
+
+  const DenseMap<unsigned, unsigned> &remaps() { return PostRemaps; }
+  MachineBasicBlock *getMBB() { return MBB; }
+  CFG &getContext() { return Ctx; }
+  ArrayRef<MachineInstr *> getInstrs() const { return Instrs; }
+
+private:
+  CFG &Ctx;
+  CFGBlock *Block;
+  // The block we will write our instructions into.
+  MachineBasicBlock *MBB;
+  // The index of this stage.
+  unsigned StageIdx;
+  // The inputs to this stage. The last input (if Inputs.size() == 2) may be
+  // nullptr until finalize() is called.
+  SmallVector<Stage *, 2> Inputs;
+  // Our generated PHI instructions for collecting incoming values.
+  SmallVector<MachineInstr *, 4> Phis;
+  // Our generated instructions.
+  SmallVector<MachineInstr *, 4> Instrs;
+  // Map of original register to rewritten register on entry and including
+  // defs within this stage.
+  DenseMap<unsigned, unsigned> Remaps;
+  // Remaps, but with loop-carried PHI values updated.
+  DenseMap<unsigned, unsigned> PostRemaps;
+  // Map from generated PHI to its pre-rewrite register.
+  DenseMap<MachineInstr *, unsigned> PhiRegs;
+};
+
+// A CFGBlock manages the layout and rewriting of loop instructions. A
+// CFGBlock creates possibly multiple copies of the non-terminator,
+// non-phi instructions in OrigMBB. It handles PHI insertion and rewriting
+// instructions to use previously-computed values.
+class CFGBlock {
+public:
+  CFGBlock(CFG &Ctx);
+
+  // Creates a new Stage within this block.
+  Stage *CreateStage(std::initializer_list<Stage *> Inputs, unsigned StageIdx);
+
+  // Rewrite the instructions in this block to account for feeds from
+  // predecessors. This must be the final action on this CFGBlock. Returns the
+  // new block.
+  MachineBasicBlock *finalize();
+
+  // Add a single successor to this block. This updates the predecessors list
+  // of Succ.
+  void addSuccessor(CFGBlock &Succ);
+  // Add multiple successors to this block. This updates the predecessors list
+  // of TrueSucc and FalseSucc.
+  void addSuccessors(CFGBlock &TrueSucc, CFGBlock &FalseSucc);
+
+  // Set the branch condition (for a multiple-successor block) to Cond.
+  void setBranchCondition(SmallVector<MachineOperand, 4> Cond) {
+    BranchCondition = std::move(Cond);
+  }
+
+  // Reorder the instructions in this block to be according to the schedule
+  // order.
+  void sortAll();
+
+  CFG &getContext() { return Ctx; }
+  MachineBasicBlock *getMBB() { return MBB; }
+  ArrayRef<CFGBlock *> successors() const { return Successors; }
+  ArrayRef<CFGBlock *> predecessors() const { return Predecessors; }
+
+private:
+  CFG &Ctx;
+
+  std::list<Stage> Stages;
+  SmallVector<CFGBlock *, 2> Successors;
+  SmallVector<CFGBlock *, 2> Predecessors;
+
+  // The generated block.
+  MachineBasicBlock *MBB;
+  SmallVector<MachineOperand, 4> BranchCondition;
+};
+
+// Constructs and maintains state for a series of CFGBlocks. CFG
+// drives the cloning of instructions and creation of prolog / epilog blocks.
+// It does not insert the new region of blocks into the function.
+class CFG {
+public:
+  CFG(MachineLoop *Loop, SMSchedule &Schedule, SwingSchedulerDAG &DAG);
+
+  MachineBasicBlock *getEntryBlock() { return Entry; }
+  MachineBasicBlock *getExitBlock() { return Exit; }
+
+private:
+  friend class Stage;
+  friend class CFGBlock;
+
+  MachinePipeliner &Pass;
+  SMSchedule &Schedule;
+  MachineBasicBlock *OrigBB;
+  MachineRegisterInfo &MRI;
+  const TargetInstrInfo *TII;
+  MachineFunction &MF;
+  SwingSchedulerDAG &DAG;
+
+  // Map from cloned instruction to its original in OrigBB.
+  DenseMap<MachineInstr *, MachineInstr *> Rewrites;
+
+  MachineBasicBlock *Entry, *Exit;
+};
+
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_CODEGEN_MACHINEPIPELINER_H
Index: lib/CodeGen/MachinePipeliner.cpp
===================================================================
--- lib/CodeGen/MachinePipeliner.cpp
+++ lib/CodeGen/MachinePipeliner.cpp
@@ -29,6 +29,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/MachinePipeliner.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
@@ -54,7 +55,6 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachinePipeliner.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
@@ -206,6 +206,7 @@
   MLI = &getAnalysis<MachineLoopInfo>();
   MDT = &getAnalysis<MachineDominatorTree>();
   TII = MF->getSubtarget().getInstrInfo();
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   RegClassInfo.runOnMachineFunction(*MF);
 
   for (auto &L : *MLI)
@@ -515,7 +516,22 @@
     return;
   }
 
-  generatePipelinedLoop(Schedule);
+  CFG CFG(&Loop, Schedule, *this);
+  MachineBasicBlock *OrigPreheader = Loop.getLoopPreheader();
+  MachineBasicBlock *OrigLoopBlock = Loop.getTopBlock();
+  MachineBasicBlock *OrigExitBlock = Loop.getExitBlock();
+  OrigPreheader->removeSuccessor(Loop.getTopBlock());
+  OrigPreheader->addSuccessor(CFG.getEntryBlock());
+
+  if (TII->removeBranch(*OrigPreheader) > 0)
+    TII->insertUnconditionalBranch(*OrigPreheader, CFG.getEntryBlock(),
+                                   DebugLoc());
+
+  CFG.getExitBlock()->addSuccessor(OrigExitBlock);
+  TII->insertUnconditionalBranch(*CFG.getExitBlock(), OrigExitBlock,
+                                 DebugLoc());
+  OrigLoopBlock->removeSuccessor(OrigExitBlock);
+  OrigLoopBlock->eraseFromParent();
   ++NumPipelined;
 }
 
@@ -1989,7 +2005,6 @@
         if (SwpMaxStages > -1 &&
             Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
           scheduleFound = false;
-
       LLVM_DEBUG({
         if (!scheduleFound)
           dbgs() << "\tCan't schedule\n";
@@ -2012,836 +2027,6 @@
   return scheduleFound && Schedule.getMaxStageCount() > 0;
 }
 
-/// Given a schedule for the loop, generate a new version of the loop,
-/// and replace the old version.  This function generates a prolog
-/// that contains the initial iterations in the pipeline, and kernel
-/// loop, and the epilogue that contains the code for the final
-/// iterations.
-void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
-  // Create a new basic block for the kernel and add it to the CFG.
-  MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
-
-  unsigned MaxStageCount = Schedule.getMaxStageCount();
-
-  // Remember the registers that are used in different stages. The index is
-  // the iteration, or stage, that the instruction is scheduled in.  This is
-  // a map between register names in the original block and the names created
-  // in each stage of the pipelined loop.
-  ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
-  InstrMapTy InstrMap;
-
-  SmallVector<MachineBasicBlock *, 4> PrologBBs;
-
-  MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
-  assert(PreheaderBB != nullptr &&
-         "Need to add code to handle loops w/o preheader");
-  // Generate the prolog instructions that set up the pipeline.
-  generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs);
-  MF.insert(BB->getIterator(), KernelBB);
-
-  // Rearrange the instructions to generate the new, pipelined loop,
-  // and update register names as needed.
-  for (int Cycle = Schedule.getFirstCycle(),
-           LastCycle = Schedule.getFinalCycle();
-       Cycle <= LastCycle; ++Cycle) {
-    std::deque<SUnit *> &CycleInstrs = Schedule.getInstructions(Cycle);
-    // This inner loop schedules each instruction in the cycle.
-    for (SUnit *CI : CycleInstrs) {
-      if (CI->getInstr()->isPHI())
-        continue;
-      unsigned StageNum = Schedule.stageScheduled(getSUnit(CI->getInstr()));
-      MachineInstr *NewMI = cloneInstr(CI->getInstr(), MaxStageCount, StageNum);
-      updateInstruction(NewMI, false, MaxStageCount, StageNum, Schedule, VRMap);
-      KernelBB->push_back(NewMI);
-      InstrMap[NewMI] = CI->getInstr();
-    }
-  }
-
-  // Copy any terminator instructions to the new kernel, and update
-  // names as needed.
-  for (MachineBasicBlock::iterator I = BB->getFirstTerminator(),
-                                   E = BB->instr_end();
-       I != E; ++I) {
-    MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
-    updateInstruction(NewMI, false, MaxStageCount, 0, Schedule, VRMap);
-    KernelBB->push_back(NewMI);
-    InstrMap[NewMI] = &*I;
-  }
-
-  KernelBB->transferSuccessors(BB);
-  KernelBB->replaceSuccessor(BB, KernelBB);
-
-  generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule,
-                       VRMap, InstrMap, MaxStageCount, MaxStageCount, false);
-  generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule, VRMap,
-               InstrMap, MaxStageCount, MaxStageCount, false);
-
-  LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
-
-  SmallVector<MachineBasicBlock *, 4> EpilogBBs;
-  // Generate the epilog instructions to complete the pipeline.
-  generateEpilog(Schedule, MaxStageCount, KernelBB, VRMap, EpilogBBs,
-                 PrologBBs);
-
-  // We need this step because the register allocation doesn't handle some
-  // situations well, so we insert copies to help out.
-  splitLifetimes(KernelBB, EpilogBBs, Schedule);
-
-  // Remove dead instructions due to loop induction variables.
-  removeDeadInstructions(KernelBB, EpilogBBs);
-
-  // Add branches between prolog and epilog blocks.
-  addBranches(*PreheaderBB, PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
-
-  // Remove the original loop since it's no longer referenced.
-  for (auto &I : *BB)
-    LIS.RemoveMachineInstrFromMaps(I);
-  BB->clear();
-  BB->eraseFromParent();
-
-  delete[] VRMap;
-}
-
-/// Generate the pipeline prolog code.
-void SwingSchedulerDAG::generateProlog(SMSchedule &Schedule, unsigned LastStage,
-                                       MachineBasicBlock *KernelBB,
-                                       ValueMapTy *VRMap,
-                                       MBBVectorTy &PrologBBs) {
-  MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
-  assert(PreheaderBB != nullptr &&
-         "Need to add code to handle loops w/o preheader");
-  MachineBasicBlock *PredBB = PreheaderBB;
-  InstrMapTy InstrMap;
-
-  // Generate a basic block for each stage, not including the last stage,
-  // which will be generated in the kernel. Each basic block may contain
-  // instructions from multiple stages/iterations.
-  for (unsigned i = 0; i < LastStage; ++i) {
-    // Create and insert the prolog basic block prior to the original loop
-    // basic block.  The original loop is removed later.
-    MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
-    PrologBBs.push_back(NewBB);
-    MF.insert(BB->getIterator(), NewBB);
-    NewBB->transferSuccessors(PredBB);
-    PredBB->addSuccessor(NewBB);
-    PredBB = NewBB;
-
-    // Generate instructions for each appropriate stage. Process instructions
-    // in original program order.
-    for (int StageNum = i; StageNum >= 0; --StageNum) {
-      for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
-                                       BBE = BB->getFirstTerminator();
-           BBI != BBE; ++BBI) {
-        if (Schedule.isScheduledAtStage(getSUnit(&*BBI), (unsigned)StageNum)) {
-          if (BBI->isPHI())
-            continue;
-          MachineInstr *NewMI =
-              cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum, Schedule);
-          updateInstruction(NewMI, false, i, (unsigned)StageNum, Schedule,
-                            VRMap);
-          NewBB->push_back(NewMI);
-          InstrMap[NewMI] = &*BBI;
-        }
-      }
-    }
-    rewritePhiValues(NewBB, i, Schedule, VRMap, InstrMap);
-    LLVM_DEBUG({
-      dbgs() << "prolog:\n";
-      NewBB->dump();
-    });
-  }
-
-  PredBB->replaceSuccessor(BB, KernelBB);
-
-  // Check if we need to remove the branch from the preheader to the original
-  // loop, and replace it with a branch to the new loop.
-  unsigned numBranches = TII->removeBranch(*PreheaderBB);
-  if (numBranches) {
-    SmallVector<MachineOperand, 0> Cond;
-    TII->insertBranch(*PreheaderBB, PrologBBs[0], nullptr, Cond, DebugLoc());
-  }
-}
-
-/// Generate the pipeline epilog code. The epilog code finishes the iterations
-/// that were started in either the prolog or the kernel.  We create a basic
-/// block for each stage that needs to complete.
-void SwingSchedulerDAG::generateEpilog(SMSchedule &Schedule, unsigned LastStage,
-                                       MachineBasicBlock *KernelBB,
-                                       ValueMapTy *VRMap,
-                                       MBBVectorTy &EpilogBBs,
-                                       MBBVectorTy &PrologBBs) {
-  // We need to change the branch from the kernel to the first epilog block, so
-  // this call to analyze branch uses the kernel rather than the original BB.
-  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
-  SmallVector<MachineOperand, 4> Cond;
-  bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
-  assert(!checkBranch && "generateEpilog must be able to analyze the branch");
-  if (checkBranch)
-    return;
-
-  MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
-  if (*LoopExitI == KernelBB)
-    ++LoopExitI;
-  assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
-  MachineBasicBlock *LoopExitBB = *LoopExitI;
-
-  MachineBasicBlock *PredBB = KernelBB;
-  MachineBasicBlock *EpilogStart = LoopExitBB;
-  InstrMapTy InstrMap;
-
-  // Generate a basic block for each stage, not including the last stage,
-  // which was generated for the kernel.  Each basic block may contain
-  // instructions from multiple stages/iterations.
-  int EpilogStage = LastStage + 1;
-  for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
-    MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
-    EpilogBBs.push_back(NewBB);
-    MF.insert(BB->getIterator(), NewBB);
-
-    PredBB->replaceSuccessor(LoopExitBB, NewBB);
-    NewBB->addSuccessor(LoopExitBB);
-
-    if (EpilogStart == LoopExitBB)
-      EpilogStart = NewBB;
-
-    // Add instructions to the epilog depending on the current block.
-    // Process instructions in original program order.
-    for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
-      for (auto &BBI : *BB) {
-        if (BBI.isPHI())
-          continue;
-        MachineInstr *In = &BBI;
-        if (Schedule.isScheduledAtStage(getSUnit(In), StageNum)) {
-          // Instructions with memoperands in the epilog are updated with
-          // conservative values.
-          MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
-          updateInstruction(NewMI, i == 1, EpilogStage, 0, Schedule, VRMap);
-          NewBB->push_back(NewMI);
-          InstrMap[NewMI] = In;
-        }
-      }
-    }
-    generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule,
-                         VRMap, InstrMap, LastStage, EpilogStage, i == 1);
-    generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule, VRMap,
-                 InstrMap, LastStage, EpilogStage, i == 1);
-    PredBB = NewBB;
-
-    LLVM_DEBUG({
-      dbgs() << "epilog:\n";
-      NewBB->dump();
-    });
-  }
-
-  // Fix any Phi nodes in the loop exit block.
-  for (MachineInstr &MI : *LoopExitBB) {
-    if (!MI.isPHI())
-      break;
-    for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
-      MachineOperand &MO = MI.getOperand(i);
-      if (MO.getMBB() == BB)
-        MO.setMBB(PredBB);
-    }
-  }
-
-  // Create a branch to the new epilog from the kernel.
-  // Remove the original branch and add a new branch to the epilog.
-  TII->removeBranch(*KernelBB);
-  TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
-  // Add a branch to the loop exit.
-  if (EpilogBBs.size() > 0) {
-    MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
-    SmallVector<MachineOperand, 4> Cond1;
-    TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
-  }
-}
-
-/// Replace all uses of FromReg that appear outside the specified
-/// basic block with ToReg.
-static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
-                                    MachineBasicBlock *MBB,
-                                    MachineRegisterInfo &MRI,
-                                    LiveIntervals &LIS) {
-  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
-                                         E = MRI.use_end();
-       I != E;) {
-    MachineOperand &O = *I;
-    ++I;
-    if (O.getParent()->getParent() != MBB)
-      O.setReg(ToReg);
-  }
-  if (!LIS.hasInterval(ToReg))
-    LIS.createEmptyInterval(ToReg);
-}
-
-/// Return true if the register has a use that occurs outside the
-/// specified loop.
-static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
-                            MachineRegisterInfo &MRI) {
-  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
-                                         E = MRI.use_end();
-       I != E; ++I)
-    if (I->getParent()->getParent() != BB)
-      return true;
-  return false;
-}
-
-/// Generate Phis for the specific block in the generated pipelined code.
-/// This function looks at the Phis from the original code to guide the
-/// creation of new Phis.
-void SwingSchedulerDAG::generateExistingPhis(
-    MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
-    MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
-    InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
-    bool IsLast) {
-  // Compute the stage number for the initial value of the Phi, which
-  // comes from the prolog. The prolog to use depends on to which kernel/
-  // epilog that we're adding the Phi.
-  unsigned PrologStage = 0;
-  unsigned PrevStage = 0;
-  bool InKernel = (LastStageNum == CurStageNum);
-  if (InKernel) {
-    PrologStage = LastStageNum - 1;
-    PrevStage = CurStageNum;
-  } else {
-    PrologStage = LastStageNum - (CurStageNum - LastStageNum);
-    PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
-  }
-
-  for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
-                                   BBE = BB->getFirstNonPHI();
-       BBI != BBE; ++BBI) {
-    unsigned Def = BBI->getOperand(0).getReg();
-
-    unsigned InitVal = 0;
-    unsigned LoopVal = 0;
-    getPhiRegs(*BBI, BB, InitVal, LoopVal);
-
-    unsigned PhiOp1 = 0;
-    // The Phi value from the loop body typically is defined in the loop, but
-    // not always. So, we need to check if the value is defined in the loop.
-    unsigned PhiOp2 = LoopVal;
-    if (VRMap[LastStageNum].count(LoopVal))
-      PhiOp2 = VRMap[LastStageNum][LoopVal];
-
-    int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
-    int LoopValStage =
-        Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
-    unsigned NumStages = Schedule.getStagesForReg(Def, CurStageNum);
-    if (NumStages == 0) {
-      // We don't need to generate a Phi anymore, but we need to rename any uses
-      // of the Phi value.
-      unsigned NewReg = VRMap[PrevStage][LoopVal];
-      rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, 0, &*BBI,
-                            Def, InitVal, NewReg);
-      if (VRMap[CurStageNum].count(LoopVal))
-        VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
-    }
-    // Adjust the number of Phis needed depending on the number of prologs left,
-    // and the distance from where the Phi is first scheduled. The number of
-    // Phis cannot exceed the number of prolog stages. Each stage can
-    // potentially define two values.
-    unsigned MaxPhis = PrologStage + 2;
-    if (!InKernel && (int)PrologStage <= LoopValStage)
-      MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
-    unsigned NumPhis = std::min(NumStages, MaxPhis);
-
-    unsigned NewReg = 0;
-    unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
-    // In the epilog, we may need to look back one stage to get the correct
-    // Phi name because the epilog and prolog blocks execute the same stage.
-    // The correct name is from the previous block only when the Phi has
-    // been completely scheduled prior to the epilog, and Phi value is not
-    // needed in multiple stages.
-    int StageDiff = 0;
-    if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
-        NumPhis == 1)
-      StageDiff = 1;
-    // Adjust the computations below when the phi and the loop definition
-    // are scheduled in different stages.
-    if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
-      StageDiff = StageScheduled - LoopValStage;
-    for (unsigned np = 0; np < NumPhis; ++np) {
-      // If the Phi hasn't been scheduled, then use the initial Phi operand
-      // value. Otherwise, use the scheduled version of the instruction. This
-      // is a little complicated when a Phi references another Phi.
-      if (np > PrologStage || StageScheduled >= (int)LastStageNum)
-        PhiOp1 = InitVal;
-      // Check if the Phi has already been scheduled in a prolog stage.
-      else if (PrologStage >= AccessStage + StageDiff + np &&
-               VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
-        PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
-      // Check if the Phi has already been scheduled, but the loop instruction
-      // is either another Phi, or doesn't occur in the loop.
-      else if (PrologStage >= AccessStage + StageDiff + np) {
-        // If the Phi references another Phi, we need to examine the other
-        // Phi to get the correct value.
-        PhiOp1 = LoopVal;
-        MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
-        int Indirects = 1;
-        while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
-          int PhiStage = Schedule.stageScheduled(getSUnit(InstOp1));
-          if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
-            PhiOp1 = getInitPhiReg(*InstOp1, BB);
-          else
-            PhiOp1 = getLoopPhiReg(*InstOp1, BB);
-          InstOp1 = MRI.getVRegDef(PhiOp1);
-          int PhiOpStage = Schedule.stageScheduled(getSUnit(InstOp1));
-          int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
-          if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
-              VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
-            PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
-            break;
-          }
-          ++Indirects;
-        }
-      } else
-        PhiOp1 = InitVal;
-      // If this references a generated Phi in the kernel, get the Phi operand
-      // from the incoming block.
-      if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
-        if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
-          PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
-
-      MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
-      bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
-      // In the epilog, a map lookup is needed to get the value from the kernel,
-      // or previous epilog block. How is does this depends on if the
-      // instruction is scheduled in the previous block.
-      if (!InKernel) {
-        int StageDiffAdj = 0;
-        if (LoopValStage != -1 && StageScheduled > LoopValStage)
-          StageDiffAdj = StageScheduled - LoopValStage;
-        // Use the loop value defined in the kernel, unless the kernel
-        // contains the last definition of the Phi.
-        if (np == 0 && PrevStage == LastStageNum &&
-            (StageScheduled != 0 || LoopValStage != 0) &&
-            VRMap[PrevStage - StageDiffAdj].count(LoopVal))
-          PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
-        // Use the value defined by the Phi. We add one because we switch
-        // from looking at the loop value to the Phi definition.
-        else if (np > 0 && PrevStage == LastStageNum &&
-                 VRMap[PrevStage - np + 1].count(Def))
-          PhiOp2 = VRMap[PrevStage - np + 1][Def];
-        // Use the loop value defined in the kernel.
-        else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
-                 VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
-          PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
-        // Use the value defined by the Phi, unless we're generating the first
-        // epilog and the Phi refers to a Phi in a different stage.
-        else if (VRMap[PrevStage - np].count(Def) &&
-                 (!LoopDefIsPhi || (PrevStage != LastStageNum) || (LoopValStage == StageScheduled)))
-          PhiOp2 = VRMap[PrevStage - np][Def];
-      }
-
-      // Check if we can reuse an existing Phi. This occurs when a Phi
-      // references another Phi, and the other Phi is scheduled in an
-      // earlier stage. We can try to reuse an existing Phi up until the last
-      // stage of the current Phi.
-      if (LoopDefIsPhi) {
-        if (static_cast<int>(PrologStage - np) >= StageScheduled) {
-          int LVNumStages = Schedule.getStagesForPhi(LoopVal);
-          int StageDiff = (StageScheduled - LoopValStage);
-          LVNumStages -= StageDiff;
-          // Make sure the loop value Phi has been processed already.
-          if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
-            NewReg = PhiOp2;
-            unsigned ReuseStage = CurStageNum;
-            if (Schedule.isLoopCarried(this, *PhiInst))
-              ReuseStage -= LVNumStages;
-            // Check if the Phi to reuse has been generated yet. If not, then
-            // there is nothing to reuse.
-            if (VRMap[ReuseStage - np].count(LoopVal)) {
-              NewReg = VRMap[ReuseStage - np][LoopVal];
-
-              rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
-                                    &*BBI, Def, NewReg);
-              // Update the map with the new Phi name.
-              VRMap[CurStageNum - np][Def] = NewReg;
-              PhiOp2 = NewReg;
-              if (VRMap[LastStageNum - np - 1].count(LoopVal))
-                PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
-
-              if (IsLast && np == NumPhis - 1)
-                replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
-              continue;
-            }
-          }
-        }
-        if (InKernel && StageDiff > 0 &&
-            VRMap[CurStageNum - StageDiff - np].count(LoopVal))
-          PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
-      }
-
-      const TargetRegisterClass *RC = MRI.getRegClass(Def);
-      NewReg = MRI.createVirtualRegister(RC);
-
-      MachineInstrBuilder NewPhi =
-          BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
-                  TII->get(TargetOpcode::PHI), NewReg);
-      NewPhi.addReg(PhiOp1).addMBB(BB1);
-      NewPhi.addReg(PhiOp2).addMBB(BB2);
-      if (np == 0)
-        InstrMap[NewPhi] = &*BBI;
-
-      // We define the Phis after creating the new pipelined code, so
-      // we need to rename the Phi values in scheduled instructions.
-
-      unsigned PrevReg = 0;
-      if (InKernel && VRMap[PrevStage - np].count(LoopVal))
-        PrevReg = VRMap[PrevStage - np][LoopVal];
-      rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
-                            Def, NewReg, PrevReg);
-      // If the Phi has been scheduled, use the new name for rewriting.
-      if (VRMap[CurStageNum - np].count(Def)) {
-        unsigned R = VRMap[CurStageNum - np][Def];
-        rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
-                              R, NewReg);
-      }
-
-      // Check if we need to rename any uses that occurs after the loop. The
-      // register to replace depends on whether the Phi is scheduled in the
-      // epilog.
-      if (IsLast && np == NumPhis - 1)
-        replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
-
-      // In the kernel, a dependent Phi uses the value from this Phi.
-      if (InKernel)
-        PhiOp2 = NewReg;
-
-      // Update the map with the new Phi name.
-      VRMap[CurStageNum - np][Def] = NewReg;
-    }
-
-    while (NumPhis++ < NumStages) {
-      rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, NumPhis,
-                            &*BBI, Def, NewReg, 0);
-    }
-
-    // Check if we need to rename a Phi that has been eliminated due to
-    // scheduling.
-    if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
-      replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
-  }
-}
-
-/// Generate Phis for the specified block in the generated pipelined code.
-/// These are new Phis needed because the definition is scheduled after the
-/// use in the pipelined sequence.
-void SwingSchedulerDAG::generatePhis(
-    MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
-    MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
-    InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
-    bool IsLast) {
-  // Compute the stage number that contains the initial Phi value, and
-  // the Phi from the previous stage.
-  unsigned PrologStage = 0;
-  unsigned PrevStage = 0;
-  unsigned StageDiff = CurStageNum - LastStageNum;
-  bool InKernel = (StageDiff == 0);
-  if (InKernel) {
-    PrologStage = LastStageNum - 1;
-    PrevStage = CurStageNum;
-  } else {
-    PrologStage = LastStageNum - StageDiff;
-    PrevStage = LastStageNum + StageDiff - 1;
-  }
-
-  for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
-                                   BBE = BB->instr_end();
-       BBI != BBE; ++BBI) {
-    for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = BBI->getOperand(i);
-      if (!MO.isReg() || !MO.isDef() ||
-          !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-        continue;
-
-      int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
-      assert(StageScheduled != -1 && "Expecting scheduled instruction.");
-      unsigned Def = MO.getReg();
-      unsigned NumPhis = Schedule.getStagesForReg(Def, CurStageNum);
-      // An instruction scheduled in stage 0 and is used after the loop
-      // requires a phi in the epilog for the last definition from either
-      // the kernel or prolog.
-      if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
-          hasUseAfterLoop(Def, BB, MRI))
-        NumPhis = 1;
-      if (!InKernel && (unsigned)StageScheduled > PrologStage)
-        continue;
-
-      unsigned PhiOp2 = VRMap[PrevStage][Def];
-      if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
-        if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
-          PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
-      // The number of Phis can't exceed the number of prolog stages. The
-      // prolog stage number is zero based.
-      if (NumPhis > PrologStage + 1 - StageScheduled)
-        NumPhis = PrologStage + 1 - StageScheduled;
-      for (unsigned np = 0; np < NumPhis; ++np) {
-        unsigned PhiOp1 = VRMap[PrologStage][Def];
-        if (np <= PrologStage)
-          PhiOp1 = VRMap[PrologStage - np][Def];
-        if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1)) {
-          if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
-            PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
-          if (InstOp1->isPHI() && InstOp1->getParent() == NewBB)
-            PhiOp1 = getInitPhiReg(*InstOp1, NewBB);
-        }
-        if (!InKernel)
-          PhiOp2 = VRMap[PrevStage - np][Def];
-
-        const TargetRegisterClass *RC = MRI.getRegClass(Def);
-        unsigned NewReg = MRI.createVirtualRegister(RC);
-
-        MachineInstrBuilder NewPhi =
-            BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
-                    TII->get(TargetOpcode::PHI), NewReg);
-        NewPhi.addReg(PhiOp1).addMBB(BB1);
-        NewPhi.addReg(PhiOp2).addMBB(BB2);
-        if (np == 0)
-          InstrMap[NewPhi] = &*BBI;
-
-        // Rewrite uses and update the map. The actions depend upon whether
-        // we generating code for the kernel or epilog blocks.
-        if (InKernel) {
-          rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
-                                &*BBI, PhiOp1, NewReg);
-          rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
-                                &*BBI, PhiOp2, NewReg);
-
-          PhiOp2 = NewReg;
-          VRMap[PrevStage - np - 1][Def] = NewReg;
-        } else {
-          VRMap[CurStageNum - np][Def] = NewReg;
-          if (np == NumPhis - 1)
-            rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
-                                  &*BBI, Def, NewReg);
-        }
-        if (IsLast && np == NumPhis - 1)
-          replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
-      }
-    }
-  }
-}
-
-/// Remove instructions that generate values with no uses.
-/// Typically, these are induction variable operations that generate values
-/// used in the loop itself.  A dead instruction has a definition with
-/// no uses, or uses that occur in the original loop only.
-void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB,
-                                               MBBVectorTy &EpilogBBs) {
-  // For each epilog block, check that the value defined by each instruction
-  // is used.  If not, delete it.
-  for (MBBVectorTy::reverse_iterator MBB = EpilogBBs.rbegin(),
-                                     MBE = EpilogBBs.rend();
-       MBB != MBE; ++MBB)
-    for (MachineBasicBlock::reverse_instr_iterator MI = (*MBB)->instr_rbegin(),
-                                                   ME = (*MBB)->instr_rend();
-         MI != ME;) {
-      // From DeadMachineInstructionElem. Don't delete inline assembly.
-      if (MI->isInlineAsm()) {
-        ++MI;
-        continue;
-      }
-      bool SawStore = false;
-      // Check if it's safe to remove the instruction due to side effects.
-      // We can, and want to, remove Phis here.
-      if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
-        ++MI;
-        continue;
-      }
-      bool used = true;
-      for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
-                                      MOE = MI->operands_end();
-           MOI != MOE; ++MOI) {
-        if (!MOI->isReg() || !MOI->isDef())
-          continue;
-        unsigned reg = MOI->getReg();
-        // Assume physical registers are used, unless they are marked dead.
-        if (TargetRegisterInfo::isPhysicalRegister(reg)) {
-          used = !MOI->isDead();
-          if (used)
-            break;
-          continue;
-        }
-        unsigned realUses = 0;
-        for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg),
-                                               EI = MRI.use_end();
-             UI != EI; ++UI) {
-          // Check if there are any uses that occur only in the original
-          // loop.  If so, that's not a real use.
-          if (UI->getParent()->getParent() != BB) {
-            realUses++;
-            used = true;
-            break;
-          }
-        }
-        if (realUses > 0)
-          break;
-        used = false;
-      }
-      if (!used) {
-        LIS.RemoveMachineInstrFromMaps(*MI);
-        MI++->eraseFromParent();
-        continue;
-      }
-      ++MI;
-    }
-  // In the kernel block, check if we can remove a Phi that generates a value
-  // used in an instruction removed in the epilog block.
-  for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
-                                   BBE = KernelBB->getFirstNonPHI();
-       BBI != BBE;) {
-    MachineInstr *MI = &*BBI;
-    ++BBI;
-    unsigned reg = MI->getOperand(0).getReg();
-    if (MRI.use_begin(reg) == MRI.use_end()) {
-      LIS.RemoveMachineInstrFromMaps(*MI);
-      MI->eraseFromParent();
-    }
-  }
-}
-
-/// For loop carried definitions, we split the lifetime of a virtual register
-/// that has uses past the definition in the next iteration. A copy with a new
-/// virtual register is inserted before the definition, which helps with
-/// generating a better register assignment.
-///
-///   v1 = phi(a, v2)     v1 = phi(a, v2)
-///   v2 = phi(b, v3)     v2 = phi(b, v3)
-///   v3 = ..             v4 = copy v1
-///   .. = V1             v3 = ..
-///                       .. = v4
-void SwingSchedulerDAG::splitLifetimes(MachineBasicBlock *KernelBB,
-                                       MBBVectorTy &EpilogBBs,
-                                       SMSchedule &Schedule) {
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  for (auto &PHI : KernelBB->phis()) {
-    unsigned Def = PHI.getOperand(0).getReg();
-    // Check for any Phi definition that used as an operand of another Phi
-    // in the same block.
-    for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
-                                                 E = MRI.use_instr_end();
-         I != E; ++I) {
-      if (I->isPHI() && I->getParent() == KernelBB) {
-        // Get the loop carried definition.
-        unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
-        if (!LCDef)
-          continue;
-        MachineInstr *MI = MRI.getVRegDef(LCDef);
-        if (!MI || MI->getParent() != KernelBB || MI->isPHI())
-          continue;
-        // Search through the rest of the block looking for uses of the Phi
-        // definition. If one occurs, then split the lifetime.
-        unsigned SplitReg = 0;
-        for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
-                                    KernelBB->instr_end()))
-          if (BBJ.readsRegister(Def)) {
-            // We split the lifetime when we find the first use.
-            if (SplitReg == 0) {
-              SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
-              BuildMI(*KernelBB, MI, MI->getDebugLoc(),
-                      TII->get(TargetOpcode::COPY), SplitReg)
-                  .addReg(Def);
-            }
-            BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
-          }
-        if (!SplitReg)
-          continue;
-        // Search through each of the epilog blocks for any uses to be renamed.
-        for (auto &Epilog : EpilogBBs)
-          for (auto &I : *Epilog)
-            if (I.readsRegister(Def))
-              I.substituteRegister(Def, SplitReg, 0, *TRI);
-        break;
-      }
-    }
-  }
-}
-
-/// Remove the incoming block from the Phis in a basic block.
-static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
-  for (MachineInstr &MI : *BB) {
-    if (!MI.isPHI())
-      break;
-    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
-      if (MI.getOperand(i + 1).getMBB() == Incoming) {
-        MI.RemoveOperand(i + 1);
-        MI.RemoveOperand(i);
-        break;
-      }
-  }
-}
-
-/// Create branches from each prolog basic block to the appropriate epilog
-/// block.  These edges are needed if the loop ends before reaching the
-/// kernel.
-void SwingSchedulerDAG::addBranches(MachineBasicBlock &PreheaderBB,
-                                    MBBVectorTy &PrologBBs,
-                                    MachineBasicBlock *KernelBB,
-                                    MBBVectorTy &EpilogBBs,
-                                    SMSchedule &Schedule, ValueMapTy *VRMap) {
-  assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
-  MachineInstr *IndVar = Pass.LI.LoopInductionVar;
-  MachineInstr *Cmp = Pass.LI.LoopCompare;
-  MachineBasicBlock *LastPro = KernelBB;
-  MachineBasicBlock *LastEpi = KernelBB;
-
-  // Start from the blocks connected to the kernel and work "out"
-  // to the first prolog and the last epilog blocks.
-  SmallVector<MachineInstr *, 4> PrevInsts;
-  unsigned MaxIter = PrologBBs.size() - 1;
-  unsigned LC = UINT_MAX;
-  unsigned LCMin = UINT_MAX;
-  for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
-    // Add branches to the prolog that go to the corresponding
-    // epilog, and the fall-thru prolog/kernel block.
-    MachineBasicBlock *Prolog = PrologBBs[j];
-    MachineBasicBlock *Epilog = EpilogBBs[i];
-    // We've executed one iteration, so decrement the loop count and check for
-    // the loop end.
-    SmallVector<MachineOperand, 4> Cond;
-    // Check if the LOOP0 has already been removed. If so, then there is no need
-    // to reduce the trip count.
-    if (LC != 0)
-      LC = TII->reduceLoopCount(*Prolog, PreheaderBB, IndVar, *Cmp, Cond,
-                                PrevInsts, j, MaxIter);
-
-    // Record the value of the first trip count, which is used to determine if
-    // branches and blocks can be removed for constant trip counts.
-    if (LCMin == UINT_MAX)
-      LCMin = LC;
-
-    unsigned numAdded = 0;
-    if (TargetRegisterInfo::isVirtualRegister(LC)) {
-      Prolog->addSuccessor(Epilog);
-      numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
-    } else if (j >= LCMin) {
-      Prolog->addSuccessor(Epilog);
-      Prolog->removeSuccessor(LastPro);
-      LastEpi->removeSuccessor(Epilog);
-      numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
-      removePhis(Epilog, LastEpi);
-      // Remove the blocks that are no longer referenced.
-      if (LastPro != LastEpi) {
-        LastEpi->clear();
-        LastEpi->eraseFromParent();
-      }
-      LastPro->clear();
-      LastPro->eraseFromParent();
-    } else {
-      numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
-      removePhis(Epilog, Prolog);
-    }
-    LastPro = Prolog;
-    LastEpi = Epilog;
-    for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
-                                                   E = Prolog->instr_rend();
-         I != E && numAdded > 0; ++I, --numAdded)
-      updateInstruction(&*I, false, j, 0, Schedule, VRMap);
-  }
-}
-
 /// Return true if we can compute the amount the instruction changes
 /// during each iteration. Set Delta to the amount of the change.
 bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
@@ -2907,228 +2092,6 @@
   NewMI.setMemRefs(MF, NewMMOs);
 }
 
-/// Clone the instruction for the new pipelined loop and update the
-/// memory operands, if needed.
-MachineInstr *SwingSchedulerDAG::cloneInstr(MachineInstr *OldMI,
-                                            unsigned CurStageNum,
-                                            unsigned InstStageNum) {
-  MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
-  // Check for tied operands in inline asm instructions. This should be handled
-  // elsewhere, but I'm not sure of the best solution.
-  if (OldMI->isInlineAsm())
-    for (unsigned i = 0, e = OldMI->getNumOperands(); i != e; ++i) {
-      const auto &MO = OldMI->getOperand(i);
-      if (MO.isReg() && MO.isUse())
-        break;
-      unsigned UseIdx;
-      if (OldMI->isRegTiedToUseOperand(i, &UseIdx))
-        NewMI->tieOperands(i, UseIdx);
-    }
-  updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
-  return NewMI;
-}
-
-/// Clone the instruction for the new pipelined loop. If needed, this
-/// function updates the instruction using the values saved in the
-/// InstrChanges structure.
-MachineInstr *SwingSchedulerDAG::cloneAndChangeInstr(MachineInstr *OldMI,
-                                                     unsigned CurStageNum,
-                                                     unsigned InstStageNum,
-                                                     SMSchedule &Schedule) {
-  MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
-  DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
-      InstrChanges.find(getSUnit(OldMI));
-  if (It != InstrChanges.end()) {
-    std::pair<unsigned, int64_t> RegAndOffset = It->second;
-    unsigned BasePos, OffsetPos;
-    if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
-      return nullptr;
-    int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
-    MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
-    if (Schedule.stageScheduled(getSUnit(LoopDef)) > (signed)InstStageNum)
-      NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
-    NewMI->getOperand(OffsetPos).setImm(NewOffset);
-  }
-  updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
-  return NewMI;
-}
-
-/// Update the machine instruction with new virtual registers.  This
-/// function may change the defintions and/or uses.
-void SwingSchedulerDAG::updateInstruction(MachineInstr *NewMI, bool LastDef,
-                                          unsigned CurStageNum,
-                                          unsigned InstrStageNum,
-                                          SMSchedule &Schedule,
-                                          ValueMapTy *VRMap) {
-  for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = NewMI->getOperand(i);
-    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-      continue;
-    unsigned reg = MO.getReg();
-    if (MO.isDef()) {
-      // Create a new virtual register for the definition.
-      const TargetRegisterClass *RC = MRI.getRegClass(reg);
-      unsigned NewReg = MRI.createVirtualRegister(RC);
-      MO.setReg(NewReg);
-      VRMap[CurStageNum][reg] = NewReg;
-      if (LastDef)
-        replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
-    } else if (MO.isUse()) {
-      MachineInstr *Def = MRI.getVRegDef(reg);
-      // Compute the stage that contains the last definition for instruction.
-      int DefStageNum = Schedule.stageScheduled(getSUnit(Def));
-      unsigned StageNum = CurStageNum;
-      if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
-        // Compute the difference in stages between the defintion and the use.
-        unsigned StageDiff = (InstrStageNum - DefStageNum);
-        // Make an adjustment to get the last definition.
-        StageNum -= StageDiff;
-      }
-      if (VRMap[StageNum].count(reg))
-        MO.setReg(VRMap[StageNum][reg]);
-    }
-  }
-}
-
-/// Return the instruction in the loop that defines the register.
-/// If the definition is a Phi, then follow the Phi operand to
-/// the instruction in the loop.
-MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
-  SmallPtrSet<MachineInstr *, 8> Visited;
-  MachineInstr *Def = MRI.getVRegDef(Reg);
-  while (Def->isPHI()) {
-    if (!Visited.insert(Def).second)
-      break;
-    for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
-      if (Def->getOperand(i + 1).getMBB() == BB) {
-        Def = MRI.getVRegDef(Def->getOperand(i).getReg());
-        break;
-      }
-  }
-  return Def;
-}
-
-/// Return the new name for the value from the previous stage.
-unsigned SwingSchedulerDAG::getPrevMapVal(unsigned StageNum, unsigned PhiStage,
-                                          unsigned LoopVal, unsigned LoopStage,
-                                          ValueMapTy *VRMap,
-                                          MachineBasicBlock *BB) {
-  unsigned PrevVal = 0;
-  if (StageNum > PhiStage) {
-    MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
-    if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
-      // The name is defined in the previous stage.
-      PrevVal = VRMap[StageNum - 1][LoopVal];
-    else if (VRMap[StageNum].count(LoopVal))
-      // The previous name is defined in the current stage when the instruction
-      // order is swapped.
-      PrevVal = VRMap[StageNum][LoopVal];
-    else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
-      // The loop value hasn't yet been scheduled.
-      PrevVal = LoopVal;
-    else if (StageNum == PhiStage + 1)
-      // The loop value is another phi, which has not been scheduled.
-      PrevVal = getInitPhiReg(*LoopInst, BB);
-    else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
-      // The loop value is another phi, which has been scheduled.
-      PrevVal =
-          getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
-                        LoopStage, VRMap, BB);
-  }
-  return PrevVal;
-}
-
-/// Rewrite the Phi values in the specified block to use the mappings
-/// from the initial operand. Once the Phi is scheduled, we switch
-/// to using the loop value instead of the Phi value, so those names
-/// do not need to be rewritten.
-void SwingSchedulerDAG::rewritePhiValues(MachineBasicBlock *NewBB,
-                                         unsigned StageNum,
-                                         SMSchedule &Schedule,
-                                         ValueMapTy *VRMap,
-                                         InstrMapTy &InstrMap) {
-  for (auto &PHI : BB->phis()) {
-    unsigned InitVal = 0;
-    unsigned LoopVal = 0;
-    getPhiRegs(PHI, BB, InitVal, LoopVal);
-    unsigned PhiDef = PHI.getOperand(0).getReg();
-
-    unsigned PhiStage =
-        (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(PhiDef)));
-    unsigned LoopStage =
-        (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
-    unsigned NumPhis = Schedule.getStagesForPhi(PhiDef);
-    if (NumPhis > StageNum)
-      NumPhis = StageNum;
-    for (unsigned np = 0; np <= NumPhis; ++np) {
-      unsigned NewVal =
-          getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
-      if (!NewVal)
-        NewVal = InitVal;
-      rewriteScheduledInstr(NewBB, Schedule, InstrMap, StageNum - np, np, &PHI,
-                            PhiDef, NewVal);
-    }
-  }
-}
-
-/// Rewrite a previously scheduled instruction to use the register value
-/// from the new instruction. Make sure the instruction occurs in the
-/// basic block, and we don't change the uses in the new instruction.
-void SwingSchedulerDAG::rewriteScheduledInstr(
-    MachineBasicBlock *BB, SMSchedule &Schedule, InstrMapTy &InstrMap,
-    unsigned CurStageNum, unsigned PhiNum, MachineInstr *Phi, unsigned OldReg,
-    unsigned NewReg, unsigned PrevReg) {
-  bool InProlog = (CurStageNum < Schedule.getMaxStageCount());
-  int StagePhi = Schedule.stageScheduled(getSUnit(Phi)) + PhiNum;
-  // Rewrite uses that have been scheduled already to use the new
-  // Phi register.
-  for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(OldReg),
-                                         EI = MRI.use_end();
-       UI != EI;) {
-    MachineOperand &UseOp = *UI;
-    MachineInstr *UseMI = UseOp.getParent();
-    ++UI;
-    if (UseMI->getParent() != BB)
-      continue;
-    if (UseMI->isPHI()) {
-      if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
-        continue;
-      if (getLoopPhiReg(*UseMI, BB) != OldReg)
-        continue;
-    }
-    InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
-    assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
-    SUnit *OrigMISU = getSUnit(OrigInstr->second);
-    int StageSched = Schedule.stageScheduled(OrigMISU);
-    int CycleSched = Schedule.cycleScheduled(OrigMISU);
-    unsigned ReplaceReg = 0;
-    // This is the stage for the scheduled instruction.
-    if (StagePhi == StageSched && Phi->isPHI()) {
-      int CyclePhi = Schedule.cycleScheduled(getSUnit(Phi));
-      if (PrevReg && InProlog)
-        ReplaceReg = PrevReg;
-      else if (PrevReg && !Schedule.isLoopCarried(this, *Phi) &&
-               (CyclePhi <= CycleSched || OrigMISU->getInstr()->isPHI()))
-        ReplaceReg = PrevReg;
-      else
-        ReplaceReg = NewReg;
-    }
-    // The scheduled instruction occurs before the scheduled Phi, and the
-    // Phi is not loop carried.
-    if (!InProlog && StagePhi + 1 == StageSched &&
-        !Schedule.isLoopCarried(this, *Phi))
-      ReplaceReg = NewReg;
-    if (StagePhi > StageSched && Phi->isPHI())
-      ReplaceReg = NewReg;
-    if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
-      ReplaceReg = NewReg;
-    if (ReplaceReg) {
-      MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
-      UseOp.setReg(ReplaceReg);
-    }
-  }
-}
-
 /// Check if we can change the instruction to use an offset value from the
 /// previous iteration. If so, return true and set the base and offset values
 /// so that we can rewrite the load, if necessary.
@@ -3190,6 +2153,24 @@
   return true;
 }
 
+/// Return the instruction in the loop that defines the register.
+/// If the definition is a Phi, then follow the Phi operand to
+/// the instruction in the loop.
+MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
+  SmallPtrSet<MachineInstr *, 8> Visited;
+  MachineInstr *Def = MRI.getVRegDef(Reg);
+  while (Def->isPHI()) {
+    if (!Visited.insert(Def).second)
+      break;
+    for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
+      if (Def->getOperand(i + 1).getMBB() == BB) {
+        Def = MRI.getVRegDef(Def->getOperand(i).getReg());
+        break;
+      }
+  }
+  return Def;
+}
+
 /// Apply changes to the instruction if needed. The changes are need
 /// to improve the scheduling and depend up on the final schedule.
 void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
@@ -3257,6 +2238,13 @@
   if (!DI->mayStore() || !SI->mayLoad())
     return false;
 
+  if (!SI->mayAlias(AAForDep, *DI, /*UseTBAA=*/true) ||
+      !DI->mayAlias(AAForDep, *SI, /*UseTBAA=*/true))
+    return false;
+  if (DI->hasOneMemOperand() && (*DI->memoperands_begin())->getPseudoValue())
+    return false;
+  dbgs() << "ISLoopCarried: "; DI->print(dbgs(), true, true); dbgs() << " + ";
+  SI->print(dbgs(), true, true); dbgs() << "\n";
   unsigned DeltaS, DeltaD;
   if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
     return true;
@@ -3295,7 +2283,7 @@
 
   if (DeltaS != DeltaD || DeltaS < AccessSizeS || DeltaD < AccessSizeD)
     return true;
-
+  dbgs() << "probably not\n";
   return (OffsetS + (int64_t)AccessSizeS < OffsetD + (int64_t)AccessSizeD);
 }
 
@@ -3322,7 +2310,6 @@
   int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
   for (int curCycle = StartCycle; curCycle != termCycle;
        forward ? ++curCycle : --curCycle) {
-
     // Add the already scheduled instructions at the specified cycle to the
     // DFA.
     ProcItinResources.clearResources();
@@ -4078,3 +3065,344 @@
   std::fill(ProcResourceCount.begin(), ProcResourceCount.end(), 0);
 }
 
+CFG::CFG(MachineLoop *Loop, SMSchedule &Schedule, SwingSchedulerDAG &DAG) :
+  Pass(DAG.getPass()), Schedule(Schedule), MRI(DAG.MF.getRegInfo()),
+  MF(DAG.MF), DAG(DAG) {
+  TII = MF.getSubtarget().getInstrInfo();
+  OrigBB = Loop->getTopBlock();
+
+  MachineBasicBlock &Preheader = *Loop->getLoopPreheader();
+  unsigned NumStages = Schedule.getMaxStageCount() + 1;
+
+  // Analyze the original loop to extract the main loop condition.
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> KernelLoopCond;
+  TII->analyzeBranch(*OrigBB, TBB, FBB, KernelLoopCond);
+
+  // State for TII->reduceLoopCount.
+  MachineInstr *IndVar = Pass.LI.LoopInductionVar;
+  MachineInstr *Cmp = Pass.LI.LoopCompare;
+  SmallVector<MachineInstr *, 4> PrevInsts;
+
+  // We create a ladder of prolog and epilog blocks. Each prolog block tests for
+  // termination and if so jumps to an epilog block;
+  //
+  //   prolog1
+  //     i0[stage 0]
+  //     done? goto epilog1
+  //
+  //   prolog2
+  //     i0[stage 1]
+  //     i1[stage 0]
+  //     done? goto epilog2
+  //
+  //   kernel
+  //     i0..(n-2)[stage 2]
+  //     i1..(n-1)[stage 1]
+  //     i2..n    [stage 0]
+  //     done? goto epilog2 else goto kernel
+  //
+  //   epilog2
+  //     i(n-1) [stage 2]
+  //     goto epilog1
+  //
+  //   epilog1
+  //     in [stage 1]
+  //     in [stage 2]
+  //     goto loop exit
+  //
+  // Note that this approach allows us to deal with any trip count, but means
+  // we must complete loop iterations during the epilog in the *reverse* order
+  // they were started. If all iterations are purely independent this may not
+  // be an issue but this may generate incorrect code if we have a loop-carried
+  // dependence (which is why we are conservative with loop-carried
+  // dependences).
+
+  std::deque<Stage *> PrologStages, EpilogStages;
+  std::vector<MachineBasicBlock *> Blocks;
+  std::vector<CFGBlock> Prologs, Epilogs;
+  Prologs.reserve(NumStages);
+  Epilogs.reserve(NumStages);
+
+  LLVM_DEBUG(dbgs() << "Generate prolog\n");
+  // Generate prolog blocks until we have NumStages - 1 iterations in flight.
+  // The NumStages'th iteration will be launched by the kernel.
+  for (unsigned PrologStage = 0; PrologStage < NumStages - 1; ++PrologStage) {
+    CFGBlock PB(*this);
+    for (unsigned I = 0; I <= PrologStage; ++I) {
+      LLVM_DEBUG(dbgs() << " Clone stage " << I << " for prolog block "
+                        << Prologs.size() << "\n");
+      if (I == 0 && PrologStages.empty()) {
+        PrologStages.push_back(PB.CreateStage({}, I));
+      } else {
+        PrologStages.push_back(PB.CreateStage({PrologStages.front()}, I));
+        if (I > 0)
+          PrologStages.pop_front();
+      }
+    }
+
+    SmallVector<MachineOperand, 4> Cond;
+    // TODO(jmolloy): This API is really too overfit for Hexagon and the
+    // previous implementation's details. It should be refactored.
+    TII->reduceLoopCount(*PB.getMBB(), Preheader, IndVar, *Cmp, Cond,
+                             PrevInsts, PrologStage, NumStages - 1);
+    PB.setBranchCondition(std::move(Cond));
+
+    Blocks.push_back(PB.finalize());
+    Prologs.push_back(std::move(PB));
+  }
+  LLVM_DEBUG(dbgs() << "Generate kernel\n");
+
+  // Create the kernel block.
+  CFGBlock KernelPB(*this);
+
+  // We create the kernel by creating one instance of each stage. We create
+  // stage zero last; Stage zero takes stage one as input for loop-carried phis.
+  //   Iteration a:  [0 | 1 | 2 ]
+  //   Iteration b:     [ 0 | 1 | 2]
+  //                      ^ b[0] may depend on a[1] for loop-carried phis. If
+  //                        b[0] depended on a[2], the schedule would be
+  //                        invalid.
+  //
+  std::deque<Stage *> KernelStages(1, nullptr);
+  for (int I = 1; I < NumStages; ++I)
+    KernelStages.push_back(
+        KernelPB.CreateStage({PrologStages[I - 1], nullptr}, I));
+  KernelStages[0] = KernelPB.CreateStage({KernelStages[1]}, 0);
+  for (int I = 1; I < NumStages; ++I)
+    KernelStages[I]->setLastInput(KernelStages[I - 1]);
+
+  KernelPB.setBranchCondition(std::move(KernelLoopCond));
+  // Now interleave all the instructions in the kernel in schedule order.
+  KernelPB.sortAll();
+  LLVM_DEBUG(dbgs() << "Generate epilog\n");
+
+  // Seed the Epilog with the final kernel state.
+  EpilogStages = KernelStages;
+
+  // The last stage is complete so can be discarded.
+  EpilogStages.pop_back();
+
+  assert(PrologStages.size() == EpilogStages.size());
+  Stage *LastStage = nullptr;
+  for (int PeelStage = NumStages - 1; PeelStage > 0; --PeelStage) {
+    CFGBlock PB(*this);
+    // In this epilog block our input blocks define stages [0,PeelStage]. This
+    // block will peel off and complete the PeelStage'th iteration.
+    LLVM_DEBUG(dbgs() << " Clone stage " << PeelStage << " for epilog block "
+                      << Epilogs.size() << "\n");
+    Stage *S =
+        PB.CreateStage({EpilogStages.back(), PrologStages.back()}, PeelStage);
+    for (unsigned I = PeelStage + 1; I < NumStages; ++I) {
+      LLVM_DEBUG(dbgs() << " Clone stage " << I << " for epilog block "
+                        << Epilogs.size() << "\n");
+      S = PB.CreateStage({S}, I);
+    }
+    PrologStages.pop_back();
+    EpilogStages.pop_back();
+    LastStage = S;
+
+    // Pass through all stages before PeelStage. We do this by creating a dummy
+    // stage to force PHI creation.
+    for (unsigned I = 0; I < PeelStage - 1; ++I)
+      EpilogStages[I] = PB.CreateStage({PrologStages[I], EpilogStages[I]}, ~1U);
+
+    // Connect up the associated prolog block to both its normal successor and
+    // this epilog.
+    Prologs[PeelStage - 1].addSuccessors(
+        PB, PeelStage == NumStages - 1 ? KernelPB : Prologs[PeelStage]);
+    if (!Epilogs.empty()) {
+      // We're not the first epilog block, so the previous epilog block falls
+      // through to us.
+      Epilogs.back().addSuccessor(PB);
+    } else {
+      // We are the first epilog block, so the kernel falls through to us.
+      KernelPB.addSuccessors(KernelPB, PB);
+      Blocks.push_back(KernelPB.finalize());
+    }
+    Blocks.push_back(PB.finalize());
+    Epilogs.push_back(std::move(PB));
+  }
+
+  LastStage->rewriteUsesOutsideOfLoop();
+
+  LLVM_DEBUG(for (auto *MBB : Blocks) MBB->dump(););
+  Entry = Blocks.front();
+  Exit = Blocks.back();
+}
+
+Stage::Stage(CFGBlock *Block, std::initializer_list<Stage *> Ins,
+             unsigned StageIdx)
+    : Ctx(Block->getContext()), Block(Block), MBB(Block->getMBB()),
+      Inputs(Ins.begin(), Ins.end()) {
+  auto &MRI = Ctx.MRI;
+  if (Inputs.size() > 0) {
+    Remaps = Inputs[0]->remaps();
+  } else {
+    // No inputs; loop-carried PHIs take their init value.
+    for (MachineInstr &MI : Ctx.OrigBB->phis())
+      Remaps[MI.getOperand(0).getReg()] = getInitPhiReg(MI, Ctx.OrigBB);
+  }
+
+  // On stage zero our input should only be used to collect loop-carried PHI
+  // values. No other outputs from previous stages should be considered.
+  bool OnlyLoopPhis = StageIdx == 0;
+
+  // Start by creating PHIs for all live inputs.
+  auto InsertPt = MBB->getFirstNonPHI();
+  if (!Inputs.empty())
+    for (auto &KV : Inputs[0]->remaps()) {
+      if (OnlyLoopPhis && !MRI.getUniqueVRegDef(KV.first)->isPHI())
+        continue;
+      if (Inputs.size() == 1) {
+        Remaps.insert(KV);
+        continue;
+      }
+      unsigned OrigReg = KV.first;
+      unsigned DestReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
+      Phis.push_back(BuildMI(*MBB, InsertPt, DebugLoc(),
+                             Ctx.TII->get(TargetOpcode::PHI), DestReg));
+      PhiRegs[Phis.back()] = OrigReg;
+      Remaps[OrigReg] = DestReg;
+    }
+
+  SmallVector<MachineInstr *, 4> SortedInstrs;
+  for (auto &MI : *Ctx.OrigBB) {
+    if (Ctx.Schedule.isScheduledAtStage(Ctx.DAG.getSUnit(&MI), StageIdx) &&
+        !MI.isPHI())
+      SortedInstrs.push_back(&MI);
+  }
+  sort(SortedInstrs, [&](MachineInstr *A, MachineInstr *B) {
+    return Ctx.Schedule.cycleScheduled(Ctx.DAG.getSUnit(A)) <
+           Ctx.Schedule.cycleScheduled(Ctx.DAG.getSUnit(B));
+  });
+
+  for (auto *MI : SortedInstrs) {
+    MachineInstr *NewMI = Ctx.MF.CloneMachineInstr(MI);
+    MBB->insert(MBB->end(), NewMI);
+    for (auto &MO : NewMI->defs()) {
+      unsigned OrigReg = MO.getReg();
+      unsigned DestReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
+      NewMI->substituteRegister(OrigReg, DestReg, MO.getSubReg(),
+                                *MRI.getTargetRegisterInfo());
+      Remaps[OrigReg] = DestReg;
+    }
+    Instrs.push_back(NewMI);
+    Ctx.Rewrites[NewMI] = MI;
+
+    LLVM_DEBUG(dbgs() << "  add (stage " << StageIdx << ") " << *NewMI);
+  }
+
+  PostRemaps = Remaps;
+  for (auto &MI : *Ctx.OrigBB) {
+    if (MI.isPHI() && Remaps.count(getLoopPhiReg(MI, Ctx.OrigBB)))
+      PostRemaps[MI.getOperand(0).getReg()] =
+          Remaps[getLoopPhiReg(MI, Ctx.OrigBB)];
+  }
+}
+
+void Stage::finalize() {
+  if (Inputs.size() > 1) {
+    assert(Inputs.size() == 2);
+    const DenseMap<unsigned, unsigned> &R0 = Inputs[0]->remaps();
+    const DenseMap<unsigned, unsigned> &R1 = Inputs[1]->remaps();
+    for (MachineInstr *Phi : Phis) {
+      unsigned DestReg = Phi->getOperand(0).getReg();
+      unsigned OrigReg = PhiRegs[Phi];
+
+      MachineInstrBuilder B(*Phi->getParent()->getParent(), Phi);
+      unsigned R0Reg = R0.lookup(OrigReg);
+      B.addReg(R0Reg).addMBB(Inputs[0]->getMBB());
+      Ctx.MRI.constrainRegClass(DestReg, Ctx.MRI.getRegClass(R0Reg));
+
+      // Even though we may have been defined with two inputs it may turn out
+      // that we only have one predecessor.
+      if (Block->predecessors().size() > 1) {
+        unsigned R1Reg = R1.lookup(OrigReg);
+        B.addReg(R1Reg).addMBB(Inputs[1]->getMBB());
+        Ctx.MRI.constrainRegClass(DestReg, Ctx.MRI.getRegClass(R1Reg));
+      }
+    }
+  }
+
+  for (MachineInstr *MI : Instrs) {
+    for (auto &MO : MI->uses()) {
+      if (!MO.isReg() || !Remaps.count(MO.getReg()))
+        continue;
+      Ctx.MRI.constrainRegClass(Remaps[MO.getReg()],
+                                Ctx.MRI.getRegClass(MO.getReg()));
+      MO.ChangeToRegister(Remaps[MO.getReg()], /*isDef=*/false);
+    }
+  }
+}
+
+void Stage::rewriteUsesOutsideOfLoop() {
+  for (auto &KV : remaps()) {
+    unsigned RewrittenReg = KV.second;
+    unsigned OrigReg = KV.first;
+    for (MachineOperand &MO : Ctx.MRI.use_operands(OrigReg)) {
+      if (MO.getParent()->getParent() != Ctx.OrigBB)
+        MO.ChangeToRegister(RewrittenReg, /*isDef=*/false);
+    }
+  }
+}
+
+CFGBlock::CFGBlock(CFG &Ctx) : Ctx(Ctx) {
+  MBB = Ctx.MF.CreateMachineBasicBlock(Ctx.OrigBB->getBasicBlock());
+  Ctx.MF.insert(Ctx.OrigBB->getIterator(), MBB);
+}
+
+void CFGBlock::sortAll() {
+  std::vector<MachineInstr*> Instrs;
+  DenseMap<MachineInstr*, MachineInstr*> Rewrites;
+  for (auto &S : Stages) {
+    Instrs.insert(Instrs.end(), S.getInstrs().begin(), S.getInstrs().end());
+  }
+  sort(Instrs, [&](MachineInstr *A, MachineInstr *B) {
+    return Ctx.Schedule.cycleScheduled(Ctx.DAG.getSUnit(Ctx.Rewrites[A])) <
+        Ctx.Schedule.cycleScheduled(Ctx.DAG.getSUnit(Ctx.Rewrites[B]));
+  });
+  for (MachineInstr *MI : Instrs) {
+    MI->removeFromParent();
+    MBB->insert(MBB->end(), MI);
+  }
+}
+
+MachineBasicBlock *CFGBlock::finalize() {
+  for (auto &S : Stages)
+    S.finalize();
+  return MBB;
+}
+
+Stage *CFGBlock::CreateStage(std::initializer_list<Stage*> Inputs, unsigned StageIdx) {
+  Stages.emplace_back(this, Inputs, StageIdx);
+  return &Stages.back();
+}
+
+void CFGBlock::addSuccessor(CFGBlock &Succ) {
+  MBB->addSuccessor(Succ.MBB);
+  Succ.Predecessors.push_back(this);
+  assert(Ctx.TII->insertUnconditionalBranch(*MBB, Succ.MBB, DebugLoc()) != 0);
+}
+
+void CFGBlock::addSuccessors(CFGBlock &TrueSucc,
+                             CFGBlock &FalseSucc) {
+  FalseSucc.Predecessors.push_back(this);
+  if (BranchCondition.empty()) {
+    MBB->addSuccessor(FalseSucc.MBB);
+    Ctx.TII->insertUnconditionalBranch(*MBB, FalseSucc.MBB, DebugLoc());
+    return;
+  }
+  TrueSucc.Predecessors.push_back(this);
+
+  // We do this dance to transfer the successor probabilities from the loop
+  // block. This is valid for all prolog blocks and the kernel block.
+  auto TrueI = Ctx.OrigBB->succ_begin();
+  auto FalseI = std::next(TrueI);
+  MBB->addSuccessor(TrueSucc.MBB,
+                    Ctx.Pass.MBPI->getEdgeProbability(Ctx.OrigBB, TrueI));
+  MBB->addSuccessor(FalseSucc.MBB,
+                    Ctx.Pass.MBPI->getEdgeProbability(Ctx.OrigBB, FalseI));
+  assert(Ctx.TII->insertBranch(*MBB, TrueSucc.MBB, FalseSucc.MBB,
+                               BranchCondition, DebugLoc()) != 0);
+}