Index: include/llvm/CodeGen/MachineModuloSched.h
===================================================================
--- /dev/null
+++ include/llvm/CodeGen/MachineModuloSched.h
@@ -0,0 +1,190 @@
+//===--- llvm/CodeGen/MachineModuloSched.h - Common Base Class---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the data dependence graph class, which is used as the
+// common base class for software pipeline schedulers.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MACHINEMODULOSCHED_H
+#define LLVM_CODEGEN_MACHINEMODULOSCHED_H
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+
+namespace llvm {
+
+/// The main class in the implementation of the target independent
+/// software pipeliner pass.
+class MachineModuloSched : public MachineFunctionPass {
+public:
+  MachineFunction *MF = nullptr;
+  const MachineLoopInfo *MLI = nullptr;
+  const MachineDominatorTree *MDT = nullptr;
+  const InstrItineraryData *InstrItins;
+  const TargetInstrInfo *TII = nullptr;
+  RegisterClassInfo RegClassInfo;
+
+#ifndef NDEBUG
+  static int NumTries;
+#endif
+  /// Cache the target analysis information about the loop.
+  struct LoopInfo {
+    MachineBasicBlock *TBB = nullptr;
+    MachineBasicBlock *FBB = nullptr;
+    SmallVector<MachineOperand, 4> BrCond;
+    MachineInstr *LoopInductionVar = nullptr;
+    MachineInstr *LoopCompare = nullptr;
+  };
+  LoopInfo LI;
+
+  static char ID;
+  MachineModuloSched() : MachineFunctionPass(ID) {
+    initializeMachineModuloSchedPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<LiveIntervals>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  bool canPipelineLoop(MachineLoop &L);
+  bool scheduleLoop(MachineLoop &L);
+  bool swpModuloScheduler(MachineLoop &L);
+};
+
+/// This class builds the dependence graph for the instructions in a loop,
+/// and attempts to schedule the instructions using the modulo algorithm.
+class ModuloSchedulerDAG : public ScheduleDAGInstrs {
+  MachineModuloSched &Pass;
+  /// The minimum initiation interval between iterations for this schedule.
+  unsigned MII;
+  /// Set to true if a valid pipelined schedule is found for the loop.
+  bool Scheduled;
+  MachineLoop &Loop;
+  LiveIntervals &LIS;
+  const RegisterClassInfo &RegClassInfo;
+
+  /// Ordered list of DAG postprocessing steps.
+  std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
+
+public:
+  /// DdgUnit data structure to help building loop carried dependence.
+  class DdgUnit {
+  public:
+    ModuloSchedulerDAG *DAG;
+
+    SUnit *SU;
+    // DdgUnit node number, it is equal to SU->NodeNum.
+    unsigned NodeNum;
+
+    // Preds and Succs will include backedges.
+    SmallVector<SDep, 4> Preds;
+    SmallVector<SDep, 4> Succs;
+
+    DdgUnit(ModuloSchedulerDAG *DAG, SUnit *SU)
+        : DAG(DAG), SU(SU), NodeNum(SU->NodeNum), Preds(SU->Preds),
+          Succs(SU->Succs) {}
+    bool addPred(const SDep &D, DdgUnit *DDef);
+  };
+
+  /// This array holds the ddg nodes in the graph;
+  std::vector<DdgUnit> DdgUnits;
+  /// Each SUnit in the DDG block is mapped to an DdgUnit.
+  DenseMap<SUnit *, DdgUnit *> DdgUnitMap;
+
+  /// Array and number of backedges (edges with distance > 0) in the DDG.
+  unsigned NumBackEdges = 0;
+  std::vector<std::pair<SDep *, DdgUnit *>> BackEdges;
+
+  // Holds information on an SCC (Strongly Connected Component) of the DDG.
+  class Scc {
+  public:
+    ModuloSchedulerDAG *DAG;
+    // A bitmap that represents the DdgUnits that are in the SCC.
+    BitVector Nodes;
+    int NumNodes;
+
+    // Array and number of backedges in the SCC.
+    std::vector<std::pair<SDep *, DdgUnit *>> BackEdges;
+    int NumBackEdges;
+
+  public:
+    Scc(ModuloSchedulerDAG *DAG, int NumNodes)
+        : DAG(DAG), NumNodes(NumNodes), NumBackEdges(0) {}
+    void creatScc(BitVector &SccNodes);
+  };
+
+  std::set<SDep *> InSccs;
+  /// Array that holds the SCCs and their number.
+  std::vector<Scc> Sccs;
+  int NumSccs = 0;
+
+  ModuloSchedulerDAG(MachineModuloSched &P, MachineLoop &L, LiveIntervals &lis,
+                     const RegisterClassInfo &rci)
+      : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), MII(0),
+        Scheduled(false), Loop(L), LIS(lis), RegClassInfo(rci) {
+    P.MF->getSubtarget().getSMSMutations(Mutations);
+  }
+
+  void schedule() override;
+
+  /// Return true if the loop kernel has been scheduled.
+  bool hasNewSchedule() { return Scheduled; }
+
+  bool addLoopCarriedDependences();
+  DdgUnit *getDdgUnit(SUnit *SU);
+  bool isBackEdge(DdgUnit *Src, DdgUnit *Dst) {
+    return Src->NodeNum >= Dst->NodeNum;
+  }
+
+  void buildSccs();
+  void findNodesByOrder(unsigned NodeNum, BitVector &Result, bool Succs);
+
+  void viewGraph(const Twine &Name, const Twine &Title);
+  void viewGraph();
+
+private:
+  void postprocessDAG();
+  void addLoopCarriedRegDeps();
+  void createLoopRegDep(SUnit *Use, SDep &Dep);
+  void addLoopCarriedRegDep(MachineRegisterInfo &MRI, unsigned Reg);
+  void addLoopCarriedMemDeps();
+  void createLoopMemDep(SUnit *Src, SUnit *Dstint, int Latency);
+  int getLastDefOperand(MachineRegisterInfo &MRI, unsigned Reg, unsigned &Min,
+                        unsigned &Max);
+  MachineInstr *getSingleDef(unsigned Reg, const MachineRegisterInfo *MRI);
+  bool computeDelta(MachineInstr &MI, unsigned &Delta);
+  DdgUnit *newDdgUnit(SUnit *SU);
+};
+
+} // namespace llvm
+
+#endif
Index: include/llvm/CodeGen/Passes.h
===================================================================
--- include/llvm/CodeGen/Passes.h
+++ include/llvm/CodeGen/Passes.h
@@ -423,6 +423,11 @@
   /// This pass performs software pipelining on machine instructions.
   extern char &MachinePipelinerID;
 
+  /// This pass performs modulo scheduling on machine instructions.
+  /// This is an improved version of the origin machine pipeline pass
+  /// which will base on non-ssa machine instruction mode.
+  extern char &MachineModuloSchedID;
+
   /// This pass frees the memory occupied by the MachineFunction.
   FunctionPass *createFreeMachineFunctionPass();
 
Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -259,6 +259,7 @@
 void initializeMachineModuleInfoPass(PassRegistry&);
 void initializeMachineOptimizationRemarkEmitterPassPass(PassRegistry&);
 void initializeMachineOutlinerPass(PassRegistry&);
+void initializeMachineModuloSchedPass(PassRegistry&);
 void initializeMachinePipelinerPass(PassRegistry&);
 void initializeMachinePostDominatorTreePass(PassRegistry&);
 void initializeMachineRegionInfoPassPass(PassRegistry&);
Index: lib/CodeGen/CMakeLists.txt
===================================================================
--- lib/CodeGen/CMakeLists.txt
+++ lib/CodeGen/CMakeLists.txt
@@ -81,6 +81,7 @@
   MachineLoopInfo.cpp
   MachineModuleInfo.cpp
   MachineModuleInfoImpls.cpp
+  MachineModuloSched.cpp
   MachineOperand.cpp
   MachineOptimizationRemarkEmitter.cpp
   MachineOutliner.cpp
Index: lib/CodeGen/CodeGen.cpp
===================================================================
--- lib/CodeGen/CodeGen.cpp
+++ lib/CodeGen/CodeGen.cpp
@@ -64,6 +64,7 @@
   initializeMachineLICMPass(Registry);
   initializeMachineLoopInfoPass(Registry);
   initializeMachineModuleInfoPass(Registry);
+  initializeMachineModuloSchedPass(Registry);
   initializeMachineOptimizationRemarkEmitterPassPass(Registry);
   initializeMachineOutlinerPass(Registry);
   initializeMachinePipelinerPass(Registry);
Index: lib/CodeGen/MachineModuloSched.cpp
===================================================================
--- /dev/null
+++ lib/CodeGen/MachineModuloSched.cpp
@@ -0,0 +1,955 @@
+//===--------MachineModuloSched.cpp ---------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Build the data dependence graph(DDG) for a loop. This allows software
+// pipeliner(SWP) or other schedulers using the ddg info to make the scheduling
+// codes better.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineModuloSched.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Support/GraphWriter.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "modulo-sched"
+
+/// A command line option to turn software pipelining on or off.
+static cl::opt<bool> EnableSWP("enable-modulo-sched", cl::Hidden,
+                               cl::init(true), cl::ZeroOrMore,
+                               cl::desc("Enable Modulo Software Pipelining"));
+
+/// A command line option to enable SWP at -Os/-Oz.
+static cl::opt<bool> EnableSWPOptSize("enable-swp-opt-size",
+                                      cl::desc("Enable SWP at Os."), cl::Hidden,
+                                      cl::init(false));
+
+/// A command line argument to limit the number of stages in the pipeline.
+static cl::opt<int>
+    SWPMaxStages("swp-max-stages",
+                 cl::desc("Maximum stages allowed in the generated scheduled."),
+                 cl::Hidden, cl::init(3));
+
+char MachineModuloSched::ID = 0;
+char &llvm::MachineModuloSchedID = MachineModuloSched::ID;
+INITIALIZE_PASS_BEGIN(MachineModuloSched, "modulo-sched",
+                      "Modulo Software Pipelining", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(MachineModuloSched, "modulo-sched",
+                    "Modulo Software Pipelining", false, false)
+
+/// The "main" function for implementing machine modulo scheduling.
+bool MachineModuloSched::runOnMachineFunction(MachineFunction &mf) {
+  if (skipFunction(mf.getFunction()))
+    return false;
+
+  if (!EnableSWP)
+    return false;
+
+  if (mf.getFunction().getAttributes().hasAttribute(
+          AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&
+      !EnableSWPOptSize.getPosition())
+    return false;
+
+  MF = &mf;
+  MLI = &getAnalysis<MachineLoopInfo>();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  TII = MF->getSubtarget().getInstrInfo();
+  RegClassInfo.runOnMachineFunction(*MF);
+
+  for (auto &L : *MLI)
+    scheduleLoop(*L);
+
+  return false;
+}
+
+/// Attempt to perform the SWP algorithm on the specified loop. This function is
+/// the main entry point for the algorithm.  The function identifies candidate
+/// loops, calculates the minimum initiation interval, and attempts to schedule
+/// the loop.
+bool MachineModuloSched::scheduleLoop(MachineLoop &L) {
+  bool Changed = false;
+  for (auto &InnerLoop : L)
+    Changed |= scheduleLoop(*InnerLoop);
+
+  if (!canPipelineLoop(L))
+    return Changed;
+
+  Changed = swpModuloScheduler(L);
+
+  return Changed;
+}
+
+/// Return true if the loop can be software pipelined.  The algorithm is
+/// restricted to loops with a single basic block.  Make sure that the
+/// branch in the loop can be analyzed.
+bool MachineModuloSched::canPipelineLoop(MachineLoop &L) {
+  if (L.getNumBlocks() != 1)
+    return false;
+
+  // Check if the branch can't be understood because we can't do pipelining
+  // if that's the case.
+  LI.TBB = nullptr;
+  LI.FBB = nullptr;
+  LI.BrCond.clear();
+  if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond))
+    return false;
+
+  LI.LoopInductionVar = nullptr;
+  LI.LoopCompare = nullptr;
+  if (TII->analyzeLoop(L, LI.LoopInductionVar, LI.LoopCompare))
+    return false;
+
+  if (!L.getLoopPreheader())
+    return false;
+
+  // If any of the Phis contain subregs, then we can't pipeline
+  // because we don't know how to maintain subreg information in the
+  // VMap structure.
+  MachineBasicBlock *MBB = L.getHeader();
+  for (MachineBasicBlock::iterator BBI = MBB->instr_begin(),
+                                   BBE = MBB->getFirstNonPHI();
+       BBI != BBE; ++BBI)
+    for (unsigned i = 1; i != BBI->getNumOperands(); i += 2)
+      if (BBI->getOperand(i).getSubReg() != 0)
+        return false;
+
+  return true;
+}
+
+/// The SWP algorithm consists of the following main steps:
+/// 1. Computation and analysis of the dependence graph.
+/// 2. Ordering of the nodes (instructions).
+/// 3. Attempt to Schedule the loop.
+bool MachineModuloSched::swpModuloScheduler(MachineLoop &L) {
+  assert(L.getBlocks().size() == 1 && "SWP works on single blocks only.");
+
+  LLVM_DEBUG(dbgs() << "Attempt SWP on BB#" << L.getBlocks()[0]->getNumber()
+                    << "\n");
+
+  ModuloSchedulerDAG SWP(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo);
+
+  MachineBasicBlock *MBB = L.getHeader();
+  // The kernel should not include any terminator instructions.  These
+  // will be added back later.
+  SWP.startBlock(MBB);
+
+  // Compute the number of 'real' instructions in the basic block by
+  // ignoring terminators.
+  unsigned size = MBB->size();
+  for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
+                                   E = MBB->instr_end();
+       I != E; ++I, --size)
+    ;
+
+  SWP.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
+  SWP.schedule();
+  SWP.exitRegion();
+
+  SWP.finishBlock();
+  return SWP.hasNewSchedule();
+}
+
+void ModuloSchedulerDAG::postprocessDAG() {
+  for (auto &M : Mutations)
+    M->apply(this);
+}
+
+/// Override the schedule function in ScheduleDAGInstrs to implement the
+/// scheduling part of the Modulo Scheduling algorithm.
+/// Create a data dependence graph(DDG) for the loop. There are two kinds of
+/// dependence, inner loop deps and loop carried deps. Modulo scheduler will
+/// call buildSchedGraph or buildDAGWithRegPressure to build inner loop deps.
+/// And call addLoopCarriedDependences to build loop carried deps. Finally, we
+/// can combine the existing DAG diagram, and form a complete data dependency
+/// graph(DDG).
+void ModuloSchedulerDAG::schedule() {
+  // Build inner loop dependence for modulo scheduler.
+  AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
+  buildSchedGraph(AA);
+  postprocessDAG();
+  LLVM_DEBUG(dump());
+
+  // Add loop carried dependence for modulo scheduler.
+  if (!addLoopCarriedDependences())
+    return;
+
+  // build the strongly connected component graph(SCC) for modulo scheduler.
+  buildSccs();
+
+  // TODO: SWP algorithm will be implemented here based on the SCCs result.
+}
+
+/// Add loop carried dependence for registers and memories, and record all the
+/// BackEdges.
+bool ModuloSchedulerDAG::addLoopCarriedDependences() {
+  // There is nothing to do for this BB.
+  if (SUnits.size() <= 1)
+    return false;
+
+  // We'll be allocating one DdgUnit for each real instruction in the region,
+  // which is contained within a basic block.
+  DdgUnits.reserve(SUnits.size());
+
+  // Map the SUnit to DdgUnit.
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    assert(i == SU->NodeNum && i == DdgUnits.size() && "Wrong node number.");
+    DdgUnit *DU = newDdgUnit(SU);
+    DdgUnitMap[SU] = DU;
+  }
+
+  // Add the loop carried dependences for memory and registers.
+  addLoopCarriedMemDeps();
+  addLoopCarriedRegDeps();
+
+  // Find all the backedges, push them into BackEdges vector.
+  for (auto &DU : DdgUnits) {
+    for (unsigned i = 0, e = DU.Succs.size(); i != e; ++i) {
+      SDep *Dep = &DU.Succs[i];
+      DdgUnit *DestDU = getDdgUnit(Dep->getSUnit());
+      if (isBackEdge(&DU, DestDU))
+        BackEdges.push_back(std::make_pair(Dep, &DU));
+    }
+  }
+  assert(BackEdges.size() == NumBackEdges && "Wrong number of backedges.");
+
+  return true;
+}
+
+/// Return the underlying objects for the memory references of an instruction.
+/// This function calls the code in ValueTracking, but first checks that the
+/// instruction has a memory operand.
+static void getUnderlyingObjects(MachineInstr *MI,
+                                 SmallVectorImpl<Value *> &Objs,
+                                 const DataLayout &DL) {
+  if (!MI->hasOneMemOperand())
+    return;
+  MachineMemOperand *MM = *MI->memoperands_begin();
+  if (!MM->getValue())
+    return;
+  GetUnderlyingObjects(const_cast<Value *>(MM->getValue()), Objs, DL);
+}
+
+/// Return true if the instruction causes a chain between memory
+/// references before and after it.
+static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) {
+  return MI.isCall() || MI.hasUnmodeledSideEffects() ||
+         (MI.hasOrderedMemoryRef() &&
+          (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad(AA)));
+}
+
+MachineInstr *ModuloSchedulerDAG::getSingleDef(unsigned Reg,
+                                               const MachineRegisterInfo *MRI) {
+  MachineInstr *Ret = nullptr;
+  for (MachineInstr &DefMI : MRI->def_instructions(Reg)) {
+    if (DefMI.getParent() != BB || DefMI.isDebugValue())
+      continue;
+    if (!Ret)
+      Ret = &DefMI;
+    else if (Ret != &DefMI)
+      return nullptr;
+  }
+  return Ret;
+}
+
+/// Return true if we can compute the amount the instruction changes
+/// during each iteration. Set Delta to the amount of the change.
+bool ModuloSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
+  const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+  const TargetRegisterInfo *TRI =
+      BB->getParent()->getSubtarget().getRegisterInfo();
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+
+  MachineOperand *BaseOp;
+  int64_t Offset;
+  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI))
+    return false;
+
+  if (!BaseOp->isReg())
+    return false;
+
+  unsigned BaseReg = BaseOp->getReg();
+
+  MachineInstr *BaseDef = getSingleDef(BaseReg, &MRI);
+  if (!BaseDef)
+    return false;
+
+  int D = 0;
+  if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
+    return false;
+
+  Delta = D;
+  return true;
+}
+
+/// There are three kinds of loop carried memory dependences:
+/// 1) Store-->Load true deps
+///   BB:
+///     ...
+///     Load     xxx           (a) <--
+///     ...                           |
+///     ...                           |  (True Dep)
+///     Store    xxx           (b) ---
+///     ...
+///     Branch  BB
+/// 2) Store-->Store output deps
+///   BB:
+///     ...
+///     Store    xxx           (a) <--
+///     ...                           |
+///     ...                           |  (Output Dep)
+///     Store    xxx           (b) ---
+///     ...
+///     Branch  BB
+/// 3) Load-->Store Anti deps
+///   BB:
+///     ...
+///     Store    xxx           (a) <--
+///     ...                           |
+///     ...                           |  (Anti Dep)
+///     Load     xxx           (b) ---
+///     ...
+///     Branch  BB
+void ModuloSchedulerDAG::addLoopCarriedMemDeps() {
+  AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
+  const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+  const TargetRegisterInfo *TRI =
+      BB->getParent()->getSubtarget().getRegisterInfo();
+  MapVector<Value *, SmallVector<SUnit *, 8>> PendingLoadsStores;
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    SUnit *SUI = &SUnits[i];
+    MachineInstr *DstInst = SUI->getInstr();
+    if (isDependenceBarrier(*DstInst, AA)) {
+      PendingLoadsStores.clear();
+      // For Barrier dependence, we must add loop carried dependence for them.
+      for (auto &PI : SUI->Preds) {
+        if (PI.getKind() != SDep::Order)
+          continue;
+
+        SUnit *SUJ = PI.getSUnit();
+        MachineInstr *SrcInst = SUJ->getInstr();
+        createLoopMemDep(SUI, SUJ,
+                         (DstInst->mayStore() && SrcInst->mayLoad()) ? 1 : 0);
+      }
+
+      for (auto &SI : SUI->Succs) {
+        if (SI.getKind() != SDep::Order)
+          continue;
+
+        SUnit *SUJ = SI.getSUnit();
+        MachineInstr *SrcInst = SUJ->getInstr();
+        createLoopMemDep(SUJ, SUI,
+                         (SrcInst->mayStore() && DstInst->mayLoad()) ? 1 : 0);
+      }
+    }
+    if (!DstInst->mayLoadOrStore())
+      continue;
+
+    SmallVector<Value *, 8> Objs;
+    getUnderlyingObjects(DstInst, Objs, BB->getParent()->getDataLayout());
+    for (auto V : Objs) {
+      SmallVector<SUnit *, 8> &SUs = PendingLoadsStores[V];
+      SUs.push_back(SUI);
+    }
+
+    for (unsigned j = 0; j <= i; j++) {
+      SUnit *SUJ = &SUnits[j];
+      MachineInstr *SrcInst = SUJ->getInstr();
+      if (!SrcInst->mayLoadOrStore() || isDependenceBarrier(*SrcInst, AA))
+        continue;
+
+      if (SUJ->isSucc(SUI)) {
+        for (auto &PI : SUI->Preds) {
+          if (PI.getSUnit() != SUJ)
+            continue;
+          if (PI.getKind() != SDep::Order)
+            continue;
+
+          // TODO: This approach is conservative, the function was similar with
+          // addLoopCarriedDependences in MachinePipeliner.cpp, we can do more
+          // accurate analysis for it.
+          if (SrcInst->mayLoadOrStore() && !SUI->isSucc(SUJ))
+            createLoopMemDep(SUI, SUJ, DstInst->mayLoad() ? 0 : 1);
+        }
+      } else {
+        for (auto V : Objs) {
+          MapVector<Value *, SmallVector<SUnit *, 8>>::iterator I =
+              PendingLoadsStores.find(V);
+          if (I == PendingLoadsStores.end())
+            continue;
+
+          bool NeedCrossDep = false;
+          for (auto SU : I->second) {
+            MachineInstr *MI = SU->getInstr();
+            if (MI != SrcInst)
+              continue;
+            if (DstInst->mayLoad() && MI->mayLoad())
+              break;
+
+            // First, perform the cheaper check that compares the base reg.
+            MachineOperand *BaseReg1, *BaseReg2;
+            int64_t Offset1, Offset2;
+            bool Legal1 =
+                TII->getMemOperandWithOffset(*MI, BaseReg1, Offset1, TRI);
+            bool Legal2 =
+                TII->getMemOperandWithOffset(*DstInst, BaseReg2, Offset2, TRI);
+            if (Legal1 && Legal2 && BaseReg1->isIdenticalTo(*BaseReg2)) {
+              unsigned Delta1, Delta2;
+              unsigned MaxLoopCarry = (SWPMaxStages + 1);
+              if (computeDelta(*MI, Delta1) && computeDelta(*DstInst, Delta2)) {
+                uint64_t Width1 = (*MI->memoperands_begin())->getSize();
+                uint64_t Width2 = (*DstInst->memoperands_begin())->getSize();
+                // This is the main test, which checks the offset values and the
+                // loop increment value to determine if the accesses may be loop
+                // carried.
+                if ((Delta1 > 0 &&
+                     std::abs(Offset2 - Offset1) >= MaxLoopCarry * Delta1) ||
+                    ((Offset1 >= Offset2) && (Offset1 + Width1 <= Delta1)) ||
+                    ((Offset1 < Offset2) && (Offset2 + Width2 <= Delta2)))
+                  break;
+              }
+            }
+
+            // Second, the more expensive check that uses alias analysis on the
+            // value with MIMO info.
+            if (!AA) {
+              NeedCrossDep = true;
+              break;
+            }
+
+            MachineMemOperand *MMO1 = *MI->memoperands_begin();
+            MachineMemOperand *MMO2 = *DstInst->memoperands_begin();
+            if (!MMO1->getValue() || !MMO2->getValue()) {
+              NeedCrossDep = true;
+              break;
+            }
+            AliasResult AAResult = AA->alias(
+                MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize,
+                               MMO1->getAAInfo()),
+                MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize,
+                               MMO2->getAAInfo()));
+            if (AAResult != NoAlias)
+              NeedCrossDep = true;
+            break;
+          }
+
+          if (NeedCrossDep)
+            createLoopMemDep(SUI, SUJ, DstInst->mayLoad() ? 0 : 1);
+        }
+      }
+    }
+  }
+}
+
+void ModuloSchedulerDAG::createLoopMemDep(SUnit *Src, SUnit *Dst, int Latency) {
+  if (Src == Dst)
+    return;
+  SDep Dep(Src, SDep::MayAliasMem);
+  Dep.setLatency(Latency);
+
+  DdgUnit *DUse = DdgUnitMap[Dst];
+  DdgUnit *DDef = DdgUnitMap[Src];
+  if (DUse->addPred(Dep, DDef))
+    NumBackEdges++;
+}
+
+void ModuloSchedulerDAG::addLoopCarriedRegDeps() {
+  const TargetRegisterInfo *TRI =
+      BB->getParent()->getSubtarget().getRegisterInfo();
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+
+  BitVector PRegDefs(TRI->getNumRegs());
+  BitVector VRegDefs(MRI.getNumVirtRegs());
+
+  for (auto &N : SUnits) {
+    MachineInstr *MI = N.getInstr();
+
+    for (unsigned i = 0, n = MI->getNumOperands(); i != n; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned Reg = MO.getReg();
+
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        if (PRegDefs[Reg])
+          continue;
+        addLoopCarriedRegDep(MRI, Reg);
+        PRegDefs.set(Reg);
+      } else if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        unsigned u = TargetRegisterInfo::virtReg2Index(Reg);
+        if (VRegDefs[u])
+          continue;
+        addLoopCarriedRegDep(MRI, Reg);
+        VRegDefs.set(u);
+      }
+    }
+  }
+}
+
+int ModuloSchedulerDAG::getLastDefOperand(MachineRegisterInfo &MRI,
+                                          unsigned Reg, unsigned &Min,
+                                          unsigned &Max) {
+  int LastOpIdx = -1;
+  for (MachineOperand &MO : MRI.def_operands(Reg)) {
+    MachineInstr *MI = MO.getParent();
+    SUnit *SU = MISUnitMap[MI];
+    if (!SU)
+      continue;
+
+    if (SU->NodeNum <= Min)
+      Min = SU->NodeNum;
+    if (SU->NodeNum >= Max) {
+      Max = SU->NodeNum;
+      LastOpIdx = &MO - &MI->getOperand(0);
+    }
+  }
+
+  return LastOpIdx;
+}
+
+/// Add loop carried dependence for registers, include physical or virtual
+/// registers, with three kinds of deps "True/Anti/Output".
+/// The following is the detailed description of the three kinds of loop carried
+/// registers dependences:
+/// 1) <last def>--><use> true deps
+///   BB:
+///     ...
+///     yyy = Reg<use>                (a) <--
+///     ...                                  |
+///     Reg = xxx<first def>                 |
+///     ...                                  |  (True Dep)
+///     Reg = zzz<last def>           (b) ---
+///     ...
+///     Branch  BB
+/// 2) <use>--><first def> anti deps
+///   BB:
+///     ...
+///     Reg = xxx<first def>          (a) <--
+///     ...                                  |
+///     Reg = zzz<last def>                  |
+///     ...                                  |  (Anti Dep)
+///     yyy = Reg<use>                (b) ---
+///     ...
+///     Branch  BB
+/// 3) <last def>--><first def> output deps
+///   BB:
+///     ...
+///     Reg = xxx<first def>          (a) <--
+///     ...                                  |
+///     ...                                  |  (Output Dep)
+///     Reg = zzz<last def>           (b) ---
+///     ...
+///     Branch  BB
+void ModuloSchedulerDAG::addLoopCarriedRegDep(MachineRegisterInfo &MRI,
+                                              unsigned Reg) {
+  unsigned Min = SUnits.size();
+  unsigned Max = 0;
+  int LastOpIdx = -1;
+  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    for (MCRegAliasIterator Alias(Reg, TRI, true); Alias.isValid(); ++Alias) {
+      LastOpIdx = getLastDefOperand(MRI, *Alias, Min, Max);
+    }
+  } else
+    LastOpIdx = getLastDefOperand(MRI, Reg, Min, Max);
+
+  assert(Min <= Max && "Can not find Min/Max DDG Node for inter-loop.");
+
+  MachineInstr *LastDefMI = SUnits[Max].getInstr();
+  MachineInstr *FirstDefMI = SUnits[Min].getInstr();
+
+  bool UseByLastDef = false;
+  for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+    MachineInstr *UseMI = MO.getParent();
+    SUnit *SU = MISUnitMap[UseMI];
+    if (!SU)
+      continue;
+
+    if (SU->NodeNum <= Min) {
+      // First, add true register dependence for last def to the use in the next
+      // iteration.
+      unsigned OpNo = &MO - &UseMI->getOperand(0);
+      SDep Dep = SDep(&SUnits[Max], SDep::Data, Reg);
+      int Latency =
+          SchedModel.computeOperandLatency(LastDefMI, LastOpIdx, UseMI, OpNo);
+      Dep.setLatency(Latency);
+      createLoopRegDep(SU, Dep);
+      UseByLastDef = true;
+    } else if (SU->NodeNum > Max) {
+      // Second, add anti register dependence for the use after last def to the
+      // first register def in the next iteration.
+      SDep Dep = SDep(SU, SDep::Anti, Reg);
+      createLoopRegDep(&SUnits[Min], Dep);
+      UseByLastDef = true;
+    }
+  }
+
+  // Third, add an output register dependence between last def and the first def
+  // in the next iteration.
+  // There are two cases need be avoided, we need check:
+  // a) Whether a self output dependence(Min == Max).
+  // b) Is there a register use between last def and first def in the next
+  //    iteration.
+  if (!UseByLastDef && Min != Max) {
+    SDep Dep = SDep(&SUnits[Max], SDep::Output, Reg);
+    Dep.setLatency(
+        SchedModel.computeOutputLatency(LastDefMI, LastOpIdx, FirstDefMI));
+    createLoopRegDep(&SUnits[Min], Dep);
+  }
+}
+
+void ModuloSchedulerDAG::createLoopRegDep(SUnit *Use, SDep &Dep) {
+  DdgUnit *DUse = DdgUnitMap[Use];
+  DdgUnit *DDef = DdgUnitMap[Dep.getSUnit()];
+  if (DUse->addPred(Dep, DDef))
+    NumBackEdges++;
+}
+
+/// newDdgUnit - Creates a new DdgUnit and return a ptr to it.
+ModuloSchedulerDAG::DdgUnit *ModuloSchedulerDAG::newDdgUnit(SUnit *SU) {
+#ifndef NDEBUG
+  const DdgUnit *Addr = DdgUnits.empty() ? nullptr : &DdgUnits[0];
+#endif
+  DdgUnits.emplace_back(this, SU);
+  assert((Addr == nullptr || Addr == &DdgUnits[0]) &&
+         "DdgUnits std::vector reallocated on the fly!");
+  return &DdgUnits.back();
+}
+
+/// addPred - This adds the specified edge as a pred of the current node if
+/// not already.  It also adds the current node as a successor of the
+/// specified node.
+bool ModuloSchedulerDAG::DdgUnit::addPred(const SDep &D, DdgUnit *DDef) {
+  // If this node already has this dependence, don't add a redundant one.
+  for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end();
+       I != E; ++I) {
+    if (I->overlaps(D)) {
+      // Extend the latency if needed. Equivalent to removePred(I) + addPred(D).
+      if (I->getLatency() < D.getLatency()) {
+        // Find the corresponding successor in N.
+        SDep ForwardD = *I;
+        ForwardD.setSUnit(this->SU);
+        for (SmallVectorImpl<SDep>::iterator II = DDef->Succs.begin(),
+                                             EE = DDef->Succs.end();
+             II != EE; ++II) {
+          if (*II == ForwardD) {
+            II->setLatency(D.getLatency());
+            break;
+          }
+        }
+        I->setLatency(D.getLatency());
+      }
+      return false;
+    }
+  }
+  // Now add a corresponding succ to N.
+  SDep P = D;
+  P.setSUnit(this->SU);
+
+  Preds.push_back(D);
+  DDef->Succs.push_back(P);
+
+  return true;
+}
+
+ModuloSchedulerDAG::DdgUnit *ModuloSchedulerDAG::getDdgUnit(SUnit *SU) {
+  if (DdgUnitMap.find(SU) == DdgUnitMap.end())
+    return NULL;
+
+  return DdgUnitMap[SU];
+}
+
+void ModuloSchedulerDAG::buildSccs() {
+  unsigned NumNodes = SUnits.size();
+  for (unsigned i = 0; i < NumBackEdges; i++) {
+    SDep *Dep = BackEdges[i].first;
+    DdgUnit *Src = BackEdges[i].second;
+    DdgUnit *Dest = getDdgUnit(Dep->getSUnit());
+
+    if (InSccs.find(Dep) != InSccs.end())
+      continue;
+
+    BitVector Start(NumNodes);
+    BitVector End(NumNodes);
+
+    // Kosaraju's algorithm to find the SCC graph.
+    findNodesByOrder(Dest->NodeNum, Start, true);
+    findNodesByOrder(Src->NodeNum, End, false);
+    Start &= End;
+
+    if (Start.any()) {
+      Scc SCC(this, NumNodes);
+      SCC.creatScc(Start);
+      Sccs.push_back(SCC);
+      NumSccs++;
+    }
+  }
+
+  if (NumSccs == 0)
+    return;
+
+#ifndef NDEBUG
+  // Check that every node in SCCS belongs to exactly one strongly connected
+  // component and that no element of SCCS is empty.
+  BitVector Tmp(NumNodes);
+  for (int i = 0; i < NumSccs; i++) {
+    assert(Sccs[i].Nodes.any() && "Scc can not be empty.");
+    // Verify that every node in sccs is
+    // in exactly one strongly connected component.
+    assert(!Tmp.anyCommon(Sccs[i].Nodes) && "Sccs can not have common nodes.");
+    Tmp |= Sccs[i].Nodes;
+  }
+  LLVM_DEBUG(dbgs() << "Node numbers for all sccs is " << Tmp.count()
+                    << ", Total Node numbers is " << NumNodes << ".\n");
+#endif
+}
+
+void ModuloSchedulerDAG::findNodesByOrder(unsigned NodeNum, BitVector &Result,
+                                          bool Succs) {
+  unsigned NumNodes = SUnits.size();
+  BitVector Worklist(NumNodes);
+  BitVector Visited(NumNodes);
+
+  Result.set(NodeNum);
+  Visited.set(NodeNum);
+
+  bool change = true;
+  while (change) {
+    change = false;
+    Worklist.reset();
+    Worklist |= Visited;
+    Visited.reset();
+
+    for (int u = Worklist.find_first(); u >= 0; u = Worklist.find_next(u)) {
+      DdgUnit &DU = DdgUnits[u];
+      for (unsigned i = 0, e = Succs ? DU.Succs.size() : DU.Preds.size();
+           i != e; ++i) {
+        SDep *Dep = Succs ? &DU.Succs[i] : &DU.Preds[i];
+        DdgUnit *DestDU = getDdgUnit(Dep->getSUnit());
+        assert(DestDU != NULL && "Can't find pred/succ DDG node.");
+        int V = DestDU->NodeNum;
+
+        if (!Result[V]) {
+          Result.set(V);
+          Visited.set(V);
+          change = true;
+        }
+      }
+    }
+  }
+}
+
+void ModuloSchedulerDAG::Scc::creatScc(BitVector &SccNodes) {
+  Nodes.clear();
+  Nodes.resize(NumNodes);
+
+  Nodes |= SccNodes;
+
+  for (int u = SccNodes.find_first(); u >= 0; u = SccNodes.find_next(u)) {
+    DdgUnit &DU = DAG->DdgUnits[u];
+
+    for (unsigned i = 0, e = DU.Succs.size(); i != e; ++i) {
+      SDep *Dep = &DU.Succs[i];
+      DdgUnit *DestDU = DAG->getDdgUnit(Dep->getSUnit());
+
+      if (SccNodes[DestDU->NodeNum]) {
+        DAG->InSccs.insert(Dep);
+        if (DAG->isBackEdge(&DU, DestDU)) {
+          BackEdges.push_back(std::make_pair(Dep, &DU));
+          NumBackEdges++;
+        }
+      }
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// GraphWriter support for Ddg.
+//===----------------------------------------------------------------------===//
+
+#ifndef NDEBUG
+namespace llvm {
+class DdgNodeIterator
+    : public std::iterator<std::forward_iterator_tag,
+                           ModuloSchedulerDAG::DdgUnit, ptrdiff_t> {
+  ModuloSchedulerDAG::DdgUnit *Node;
+  unsigned Operand;
+
+  DdgNodeIterator(ModuloSchedulerDAG::DdgUnit *N, unsigned Op)
+      : Node(N), Operand(Op) {}
+
+public:
+  bool operator==(const DdgNodeIterator &x) const {
+    return Operand == x.Operand;
+  }
+  bool operator!=(const DdgNodeIterator &x) const { return !operator==(x); }
+
+  pointer operator*() const {
+    SUnit *SU = getDdgEdge()->getSUnit();
+    return Node->DAG->getDdgUnit(SU);
+  }
+  pointer operator->() const { return operator*(); }
+
+  // Preincrement
+  DdgNodeIterator &operator++() {
+    ++Operand;
+    return *this;
+  }
+  // Postincrement
+  DdgNodeIterator operator++(int) {
+    DdgNodeIterator tmp = *this;
+    ++*this;
+    return tmp;
+  }
+
+  static DdgNodeIterator begin(ModuloSchedulerDAG::DdgUnit *N) {
+    return DdgNodeIterator(N, 0);
+  }
+  static DdgNodeIterator end(ModuloSchedulerDAG::DdgUnit *N) {
+    return DdgNodeIterator(N, N->Succs.size());
+  }
+
+  unsigned getOperand() const { return Operand; }
+  const ModuloSchedulerDAG::DdgUnit *getNode() const { return Node; }
+
+  bool isMemDep() const { return getDdgEdge()->isNormalMemoryOrBarrier(); }
+  SDep::Kind getDepType() const { return getDdgEdge()->getKind(); }
+  const SDep *getDdgEdge() const { return &Node->Succs[Operand]; }
+};
+
+template <> struct GraphTraits<ModuloSchedulerDAG::DdgUnit *> {
+  typedef ModuloSchedulerDAG::DdgUnit *NodeRef;
+  typedef DdgNodeIterator ChildIteratorType;
+
+  static NodeRef getEntryNode(NodeRef A) { return A; }
+  static ChildIteratorType child_begin(NodeRef N) {
+    return DdgNodeIterator::begin(N);
+  }
+  static ChildIteratorType child_end(NodeRef N) {
+    return DdgNodeIterator::end(N);
+  }
+};
+
+template <>
+struct GraphTraits<ModuloSchedulerDAG *>
+    : public GraphTraits<ModuloSchedulerDAG::DdgUnit *> {
+  typedef pointer_iterator<std::vector<ModuloSchedulerDAG::DdgUnit>::iterator>
+      nodes_iterator;
+  static nodes_iterator nodes_begin(ModuloSchedulerDAG *G) {
+    return nodes_iterator(G->DdgUnits.begin());
+  }
+  static nodes_iterator nodes_end(ModuloSchedulerDAG *G) {
+    return nodes_iterator(G->DdgUnits.end());
+  }
+};
+
+template <>
+struct DOTGraphTraits<ModuloSchedulerDAG *> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(const ModuloSchedulerDAG *G) {
+    return G->MF.getName();
+  }
+
+  /// If you want to override the dot attributes printed for a particular
+  /// edge, override this method.
+  static std::string getEdgeAttributes(const ModuloSchedulerDAG::DdgUnit *Node,
+                                       DdgNodeIterator EI,
+                                       const ModuloSchedulerDAG *Graph) {
+    std::string Str;
+    raw_string_ostream OS(Str);
+
+    SUnit *SU = EI.getDdgEdge()->getSUnit();
+    if (SU->NodeNum <= Node->SU->NodeNum)
+      OS << "style=dashed, ";
+
+    if (EI.getDepType() == SDep::Data)
+      OS << "color = black, label = \"true " << EI.getDdgEdge()->getLatency();
+    else if (EI.getDepType() == SDep::Anti)
+      OS << "color = red, label = \"anti " << EI.getDdgEdge()->getLatency();
+    else if (EI.getDepType() == SDep::Output)
+      OS << "color = blue, label = \"output " << EI.getDdgEdge()->getLatency();
+    else {
+      OS << "color = cyan, label = \"mem " << EI.getDdgEdge()->getLatency();
+    }
+
+    if (EI.isMemDep())
+      OS << " M\"";
+    else
+      OS << " R\"";
+    return OS.str();
+  }
+
+  static std::string getNodeLabel(const ModuloSchedulerDAG::DdgUnit *Node,
+                                  const ModuloSchedulerDAG *G) {
+    std::string Str;
+    raw_string_ostream OS(Str);
+    MachineBasicBlock *BB = Node->SU->getInstr()->getParent();
+
+    if (Node->SU != nullptr)
+      OS << "N:" << Node->SU->NodeNum;
+    else
+      OS << "BB#" << BB->getNumber();
+
+    return OS.str();
+  }
+  static std::string getNodeDescription(const ModuloSchedulerDAG::DdgUnit *N,
+                                        const ModuloSchedulerDAG *G) {
+    std::string Str;
+    raw_string_ostream OS(Str);
+
+    if (N->SU != nullptr)
+      N->SU->getInstr()->print(OS, /*SkipOpers=*/true);
+    else
+      OS << "Total insn: " << G->DdgUnits.size();
+
+    return OS.str();
+  }
+
+  static std::string getNodeAttributes(const ModuloSchedulerDAG::DdgUnit *N,
+                                       const ModuloSchedulerDAG *G) {
+    if (N->SU != nullptr)
+      return "shape=Mrecord";
+    else
+      return "style = filled";
+  }
+};
+} // namespace llvm
+#endif // NDEBUG
+
+/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
+/// rendered using 'dot'.
+///
+void ModuloSchedulerDAG::viewGraph(const Twine &Name, const Twine &Title) {
+#ifndef NDEBUG
+  ViewGraph(this, Name, false, Title);
+#else
+  errs() << "Ddg::viewGraph is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif // NDEBUG
+}
+
+/// Out-of-line implementation with no arguments is handy for gdb.
+void ModuloSchedulerDAG::viewGraph() {
+  viewGraph(BB->getFullName(), "DDG Graph for " + BB->getName());
+}