Index: lib/CodeGen/MachinePipeliner.cpp
===================================================================
--- lib/CodeGen/MachinePipeliner.cpp
+++ lib/CodeGen/MachinePipeliner.cpp
@@ -119,6 +119,8 @@
 #include <utility>
 #include <vector>
 
+/* #define USE_DFAPacketizer_P 1 */
+
 using namespace llvm;
 
 #define DEBUG_TYPE "pipeliner"
@@ -601,12 +603,18 @@
   /// Virtual register information.
   MachineRegisterInfo &MRI;
 
+#ifdef USE_DFAPacketizer_P
   std::unique_ptr<DFAPacketizer> Resources;
-
+#endif /* USE_DFAPacketizer_P */
 public:
+#ifdef USE_DFAPacketizer_P
   SMSchedule(MachineFunction *mf)
       : ST(mf->getSubtarget()), MRI(mf->getRegInfo()),
         Resources(ST.getInstrInfo()->CreateTargetScheduleState(ST)) {}
+#else
+  SMSchedule(MachineFunction *mf)
+      : ST(mf->getSubtarget()), MRI(mf->getRegInfo()) {}
+#endif /* USE_DFAPacketizer_P */
 
   void reset() {
     ScheduledInstrs.clear();
@@ -1309,6 +1317,7 @@
 
 } // end anonymous namespace
 
+#ifdef USE_DFAPacketizer_P
 /// Calculate the resource constrained minimum initiation interval for the
 /// specified loop. We use the DFA to model the resources needed for
 /// each instruction, and we ignore dependences. A different DFA is created
@@ -1377,6 +1386,19 @@
   Resources.clear();
   return Resmii;
 }
+#else
+unsigned SwingSchedulerDAG::calculateResMII() {
+  // Consider only issue width
+  MachineBasicBlock *MBB = Loop.getHeader();
+  unsigned size = 0;
+  for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
+	 E = MBB->getFirstTerminator();
+       I != E; ++I) {
+    size++;
+  }
+  return (size + SchedModel.getIssueWidth() - 1) / SchedModel.getIssueWidth();
+}
+#endif /* USE_DFAPacketizer_P */
 
 /// Calculate the recurrence-constrainted minimum initiation interval.
 /// Iterate over each circuit.  Compute the delay(c) and distance(c)
@@ -3483,6 +3505,7 @@
     M->apply(this);
 }
 
+#ifdef USE_DFAPacketizer_P
 /// Try to schedule the node at the specified StartCycle and continue
 /// until the node is schedule or the EndCycle is reached.  This function
 /// returns true if the node is scheduled.  This routine may search either
@@ -3536,6 +3559,131 @@
   }
   return false;
 }
+#else
+static void clearResources(std::vector<unsigned>& ResourceTable)
+{
+  for (unsigned Idx = 0; Idx < ResourceTable.size(); ++Idx) {
+    ResourceTable[Idx] = 0;
+  }
+}
+
+static bool canReserveResources(const TargetSubtargetInfo *STI,
+				const MCSchedModel &SchedModel,
+				std::vector<unsigned>& ResourceTable, SUnit *SU)
+{
+  unsigned SchedClass = SU->getInstr()->getDesc().getSchedClass();
+  const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(SchedClass);
+  unsigned Issues = 0;
+  for (unsigned Idx = 0; Idx < ResourceTable.size(); ++Idx) {
+    Issues += ResourceTable[Idx];
+  }
+  if (Issues >= SchedModel.IssueWidth)
+    return false;
+
+  // TODO: This process is not accurate.
+  // Information on elements of the set of ProcResGroup as follows is necessary.
+  // [TargetSchedule] Expose sub-units of a ProcResGroup in MCProcResourceDesc.
+  // https://reviews.llvm.org/D43023
+  // In addition, resource management using ResourceCycles is necessary.
+  bool Check = false;
+  for (const MCWriteProcResEntry &PRE :
+	 make_range(STI->getWriteProcResBegin(SC),
+		    STI->getWriteProcResEnd(SC))) {
+    unsigned Idx = PRE.ProcResourceIdx;
+    if (SchedModel.getProcResource(Idx)->NumUnits > ResourceTable[Idx]) {
+      return true;
+    }
+    Check = true;
+  }
+  if (!Check) {
+    ResourceTable[0]++;
+    return true;
+  }
+  return false;
+}
+
+static void reserveResources(const TargetSubtargetInfo *STI,
+			     const MCSchedModel &SchedModel,
+			     std::vector<unsigned>& ResourceTable, SUnit *SU)
+{
+  unsigned SchedClass = SU->getInstr()->getDesc().getSchedClass();
+  const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(SchedClass);
+  bool Check = false;
+  for (const MCWriteProcResEntry &PRE :
+	 make_range(STI->getWriteProcResBegin(SC),
+		    STI->getWriteProcResEnd(SC))) {
+    unsigned Idx = PRE.ProcResourceIdx;
+    if (SchedModel.getProcResource(Idx)->NumUnits > ResourceTable[Idx]) {
+      ResourceTable[Idx]++;
+      return;
+    }
+    Check = true;
+  }
+  if (!Check) {
+    ResourceTable[0]++;
+    return;
+  }
+  llvm_unreachable("reserveResources");
+}
+
+bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
+  const TargetSubtargetInfo *STI = &ST;
+  std::vector<unsigned> ResourceTable;
+  const MCSchedModel &SchedModel = ST.getSchedModel();
+  {
+    unsigned NumRes = SchedModel.getNumProcResourceKinds();
+    for (unsigned Idx = 0; Idx < NumRes; ++Idx) {
+      ResourceTable.push_back(0);
+    }
+  }
+  bool forward = true;
+  if (StartCycle > EndCycle)
+    forward = false;
+
+  // The terminating condition depends on the direction.
+  int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
+  for (int curCycle = StartCycle; curCycle != termCycle;
+       forward ? ++curCycle : --curCycle) {
+
+    // Add the already scheduled instructions at the specified cycle.
+    clearResources(ResourceTable);
+    for (int checkCycle = FirstCycle + ((curCycle - FirstCycle) % II);
+         checkCycle <= LastCycle; checkCycle += II) {
+      std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[checkCycle];
+
+      for (std::deque<SUnit *>::iterator I = cycleInstrs.begin(),
+                                         E = cycleInstrs.end();
+           I != E; ++I) {
+        if (ST.getInstrInfo()->isZeroCost((*I)->getInstr()->getOpcode()))
+          continue;
+        assert(canReserveResources(STI, SchedModel, ResourceTable, (*I)) &&
+               "These instructions have already been scheduled.");
+        reserveResources(STI, SchedModel, ResourceTable, (*I));
+      }
+    }
+    if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
+        canReserveResources(STI, SchedModel, ResourceTable, SU)) {
+      DEBUG({
+        dbgs() << "\tinsert at cycle " << curCycle << " ";
+        SU->getInstr()->dump();
+      });
+
+      ScheduledInstrs[curCycle].push_back(SU);
+      InstrToCycle.insert(std::make_pair(SU, curCycle));
+      if (curCycle > LastCycle)
+        LastCycle = curCycle;
+      if (curCycle < FirstCycle)
+        FirstCycle = curCycle;
+      return true;
+    }
+    DEBUG({
+      dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
+      SU->getInstr()->dump();
+    });
+  }
+  return false;
+}
+#endif /* USE_DFAPacketizer_P */
 
 // Return the cycle of the earliest scheduled instruction in the chain.
 int SMSchedule::earliestCycleInChain(const SDep &Dep) {
Index: lib/Target/AArch64/AArch64InstrInfo.h
===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.h
+++ lib/Target/AArch64/AArch64InstrInfo.h
@@ -293,6 +293,16 @@
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
+
+  bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
+                   MachineInstr *&CmpInst) const override;
+
+  unsigned reduceLoopCount(MachineBasicBlock &MBB,
+			   MachineInstr *IndVar, MachineInstr &Cmp,
+			   SmallVectorImpl<MachineOperand> &Cond,
+			   SmallVectorImpl<MachineInstr *> &PrevInsts,
+			   unsigned Iter, unsigned MaxIter) const override;
+
   bool
   reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
   bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
@@ -309,6 +319,7 @@
   bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
                       unsigned &SrcReg2, int &CmpMask,
                       int &CmpValue) const override;
+
   /// optimizeCompareInstr - Convert the instruction supplying the argument to
   /// the comparison into one that sets the zero bit in the flags register.
   bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
Index: lib/Target/AArch64/AArch64InstrInfo.cpp
===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.cpp
+++ lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5063,3 +5063,57 @@
 
   return It;
 }
+
+bool AArch64InstrInfo::analyzeLoop(MachineLoop &L,
+                                   MachineInstr *&IndVarInst,
+                                   MachineInstr *&CmpInst) const {
+  MachineBasicBlock *LoopEnd = L.getBottomBlock();
+  MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator();
+  MachineBasicBlock::iterator E = LoopEnd->getFirstNonDebugInstr();
+  MachineInstr *BccMI = nullptr;
+  MachineInstr *CompMI = nullptr;
+  MachineInstr *CopyMI = nullptr;
+  MachineInstr *AddMI = nullptr;
+  for (; I != E; --I) {
+    if (!BccMI && I->getOpcode() == AArch64::Bcc) {
+      BccMI = &*I;
+      AArch64CC::CondCode CC = (AArch64CC::CondCode)BccMI->getOperand(0).getImm();
+      if (CC != AArch64CC::LT)
+	return true;
+    } else if (BccMI && !CompMI && I->getOpcode() == AArch64::SUBSXrr) {
+      CompMI = &*I;
+    } else if (CompMI && !CopyMI && I->getOpcode() == AArch64::COPY) {
+      if (CompMI->getOperand(1).getReg() == I->getOperand(1).getReg()) {
+	CopyMI = &*I;
+      }
+    } else if (CopyMI && !AddMI && I->getOpcode() == AArch64::ADDXri) {
+      if (CompMI->getOperand(1).getReg() == I->getOperand(0).getReg()) {
+	AddMI = &*I;
+      }
+    } else if (AddMI && I->isPHI()) {
+      if (I->getOperand(0).getReg() == AddMI->getOperand(1).getReg()
+	  && I->getOperand(3).getReg() == CopyMI->getOperand(0).getReg()) {
+	IndVarInst = AddMI;
+	CmpInst = CompMI;
+	return false;
+      }
+    }
+  }
+  return true;
+}
+
+unsigned AArch64InstrInfo::reduceLoopCount(MachineBasicBlock &MBB,
+      MachineInstr *IndVar, MachineInstr &Cmp,
+      SmallVectorImpl<MachineOperand> &Cond,
+      SmallVectorImpl<MachineInstr *> &PrevInsts,
+      unsigned Iter, unsigned MaxIter) const {
+  MachineInstr *CompMI = nullptr;
+  for (auto I = MBB.instr_rbegin(), E = MBB.instr_rend(); I != E; ++I) {
+    if (I->getOpcode() == AArch64::SUBSXrr) {
+      CompMI = &*I;
+    }
+  }
+  unsigned LoopCount = CompMI->getOperand(1).getReg();
+  Cond.push_back(MachineOperand::CreateImm(AArch64CC::LT));
+  return LoopCount;
+}
Index: lib/Target/AArch64/AArch64TargetMachine.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetMachine.cpp
+++ lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -488,6 +488,8 @@
     // be register coaleascer friendly.
     addPass(&PeepholeOptimizerID);
   }
+  if (TM->getOptLevel() >= CodeGenOpt::Default)
+    addPass(&MachinePipelinerID);
 }
 
 void AArch64PassConfig::addPostRegAlloc() {
Index: lib/Target/X86/X86InstrInfo.h
===================================================================
--- lib/Target/X86/X86InstrInfo.h
+++ lib/Target/X86/X86InstrInfo.h
@@ -77,12 +77,6 @@
 unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
                          bool HasMemoryOperand = false);
 
-// Turn jCC opcode into condition code.
-CondCode getCondFromBranchOpc(unsigned Opc);
-
-// Turn setCC opcode into condition code.
-CondCode getCondFromSETOpc(unsigned Opc);
-
 // Turn CMov opcode into condition code.
 CondCode getCondFromCMovOpc(unsigned Opc);
 
@@ -366,6 +360,16 @@
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
+  
+  bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
+                   MachineInstr *&CmpInst) const override;
+
+  unsigned reduceLoopCount(MachineBasicBlock &MBB,
+                           MachineInstr *IndVar, MachineInstr &Cmp,
+                           SmallVectorImpl<MachineOperand> &Cond,
+                           SmallVectorImpl<MachineInstr *> &PrevInsts,
+                           unsigned Iter, unsigned MaxIter) const override;
+  
   bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
                        unsigned, unsigned, int &, int &, int &) const override;
   void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
Index: lib/Target/X86/X86InstrInfo.cpp
===================================================================
--- lib/Target/X86/X86InstrInfo.cpp
+++ lib/Target/X86/X86InstrInfo.cpp
@@ -5782,7 +5782,7 @@
   return false;
 }
 
-X86::CondCode X86::getCondFromBranchOpc(unsigned BrOpc) {
+static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
   switch (BrOpc) {
   default: return X86::COND_INVALID;
   case X86::JE_1:  return X86::COND_E;
@@ -5805,7 +5805,7 @@
 }
 
 /// Return condition code of a SET opcode.
-X86::CondCode X86::getCondFromSETOpc(unsigned Opc) {
+static X86::CondCode getCondFromSETOpc(unsigned Opc) {
   switch (Opc) {
   default: return X86::COND_INVALID;
   case X86::SETAr:  case X86::SETAm:  return X86::COND_A;
@@ -6130,7 +6130,7 @@
     if (!I->isBranch())
       assert(0 && "Can't find the branch to replace!");
 
-    X86::CondCode CC = X86::getCondFromBranchOpc(I->getOpcode());
+    X86::CondCode CC = getCondFromBranchOpc(I->getOpcode());
     assert(BranchCond.size() == 1);
     if (CC != BranchCond[0].getImm())
       continue;
@@ -6237,7 +6237,7 @@
     }
 
     // Handle conditional branches.
-    X86::CondCode BranchCode = X86::getCondFromBranchOpc(I->getOpcode());
+    X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode());
     if (BranchCode == X86::COND_INVALID)
       return true;  // Can't handle indirect branch.
 
@@ -6433,7 +6433,7 @@
     if (I->isDebugValue())
       continue;
     if (I->getOpcode() != X86::JMP_1 &&
-        X86::getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+        getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
       break;
     // Remove the branch.
     I->eraseFromParent();
@@ -6710,12 +6710,102 @@
     return;
   }
 
-  if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
-    // FIXME: We use a fatal error here because historically LLVM has tried
-    // lower some of these physreg copies and we want to ensure we get
-    // reasonable bug reports if someone encounters a case no other testing
-    // found. This path should be removed after the LLVM 7 release.
-    report_fatal_error("Unable to copy EFLAGS physical register!");
+  bool FromEFLAGS = SrcReg == X86::EFLAGS;
+  bool ToEFLAGS = DestReg == X86::EFLAGS;
+  int Reg = FromEFLAGS ? DestReg : SrcReg;
+  bool is32 = X86::GR32RegClass.contains(Reg);
+  bool is64 = X86::GR64RegClass.contains(Reg);
+
+  if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) {
+    int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
+    int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
+    int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32;
+    int Pop = is64 ? X86::POP64r : X86::POP32r;
+    int PopF = is64 ? X86::POPF64 : X86::POPF32;
+    int AX = is64 ? X86::RAX : X86::EAX;
+
+    if (!Subtarget.hasLAHFSAHF()) {
+      assert(Subtarget.is64Bit() &&
+             "Not having LAHF/SAHF only happens on 64-bit.");
+      // Moving EFLAGS to / from another register requires a push and a pop.
+      // Notice that we have to adjust the stack if we don't want to clobber the
+      // first frame index. See X86FrameLowering.cpp - usesTheStack.
+      if (FromEFLAGS) {
+        BuildMI(MBB, MI, DL, get(PushF));
+        BuildMI(MBB, MI, DL, get(Pop), DestReg);
+      }
+      if (ToEFLAGS) {
+        BuildMI(MBB, MI, DL, get(Push))
+            .addReg(SrcReg, getKillRegState(KillSrc));
+        BuildMI(MBB, MI, DL, get(PopF));
+      }
+      return;
+    }
+
+    // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is
+    // inefficient. Instead:
+    //   - Save the overflow flag OF into AL using SETO, and restore it using a
+    //     signed 8-bit addition of AL and INT8_MAX.
+    //   - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH
+    //     using LAHF/SAHF.
+    //   - When RAX/EAX is live and isn't the destination register, make sure it
+    //     isn't clobbered by PUSH/POP'ing it before and after saving/restoring
+    //     the flags.
+    // This approach is ~2.25x faster than using PUSHF/POPF.
+    //
+    // This is still somewhat inefficient because we don't know which flags are
+    // actually live inside EFLAGS. Were we able to do a single SETcc instead of
+    // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster.
+    //
+    // PUSHF/POPF is also potentially incorrect because it affects other flags
+    // such as TF/IF/DF, which LLVM doesn't model.
+    //
+    // Notice that we have to adjust the stack if we don't want to clobber the
+    // first frame index.
+    // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment.
+
+    const TargetRegisterInfo &TRI = getRegisterInfo();
+    MachineBasicBlock::LivenessQueryResult LQR =
+        MBB.computeRegisterLiveness(&TRI, AX, MI);
+    // We do not want to save and restore AX if we do not have to.
+    // Moreover, if we do so whereas AX is dead, we would need to set
+    // an undef flag on the use of AX, otherwise the verifier will
+    // complain that we read an undef value.
+    // We do not want to change the behavior of the machine verifier
+    // as this is usually wrong to read an undef value.
+    if (MachineBasicBlock::LQR_Unknown == LQR) {
+      LivePhysRegs LPR(TRI);
+      LPR.addLiveOuts(MBB);
+      MachineBasicBlock::iterator I = MBB.end();
+      while (I != MI) {
+        --I;
+        LPR.stepBackward(*I);
+      }
+      // AX contains the top most register in the aliasing hierarchy.
+      // It may not be live, but one of its aliases may be.
+      for (MCRegAliasIterator AI(AX, &TRI, true);
+           AI.isValid() && LQR != MachineBasicBlock::LQR_Live; ++AI)
+        LQR = LPR.contains(*AI) ? MachineBasicBlock::LQR_Live
+                                : MachineBasicBlock::LQR_Dead;
+    }
+    bool AXDead = (Reg == AX) || (MachineBasicBlock::LQR_Dead == LQR);
+    if (!AXDead)
+      BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
+    if (FromEFLAGS) {
+      BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL);
+      BuildMI(MBB, MI, DL, get(X86::LAHF));
+      BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX);
+    }
+    if (ToEFLAGS) {
+      BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc));
+      BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL)
+          .addReg(X86::AL)
+          .addImm(INT8_MAX);
+      BuildMI(MBB, MI, DL, get(X86::SAHF));
+    }
+    if (!AXDead)
+      BuildMI(MBB, MI, DL, get(Pop), AX);
+    return;
   }
 
   DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
@@ -7375,9 +7465,9 @@
     if (IsCmpZero || IsSwapped) {
       // We decode the condition code from opcode.
       if (Instr.isBranch())
-        OldCC = X86::getCondFromBranchOpc(Instr.getOpcode());
+        OldCC = getCondFromBranchOpc(Instr.getOpcode());
       else {
-        OldCC = X86::getCondFromSETOpc(Instr.getOpcode());
+        OldCC = getCondFromSETOpc(Instr.getOpcode());
         if (OldCC != X86::COND_INVALID)
           OpcIsSET = true;
         else
@@ -9323,9 +9413,8 @@
 isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
   // FIXME: Return false for x87 stack register classes for now. We can't
   // allow any loads of these registers before FpGet_ST0_80.
-  return !(RC == &X86::CCRRegClass || RC == &X86::DFCCRRegClass ||
-           RC == &X86::RFP32RegClass || RC == &X86::RFP64RegClass ||
-           RC == &X86::RFP80RegClass);
+  return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass ||
+           RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
 }
 
 /// Return a virtual register initialized with the
@@ -10872,3 +10961,49 @@
 
   return It;
 }
+
+bool X86InstrInfo::analyzeLoop(MachineLoop &L,
+			       MachineInstr *&IndVarInst,
+			       MachineInstr *&CmpInst) const {
+  MachineBasicBlock *LoopEnd = L.getBottomBlock();
+  MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator();
+  MachineBasicBlock::iterator E = LoopEnd->getFirstNonDebugInstr();
+  MachineInstr *JumpMI = nullptr;
+  MachineInstr *CompMI = nullptr;
+  MachineInstr *AddMI = nullptr;
+  for (; I != E; --I) {
+    if (!JumpMI && I->getOpcode() == X86::JL_1) {
+      JumpMI = &*I;
+    } else if (JumpMI && !CompMI && I->getOpcode() == X86::CMP64rr) {
+      CompMI = &*I;
+    } else if (CompMI && !AddMI &&
+	       (I->getOpcode() == X86::INC64r || I->getOpcode() == X86::ADD64ri8)
+	       && CompMI->getOperand(0).getReg() == I->getOperand(0).getReg()) {
+      AddMI = &*I;
+    } else if (AddMI && I->isPHI()) {
+      if (I->getOperand(0).getReg() == AddMI->getOperand(1).getReg()
+	  && I->getOperand(3).getReg() == AddMI->getOperand(0).getReg()) {
+	IndVarInst = AddMI;
+	CmpInst = CompMI;
+	return false;
+      }
+    }
+  }
+  return true;
+}
+
+unsigned X86InstrInfo::reduceLoopCount(MachineBasicBlock &MBB,
+      MachineInstr *IndVar, MachineInstr &Cmp,
+      SmallVectorImpl<MachineOperand> &Cond,
+      SmallVectorImpl<MachineInstr *> &PrevInsts,
+      unsigned Iter, unsigned MaxIter) const {
+  MachineInstr *CompMI = nullptr;
+  for (auto I = MBB.instr_rbegin(), E = MBB.instr_rend(); I != E; ++I) {
+    if (I->getOpcode() == X86::CMP64rr) {
+      CompMI = &*I;
+    }
+  }
+  unsigned LoopCount = CompMI->getOperand(0).getReg();
+  Cond.push_back(MachineOperand::CreateImm(X86::COND_L));
+  return LoopCount;
+}
Index: lib/Target/X86/X86TargetMachine.cpp
===================================================================
--- lib/Target/X86/X86TargetMachine.cpp
+++ lib/Target/X86/X86TargetMachine.cpp
@@ -62,7 +62,6 @@
 void initializeX86CmovConverterPassPass(PassRegistry &);
 void initializeX86ExecutionDepsFixPass(PassRegistry &);
 void initializeX86DomainReassignmentPass(PassRegistry &);
-void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
 
 } // end namespace llvm
 
@@ -81,7 +80,6 @@
   initializeX86CmovConverterPassPass(PR);
   initializeX86ExecutionDepsFixPass(PR);
   initializeX86DomainReassignmentPass(PR);
-  initializeX86FlagsCopyLoweringPassPass(PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -417,7 +415,9 @@
     addPass(createX86CallFrameOptimization());
   }
 
-  addPass(createX86FlagsCopyLoweringPass());
+  if (TM->getOptLevel() >= CodeGenOpt::Default)
+    addPass(&MachinePipelinerID);
+
   addPass(createX86WinAllocaExpander());
 }
 void X86PassConfig::addMachineSSAOptimization() {