Index: lib/CodeGen/PeepholeOptimizer.cpp
===================================================================
--- lib/CodeGen/PeepholeOptimizer.cpp
+++ lib/CodeGen/PeepholeOptimizer.cpp
@@ -77,6 +77,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -119,6 +120,14 @@
     "rewrite-phi-limit", cl::Hidden, cl::init(10),
     cl::desc("Limit the length of PHI chains to lookup"));
 
+// Limit the length of recurrence chain when evaluating the benefit of
+// commuting operands.
+static cl::opt<unsigned> MaxRecurrenceChain(
+    "recurrence-chain-limit", cl::Hidden, cl::init(3),
+    cl::desc("Maximum length of recurrence chain when evaluating the benefit "
+             "of commuting operands"));
+
+
 STATISTIC(NumReuse,      "Number of extension results reused");
 STATISTIC(NumCmps,       "Number of compares eliminated");
 STATISTIC(NumImmFold,    "Number of move immediate folded");
@@ -131,12 +140,14 @@
 namespace {
 
   class ValueTrackerResult;
+  class RecurrenceInstr;
 
   class PeepholeOptimizer : public MachineFunctionPass {
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
     MachineRegisterInfo   *MRI;
     MachineDominatorTree  *DT;  // Machine dominator tree
+    MachineLoopInfo       *MLI;
 
   public:
     static char ID; // Pass identification
@@ -150,6 +161,8 @@
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
       if (Aggressive) {
         AU.addRequired<MachineDominatorTree>();
         AU.addPreserved<MachineDominatorTree>();
@@ -160,6 +173,9 @@
     typedef SmallDenseMap<TargetInstrInfo::RegSubRegPair, ValueTrackerResult>
         RewriteMapTy;
 
+    /// \brief Sequence of instructions that formulate recurrence cycle.
+    typedef SmallVector<RecurrenceInstr, 4> RecurrenceCycle;
+
   private:
     bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
     bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
@@ -170,6 +186,7 @@
     bool optimizeCoalescableCopy(MachineInstr *MI);
     bool optimizeUncoalescableCopy(MachineInstr *MI,
                                    SmallPtrSetImpl<MachineInstr *> &LocalMIs);
+    bool optimizeRecurrence(MachineInstr &PHI);
     bool findNextSource(unsigned Reg, unsigned SubReg,
                         RewriteMapTy &RewriteMap);
     bool isMoveImmediate(MachineInstr *MI,
@@ -178,6 +195,13 @@
     bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
                        SmallSet<unsigned, 4> &ImmDefRegs,
                        DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
+    /// \brief Finds recurrence cycles, but only ones that formulated around
+    /// a def operand and a use operand that are tied. If there is a use
+    /// operand commutable with the tied use operand, find recurrence cycle
+    /// along that operand as well.
+    void findTargetRecurrence(unsigned DefReg, unsigned TargetReg,
+                              RecurrenceCycle &RC,
+                              SmallVectorImpl<RecurrenceCycle> &RCs);
 
     /// \brief If copy instruction \p MI is a virtual register copy, track it in
     /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was
@@ -222,6 +246,28 @@
     }
   };
 
+  /// \brief Helper class to hold instructions that are inside recurrence
+  /// cycles. The recurrence cycle is formulated around 1) a def operand and its
+  /// tied use operand, or 2) a def operand and a use operand that is commutable
+  /// with another use operand which is tied to the def operand. In the latter
+  /// case, index of the tied use operand and the commutable use operand are
+  /// maintained with CommutePair.
+  class RecurrenceInstr {
+  public:
+    typedef std::pair<unsigned, unsigned> IndexPair;
+
+    RecurrenceInstr(MachineInstr *MI) : MI(MI) {}
+    RecurrenceInstr(MachineInstr *MI, unsigned Idx1, unsigned Idx2)
+      : MI(MI), CommutePair(std::make_pair(Idx1, Idx2)) {}
+
+    MachineInstr *getMI() const { return MI; }
+    Optional<IndexPair> getCommutePair() const { return CommutePair; }
+
+  private:
+    MachineInstr *MI;
+    Optional<IndexPair> CommutePair;
+  };
+
   /// \brief Helper class to hold a reply for ValueTracker queries. Contains the
   /// returned sources for a given search and the instructions where the sources
   /// were tracked from.
@@ -412,6 +458,7 @@
 INITIALIZE_PASS_BEGIN(PeepholeOptimizer, DEBUG_TYPE,
                 "Peephole Optimizations", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(PeepholeOptimizer, DEBUG_TYPE,
                 "Peephole Optimizations", false, false)
 
@@ -1487,6 +1534,114 @@
   return false;
 }
 
+/// \bried Returns true if \p MO is a virtual register operand.
+static bool isVirtualRegisterOperand(MachineOperand &MO) {
+  if (!MO.isReg())
+    return false;
+  return TargetRegisterInfo::isVirtualRegister(MO.getReg());
+}
+
+void PeepholeOptimizer::findTargetRecurrence(
+    unsigned DefReg, unsigned TargetReg, RecurrenceCycle &RC,
+    SmallVectorImpl<RecurrenceCycle> &RCs) {
+  // Recurrence found if (DefReg == TargetReg).
+  if (DefReg == TargetReg) {
+    RCs.push_back(RC);
+    return;
+  }
+
+  // Give up if the recurrence chain length is longer than the limit.
+  if (RC.size() >= MaxRecurrenceChain)
+    return;
+
+  MachineInstr *DefMI = MRI->getVRegDef(DefReg);
+  int DefIdx = DefMI->findRegisterDefOperandIdx(DefReg);
+  assert(DefIdx != -1 && "DefReg is not defined by DefMI");
+
+  // Check if there is a use operand that tied to DefReg. We are only
+  // interested in the case that all the instructions in the recurrence chain
+  // have there def operand tied with one of the use operand.
+  unsigned TiedUseIdx;
+  if (!DefMI->isRegTiedToUseOperand(DefIdx, &TiedUseIdx))
+    return;
+
+  auto Progress = [&](unsigned Idx, RecurrenceInstr &&RI) {
+    MachineOperand &MO = DefMI->getOperand(Idx);
+    if (isVirtualRegisterOperand(MO)) {
+      RC.push_back(RI);
+      findTargetRecurrence(MO.getReg(), TargetReg, RC, RCs);
+      RC.pop_back();
+    }
+  };
+
+  // Continuing search with tied use operand.
+  Progress(TiedUseIdx, RecurrenceInstr(DefMI));
+
+  // If tied use operand is commutable, continuing search with commutable
+  // operand as well.
+  unsigned CommutableIdx = TargetInstrInfo::CommuteAnyOperandIndex;
+  if (TII->findCommutedOpIndices(*DefMI, TiedUseIdx, CommutableIdx))
+    Progress(CommutableIdx, RecurrenceInstr(DefMI, TiedUseIdx, CommutableIdx));
+}
+
+/// \brief Phi instructions will eventually be lowered to copy instructions. If
+/// phi is in a loop header, a recurrence may formulated around the source and
+/// destination of the phi. For such case commuting operands of the instructions
+/// in the recurrence may enable coalescing of the copy instruction generated
+/// from the phi. For example, if there is a recurrence of
+///
+/// LoopHeader:
+///   %vreg1 = phi(%vreg0, %vreg100)
+/// LoopLatch:
+///   %vreg0<def, tied1> = ADD %vreg2<def, tied0>, %vreg1
+///
+/// , the fact that vreg0 and vreg2 are in the same tied operands set makes
+/// the coalescing of copy instruction generated from the phi in
+/// LoopHeader(i.e. %vreg1 = COPY %vreg0) impossible, because %vreg1 and
+/// %vreg2 have overlapping live range. This introduces additional move
+/// instruction to the final assembly. However, if we commute %vreg2 and
+/// %vreg1 of ADD instruction, the redundant move instruction can be
+/// avoided.
+bool PeepholeOptimizer::optimizeRecurrence(MachineInstr &PHI) {
+  bool Changed = false;
+  unsigned TargetReg = PHI.getOperand(0).getReg();
+
+  // Find recurrence cycles of interest that are formulated around PHI. Except
+  // PHI, all the instructions in the cycles are required to have its def
+  // operand tied with one of its source operand.
+  for (unsigned Idx = 1; Idx < PHI.getNumOperands(); Idx += 2) {
+    MachineOperand &MO = PHI.getOperand(Idx);
+    assert(isVirtualRegisterOperand(MO) && "Invalid PHI instruction");
+
+    unsigned Reg = MO.getReg();
+    MachineInstr *MI = MRI->getVRegDef(Reg);
+    if (!MI)
+      continue;
+
+    RecurrenceCycle RC;
+    SmallVector<RecurrenceCycle, 4> RCs;
+    findTargetRecurrence(Reg, TargetReg, RC, RCs);
+
+    // Commutes operands of instructions in each RC if necessary so that the
+    // copy to be generated from PHI can be coalesced.
+    for (auto &RC : RCs) {
+      DEBUG(dbgs() << "Optimize recurrence chain from " << PHI << "\n");
+      for (auto RI : RC) {
+        DEBUG(dbgs() << "\tInst: " << *(RI.getMI()));
+        auto CP = RI.getCommutePair();
+        if (CP) {
+          Changed = true;
+          TII->commuteInstruction(*(RI.getMI()), false, (*CP).first,
+                                  (*CP).second);
+          DEBUG(dbgs() << "\t\tCommuted: " << *(RI.getMI()));
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
+
 bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(*MF.getFunction()))
     return false;
@@ -1501,6 +1656,7 @@
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   DT  = Aggressive ? &getAnalysis<MachineDominatorTree>() : nullptr;
+  MLI = &getAnalysis<MachineLoopInfo>();
 
   bool Changed = false;
 
@@ -1529,6 +1685,8 @@
     SmallSet<unsigned, 4> CopySrcRegs;
     DenseMap<unsigned, MachineInstr *> CopySrcMIs;
 
+    bool IsLoopHeader = MLI->isLoopHeader(&MBB);
+
     for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
          MII != MIE; ) {
       MachineInstr *MI = &*MII;
@@ -1540,9 +1698,16 @@
       if (MI->isDebugValue())
           continue;
 
-      if (MI->isPosition() || MI->isPHI())
+      if (MI->isPosition())
         continue;
 
+      if (IsLoopHeader && MI->isPHI()) {
+        if (optimizeRecurrence(*MI)) {
+          Changed = true;
+          continue;
+        }
+      }
+
       if (!MI->isCopy()) {
         for (const auto &Op : MI->operands()) {
           // Visit all operands: definitions can be implicit or explicit.
@@ -1667,7 +1832,7 @@
               MRI->markUsesInDebugValueAsUndef(FoldedReg);
               FoldAsLoadDefCandidates.erase(FoldedReg);
               ++NumLoadFold;
-              
+
               // MI is replaced with FoldMI so we can continue trying to fold
               Changed = true;
               MI = FoldMI;
@@ -1675,7 +1840,7 @@
           }
         }
       }
-      
+
       // If we run into an instruction we can't fold across, discard
       // the load candidates.  Note: We might be able to fold *into* this
       // instruction, so this needs to be after the folding logic.
Index: lib/CodeGen/TwoAddressInstructionPass.cpp
===================================================================
--- lib/CodeGen/TwoAddressInstructionPass.cpp
+++ lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -68,6 +68,13 @@
                    cl::desc("Coalesce copies by rescheduling (default=true)"),
                    cl::init(true), cl::Hidden);
 
+// Limit the number of dataflow edges to traverse when evaluating the benefit
+// of commuting operands.
+static cl::opt<unsigned> MaxDataFlowEdge(
+    "dataflow-edge-limit", cl::Hidden, cl::init(3),
+    cl::desc("Maximum number of dataflow edges to traverse when evaluating "
+             "the benefit of commuting operands"));
+
 namespace {
 class TwoAddressInstructionPass : public MachineFunctionPass {
   MachineFunction *MF;
@@ -637,10 +644,10 @@
   // To more generally minimize register copies, ideally the logic of two addr
   // instruction pass should be integrated with register allocation pass where
   // interference graph is available.
-  if (isRevCopyChain(regC, regA, 3))
+  if (isRevCopyChain(regC, regA, MaxDataFlowEdge))
     return true;
 
-  if (isRevCopyChain(regB, regA, 3))
+  if (isRevCopyChain(regB, regA, MaxDataFlowEdge))
     return false;
 
   // Since there are no intervening uses for both registers, then commute
Index: test/CodeGen/MIR/Generic/multiRunPass.mir
===================================================================
--- test/CodeGen/MIR/Generic/multiRunPass.mir
+++ test/CodeGen/MIR/Generic/multiRunPass.mir
@@ -7,7 +7,8 @@
 # This test ensures that the command line accepts
 # several run passes on the same command line and
 # actually create the proper pipeline for it.
-# PSEUDO_PEEPHOLE: -expand-isel-pseudos -peephole-opt
+# PSEUDO_PEEPHOLE: -expand-isel-pseudos
+# PSEUDO_PEEPHOLE-SAME: -peephole-opt
 # PEEPHOLE_PSEUDO: -peephole-opt -expand-isel-pseudos
 
 # Make sure there are no other passes happening after what we asked.
Index: test/CodeGen/X86/fma-do-not-commute.ll
===================================================================
--- test/CodeGen/X86/fma-do-not-commute.ll
+++ test/CodeGen/X86/fma-do-not-commute.ll
@@ -9,9 +9,9 @@
 ; CHECK-NOT: {{.*}}, %xmm0
 ; %addr lives in rdi.
 ; %addr2 lives in rsi.
-; CHECK: vmovss (%rdi), [[ADDR:%xmm[0-9]+]]
+; CHECK: vmovss (%rsi), [[ADDR2:%xmm[0-9]+]]
 ; The assembly syntax is in the reverse order.
-; CHECK: vfmadd231ss (%rsi), [[ADDR]], %xmm0
+; CHECK: vfmadd231ss (%rdi), [[ADDR2]], %xmm0
 define void @test1(float* %addr, float* %addr2, float %arg) {
 entry:
   br label %loop
Index: test/CodeGen/X86/peephole-recurrence.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/peephole-recurrence.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+;
+; CHECK: bb7
+; CHECK: cmpl{{ +}}$10
+; CHECK-NOT: movl
+; CHECK: jl
+;
+; Check that there's no redundant copy at the end of bb7 to feed %vreg3 to bb1.
+
+define i32 @foo(i32 %a) {
+bb0:
+  br label %bb1
+
+bb1: ; preds = %bb0, %bb7
+  %vreg0 = phi i32 [ 0 , %bb0 ], [ %vreg3, %bb7 ]
+  %cond0 = icmp eq i32 %a, 0
+  br i1 %cond0, label %bb2, label %bb3
+
+bb2: ; preds = %bb1
+  br label %bb4
+
+bb3: ; preds = %bb1
+  br label %bb4
+
+bb4: ; preds = %bb2, %bb3
+  %vreg5 = phi i32 [ 1, %bb2 ], [ 2, %bb3 ]
+  %cond1 = icmp eq i32 %vreg5, 0
+  br i1 %cond1, label %bb5, label %bb6
+  
+bb5: ; preds = %bb4
+  br label %bb7
+
+bb6: ; preds = %bb4
+  br label %bb7
+ 
+bb7: ; preds = %bb5, %bb6
+  %vreg1 = phi i32 [ 1, %bb5 ], [ 2, %bb6 ]
+  %vreg2 = add i32 %vreg5, %vreg0
+  %vreg3 = add i32 %vreg1, %vreg2
+  %cond2 = icmp slt i32 %vreg3, 10
+  br i1 %cond2, label %bb1, label %bb8
+
+bb8: ; preds = %bb7
+  ret i32 0
+}