Index: lib/CodeGen/TwoAddressInstructionPass.cpp
===================================================================
--- lib/CodeGen/TwoAddressInstructionPass.cpp
+++ lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -102,6 +102,8 @@
   bool sink3AddrInstruction(MachineInstr *MI, unsigned Reg,
                             MachineBasicBlock::iterator OldPos);
 
+  bool isRevCopyChain(unsigned FromReg, unsigned ToReg, int Maxlen);
+
   bool noUseAfterLastDef(unsigned Reg, unsigned Dist, unsigned &LastDef);
 
   bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
@@ -309,6 +311,45 @@
   return true;
 }
 
+/// getSingleDef -- return the MachineInstr* if it is the single def of the Reg
+/// in current BB.
+static MachineInstr *getSingleDef(unsigned Reg, MachineBasicBlock *BB,
+                                  const MachineRegisterInfo *MRI) {
+  MachineInstr *Ret = nullptr;
+  for (MachineInstr &DefMI : MRI->def_instructions(Reg)) {
+    if (DefMI.getParent() != BB || DefMI.isDebugValue())
+      continue;
+    if (!Ret)
+      Ret = &DefMI;
+    else if (Ret != &DefMI)
+      return nullptr;
+  }
+  return Ret;
+}
+
+/// Check if there is a reversed copy chain from FromReg to ToReg:
+/// %Tmp1 = copy %Tmp2;
+/// %FromReg = copy %Tmp1;
+/// %ToReg = add %FromReg ...
+/// %Tmp2 = copy %ToReg;
+/// MaxLen specifies the maximum length of the copy chain the func
+/// can walk through.
+bool TwoAddressInstructionPass::isRevCopyChain(unsigned FromReg, unsigned ToReg,
+                                               int Maxlen) {
+  unsigned TmpReg = FromReg;
+  for (int i = 0; i < Maxlen; i++) {
+    MachineInstr *Def = getSingleDef(TmpReg, MBB, MRI);
+    if (!Def || !Def->isCopy())
+      return false;
+
+    TmpReg = Def->getOperand(1).getReg();
+
+    if (TmpReg == ToReg)
+      return true;
+  }
+  return false;
+}
+
 /// noUseAfterLastDef - Return true if there are no intervening uses between the
 /// last instruction in the MBB that defines the specified register and the
 /// two-address instruction which is being processed. It also returns the last
@@ -574,6 +615,27 @@
   if (!noUseAfterLastDef(regB, Dist, LastDefB))
     return true;
 
+  // Look for situation like this:
+  // %reg101 = MOV %reg100
+  // %reg102 = ...
+  // %reg103 = ADD %reg102, %reg101
+  // ... = %reg103 ...
+  // %reg100 = MOV %reg103
+  // If there is a reversed copy chain from reg101 to reg103, commute the ADD
+  // to eliminate an otherwise unavoidable copy.
+  // FIXME:
+  // We can extend the logic further: If an pair of operands in an insn has
+  // been merged, the insn could be regarded as a virtual copy, and the virtual
+  // copy could also be used to construct a copy chain.
+  // To more generally minimize register copies, ideally the logic of two addr
+  // instruction pass should be integrated with register allocation pass where
+  // interference graph is available.
+  if (isRevCopyChain(regC, regA, 3))
+    return true;
+
+  if (isRevCopyChain(regB, regA, 3))
+    return false;
+
   // Since there are no intervening uses for both registers, then commute
   // if the def of regC is closer. Its live interval is shorter.
   return LastDefB && LastDefC && LastDefC > LastDefB;
Index: test/CodeGen/X86/twoaddr-coalesce-3.ll
===================================================================
--- test/CodeGen/X86/twoaddr-coalesce-3.ll
+++ test/CodeGen/X86/twoaddr-coalesce-3.ll
@@ -0,0 +1,84 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+; This test is to ensure TwoAddrInstruction pass choose the proper operands to merge and
+; generate less mov insns.
+
+@M = common global i32 0, align 4
+@total = common global i32 0, align 4
+@g = common global i32 0, align 4
+
+; Function Attrs: nounwind uwtable
+define void @foo() {
+entry:
+  %0 = load i32* @M, align 4
+  %cmp3 = icmp sgt i32 %0, 0
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %total.promoted = load i32* @total, align 4
+  br label %for.body
+
+; Check that only one mov will be generated in the kernel loop.
+; CHECK-LABEL: foo:
+; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: # %for.body
+; CHECK-NOT: mov
+; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]]
+; CHECK-NOT: mov
+; CHECK: shrl $31, [[REG1]]
+; CHECK-NOT: mov
+; CHECK: jl [[LOOP1]]
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %add5 = phi i32 [ %total.promoted, %for.body.lr.ph ], [ %add, %for.body ]
+  %i.04 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %div = sdiv i32 %i.04, 2
+  %add = add nsw i32 %div, %add5
+  %inc = add nuw nsw i32 %i.04, 1
+  %cmp = icmp slt i32 %inc, %0
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  store i32 %add, i32* @total, align 4
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @goo() {
+entry:
+  %0 = load i32* @M, align 4
+  %cmp3 = icmp sgt i32 %0, 0
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %total.promoted = load i32* @total, align 4
+  br label %for.body
+
+; Check that only two mov will be generated in the kernel loop.
+; CHECK-LABEL: goo:
+; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: # %for.body
+; CHECK-NOT: mov
+; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]]
+; CHECK-NOT: mov
+; CHECK: shrl $31, [[REG2]]
+; CHECK-NOT: mov
+; CHECK: movl {{.*}}, g(%rip)
+; CHECK: jl [[LOOP2]]
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %add5 = phi i32 [ %total.promoted, %for.body.lr.ph ], [ %add, %for.body ]
+  %i.04 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %div = sdiv i32 %i.04, 2
+  %add = add nsw i32 %div, %add5
+  store volatile i32 %add, i32* @g, align 4
+  %inc = add nuw nsw i32 %i.04, 1
+  %cmp = icmp slt i32 %inc, %0
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  store i32 %add, i32* @total, align 4
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void
+}
+