Index: lib/CodeGen/MachineCSE.cpp
===================================================================
--- lib/CodeGen/MachineCSE.cpp
+++ lib/CodeGen/MachineCSE.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/RecyclingAllocator.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -37,6 +38,11 @@
           "Number of cross-MBB physreg referencing CS eliminated");
 STATISTIC(NumCommutes,  "Number of copies coalesced after commuting");
 
+//FIXME remove this after fix CodeGen/X86/inline-asm-fpstack.ll
+static cl::opt<bool>
+CSEIgnoreCopy("cse-ignore-copy", cl::init(false), cl::Hidden,
+              cl::desc("ignore copy"));
+
 namespace {
   class MachineCSE : public MachineFunctionPass {
     const TargetInstrInfo *TII;
@@ -114,53 +120,41 @@
 
 bool MachineCSE::PerformTrivialCoalescing(MachineInstr *MI,
                                           MachineBasicBlock *MBB) {
-  bool Changed = false;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg() || !MO.isUse())
-      continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
-      continue;
-    if (!MRI->hasOneNonDBGUse(Reg))
-      // Only coalesce single use copies. This ensure the copy will be
-      // deleted.
-      continue;
-    MachineInstr *DefMI = MRI->getVRegDef(Reg);
-    if (!DefMI->isCopy())
-      continue;
-    unsigned SrcReg = DefMI->getOperand(1).getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
-      continue;
-    if (DefMI->getOperand(0).getSubReg())
-      continue;
-    // FIXME: We should trivially coalesce subregister copies to expose CSE
-    // opportunities on instructions with truncated operands (see
-    // cse-add-with-overflow.ll). This can be done here as follows:
-    // if (SrcSubReg)
-    //  RC = TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), RC,
-    //                                     SrcSubReg);
-    // MO.substVirtReg(SrcReg, SrcSubReg, *TRI);
-    //
-    // The 2-addr pass has been updated to handle coalesced subregs. However,
-    // some machine-specific code still can't handle it.
-    // To handle it properly we also need a way find a constrained subregister
-    // class given a super-reg class and subreg index.
-    if (DefMI->getOperand(1).getSubReg())
-      continue;
-    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-    if (!MRI->constrainRegClass(SrcReg, RC))
-      continue;
-    DEBUG(dbgs() << "Coalescing: " << *DefMI);
-    DEBUG(dbgs() << "***     to: " << *MI);
-    MO.setReg(SrcReg);
-    MRI->clearKillFlags(SrcReg);
-    DefMI->eraseFromParent();
-    ++NumCoalesces;
-    Changed = true;
+  if (!MI->isCopy())
+    return false;
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+    return false;
+  unsigned DstReg = MI->getOperand(0).getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+    return false;
+  if (!MRI->hasOneNonDBGUse(DstReg))
+    // Only coalesce single use copies. This ensure the copy will be
+    // deleted.
+    return false;
+  MachineInstr *UseMI = nullptr;
+  MachineOperand *UseMO = nullptr;
+  bool OnlyOneUse = true;
+  for (MachineOperand &MO : MRI->use_nodbg_operands(DstReg)) {
+    assert(OnlyOneUse && "only one use should be...");
+    OnlyOneUse = false;
+    UseMO = &MO;
+    UseMI = UseMO->getParent();
+    if (!isCSECandidate(UseMI))
+      return false;
   }
-
-  return Changed;
+  if (MI->getOperand(0).getSubReg() ||MI->getOperand(1).getSubReg())
+    return false;
+  const TargetRegisterClass *RC = MRI->getRegClass(DstReg);
+  if (!MRI->constrainRegClass(SrcReg, RC))
+    return false;
+  DEBUG(dbgs() << "Coalescing: " << *MI);
+  DEBUG(dbgs() << "***     to: " << *UseMI);
+  UseMO->setReg(SrcReg);
+  MRI->clearKillFlags(SrcReg);
+  MI->eraseFromParent();
+  ++NumCoalesces;
+  return true;
 }
 
 bool
@@ -330,8 +324,8 @@
       MI->isInlineAsm() || MI->isDebugValue())
     return false;
 
-  // Ignore copies.
-  if (MI->isCopyLike())
+  // Ignore copies.
+  if (CSEIgnoreCopy && MI->isCopyLike())
     return false;
 
   // Ignore stuff that we obviously can't move.
@@ -448,22 +442,15 @@
     MachineInstr *MI = &*I;
     ++I;
 
+    if (PerformTrivialCoalescing(MI, MBB)) {
+      Changed = true;
+      continue;
+    }
+
     if (!isCSECandidate(MI))
       continue;
 
     bool FoundCSE = VNT.count(MI);
-    if (!FoundCSE) {
-      // Look for trivial copy coalescing opportunities.
-      if (PerformTrivialCoalescing(MI, MBB)) {
-        Changed = true;
-
-        // After coalescing MI itself may become a copy.
-        if (MI->isCopyLike())
-          continue;
-        FoundCSE = VNT.count(MI);
-      }
-    }
-
     // Commute commutable instructions.
     bool Commuted = false;
     if (!FoundCSE && MI->isCommutable()) {
Index: test/CodeGen/ARM/atomic-64bit.ll
===================================================================
--- test/CodeGen/ARM/atomic-64bit.ll
+++ test/CodeGen/ARM/atomic-64bit.ll
@@ -189,8 +189,8 @@
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB-LE-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG1]], r2
 ; CHECK-THUMB-LE-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG2]], r3
-; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG1]], r2
+; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG2]], r3
 ; CHECK-THUMB: orrs    [[MISMATCH_HI]], [[MISMATCH_LO]]
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
Index: test/CodeGen/ARM/debug-info-branch-folding.ll
===================================================================
--- test/CodeGen/ARM/debug-info-branch-folding.ll
+++ test/CodeGen/ARM/debug-info-branch-folding.ll
@@ -2,11 +2,11 @@
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-macosx10.6.7"
 
-;CHECK: 	vadd.f32	q4, q8, q8
+;CHECK: 	vadd.f32	q[[REGNO:[0-9]+]], q8, q8
 ;CHECK-NEXT: LBB0_1
 
-;CHECK:@DEBUG_VALUE: x <- Q4{{$}}
-;CHECK-NEXT:@DEBUG_VALUE: y <- Q4{{$}}
+;CHECK:@DEBUG_VALUE: x <- Q[[REGNO]]{{$}}
+;CHECK-NEXT:@DEBUG_VALUE: y <- Q[[REGNO]]{{$}}
 
 
 @.str = external constant [13 x i8]
Index: test/CodeGen/X86/cse-add-with-overflow.ll
===================================================================
--- test/CodeGen/X86/cse-add-with-overflow.ll
+++ test/CodeGen/X86/cse-add-with-overflow.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -mtriple=x86_64-darwin -mcpu=generic | FileCheck %s
-; XFAIL: *
 ; rdar:15661073 simple example of redundant adds
 ;
 ; MachineCSE should coalesce trivial subregister copies.
@@ -6,7 +5,6 @@
 ;
 ; The extra movl+addl should be removed during MachineCSE.
 ; CHECK-LABEL: redundantadd
-; CHECK: cmpq
 ; CHECK: movq
 ; CHECK-NOT: movl
 ; CHECK: addl
@@ -17,13 +15,6 @@
 entry:
   %tmp8 = load i64* %a0, align 8
   %tmp12 = load i64* %a1, align 8
-  %tmp13 = icmp ult i64 %tmp12, -281474976710656
-  br i1 %tmp13, label %exit1, label %body
-
-exit1:
-  unreachable
-
-body:
   %tmp14 = trunc i64 %tmp8 to i32
   %tmp15 = trunc i64 %tmp12 to i32
   %tmp16 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %tmp14, i32 %tmp15)
@@ -36,8 +27,7 @@
 return:
   %tmp18 = add i64 %tmp12, %tmp8
   %tmp19 = and i64 %tmp18, 4294967295
-  %tmp20 = or i64 %tmp19, -281474976710656
-  ret i64 %tmp20
+  ret i64 %tmp19
 }
 
 declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32)
Index: test/CodeGen/X86/inline-asm-fpstack.ll
===================================================================
--- test/CodeGen/X86/inline-asm-fpstack.ll
+++ test/CodeGen/X86/inline-asm-fpstack.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -mtriple=i386-apple-darwin -no-integrated-as | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i386-apple-darwin -no-integrated-as -cse-ignore-copy | FileCheck %s
 
 ; There should be no stack manipulations between the inline asm and ret.
 ; CHECK: test1