Index: lib/Target/ARM/ARMBaseInstrInfo.h
===================================================================
--- lib/Target/ARM/ARMBaseInstrInfo.h
+++ lib/Target/ARM/ARMBaseInstrInfo.h
@@ -215,6 +215,8 @@
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
+  bool shouldSink(const MachineInstr &MI) const override;
+
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      unsigned DestReg, unsigned SubIdx,
                      const MachineInstr &Orig,
Index: lib/Target/ARM/ARMBaseInstrInfo.cpp
===================================================================
--- lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -2534,14 +2534,28 @@
   }
 }
 
+/// getCmpToAddCondition - assume the flags are set by CMP(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by ADD(a,b,X).
+inline static ARMCC::CondCodes getCmpToAddCondition(ARMCC::CondCodes CC) {
+  switch (CC) {
+  default: return ARMCC::AL;
+  case ARMCC::HS: return ARMCC::LO;
+  case ARMCC::LO: return ARMCC::HS;
+  case ARMCC::VS: return ARMCC::VS;
+  case ARMCC::VC: return ARMCC::VC;
+  }
+}
+
 /// isRedundantFlagInstr - check whether the first instruction, whose only
 /// purpose is to update flags, can be made redundant.
 /// CMPrr can be made redundant by SUBrr if the operands are the same.
 /// CMPri can be made redundant by SUBri if the operands are the same.
+/// CMPrr(r0, r1) can be made redundant by ADDr[ri](r0, r1, X).
 /// This function can be extended later on.
-inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
-                                        unsigned SrcReg2, int ImmValue,
-                                        MachineInstr *OI) {
+inline static bool isRedundantFlagInstr(const MachineInstr *CmpI,
+                                        unsigned SrcReg, unsigned SrcReg2,
+                                        int ImmValue, const MachineInstr *OI) {
   if ((CmpI->getOpcode() == ARM::CMPrr ||
        CmpI->getOpcode() == ARM::t2CMPrr) &&
       (OI->getOpcode() == ARM::SUBrr ||
@@ -2559,6 +2573,14 @@
       OI->getOperand(1).getReg() == SrcReg &&
       OI->getOperand(2).getImm() == ImmValue)
     return true;
+
+  if ((CmpI->getOpcode() == ARM::CMPrr || CmpI->getOpcode() == ARM::t2CMPrr) &&
+      (OI->getOpcode() == ARM::ADDrr || OI->getOpcode() == ARM::t2ADDrr ||
+       OI->getOpcode() == ARM::ADDri || OI->getOpcode() == ARM::t2ADDri) &&
+      OI->getOperand(0).isReg() && OI->getOperand(1).isReg() &&
+      OI->getOperand(0).getReg() == SrcReg &&
+      OI->getOperand(1).getReg() == SrcReg2)
+    return true;
   return false;
 }
 
@@ -2661,17 +2683,18 @@
   if (I == B) return false;
 
   // There are two possible candidates which can be changed to set CPSR:
-  // One is MI, the other is a SUB instruction.
-  // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
+  // One is MI, the other is a SUB or ADD instruction.
+  // For CMPrr(r1,r2), we are looking for SUB(r1,r2), SUB(r2,r1), or
+  // ADDr[ri](r1, r2, X).
   // For CMPri(r1, CmpValue), we are looking for SUBri(r1, CmpValue).
-  MachineInstr *Sub = nullptr;
+  MachineInstr *SubAdd = nullptr;
   if (SrcReg2 != 0)
     // MI is not a candidate for CMPrr.
     MI = nullptr;
   else if (MI->getParent() != CmpInstr.getParent() || CmpValue != 0) {
     // Conservatively refuse to convert an instruction which isn't in the same
     // BB as the comparison.
-    // For CMPri w/ CmpValue != 0, a Sub may still be a candidate.
+    // For CMPri w/ CmpValue != 0, a SubAdd may still be a candidate.
     // Thus we cannot return here.
     if (CmpInstr.getOpcode() == ARM::CMPri ||
         CmpInstr.getOpcode() == ARM::t2CMPri)
@@ -2713,38 +2736,43 @@
     }
     I = CmpInstr;
     E = MI;
+  } else {
+    // Allow the loop below to search E (which was initially MI).  Since MI and
+    // SubAdd have different tests, even if that instruction could not be MI, it
+    // could still potentially be SubAdd.
+    --E;
   }
 
   // Check that CPSR isn't set between the comparison instruction and the one we
-  // want to change. At the same time, search for Sub.
+  // want to change. At the same time, search for SubAdd.
   const TargetRegisterInfo *TRI = &getRegisterInfo();
   --I;
   for (; I != E; --I) {
     const MachineInstr &Instr = *I;
 
+    // Check whether CmpInstr can be made redundant by the current instruction.
+    if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
+      SubAdd = &*I;
+      break;
+    }
+
     if (Instr.modifiesRegister(ARM::CPSR, TRI) ||
         Instr.readsRegister(ARM::CPSR, TRI))
       // This instruction modifies or uses CPSR after the one we want to
       // change. We can't do this transformation.
       return false;
 
-    // Check whether CmpInstr can be made redundant by the current instruction.
-    if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
-      Sub = &*I;
-      break;
-    }
-
     if (I == B)
       // The 'and' is below the comparison instruction.
       return false;
   }
 
   // Return false if no candidates exist.
-  if (!MI && !Sub)
+  if (!MI && !SubAdd)
     return false;
 
   // The single candidate is called MI.
-  if (!MI) MI = Sub;
+  if (!MI) MI = SubAdd;
 
   // We can't use a predicated instruction - it doesn't always write the flags.
   if (isPredicated(*MI))
@@ -2802,25 +2830,31 @@
         break;
       }
 
-      if (Sub) {
-        ARMCC::CondCodes NewCC = getSwappedCondition(CC);
-        if (NewCC == ARMCC::AL)
-          return false;
+      if (SubAdd) {
         // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based
         // on CMP needs to be updated to be based on SUB.
+        // If we have ADD(r1, r2, X) and CMP(r1, r2), the condition code also
+        // needs to be modified.
         // Push the condition code operands to OperandsToUpdate.
         // If it is safe to remove CmpInstr, the condition code of these
         // operands will be modified.
-        if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
-            Sub->getOperand(2).getReg() == SrcReg) {
+        unsigned Opc = SubAdd->getOpcode();
+        bool IsSub = Opc == ARM::SUBrr || Opc == ARM::t2SUBrr ||
+                     Opc == ARM::SUBri || Opc == ARM::t2SUBri;
+        if (!IsSub || (SrcReg2 != 0 && SubAdd->getOperand(1).getReg() == SrcReg2 &&
+                       SubAdd->getOperand(2).getReg() == SrcReg)) {
           // VSel doesn't support condition code update.
           if (IsInstrVSel)
             return false;
+          // Ensure we can swap the condition.
+          ARMCC::CondCodes NewCC = (IsSub ? getSwappedCondition(CC) : getCmpToAddCondition(CC));
+          if (NewCC == ARMCC::AL)
+            return false;
           OperandsToUpdate.push_back(
               std::make_pair(&((*I).getOperand(IO - 1)), NewCC));
         }
       } else {
-        // No Sub, so this is x = <op> y, z; cmp x, 0.
+        // No SubAdd, so this is x = <op> y, z; cmp x, 0.
         switch (CC) {
         case ARMCC::EQ: // Z
         case ARMCC::NE: // Z
@@ -2874,6 +2908,23 @@
   return true;
 }
 
+bool ARMBaseInstrInfo::shouldSink(const MachineInstr &MI) const {
+  // Do not sink MI if it might be used to optimize a redundant compare.
+  // We heuristically only look at the instruction immediately following MI to
+  // avoid potentially searching the entire basic block.
+  if (isPredicated(MI))
+    return true;
+  MachineBasicBlock::const_iterator Next = &MI;
+  ++Next;
+  unsigned SrcReg, SrcReg2;
+  int CmpMask, CmpValue;
+  if (Next != MI.getParent()->end() &&
+      analyzeCompare(*Next, SrcReg, SrcReg2, CmpMask, CmpValue) &&
+      isRedundantFlagInstr(&*Next, SrcReg, SrcReg2, CmpValue, &MI))
+    return false;
+  return true;
+}
+
 bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
                                      unsigned Reg,
                                      MachineRegisterInfo *MRI) const {
Index: test/CodeGen/ARM/intrinsics-overflow.ll
===================================================================
--- test/CodeGen/ARM/intrinsics-overflow.ll
+++ test/CodeGen/ARM/intrinsics-overflow.ll
@@ -33,10 +33,10 @@
 
   ; CHECK-LABEL: sadd_overflow:
 
-  ; ARM: add r[[R2:[0-9]+]], r[[R0:[0-9]+]], r[[R1:[0-9]+]]
-  ; ARM: mov r[[R1]], #1
-  ; ARM: cmp r[[R2]], r[[R0]]
-  ; ARM: movvc r[[R1]], #0
+  ; ARM: adds r[[R2:[0-9]+]], r[[R0:[0-9]+]], r[[R1:[0-9]+]]
+  ; ARM: mov r[[R0]], #1
+  ; ARM: movvc r[[R0]], #0
+  ; ARM: mov pc, lr
 
   ; THUMBV6: mov  r[[R2:[0-9]+]], r[[R0:[0-9]+]]
   ; THUMBV6: adds r[[R3:[0-9]+]], r[[R2]], r[[R1:[0-9]+]]
@@ -47,11 +47,10 @@
   ; THUMBV6: mov  r[[R0]], r[[R1]]
   ; THUMBV6: .L[[LABEL]]:
 
-  ; THUMBV7: movs  r[[R1]], #1
-  ; THUMBV7: cmp   r[[R2]], r[[R0]]
+  ; THUMBV7: adds  r[[R2:[0-9]+]], r[[R0]], r[[R1:[0-9]+]]
+  ; THUMBV7: mov.w r[[R0:[0-9]+]], #1
   ; THUMBV7: it    vc
-  ; THUMBV7: movvc r[[R1]], #0
-  ; THUMBV7: mov   r[[R0]], r[[R1]]
+  ; THUMBV7: movvc r[[R0]], #0
 }
 
 define i32 @usub_overflow(i32 %a, i32 %b) #0 {
Index: test/CodeGen/ARM/su-addsub-overflow.ll
===================================================================
--- test/CodeGen/ARM/su-addsub-overflow.ll
+++ test/CodeGen/ARM/su-addsub-overflow.ll
@@ -2,9 +2,7 @@
 
 define i32 @sadd(i32 %a, i32 %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: sadd:
-; CHECK:    mov r[[R0:[0-9]+]], r0
-; CHECK-NEXT:    add r[[R1:[0-9]+]], r[[R0]], r1
-; CHECK-NEXT:    cmp r[[R1]], r[[R0]]
+; CHECK:    adds r0, r0, r1
 ; CHECK-NEXT:    movvc pc, lr
 entry:
   %0 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
@@ -23,10 +21,8 @@
 
 define i32 @uadd(i32 %a, i32 %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: uadd:
-; CHECK:    mov r[[R0:[0-9]+]], r0
-; CHECK-NEXT:    adds r[[R1:[0-9]+]], r[[R0]], r1
-; CHECK-NEXT:    cmp r[[R1]], r[[R0]]
-; CHECK-NEXT:    movhs pc, lr
+; CHECK:    adds r0, r0, r1
+; CHECK-NEXT:    movlo pc, lr
 entry:
   %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %1 = extractvalue { i32, i1 } %0, 1
@@ -44,8 +40,7 @@
 
 define i32 @ssub(i32 %a, i32 %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: ssub:
-; CHECK:    cmp r0, r1
-; CHECK-NEXT:    subvc r0, r0, r1
+; CHECK:    subs r0, r0, r1
 ; CHECK-NEXT:    movvc pc, lr
 entry:
   %0 = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
@@ -64,9 +59,7 @@
 
 define i32 @usub(i32 %a, i32 %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: usub:
-; CHECK:    mov r[[R0:[0-9]+]], r0
-; CHECK-NEXT:    subs r[[R1:[0-9]+]], r[[R0]], r1
-; CHECK-NEXT:    cmp r[[R0]], r1
+; CHECK:    subs r0, r0, r1
 ; CHECK-NEXT:    movhs pc, lr
 entry:
   %0 = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
@@ -87,11 +80,9 @@
 ; CHECK-LABEL: sum:
 ; CHECK:    ldr [[R0:r[0-9]+]],
 ; CHECK-NEXT:    ldr [[R1:r[0-9]+|lr]],
-; CHECK-NEXT:    add [[R2:r[0-9]+]], [[R1]], [[R0]]
-; CHECK-NEXT:    cmp [[R2]], [[R1]]
+; CHECK-NEXT:    adds [[R2:r[0-9]+]], [[R1]], [[R0]]
 ; CHECK-NEXT:    strvc [[R2]],
-; CHECK-NEXT:    addvc
-; CHECK-NEXT:    cmpvc
+; CHECK-NEXT:    addsvc
 ; CHECK-NEXT:    bvs
 entry:
   %cmp7 = icmp eq i32 %n, 0
@@ -128,6 +119,46 @@
 
 }
 
+define void @extern_loop(i32 %n) local_unnamed_addr #0 {
+; Do not replace the compare around the clobbering call.
+; CHECK: add {{r[0-9]+}}, {{r[0-9]+}}, #1
+; CHECK-NEXT: bl external_fn
+; CHECK: cmp
+entry:
+  %0 = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %n, i32 1)
+  %1 = extractvalue { i32, i1 } %0, 1
+  br i1 %1, label %trap, label %cont.lr.ph
+
+cont.lr.ph:
+  %2 = extractvalue { i32, i1 } %0, 0
+  %cmp5 = icmp sgt i32 %2, 0
+  br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+trap:
+  tail call void @llvm.trap() #2
+  unreachable
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.046 = phi i32 [ %5, %cont1 ], [ 0, %for.body.preheader ]
+  tail call void bitcast (void (...)* @external_fn to void ()*)() #4
+  %3 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %i.046, i32 1)
+  %4 = extractvalue { i32, i1 } %3, 1
+  br i1 %4, label %trap, label %cont1
+
+cont1:
+  %5 = extractvalue { i32, i1 } %3, 0
+  %cmp = icmp slt i32 %5, %2
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+declare void @external_fn(...) local_unnamed_addr #0
+
 declare void @llvm.trap() #2
 declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) #1
 declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1