Index: lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- lib/CodeGen/CodeGenPrepare.cpp
+++ lib/CodeGen/CodeGenPrepare.cpp
@@ -1152,9 +1152,18 @@
 static void replaceMathCmpWithIntrinsic(BinaryOperator *BO, CmpInst *Cmp,
                                         Instruction *InsertPt,
                                         Intrinsic::ID IID) {
+  Value *Arg0 = BO->getOperand(0);
+  Value *Arg1 = BO->getOperand(1);
+
+  // We allow matching the canonical IR (add X, C) back to (usubo X, -C).
+  if (BO->getOpcode() == Instruction::Add &&
+      IID == Intrinsic::usub_with_overflow) {
+    assert(isa<Constant>(Arg1) && "Unexpected input for usubo");
+    Arg1 = ConstantExpr::getNeg(cast<Constant>(Arg1));
+  }
+
   IRBuilder<> Builder(InsertPt);
-  Value *MathOV = Builder.CreateBinaryIntrinsic(IID, BO->getOperand(0),
-                                                BO->getOperand(1));
+  Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);
   Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");
   Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov");
   BO->replaceAllUsesWith(Math);
@@ -1200,6 +1209,74 @@
   return true;
 }
 
+static bool combineToUSubWithOverflow(CmpInst *Cmp, const TargetLowering &TLI,
+                                      const DataLayout &DL, bool &ModifiedDT) {
+  // Convert (A u> B) to (A u< B) to simplify pattern matching.
+  Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1);
+  ICmpInst::Predicate Pred = Cmp->getPredicate();
+  if (Pred == ICmpInst::ICMP_UGT) {
+    std::swap(A, B);
+    Pred = ICmpInst::ICMP_ULT;
+  }
+  // Convert special-case: (A == 0) is the same as (A u< 1).
+  if (Pred == ICmpInst::ICMP_EQ && match(B, m_ZeroInt())) {
+    B = ConstantInt::get(B->getType(), 1);
+    Pred = ICmpInst::ICMP_ULT;
+  }
+  if (Pred != ICmpInst::ICMP_ULT)
+    return false;
+
+  // Walk the users of a variable operand of a compare looking for a subtract or
+  // add with that same operand. Also match the 2nd operand of the compare to
+  // the add/sub, but that may be a negated constant operand of an add.
+  Value *CmpVariableOperand = isa<Constant>(A) ? B : A;
+  BinaryOperator *Sub = nullptr;
+  for (User *U : CmpVariableOperand->users()) {
+    // A - B, A u< B --> usubo(A, B)
+    if (match(U, m_Sub(m_Specific(A), m_Specific(B))))
+      Sub = cast<BinaryOperator>(U);
+
+    // A + (-C), A u< C (canonicalized form of (sub A, C))
+    const APInt *CmpC, *AddC;
+    if (match(U, m_Add(m_Specific(A), m_APInt(AddC))) &&
+        match(B, m_APInt(CmpC)) && *AddC == -(*CmpC))
+      Sub = cast<BinaryOperator>(U);
+
+    // FIXME: This is a hack to avoid a later pattern-matching failure for
+    // sum-of-absolute-differences:
+    // X = B - A
+    // Y = A - B
+    // Z = select (A u< B), X, Y
+    if (match(U, m_Sub(m_Specific(B), m_Specific(A))))
+      return false;
+  }
+  if (!Sub)
+    return false;
+
+  // Allow the transform as long as we have an integer type that is not
+  // obviously illegal and unsupported.
+  // TODO: Allow vector types.
+  Type *Ty = Sub->getType();
+  if (!isa<IntegerType>(Ty))
+    return false;
+  EVT CodegenVT = TLI.getValueType(DL, Ty);
+  if (!CodegenVT.isSimple() && TLI.isOperationExpand(ISD::UADDO, CodegenVT))
+    return false;
+
+  // Pattern matched and profitability checked. Check dominance to determine the
+  // insertion point for an intrinsic that replaces the subtract and compare.
+  DominatorTree DT(*Sub->getFunction());
+  bool SubDominates = DT.dominates(Sub, Cmp);
+  if (!SubDominates && !DT.dominates(Cmp, Sub))
+    return false;
+  Instruction *InPt = SubDominates ? cast<Instruction>(Sub)
+                                   : cast<Instruction>(Cmp);
+  replaceMathCmpWithIntrinsic(Sub, Cmp, InPt, Intrinsic::usub_with_overflow);
+  // Reset callers - do not crash by iterating over a dead instruction.
+  ModifiedDT = true;
+  return true;
+}
+
 /// Sink the given CmpInst into user blocks to reduce the number of virtual
 /// registers that must be created and coalesced. This is a clear win except on
 /// targets with multiple condition code registers (PowerPC), where it might
@@ -1266,14 +1343,17 @@
   return MadeChange;
 }
 
-static bool optimizeCmpExpression(CmpInst *Cmp, const TargetLowering &TLI,
-                                  const DataLayout &DL) {
+static bool optimizeCmp(CmpInst *Cmp, const TargetLowering &TLI,
+                        const DataLayout &DL, bool &ModifiedDT) {
   if (sinkCmpExpression(Cmp, TLI))
     return true;
 
   if (combineToUAddWithOverflow(Cmp, TLI, DL))
     return true;
 
+  if (combineToUSubWithOverflow(Cmp, TLI, DL, ModifiedDT))
+    return true;
+
   return false;
 }
 
@@ -6760,8 +6840,8 @@
     return false;
   }
 
-  if (CmpInst *CI = dyn_cast<CmpInst>(I))
-    if (TLI && optimizeCmpExpression(CI, *TLI, *DL))
+  if (auto *Cmp = dyn_cast<CmpInst>(I))
+    if (TLI && optimizeCmp(Cmp, *TLI, *DL, ModifiedDT))
       return true;
 
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
Index: test/CodeGen/AArch64/cgp-usubo.ll
===================================================================
--- test/CodeGen/AArch64/cgp-usubo.ll
+++ test/CodeGen/AArch64/cgp-usubo.ll
@@ -21,11 +21,9 @@
 define i1 @usubo_ugt_i32(i32 %x, i32 %y, i32* %p) nounwind {
 ; CHECK-LABEL: usubo_ugt_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp w1, w0
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    sub w9, w0, w1
-; CHECK-NEXT:    mov w0, w8
-; CHECK-NEXT:    str w9, [x2]
+; CHECK-NEXT:    subs w8, w0, w1
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    str w8, [x2]
 ; CHECK-NEXT:    ret
   %ov = icmp ugt i32 %y, %x
   %s = sub i32 %x, %y
@@ -38,12 +36,11 @@
 define i1 @usubo_ugt_constant_op0_i8(i8 %x, i8* %p) nounwind {
 ; CHECK-LABEL: usubo_ugt_constant_op0_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    mov w9, #42
-; CHECK-NEXT:    cmp w8, #42 // =42
-; CHECK-NEXT:    sub w9, w9, w0
-; CHECK-NEXT:    cset w0, hi
-; CHECK-NEXT:    strb w9, [x1]
+; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    sub w8, w8, w0, uxtb
+; CHECK-NEXT:    tst w8, #0xffffff00
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    strb w8, [x1]
 ; CHECK-NEXT:    ret
   %s = sub i8 42, %x
   %ov = icmp ugt i8 %x, 42
@@ -56,12 +53,11 @@
 define i1 @usubo_ult_constant_op0_i16(i16 %x, i16* %p) nounwind {
 ; CHECK-LABEL: usubo_ult_constant_op0_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    mov w9, #43
-; CHECK-NEXT:    cmp w8, #43 // =43
-; CHECK-NEXT:    sub w9, w9, w0
-; CHECK-NEXT:    cset w0, hi
-; CHECK-NEXT:    strh w9, [x1]
+; CHECK-NEXT:    mov w8, #43
+; CHECK-NEXT:    sub w8, w8, w0, uxth
+; CHECK-NEXT:    tst w8, #0xffff0000
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    strh w8, [x1]
 ; CHECK-NEXT:    ret
   %s = sub i16 43, %x
   %ov = icmp ult i16 43, %x
@@ -75,10 +71,10 @@
 ; CHECK-LABEL: usubo_ult_constant_op1_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    cmp w8, #44 // =44
-; CHECK-NEXT:    sub w9, w0, #44 // =44
-; CHECK-NEXT:    cset w0, lo
-; CHECK-NEXT:    strh w9, [x1]
+; CHECK-NEXT:    sub w8, w8, #44 // =44
+; CHECK-NEXT:    tst w8, #0xffff0000
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    strh w8, [x1]
 ; CHECK-NEXT:    ret
   %s = add i16 %x, -44
   %ov = icmp ult i16 %x, 44
@@ -90,11 +86,10 @@
 ; CHECK-LABEL: usubo_ugt_constant_op1_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    cmp w8, #45 // =45
-; CHECK-NEXT:    cset w8, lo
-; CHECK-NEXT:    sub w9, w0, #45 // =45
-; CHECK-NEXT:    mov w0, w8
-; CHECK-NEXT:    strb w9, [x1]
+; CHECK-NEXT:    sub w8, w8, #45 // =45
+; CHECK-NEXT:    tst w8, #0xffffff00
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    strb w8, [x1]
 ; CHECK-NEXT:    ret
   %ov = icmp ugt i8 45, %x
   %s = add i8 %x, -45
@@ -107,9 +102,8 @@
 define i1 @usubo_eq_constant1_op1_i32(i32 %x, i32* %p) nounwind {
 ; CHECK-LABEL: usubo_eq_constant1_op1_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp w0, #0 // =0
-; CHECK-NEXT:    sub w8, w0, #1 // =1
-; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    subs w8, w0, #1 // =1
+; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    str w8, [x1]
 ; CHECK-NEXT:    ret
   %s = add i32 %x, -1
Index: test/CodeGen/AMDGPU/sad.ll
===================================================================
--- test/CodeGen/AMDGPU/sad.ll
+++ test/CodeGen/AMDGPU/sad.ll
@@ -291,9 +291,11 @@
   ret void
 }
 
+; This is matched to usubo in CGP.
+
 ; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat2:
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %icmp0 = icmp ugt i32 %a, %b
Index: test/CodeGen/PowerPC/bdzlr.ll
===================================================================
--- test/CodeGen/PowerPC/bdzlr.ll
+++ test/CodeGen/PowerPC/bdzlr.ll
@@ -54,13 +54,13 @@
 ; CHECK: @lua_xmove
 ; CHECK: bnelr
 ; CHECK: bnelr
-; CHECK: bdzlr
+; CHECK: bnelr
 ; CHECK-NOT: blr
 
 ; CHECK-CRB: @lua_xmove
 ; CHECK-CRB: bclr 12,
 ; CHECK-CRB: bclr 12,
-; CHECK-CRB: bdzlr
+; CHECK-CRB: bgtlr 0
 ; CHECK-CRB-NOT: blr
 }
 
Index: test/CodeGen/PowerPC/expand-contiguous-isel.ll
===================================================================
--- test/CodeGen/PowerPC/expand-contiguous-isel.ll
+++ test/CodeGen/PowerPC/expand-contiguous-isel.ll
@@ -137,7 +137,8 @@
 ; CHECK: bc 12, eq, [[TRUE:.LBB[0-9]+]]
 ; CHECK-NEXT: b [[SUCCESSOR:.LBB[0-9]+]]
 ; CHECK-NEXT: [[TRUE]]
-; CHECK-NEXT: addi {{r[0-9]+}}, {{r[0-9]+}}, 0
+; There may be an extra asm comment line here, so can't use CHECK-NEXT.
+; CHECK: addi {{r[0-9]+}}, {{r[0-9]+}}, 0
 ; CHECK-NEXT: [[SUCCESSOR]]
 }
 
Index: test/CodeGen/X86/cgp-usubo.ll
===================================================================
--- test/CodeGen/X86/cgp-usubo.ll
+++ test/CodeGen/X86/cgp-usubo.ll
@@ -7,8 +7,8 @@
 ; CHECK-LABEL: usubo_ult_i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    subq %rsi, %rdi
-; CHECK-NEXT:    movq %rdi, (%rdx)
 ; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    movq %rdi, (%rdx)
 ; CHECK-NEXT:    retq
   %s = sub i64 %x, %y
   store i64 %s, i64* %p
@@ -21,9 +21,8 @@
 define i1 @usubo_ugt_i32(i32 %x, i32 %y, i32* %p) nounwind {
 ; CHECK-LABEL: usubo_ugt_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl %edi, %esi
-; CHECK-NEXT:    seta %al
 ; CHECK-NEXT:    subl %esi, %edi
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    movl %edi, (%rdx)
 ; CHECK-NEXT:    retq
   %ov = icmp ugt i32 %y, %x
@@ -39,8 +38,7 @@
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movb $42, %cl
 ; CHECK-NEXT:    subb %dil, %cl
-; CHECK-NEXT:    cmpb $42, %dil
-; CHECK-NEXT:    seta %al
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    movb %cl, (%rsi)
 ; CHECK-NEXT:    retq
   %s = sub i8 42, %x
@@ -54,10 +52,9 @@
 define i1 @usubo_ult_constant_op0_i16(i16 %x, i16* %p) nounwind {
 ; CHECK-LABEL: usubo_ult_constant_op0_i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl $43, %ecx
-; CHECK-NEXT:    subl %edi, %ecx
-; CHECK-NEXT:    cmpw $43, %di
-; CHECK-NEXT:    seta %al
+; CHECK-NEXT:    movw $43, %cx
+; CHECK-NEXT:    subw %di, %cx
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    movw %cx, (%rsi)
 ; CHECK-NEXT:    retq
   %s = sub i16 43, %x
@@ -71,11 +68,9 @@
 define i1 @usubo_ult_constant_op1_i16(i16 %x, i16* %p) nounwind {
 ; CHECK-LABEL: usubo_ult_constant_op1_i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %ecx
-; CHECK-NEXT:    addl $-44, %ecx
-; CHECK-NEXT:    cmpw $44, %di
+; CHECK-NEXT:    subw $44, %di
 ; CHECK-NEXT:    setb %al
-; CHECK-NEXT:    movw %cx, (%rsi)
+; CHECK-NEXT:    movw %di, (%rsi)
 ; CHECK-NEXT:    retq
   %s = add i16 %x, -44
   %ov = icmp ult i16 %x, 44
@@ -86,9 +81,8 @@
 define i1 @usubo_ugt_constant_op1_i8(i8 %x, i8* %p) nounwind {
 ; CHECK-LABEL: usubo_ugt_constant_op1_i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpb $45, %dil
+; CHECK-NEXT:    subb $45, %dil
 ; CHECK-NEXT:    setb %al
-; CHECK-NEXT:    addb $-45, %dil
 ; CHECK-NEXT:    movb %dil, (%rsi)
 ; CHECK-NEXT:    retq
   %ov = icmp ugt i8 45, %x
@@ -102,11 +96,9 @@
 define i1 @usubo_eq_constant1_op1_i32(i32 %x, i32* %p) nounwind {
 ; CHECK-LABEL: usubo_eq_constant1_op1_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT:    leal -1(%rdi), %ecx
-; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    movl %ecx, (%rsi)
+; CHECK-NEXT:    subl $1, %edi
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    movl %edi, (%rsi)
 ; CHECK-NEXT:    retq
   %s = add i32 %x, -1
   %ov = icmp eq i32 %x, 0
@@ -124,17 +116,14 @@
 ; CHECK-NEXT:    testb $1, %cl
 ; CHECK-NEXT:    je .LBB7_2
 ; CHECK-NEXT:  # %bb.1: # %t
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    subq %rsi, %rax
-; CHECK-NEXT:    movq %rax, (%rdx)
-; CHECK-NEXT:    testb $1, %cl
-; CHECK-NEXT:    je .LBB7_2
-; CHECK-NEXT:  # %bb.3: # %end
-; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    subq %rsi, %rdi
 ; CHECK-NEXT:    setb %al
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    movq %rdi, (%rdx)
+; CHECK-NEXT:    testb $1, %cl
+; CHECK-NEXT:    jne .LBB7_3
 ; CHECK-NEXT:  .LBB7_2: # %f
 ; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:  .LBB7_3: # %end
 ; CHECK-NEXT:    retq
 entry:
   br i1 %cond, label %t, label %f
Index: test/CodeGen/X86/lsr-loop-exit-cond.ll
===================================================================
--- test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -16,11 +16,11 @@
 ; GENERIC-NEXT:    movl (%rdx), %eax
 ; GENERIC-NEXT:    movl 4(%rdx), %ebx
 ; GENERIC-NEXT:    decl %ecx
-; GENERIC-NEXT:    leaq 20(%rdx), %r14
+; GENERIC-NEXT:    leaq 20(%rdx), %r11
 ; GENERIC-NEXT:    movq _Te0@{{.*}}(%rip), %r9
 ; GENERIC-NEXT:    movq _Te1@{{.*}}(%rip), %r8
 ; GENERIC-NEXT:    movq _Te3@{{.*}}(%rip), %r10
-; GENERIC-NEXT:    movq %rcx, %r11
+; GENERIC-NEXT:    movq %rcx, %r14
 ; GENERIC-NEXT:    jmp LBB0_1
 ; GENERIC-NEXT:    .p2align 4, 0x90
 ; GENERIC-NEXT:  LBB0_2: ## %bb1
@@ -29,14 +29,13 @@
 ; GENERIC-NEXT:    shrl $16, %ebx
 ; GENERIC-NEXT:    movzbl %bl, %ebx
 ; GENERIC-NEXT:    xorl (%r8,%rbx,4), %eax
-; GENERIC-NEXT:    xorl -4(%r14), %eax
+; GENERIC-NEXT:    xorl -4(%r11), %eax
 ; GENERIC-NEXT:    shrl $24, %edi
 ; GENERIC-NEXT:    movzbl %bpl, %ebx
 ; GENERIC-NEXT:    movl (%r10,%rbx,4), %ebx
 ; GENERIC-NEXT:    xorl (%r9,%rdi,4), %ebx
-; GENERIC-NEXT:    xorl (%r14), %ebx
-; GENERIC-NEXT:    decq %r11
-; GENERIC-NEXT:    addq $16, %r14
+; GENERIC-NEXT:    xorl (%r11), %ebx
+; GENERIC-NEXT:    addq $16, %r11
 ; GENERIC-NEXT:  LBB0_1: ## %bb
 ; GENERIC-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; GENERIC-NEXT:    movzbl %al, %edi
@@ -47,16 +46,16 @@
 ; GENERIC-NEXT:    movzbl %bpl, %ebp
 ; GENERIC-NEXT:    movl (%r8,%rbp,4), %ebp
 ; GENERIC-NEXT:    xorl (%r9,%rax,4), %ebp
-; GENERIC-NEXT:    xorl -12(%r14), %ebp
+; GENERIC-NEXT:    xorl -12(%r11), %ebp
 ; GENERIC-NEXT:    shrl $24, %ebx
 ; GENERIC-NEXT:    movl (%r10,%rdi,4), %edi
 ; GENERIC-NEXT:    xorl (%r9,%rbx,4), %edi
-; GENERIC-NEXT:    xorl -8(%r14), %edi
+; GENERIC-NEXT:    xorl -8(%r11), %edi
 ; GENERIC-NEXT:    movl %ebp, %eax
 ; GENERIC-NEXT:    shrl $24, %eax
 ; GENERIC-NEXT:    movl (%r9,%rax,4), %eax
-; GENERIC-NEXT:    testq %r11, %r11
-; GENERIC-NEXT:    jne LBB0_2
+; GENERIC-NEXT:    subq $1, %r14
+; GENERIC-NEXT:    jae LBB0_2
 ; GENERIC-NEXT:  ## %bb.3: ## %bb2
 ; GENERIC-NEXT:    shlq $4, %rcx
 ; GENERIC-NEXT:    andl $-16777216, %eax ## imm = 0xFF000000
@@ -99,27 +98,26 @@
 ; ATOM-NEXT:    ## kill: def $ecx killed $ecx def $rcx
 ; ATOM-NEXT:    movl (%rdx), %r15d
 ; ATOM-NEXT:    movl 4(%rdx), %eax
-; ATOM-NEXT:    leaq 20(%rdx), %r14
+; ATOM-NEXT:    leaq 20(%rdx), %r11
 ; ATOM-NEXT:    movq _Te0@{{.*}}(%rip), %r9
 ; ATOM-NEXT:    movq _Te1@{{.*}}(%rip), %r8
 ; ATOM-NEXT:    movq _Te3@{{.*}}(%rip), %r10
 ; ATOM-NEXT:    decl %ecx
-; ATOM-NEXT:    movq %rcx, %r11
+; ATOM-NEXT:    movq %rcx, %r14
 ; ATOM-NEXT:    jmp LBB0_1
 ; ATOM-NEXT:    .p2align 4, 0x90
 ; ATOM-NEXT:  LBB0_2: ## %bb1
 ; ATOM-NEXT:    ## in Loop: Header=BB0_1 Depth=1
 ; ATOM-NEXT:    shrl $16, %eax
 ; ATOM-NEXT:    shrl $24, %edi
-; ATOM-NEXT:    decq %r11
-; ATOM-NEXT:    movzbl %al, %ebp
+; ATOM-NEXT:    movzbl %al, %eax
+; ATOM-NEXT:    xorl (%r8,%rax,4), %r15d
 ; ATOM-NEXT:    movzbl %bl, %eax
 ; ATOM-NEXT:    movl (%r10,%rax,4), %eax
-; ATOM-NEXT:    xorl (%r8,%rbp,4), %r15d
+; ATOM-NEXT:    xorl -4(%r11), %r15d
 ; ATOM-NEXT:    xorl (%r9,%rdi,4), %eax
-; ATOM-NEXT:    xorl -4(%r14), %r15d
-; ATOM-NEXT:    xorl (%r14), %eax
-; ATOM-NEXT:    addq $16, %r14
+; ATOM-NEXT:    xorl (%r11), %eax
+; ATOM-NEXT:    addq $16, %r11
 ; ATOM-NEXT:  LBB0_1: ## %bb
 ; ATOM-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; ATOM-NEXT:    movl %eax, %edi
@@ -132,15 +130,15 @@
 ; ATOM-NEXT:    movzbl %r15b, %edi
 ; ATOM-NEXT:    xorl (%r9,%rbp,4), %ebx
 ; ATOM-NEXT:    movl (%r10,%rdi,4), %edi
-; ATOM-NEXT:    xorl -12(%r14), %ebx
+; ATOM-NEXT:    xorl -12(%r11), %ebx
 ; ATOM-NEXT:    xorl (%r9,%rax,4), %edi
 ; ATOM-NEXT:    movl %ebx, %eax
-; ATOM-NEXT:    xorl -8(%r14), %edi
+; ATOM-NEXT:    xorl -8(%r11), %edi
 ; ATOM-NEXT:    shrl $24, %eax
 ; ATOM-NEXT:    movl (%r9,%rax,4), %r15d
-; ATOM-NEXT:    testq %r11, %r11
+; ATOM-NEXT:    subq $1, %r14
 ; ATOM-NEXT:    movl %edi, %eax
-; ATOM-NEXT:    jne LBB0_2
+; ATOM-NEXT:    jae LBB0_2
 ; ATOM-NEXT:  ## %bb.3: ## %bb2
 ; ATOM-NEXT:    shrl $16, %eax
 ; ATOM-NEXT:    shrl $8, %edi
Index: test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
===================================================================
--- test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
+++ test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
@@ -175,10 +175,11 @@
 
 define i1 @usubo_ult_i64(i64 %x, i64 %y, i64* %p) {
 ; CHECK-LABEL: @usubo_ult_i64(
-; CHECK-NEXT:    [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    store i64 [[S]], i64* [[P:%.*]]
-; CHECK-NEXT:    [[OV:%.*]] = icmp ult i64 [[X]], [[Y]]
-; CHECK-NEXT:    ret i1 [[OV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]])
+; CHECK-NEXT:    [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0
+; CHECK-NEXT:    [[OV1:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
+; CHECK-NEXT:    store i64 [[MATH]], i64* [[P:%.*]]
+; CHECK-NEXT:    ret i1 [[OV1]]
 ;
   %s = sub i64 %x, %y
   store i64 %s, i64* %p
@@ -190,10 +191,11 @@
 
 define i1 @usubo_ugt_i32(i32 %x, i32 %y, i32* %p) {
 ; CHECK-LABEL: @usubo_ugt_i32(
-; CHECK-NEXT:    [[OV:%.*]] = icmp ugt i32 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[S:%.*]] = sub i32 [[X]], [[Y]]
-; CHECK-NEXT:    store i32 [[S]], i32* [[P:%.*]]
-; CHECK-NEXT:    ret i1 [[OV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[X:%.*]], i32 [[Y:%.*]])
+; CHECK-NEXT:    [[MATH:%.*]] = extractvalue { i32, i1 } [[TMP1]], 0
+; CHECK-NEXT:    [[OV1:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
+; CHECK-NEXT:    store i32 [[MATH]], i32* [[P:%.*]]
+; CHECK-NEXT:    ret i1 [[OV1]]
 ;
   %ov = icmp ugt i32 %y, %x
   %s = sub i32 %x, %y
@@ -205,10 +207,11 @@
 
 define i1 @usubo_ugt_constant_op0_i8(i8 %x, i8* %p) {
 ; CHECK-LABEL: @usubo_ugt_constant_op0_i8(
-; CHECK-NEXT:    [[S:%.*]] = sub i8 42, [[X:%.*]]
-; CHECK-NEXT:    [[OV:%.*]] = icmp ugt i8 [[X]], 42
-; CHECK-NEXT:    store i8 [[S]], i8* [[P:%.*]]
-; CHECK-NEXT:    ret i1 [[OV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 42, i8 [[X:%.*]])
+; CHECK-NEXT:    [[MATH:%.*]] = extractvalue { i8, i1 } [[TMP1]], 0
+; CHECK-NEXT:    [[OV1:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1
+; CHECK-NEXT:    store i8 [[MATH]], i8* [[P:%.*]]
+; CHECK-NEXT:    ret i1 [[OV1]]
 ;
   %s = sub i8 42, %x
   %ov = icmp ugt i8 %x, 42
@@ -220,10 +223,11 @@
 
 define i1 @usubo_ult_constant_op0_i16(i16 %x, i16* %p) {
 ; CHECK-LABEL: @usubo_ult_constant_op0_i16(
-; CHECK-NEXT:    [[S:%.*]] = sub i16 43, [[X:%.*]]
-; CHECK-NEXT:    [[OV:%.*]] = icmp ult i16 43, [[X]]
-; CHECK-NEXT:    store i16 [[S]], i16* [[P:%.*]]
-; CHECK-NEXT:    ret i1 [[OV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 43, i16 [[X:%.*]])
+; CHECK-NEXT:    [[MATH:%.*]] = extractvalue { i16, i1 } [[TMP1]], 0
+; CHECK-NEXT:    [[OV1:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1
+; CHECK-NEXT:    store i16 [[MATH]], i16* [[P:%.*]]
+; CHECK-NEXT:    ret i1 [[OV1]]
 ;
   %s = sub i16 43, %x
   %ov = icmp ult i16 43, %x
@@ -235,10 +239,11 @@
 
 define i1 @usubo_ult_constant_op1_i16(i16 %x, i16* %p) {
 ; CHECK-LABEL: @usubo_ult_constant_op1_i16(
-; CHECK-NEXT:    [[S:%.*]] = add i16 [[X:%.*]], -44
-; CHECK-NEXT:    [[OV:%.*]] = icmp ult i16 [[X]], 44
-; CHECK-NEXT:    store i16 [[S]], i16* [[P:%.*]]
-; CHECK-NEXT:    ret i1 [[OV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[X:%.*]], i16 44)
+; CHECK-NEXT:    [[MATH:%.*]] = extractvalue { i16, i1 } [[TMP1]], 0
+; CHECK-NEXT:    [[OV1:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1
+; CHECK-NEXT:    store i16 [[MATH]], i16* [[P:%.*]]
+; CHECK-NEXT:    ret i1 [[OV1]]
 ;
   %s = add i16 %x, -44
   %ov = icmp ult i16 %x, 44
@@ -248,10 +253,11 @@
 
 define i1 @usubo_ugt_constant_op1_i8(i8 %x, i8* %p) {
 ; CHECK-LABEL: @usubo_ugt_constant_op1_i8(
-; CHECK-NEXT:    [[OV:%.*]] = icmp ugt i8 45, [[X:%.*]]
-; CHECK-NEXT:    [[S:%.*]] = add i8 [[X]], -45
-; CHECK-NEXT:    store i8 [[S]], i8* [[P:%.*]]
-; CHECK-NEXT:    ret i1 [[OV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 45)
+; CHECK-NEXT:    [[MATH:%.*]] = extractvalue { i8, i1 } [[TMP1]], 0
+; CHECK-NEXT:    [[OV1:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1
+; CHECK-NEXT:    store i8 [[MATH]], i8* [[P:%.*]]
+; CHECK-NEXT:    ret i1 [[OV1]]
 ;
   %ov = icmp ugt i8 45, %x
   %s = add i8 %x, -45
@@ -263,10 +269,11 @@
 
 define i1 @usubo_eq_constant1_op1_i32(i32 %x, i32* %p) {
 ; CHECK-LABEL: @usubo_eq_constant1_op1_i32(
-; CHECK-NEXT:    [[S:%.*]] = add i32 [[X:%.*]], -1
-; CHECK-NEXT:    [[OV:%.*]] = icmp eq i32 [[X]], 0
-; CHECK-NEXT:    store i32 [[S]], i32* [[P:%.*]]
-; CHECK-NEXT:    ret i1 [[OV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[X:%.*]], i32 1)
+; CHECK-NEXT:    [[MATH:%.*]] = extractvalue { i32, i1 } [[TMP1]], 0
+; CHECK-NEXT:    [[OV1:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
+; CHECK-NEXT:    store i32 [[MATH]], i32* [[P:%.*]]
+; CHECK-NEXT:    ret i1 [[OV1]]
 ;
   %s = add i32 %x, -1
   %ov = icmp eq i32 %x, 0
@@ -283,14 +290,15 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]]
 ; CHECK:       t:
-; CHECK-NEXT:    [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    store i64 [[S]], i64* [[P:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]])
+; CHECK-NEXT:    [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
+; CHECK-NEXT:    [[OV1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
+; CHECK-NEXT:    store i64 [[MATH]], i64* [[P:%.*]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[END:%.*]], label [[F]]
 ; CHECK:       f:
 ; CHECK-NEXT:    ret i1 [[COND]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[OV:%.*]] = icmp ult i64 [[X]], [[Y]]
-; CHECK-NEXT:    ret i1 [[OV]]
+; CHECK-NEXT:    ret i1 [[OV1]]
 ;
 entry:
   br i1 %cond, label %t, label %f
@@ -319,10 +327,11 @@
 ; CHECK:       f:
 ; CHECK-NEXT:    ret i1 [[COND]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[X]], [[Y]]
-; CHECK-NEXT:    [[S:%.*]] = sub i64 [[X]], [[Y]]
-; CHECK-NEXT:    store i64 [[S]], i64* [[P:%.*]]
-; CHECK-NEXT:    ret i1 [[TMP0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X]], i64 [[Y]])
+; CHECK-NEXT:    [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
+; CHECK-NEXT:    [[OV1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
+; CHECK-NEXT:    store i64 [[MATH]], i64* [[P:%.*]]
+; CHECK-NEXT:    ret i1 [[OV1]]
 ;
 entry:
   br i1 %cond, label %t, label %f