Index: lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2705,25 +2705,59 @@
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
-  // A divide for UMULO should be faster than a function call.
   if (N->getOpcode() == ISD::UMULO) {
+    // This section expands the operation into the following sequence of
+    // instructions. `iNh` here refers to a type which has half the bit width of
+    // the type the original operation operated on.
+    //
+    // %0 = %LHS.HI != 0 && %RHS.HI != 0
+    // %1 = { iNh, i1 } @umul.with.overflow.iNh(iNh %LHS.HI, iNh %RHS.LO)
+    // %2 = { iNh, i1 } @umul.with.overflow.iNh(iNh %RHS.HI, iNh %LHS.LO)
+    // %3 = mul nuw iN (%LHS.LOW as iN), (%RHS.LOW as iN)
+    // %4 = add iN (%1.0 as iN) << Nh, (%2.0 as iN) << Nh
+    // %5 = { iN, i1 } @uadd.with.overflow.iN( %4, %3 )
+    //
+    // %res = { %5.0, %0 || %1.1 || %2.1 || %5.1 }
     SDValue LHS = N->getOperand(0), RHS = N->getOperand(1);
-
-    SDValue MUL = DAG.getNode(ISD::MUL, dl, LHS.getValueType(), LHS, RHS);
-    SplitInteger(MUL, Lo, Hi);
-
-    // A divide for UMULO will be faster than a function call. Select to
-    // make sure we aren't using 0.
-    SDValue isZero = DAG.getSetCC(dl, getSetCCResultType(VT),
-                                  RHS, DAG.getConstant(0, dl, VT), ISD::SETEQ);
-    SDValue NotZero = DAG.getSelect(dl, VT, isZero,
-                                    DAG.getConstant(1, dl, VT), RHS);
-    SDValue DIV = DAG.getNode(ISD::UDIV, dl, VT, MUL, NotZero);
-    SDValue Overflow = DAG.getSetCC(dl, N->getValueType(1), DIV, LHS,
-                                    ISD::SETNE);
-    Overflow = DAG.getSelect(dl, N->getValueType(1), isZero,
-                             DAG.getConstant(0, dl, N->getValueType(1)),
-                             Overflow);
+    SDValue LHSHigh, LHSLow, RHSHigh, RHSLow;
+    SplitInteger(LHS, LHSLow, LHSHigh);
+    SplitInteger(RHS, RHSLow, RHSHigh);
+    EVT HalfVT = LHSLow.getValueType()
+      , BitVT = N->getValueType(1);
+    SDVTList VTHalfMulO = DAG.getVTList(HalfVT, BitVT);
+    SDVTList VTFullAddO = DAG.getVTList(VT, BitVT);
+
+    SDValue HalfZero = DAG.getConstant(0, dl, HalfVT);
+    SDValue Overflow = DAG.getNode(ISD::AND, dl, BitVT,
+      DAG.getSetCC(dl, BitVT, LHSHigh, HalfZero, ISD::SETNE),
+      DAG.getSetCC(dl, BitVT, RHSHigh, HalfZero, ISD::SETNE));
+
+    SDValue One = DAG.getNode(ISD::UMULO, dl, VTHalfMulO, LHSHigh, RHSLow);
+    Overflow = DAG.getNode(ISD::OR, dl, BitVT, Overflow, One.getValue(1));
+
+    SDValue Two = DAG.getNode(ISD::UMULO, dl, VTHalfMulO, RHSHigh, LHSLow);
+    Overflow = DAG.getNode(ISD::OR, dl, BitVT, Overflow, Two.getValue(1));
+
+    // Cannot use `UMUL_LOHI` directly, because some 32-bit targets (ARM) do not
+    // know how to expand `i64,i64 = umul_lohi a, b` and abort (why isn’t this
+    // operation recursively legalized?).
+    //
+    // Many backends understand this pattern and will convert into LOHI
+    // themselves, if applicable.
+    SDValue Three = DAG.getNode(ISD::MUL, dl, VT,
+      DAG.getNode(ISD::ZERO_EXTEND, dl, VT, LHSLow),
+      DAG.getNode(ISD::ZERO_EXTEND, dl, VT, RHSLow));
+
+    MVT ShiftAmountTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+    auto ShiftAmount = DAG.getConstant(One.getValueSizeInBits(), dl, ShiftAmountTy);
+    SDValue OneInHigh = DAG.getNode(ISD::SHL, dl, VT,
+      DAG.getNode(ISD::ANY_EXTEND, dl, VT, One.getValue(0)), ShiftAmount);
+    SDValue TwoInHigh = DAG.getNode(ISD::SHL, dl, VT,
+      DAG.getNode(ISD::ANY_EXTEND, dl, VT, Two.getValue(0)), ShiftAmount);
+    SDValue Four = DAG.getNode(ISD::ADD, dl, VT, OneInHigh, TwoInHigh);
+    SDValue Five = DAG.getNode(ISD::UADDO, dl, VTFullAddO, Three, Four);
+    Overflow = DAG.getNode(ISD::OR, dl, BitVT, Overflow, Five.getValue(1));
+    SplitInteger(Five, Lo, Hi);
     ReplaceValueWith(SDValue(N, 1), Overflow);
     return;
   }
Index: test/CodeGen/X86/muloti.ll
===================================================================
--- test/CodeGen/X86/muloti.ll
+++ test/CodeGen/X86/muloti.ll
@@ -57,8 +57,7 @@
   %tmp = load i128, i128* %a.addr, align 16
   %tmp2 = load i128, i128* %b.addr, align 16
   %6 = call %1 @llvm.umul.with.overflow.i128(i128 %tmp, i128 %tmp2)
-; CHECK: cmov
-; CHECK: divti3
+; CHECK-NOT: divti3
   %7 = extractvalue %1 %6, 0
   %8 = extractvalue %1 %6, 1
   br i1 %8, label %overflow, label %nooverflow
Index: test/CodeGen/X86/select.ll
===================================================================
--- test/CodeGen/X86/select.ll
+++ test/CodeGen/X86/select.ll
@@ -53,6 +53,7 @@
 ; GENERIC-NEXT:    popq %rcx
 ; GENERIC-NEXT:    retq
 ; GENERIC-NEXT:  LBB1_1: ## %bb90
+; GENERIC-NEXT:    ud2
 ;
 ; ATOM-LABEL: test2:
 ; ATOM:       ## %bb.0: ## %entry
@@ -70,6 +71,7 @@
 ; ATOM-NEXT:    popq %rcx
 ; ATOM-NEXT:    retq
 ; ATOM-NEXT:  LBB1_1: ## %bb90
+; ATOM-NEXT:    ud2
 ;
 ; MCU-LABEL: test2:
 ; MCU:       # %bb.0: # %entry
@@ -636,71 +638,6 @@
   ret i64 %cond
 }
 
-
-declare noalias i8* @_Znam(i64) noredzone
-
-define noalias i8* @test12(i64 %count) nounwind ssp noredzone {
-; GENERIC-LABEL: test12:
-; GENERIC:       ## %bb.0: ## %entry
-; GENERIC-NEXT:    movl $4, %ecx
-; GENERIC-NEXT:    movq %rdi, %rax
-; GENERIC-NEXT:    mulq %rcx
-; GENERIC-NEXT:    movq $-1, %rdi
-; GENERIC-NEXT:    cmovnoq %rax, %rdi
-; GENERIC-NEXT:    jmp __Znam ## TAILCALL
-;
-; ATOM-LABEL: test12:
-; ATOM:       ## %bb.0: ## %entry
-; ATOM-NEXT:    movq %rdi, %rax
-; ATOM-NEXT:    movl $4, %ecx
-; ATOM-NEXT:    movq $-1, %rdi
-; ATOM-NEXT:    mulq %rcx
-; ATOM-NEXT:    cmovnoq %rax, %rdi
-; ATOM-NEXT:    jmp __Znam ## TAILCALL
-;
-; MCU-LABEL: test12:
-; MCU:       # %bb.0: # %entry
-; MCU-NEXT:    pushl %ebp
-; MCU-NEXT:    pushl %ebx
-; MCU-NEXT:    pushl %edi
-; MCU-NEXT:    pushl %esi
-; MCU-NEXT:    movl %edx, %ebx
-; MCU-NEXT:    movl %eax, %ebp
-; MCU-NEXT:    movl $4, %ecx
-; MCU-NEXT:    mull %ecx
-; MCU-NEXT:    movl %eax, %esi
-; MCU-NEXT:    leal (%edx,%ebx,4), %edi
-; MCU-NEXT:    movl %edi, %edx
-; MCU-NEXT:    pushl $0
-; MCU-NEXT:    pushl $4
-; MCU-NEXT:    calll __udivdi3
-; MCU-NEXT:    addl $8, %esp
-; MCU-NEXT:    xorl %ebx, %edx
-; MCU-NEXT:    xorl %ebp, %eax
-; MCU-NEXT:    orl %edx, %eax
-; MCU-NEXT:    movl $-1, %eax
-; MCU-NEXT:    movl $-1, %edx
-; MCU-NEXT:    jne .LBB14_2
-; MCU-NEXT:  # %bb.1: # %entry
-; MCU-NEXT:    movl %esi, %eax
-; MCU-NEXT:    movl %edi, %edx
-; MCU-NEXT:  .LBB14_2: # %entry
-; MCU-NEXT:    popl %esi
-; MCU-NEXT:    popl %edi
-; MCU-NEXT:    popl %ebx
-; MCU-NEXT:    popl %ebp
-; MCU-NEXT:    jmp _Znam # TAILCALL
-entry:
-  %A = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %count, i64 4)
-  %B = extractvalue { i64, i1 } %A, 1
-  %C = extractvalue { i64, i1 } %A, 0
-  %D = select i1 %B, i64 -1, i64 %C
-  %call = tail call noalias i8* @_Znam(i64 %D) nounwind noredzone
-  ret i8* %call
-}
-
-declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
-
 define i32 @test13(i32 %a, i32 %b) nounwind {
 ; GENERIC-LABEL: test13:
 ; GENERIC:       ## %bb.0:
@@ -862,10 +799,10 @@
 ; MCU-LABEL: test18:
 ; MCU:       # %bb.0:
 ; MCU-NEXT:    cmpl $15, %eax
-; MCU-NEXT:    jl .LBB20_2
+; MCU-NEXT:    jl .LBB19_2
 ; MCU-NEXT:  # %bb.1:
 ; MCU-NEXT:    movl %ecx, %edx
-; MCU-NEXT:  .LBB20_2:
+; MCU-NEXT:  .LBB19_2:
 ; MCU-NEXT:    movl %edx, %eax
 ; MCU-NEXT:    retl
   %cmp = icmp slt i32 %x, 15
@@ -902,10 +839,10 @@
 ; GENERIC-NEXT:    cmovlel %edi, %eax
 ; GENERIC-NEXT:    cmpl $-128, %eax
 ; GENERIC-NEXT:    movb $-128, %cl
-; GENERIC-NEXT:    jl LBB22_2
+; GENERIC-NEXT:    jl LBB21_2
 ; GENERIC-NEXT:  ## %bb.1:
 ; GENERIC-NEXT:    movl %eax, %ecx
-; GENERIC-NEXT:  LBB22_2:
+; GENERIC-NEXT:  LBB21_2:
 ; GENERIC-NEXT:    movb %cl, (%rsi)
 ; GENERIC-NEXT:    retq
 ;
@@ -916,10 +853,10 @@
 ; ATOM-NEXT:    movb $-128, %cl
 ; ATOM-NEXT:    cmovlel %edi, %eax
 ; ATOM-NEXT:    cmpl $-128, %eax
-; ATOM-NEXT:    jl LBB22_2
+; ATOM-NEXT:    jl LBB21_2
 ; ATOM-NEXT:  ## %bb.1:
 ; ATOM-NEXT:    movl %eax, %ecx
-; ATOM-NEXT:  LBB22_2:
+; ATOM-NEXT:  LBB21_2:
 ; ATOM-NEXT:    movb %cl, (%rsi)
 ; ATOM-NEXT:    retq
 ;
@@ -927,16 +864,16 @@
 ; MCU:       # %bb.0:
 ; MCU-NEXT:    cmpl $127, %eax
 ; MCU-NEXT:    movl $127, %ecx
-; MCU-NEXT:    jg .LBB22_2
+; MCU-NEXT:    jg .LBB21_2
 ; MCU-NEXT:  # %bb.1:
 ; MCU-NEXT:    movl %eax, %ecx
-; MCU-NEXT:  .LBB22_2:
+; MCU-NEXT:  .LBB21_2:
 ; MCU-NEXT:    cmpl $-128, %ecx
 ; MCU-NEXT:    movb $-128, %al
-; MCU-NEXT:    jl .LBB22_4
+; MCU-NEXT:    jl .LBB21_4
 ; MCU-NEXT:  # %bb.3:
 ; MCU-NEXT:    movl %ecx, %eax
-; MCU-NEXT:  .LBB22_4:
+; MCU-NEXT:  .LBB21_4:
 ; MCU-NEXT:    movb %al, (%edx)
 ; MCU-NEXT:    retl
   %cmp = icmp sgt i32 %src, 127
@@ -976,16 +913,16 @@
 ; MCU:       # %bb.0:
 ; MCU-NEXT:    cmpl $32767, %eax # imm = 0x7FFF
 ; MCU-NEXT:    movl $32767, %ecx # imm = 0x7FFF
-; MCU-NEXT:    jg .LBB23_2
+; MCU-NEXT:    jg .LBB22_2
 ; MCU-NEXT:  # %bb.1:
 ; MCU-NEXT:    movl %eax, %ecx
-; MCU-NEXT:  .LBB23_2:
+; MCU-NEXT:  .LBB22_2:
 ; MCU-NEXT:    cmpl $-32768, %ecx # imm = 0x8000
 ; MCU-NEXT:    movl $32768, %eax # imm = 0x8000
-; MCU-NEXT:    jl .LBB23_4
+; MCU-NEXT:    jl .LBB22_4
 ; MCU-NEXT:  # %bb.3:
 ; MCU-NEXT:    movl %ecx, %eax
-; MCU-NEXT:  .LBB23_4:
+; MCU-NEXT:  .LBB22_4:
 ; MCU-NEXT:    movw %ax, (%edx)
 ; MCU-NEXT:    retl
   %cmp = icmp sgt i32 %src, 32767
@@ -1009,19 +946,19 @@
 ; CHECK-NEXT:    movl $-1, %eax
 ; CHECK-NEXT:    movb $1, %cl
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  LBB24_1: ## %CF
+; CHECK-NEXT:  LBB23_1: ## %CF
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    testb %cl, %cl
-; CHECK-NEXT:    jne LBB24_1
+; CHECK-NEXT:    jne LBB23_1
 ; CHECK-NEXT:  ## %bb.2: ## %CF250
-; CHECK-NEXT:    ## in Loop: Header=BB24_1 Depth=1
-; CHECK-NEXT:    jne LBB24_1
+; CHECK-NEXT:    ## in Loop: Header=BB23_1 Depth=1
+; CHECK-NEXT:    jne LBB23_1
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  LBB24_3: ## %CF242
+; CHECK-NEXT:  LBB23_3: ## %CF242
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmpl %eax, %eax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-NEXT:    jp LBB24_3
+; CHECK-NEXT:    jp LBB23_3
 ; CHECK-NEXT:  ## %bb.4: ## %CF244
 ; CHECK-NEXT:    retq
 ;
@@ -1030,24 +967,24 @@
 ; MCU-NEXT:    movl $-1, %ecx
 ; MCU-NEXT:    movb $1, %al
 ; MCU-NEXT:    .p2align 4, 0x90
-; MCU-NEXT:  .LBB24_1: # %CF
+; MCU-NEXT:  .LBB23_1: # %CF
 ; MCU-NEXT:    # =>This Inner Loop Header: Depth=1
 ; MCU-NEXT:    testb %al, %al
-; MCU-NEXT:    jne .LBB24_1
+; MCU-NEXT:    jne .LBB23_1
 ; MCU-NEXT:  # %bb.2: # %CF250
-; MCU-NEXT:    # in Loop: Header=BB24_1 Depth=1
-; MCU-NEXT:    jne .LBB24_1
+; MCU-NEXT:    # in Loop: Header=BB23_1 Depth=1
+; MCU-NEXT:    jne .LBB23_1
 ; MCU-NEXT:  # %bb.3: # %CF242.preheader
 ; MCU-NEXT:    fldz
 ; MCU-NEXT:    .p2align 4, 0x90
-; MCU-NEXT:  .LBB24_4: # %CF242
+; MCU-NEXT:  .LBB23_4: # %CF242
 ; MCU-NEXT:    # =>This Inner Loop Header: Depth=1
 ; MCU-NEXT:    cmpl %eax, %ecx
 ; MCU-NEXT:    fucom %st(0)
 ; MCU-NEXT:    fnstsw %ax
 ; MCU-NEXT:    # kill: def $ah killed $ah killed $ax
 ; MCU-NEXT:    sahf
-; MCU-NEXT:    jp .LBB24_4
+; MCU-NEXT:    jp .LBB23_4
 ; MCU-NEXT:  # %bb.5: # %CF244
 ; MCU-NEXT:    fstp %st(0)
 ; MCU-NEXT:    retl
@@ -1116,10 +1053,10 @@
 ; MCU-LABEL: select_xor_1b:
 ; MCU:       # %bb.0: # %entry
 ; MCU-NEXT:    testb $1, %dl
-; MCU-NEXT:    je .LBB26_2
+; MCU-NEXT:    je .LBB25_2
 ; MCU-NEXT:  # %bb.1:
 ; MCU-NEXT:    xorl $43, %eax
-; MCU-NEXT:  .LBB26_2: # %entry
+; MCU-NEXT:  .LBB25_2: # %entry
 ; MCU-NEXT:    # kill: def $ax killed $ax killed $eax
 ; MCU-NEXT:    retl
 entry:
@@ -1168,10 +1105,10 @@
 ; MCU-LABEL: select_xor_2b:
 ; MCU:       # %bb.0: # %entry
 ; MCU-NEXT:    testb $1, %cl
-; MCU-NEXT:    je .LBB28_2
+; MCU-NEXT:    je .LBB27_2
 ; MCU-NEXT:  # %bb.1:
 ; MCU-NEXT:    xorl %edx, %eax
-; MCU-NEXT:  .LBB28_2: # %entry
+; MCU-NEXT:  .LBB27_2: # %entry
 ; MCU-NEXT:    retl
 entry:
  %and = and i8 %cond, 1
@@ -1219,10 +1156,10 @@
 ; MCU-LABEL: select_or_b:
 ; MCU:       # %bb.0: # %entry
 ; MCU-NEXT:    testb $1, %cl
-; MCU-NEXT:    je .LBB30_2
+; MCU-NEXT:    je .LBB29_2
 ; MCU-NEXT:  # %bb.1:
 ; MCU-NEXT:    orl %edx, %eax
-; MCU-NEXT:  .LBB30_2: # %entry
+; MCU-NEXT:  .LBB29_2: # %entry
 ; MCU-NEXT:    retl
 entry:
  %and = and i8 %cond, 1
@@ -1270,10 +1207,10 @@
 ; MCU-LABEL: select_or_1b:
 ; MCU:       # %bb.0: # %entry
 ; MCU-NEXT:    testb $1, %cl
-; MCU-NEXT:    je .LBB32_2
+; MCU-NEXT:    je .LBB31_2
 ; MCU-NEXT:  # %bb.1:
 ; MCU-NEXT:    orl %edx, %eax
-; MCU-NEXT:  .LBB32_2: # %entry
+; MCU-NEXT:  .LBB31_2: # %entry
 ; MCU-NEXT:    retl
 entry:
  %and = and i32 %cond, 1