diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7168,8 +7168,15 @@
 //   Remainder = Sum % Constant
 // This is based on "Remainder by Summing Digits" from Hacker's Delight.
 //
-// For division, we can compute the remainder, subtract it from the dividend,
-// and then multiply by the multiplicative inverse modulo (1 << (BitWidth / 2)).
+// If Constant is even, we can shift right the dividend and the divisor by the
+// number of trailing zeros in Constant before computing the remainder. Then
+// fixup the remainder by shifting it left by the number of trailing zeros and
+// adding the bits that were shifted out of the dividend.
+//
+// For division, we can compute the remainder using the algorithm described
+// above, subtract it from the dividend to get an exact multiple of Constant.
+// Then multiply that extact multiple by the multiplicative inverse modulo
+// (1 << (BitWidth / 2)).
 bool TargetLowering::expandDIVREMByConstant(SDNode *N,
                                             SmallVectorImpl<SDValue> &Result,
                                             EVT HiLoVT, SelectionDAG &DAG,
@@ -7188,7 +7195,7 @@
   if (!CN)
     return false;
 
-  const APInt &Divisor = CN->getAPIntValue();
+  APInt Divisor = CN->getAPIntValue();
   unsigned BitWidth = Divisor.getBitWidth();
   unsigned HBitWidth = BitWidth / 2;
   assert(VT.getScalarSizeInBits() == BitWidth &&
@@ -7209,10 +7216,17 @@
   if (DAG.shouldOptForSize())
     return false;
 
-  // Early out for 0, 1 or even divisors.
-  if (Divisor.ule(1) || Divisor[0] == 0)
+  // Early out for 0 or 1 divisors.
+  if (Divisor.ule(1))
     return false;
 
+  // If the divisor is even, shift it until it becomes odd.
+  unsigned TrailingZeros = 0;
+  if (!Divisor[0]) {
+    TrailingZeros = Divisor.countTrailingZeros();
+    Divisor.lshrInPlace(TrailingZeros);
+  }
+
   SDLoc dl(N);
   SDValue Sum;
 
@@ -7229,17 +7243,35 @@
                        DAG.getIntPtrConstant(1, dl));
     }
 
+    SDValue ShiftedLL = LL;
+    SDValue ShiftedLH = LH;
+
+    // Shift the input by the number of TrailingZeros in the divisor. The
+    // shifted out bits will be added to the remainder later.
+    if (TrailingZeros) {
+      ShiftedLL = DAG.getNode(
+          ISD::OR, dl, HiLoVT,
+          DAG.getNode(ISD::SRL, dl, HiLoVT, ShiftedLL,
+                      DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)),
+          DAG.getNode(ISD::SHL, dl, HiLoVT, ShiftedLH,
+                      DAG.getShiftAmountConstant(HBitWidth - TrailingZeros,
+                                                 HiLoVT, dl)));
+      ShiftedLH =
+          DAG.getNode(ISD::SRL, dl, HiLoVT, ShiftedLH,
+                      DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
+    }
+
     // Use addcarry if we can, otherwise use a compare to detect overflow.
     EVT SetCCType =
         getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT);
     if (isOperationLegalOrCustom(ISD::ADDCARRY, HiLoVT)) {
       SDVTList VTList = DAG.getVTList(HiLoVT, SetCCType);
-      Sum = DAG.getNode(ISD::UADDO, dl, VTList, LL, LH);
+      Sum = DAG.getNode(ISD::UADDO, dl, VTList, ShiftedLL, ShiftedLH);
       Sum = DAG.getNode(ISD::ADDCARRY, dl, VTList, Sum,
                         DAG.getConstant(0, dl, HiLoVT), Sum.getValue(1));
     } else {
-      Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, LL, LH);
-      SDValue Carry = DAG.getSetCC(dl, SetCCType, Sum, LL, ISD::SETULT);
+      Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, ShiftedLL, ShiftedLH);
+      SDValue Carry = DAG.getSetCC(dl, SetCCType, Sum, ShiftedLL, ISD::SETULT);
       // If the boolean for the target is 0 or 1, we can add the setcc result
       // directly.
       if (getBooleanContents(HiLoVT) ==
@@ -7263,6 +7295,17 @@
   // High half of the remainder is 0.
   SDValue RemH = DAG.getConstant(0, dl, HiLoVT);
 
+  // If we shifted the input, shift the remainder left and add the bits we
+  // shifted off the input.
+  if (TrailingZeros) {
+    APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros);
+    RemL = DAG.getNode(ISD::SHL, dl, HiLoVT, RemL,
+                       DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
+    RemL = DAG.getNode(ISD::ADD, dl, HiLoVT, RemL,
+                       DAG.getNode(ISD::AND, dl, HiLoVT, LL,
+                                   DAG.getConstant(Mask, dl, HiLoVT)));
+  }
+
   // If we only want remainder, we're done.
   if (Opcode == ISD::UREM) {
     Result.push_back(RemL);
diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
--- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -502,24 +502,65 @@
 define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_udiv_12:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    li a2, 12
-; RV32-NEXT:    li a3, 0
-; RV32-NEXT:    call __udivdi3@plt
-; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    slli a2, a1, 30
+; RV32-NEXT:    srli a3, a0, 2
+; RV32-NEXT:    or a2, a3, a2
+; RV32-NEXT:    srli a3, a1, 2
+; RV32-NEXT:    add a3, a2, a3
+; RV32-NEXT:    sltu a2, a3, a2
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    lui a3, 699051
+; RV32-NEXT:    addi a4, a3, -1365
+; RV32-NEXT:    mulhu a5, a2, a4
+; RV32-NEXT:    srli a6, a5, 1
+; RV32-NEXT:    andi a5, a5, -2
+; RV32-NEXT:    add a5, a5, a6
+; RV32-NEXT:    sub a2, a2, a5
+; RV32-NEXT:    slli a2, a2, 2
+; RV32-NEXT:    andi a5, a0, 3
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    sub a5, a0, a2
+; RV32-NEXT:    addi a3, a3, -1366
+; RV32-NEXT:    mul a3, a5, a3
+; RV32-NEXT:    mulhu a6, a5, a4
+; RV32-NEXT:    add a3, a6, a3
+; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    sub a0, a1, a0
+; RV32-NEXT:    mul a0, a0, a4
+; RV32-NEXT:    add a1, a3, a0
+; RV32-NEXT:    mul a0, a5, a4
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_udiv_12:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    li a2, 12
-; RV64-NEXT:    li a3, 0
-; RV64-NEXT:    call __udivti3@plt
-; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    slli a2, a1, 62
+; RV64-NEXT:    srli a3, a0, 2
+; RV64-NEXT:    or a2, a3, a2
+; RV64-NEXT:    srli a3, a1, 2
+; RV64-NEXT:    lui a4, %hi(.LCPI10_0)
+; RV64-NEXT:    ld a4, %lo(.LCPI10_0)(a4)
+; RV64-NEXT:    add a3, a2, a3
+; RV64-NEXT:    sltu a2, a3, a2
+; RV64-NEXT:    add a2, a3, a2
+; RV64-NEXT:    mulhu a3, a2, a4
+; RV64-NEXT:    srli a5, a3, 1
+; RV64-NEXT:    andi a3, a3, -2
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    sub a2, a2, a3
+; RV64-NEXT:    slli a2, a2, 2
+; RV64-NEXT:    lui a3, %hi(.LCPI10_1)
+; RV64-NEXT:    ld a3, %lo(.LCPI10_1)(a3)
+; RV64-NEXT:    andi a5, a0, 3
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    sub a5, a0, a2
+; RV64-NEXT:    mul a3, a5, a3
+; RV64-NEXT:    mulhu a6, a5, a4
+; RV64-NEXT:    add a3, a6, a3
+; RV64-NEXT:    sltu a0, a0, a2
+; RV64-NEXT:    sub a0, a1, a0
+; RV64-NEXT:    mul a0, a0, a4
+; RV64-NEXT:    add a1, a3, a0
+; RV64-NEXT:    mul a0, a5, a4
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 12
   ret iXLen2 %a
diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
--- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -335,24 +335,46 @@
 define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_urem_12:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    li a2, 12
-; RV32-NEXT:    li a3, 0
-; RV32-NEXT:    call __umoddi3@plt
-; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    slli a2, a1, 30
+; RV32-NEXT:    srli a3, a0, 2
+; RV32-NEXT:    or a2, a3, a2
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    sltu a2, a1, a2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    lui a2, 699051
+; RV32-NEXT:    addi a2, a2, -1365
+; RV32-NEXT:    mulhu a2, a1, a2
+; RV32-NEXT:    srli a3, a2, 1
+; RV32-NEXT:    andi a2, a2, -2
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    sub a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    andi a0, a0, 3
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    li a1, 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_urem_12:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    li a2, 12
-; RV64-NEXT:    li a3, 0
-; RV64-NEXT:    call __umodti3@plt
-; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    slli a2, a1, 62
+; RV64-NEXT:    srli a3, a0, 2
+; RV64-NEXT:    or a2, a3, a2
+; RV64-NEXT:    srli a1, a1, 2
+; RV64-NEXT:    lui a3, %hi(.LCPI10_0)
+; RV64-NEXT:    ld a3, %lo(.LCPI10_0)(a3)
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    sltu a2, a1, a2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    mulhu a2, a1, a3
+; RV64-NEXT:    srli a3, a2, 1
+; RV64-NEXT:    andi a2, a2, -2
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    sub a1, a1, a2
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    andi a0, a0, 3
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    li a1, 0
 ; RV64-NEXT:    ret
   %a = urem iXLen2 %x, 12
   ret iXLen2 %a
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -735,13 +735,24 @@
 define i64 @urem_i64_12(i64 %x) nounwind {
 ; X32-LABEL: urem_i64_12:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    subl $12, %esp
-; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl $12
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    calll __umoddi3
-; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    shrl $2, %eax
+; X32-NEXT:    shldl $30, %esi, %ecx
+; X32-NEXT:    addl %eax, %ecx
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    movl $-1431655765, %edx # imm = 0xAAAAAAAB
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %edx
+; X32-NEXT:    shrl %edx
+; X32-NEXT:    leal (%edx,%edx,2), %eax
+; X32-NEXT:    subl %eax, %ecx
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    leal (%esi,%ecx,4), %eax
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    popl %esi
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: urem_i64_12:
@@ -1116,13 +1127,37 @@
 define i64 @udiv_i64_12(i64 %x) nounwind {
 ; X32-LABEL: udiv_i64_12:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    subl $12, %esp
-; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl $12
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    calll __udivdi3
-; X32-NEXT:    addl $28, %esp
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    shrl $2, %eax
+; X32-NEXT:    movl %edi, %esi
+; X32-NEXT:    shldl $30, %ecx, %esi
+; X32-NEXT:    addl %eax, %esi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl $-1431655765, %ebx # imm = 0xAAAAAAAB
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    shrl %edx
+; X32-NEXT:    leal (%edx,%edx,2), %eax
+; X32-NEXT:    subl %eax, %esi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    andl $3, %eax
+; X32-NEXT:    leal (%eax,%esi,4), %eax
+; X32-NEXT:    subl %eax, %ecx
+; X32-NEXT:    sbbl $0, %edi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: udiv_i64_12:
diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -425,27 +425,39 @@
 define i128 @urem_i128_12(i128 %x) nounwind {
 ; X86-64-LABEL: urem_i128_12:
 ; X86-64:       # %bb.0: # %entry
-; X86-64-NEXT:    pushq %rax
-; X86-64-NEXT:    movl $12, %edx
-; X86-64-NEXT:    xorl %ecx, %ecx
-; X86-64-NEXT:    callq __umodti3@PLT
-; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    movq %rsi, %rax
+; X86-64-NEXT:    shldq $62, %rdi, %rax
+; X86-64-NEXT:    shrq $2, %rsi
+; X86-64-NEXT:    addq %rax, %rsi
+; X86-64-NEXT:    adcq $0, %rsi
+; X86-64-NEXT:    movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X86-64-NEXT:    movq %rsi, %rax
+; X86-64-NEXT:    mulq %rcx
+; X86-64-NEXT:    shrq %rdx
+; X86-64-NEXT:    leaq (%rdx,%rdx,2), %rax
+; X86-64-NEXT:    subq %rax, %rsi
+; X86-64-NEXT:    andl $3, %edi
+; X86-64-NEXT:    leaq (%rdi,%rsi,4), %rax
+; X86-64-NEXT:    xorl %edx, %edx
 ; X86-64-NEXT:    retq
 ;
 ; WIN64-LABEL: urem_i128_12:
 ; WIN64:       # %bb.0: # %entry
-; WIN64-NEXT:    subq $72, %rsp
-; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    movq $12, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT:    callq __umodti3
-; WIN64-NEXT:    movq %xmm0, %rax
-; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT:    movq %xmm0, %rdx
-; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    movq %rdx, %r8
+; WIN64-NEXT:    movq %rdx, %rax
+; WIN64-NEXT:    shldq $62, %rcx, %rax
+; WIN64-NEXT:    shrq $2, %r8
+; WIN64-NEXT:    addq %rax, %r8
+; WIN64-NEXT:    adcq $0, %r8
+; WIN64-NEXT:    movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB
+; WIN64-NEXT:    movq %r8, %rax
+; WIN64-NEXT:    mulq %rdx
+; WIN64-NEXT:    shrq %rdx
+; WIN64-NEXT:    leaq (%rdx,%rdx,2), %rax
+; WIN64-NEXT:    subq %rax, %r8
+; WIN64-NEXT:    andl $3, %ecx
+; WIN64-NEXT:    leaq (%rcx,%r8,4), %rax
+; WIN64-NEXT:    xorl %edx, %edx
 ; WIN64-NEXT:    retq
 entry:
   %rem = urem i128 %x, 12
@@ -887,27 +899,59 @@
 define i128 @udiv_i128_12(i128 %x) nounwind {
 ; X86-64-LABEL: udiv_i128_12:
 ; X86-64:       # %bb.0: # %entry
-; X86-64-NEXT:    pushq %rax
-; X86-64-NEXT:    movl $12, %edx
-; X86-64-NEXT:    xorl %ecx, %ecx
-; X86-64-NEXT:    callq __udivti3@PLT
-; X86-64-NEXT:    popq %rcx
+; X86-64-NEXT:    movq %rsi, %rax
+; X86-64-NEXT:    shldq $62, %rdi, %rax
+; X86-64-NEXT:    movq %rsi, %rcx
+; X86-64-NEXT:    shrq $2, %rcx
+; X86-64-NEXT:    addq %rax, %rcx
+; X86-64-NEXT:    adcq $0, %rcx
+; X86-64-NEXT:    movabsq $-6148914691236517205, %r8 # imm = 0xAAAAAAAAAAAAAAAB
+; X86-64-NEXT:    movq %rcx, %rax
+; X86-64-NEXT:    mulq %r8
+; X86-64-NEXT:    shrq %rdx
+; X86-64-NEXT:    leaq (%rdx,%rdx,2), %rax
+; X86-64-NEXT:    subq %rax, %rcx
+; X86-64-NEXT:    movl %edi, %eax
+; X86-64-NEXT:    andl $3, %eax
+; X86-64-NEXT:    leaq (%rax,%rcx,4), %rax
+; X86-64-NEXT:    subq %rax, %rdi
+; X86-64-NEXT:    sbbq $0, %rsi
+; X86-64-NEXT:    movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA
+; X86-64-NEXT:    imulq %rdi, %rcx
+; X86-64-NEXT:    movq %rdi, %rax
+; X86-64-NEXT:    mulq %r8
+; X86-64-NEXT:    addq %rcx, %rdx
+; X86-64-NEXT:    imulq %rsi, %r8
+; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
 ; WIN64-LABEL: udiv_i128_12:
 ; WIN64:       # %bb.0: # %entry
-; WIN64-NEXT:    subq $72, %rsp
-; WIN64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    movq $12, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
-; WIN64-NEXT:    callq __udivti3
-; WIN64-NEXT:    movq %xmm0, %rax
-; WIN64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; WIN64-NEXT:    movq %xmm0, %rdx
-; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    movq %rdx, %r8
+; WIN64-NEXT:    movq %rdx, %rax
+; WIN64-NEXT:    shldq $62, %rcx, %rax
+; WIN64-NEXT:    movq %rdx, %r9
+; WIN64-NEXT:    shrq $2, %r9
+; WIN64-NEXT:    addq %rax, %r9
+; WIN64-NEXT:    adcq $0, %r9
+; WIN64-NEXT:    movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB
+; WIN64-NEXT:    movq %r9, %rax
+; WIN64-NEXT:    mulq %r10
+; WIN64-NEXT:    shrq %rdx
+; WIN64-NEXT:    leaq (%rdx,%rdx,2), %rax
+; WIN64-NEXT:    subq %rax, %r9
+; WIN64-NEXT:    movl %ecx, %eax
+; WIN64-NEXT:    andl $3, %eax
+; WIN64-NEXT:    leaq (%rax,%r9,4), %rax
+; WIN64-NEXT:    subq %rax, %rcx
+; WIN64-NEXT:    sbbq $0, %r8
+; WIN64-NEXT:    movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA
+; WIN64-NEXT:    imulq %rcx, %r9
+; WIN64-NEXT:    movq %rcx, %rax
+; WIN64-NEXT:    mulq %r10
+; WIN64-NEXT:    addq %r9, %rdx
+; WIN64-NEXT:    imulq %r10, %r8
+; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
   %rem = udiv i128 %x, 12