diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7168,8 +7168,17 @@ // Remainder = Sum % Constant // This is based on "Remainder by Summing Digits" from Hacker's Delight. // -// For division, we can compute the remainder, subtract it from the dividend, -// and then multiply by the multiplicative inverse modulo (1 << (BitWidth / 2)). +// For division, we can compute the remainder using the algorithm described +// above, subtract it from the dividend to get an exact multiple of Constant. +// Then multiply that extact multiply by the multiplicative inverse modulo +// (1 << (BitWidth / 2)) to get the quotient. + +// If Constant is even, we can shift right the dividend and the divisor by the +// number of trailing zeros in Constant before applying the remainder algorithm. +// If we're after the quotient, we can subtract this value from the shifted +// dividend and multiply by the multiplicative inverse of the shifted divisor. +// If we want the remainder, we shift the value left by the number of trailing +// zeros and add the bits that were shifted out of the dividend. bool TargetLowering::expandDIVREMByConstant(SDNode *N, SmallVectorImpl &Result, EVT HiLoVT, SelectionDAG &DAG, @@ -7188,7 +7197,7 @@ if (!CN) return false; - const APInt &Divisor = CN->getAPIntValue(); + APInt Divisor = CN->getAPIntValue(); unsigned BitWidth = Divisor.getBitWidth(); unsigned HBitWidth = BitWidth / 2; assert(VT.getScalarSizeInBits() == BitWidth && @@ -7209,12 +7218,20 @@ if (DAG.shouldOptForSize()) return false; - // Early out for 0, 1 or even divisors. - if (Divisor.ule(1) || Divisor[0] == 0) + // Early out for 0 or 1 divisors. + if (Divisor.ule(1)) return false; + // If the divisor is even, shift it until it becomes odd. + unsigned TrailingZeros = 0; + if (!Divisor[0]) { + TrailingZeros = Divisor.countTrailingZeros(); + Divisor.lshrInPlace(TrailingZeros); + } + SDLoc dl(N); SDValue Sum; + SDValue PartialRem; // If (1 << HBitWidth) % divisor == 1, we can add the two halves together and // then add in the carry. @@ -7229,6 +7246,27 @@ DAG.getIntPtrConstant(1, dl)); } + // Shift the input by the number of TrailingZeros in the divisor. The + // shifted out bits will be added to the remainder later. + if (TrailingZeros) { + LL = DAG.getNode( + ISD::OR, dl, HiLoVT, + DAG.getNode(ISD::SRL, dl, HiLoVT, LL, + DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)), + DAG.getNode(ISD::SHL, dl, HiLoVT, LH, + DAG.getShiftAmountConstant(HBitWidth - TrailingZeros, + HiLoVT, dl))); + LH = DAG.getNode(ISD::SRL, dl, HiLoVT, LH, + DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)); + + // Save the shifted off bits if we need the remainder. + if (Opcode != ISD::UDIV) { + APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros); + PartialRem = DAG.getNode(ISD::AND, dl, HiLoVT, LL, + DAG.getConstant(Mask, dl, HiLoVT)); + } + } + // Use addcarry if we can, otherwise use a compare to detect overflow. EVT SetCCType = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT); @@ -7260,45 +7298,45 @@ SDValue RemL = DAG.getNode(ISD::UREM, dl, HiLoVT, Sum, DAG.getConstant(Divisor.trunc(HBitWidth), dl, HiLoVT)); - // High half of the remainder is 0. SDValue RemH = DAG.getConstant(0, dl, HiLoVT); - // If we only want remainder, we're done. - if (Opcode == ISD::UREM) { - Result.push_back(RemL); - Result.push_back(RemH); - return true; - } - - // Otherwise, we need to compute the quotient. - - // Join the remainder halves. - SDValue Rem = DAG.getNode(ISD::BUILD_PAIR, dl, VT, RemL, RemH); - - // Subtract the remainder from the input. - SDValue In = DAG.getNode(ISD::SUB, dl, VT, N->getOperand(0), Rem); - - // Multiply by the multiplicative inverse of the divisor modulo - // (1 << BitWidth). - APInt Mod = APInt::getSignedMinValue(BitWidth + 1); - APInt MulFactor = Divisor.zext(BitWidth + 1); - MulFactor = MulFactor.multiplicativeInverse(Mod); - MulFactor = MulFactor.trunc(BitWidth); - - SDValue Quotient = - DAG.getNode(ISD::MUL, dl, VT, In, DAG.getConstant(MulFactor, dl, VT)); - - // Split the quotient into low and high parts. - SDValue QuotL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient, - DAG.getIntPtrConstant(0, dl)); - SDValue QuotH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient, - DAG.getIntPtrConstant(1, dl)); - Result.push_back(QuotL); - Result.push_back(QuotH); - // For DIVREM, also return the remainder parts. - if (Opcode == ISD::UDIVREM) { + if (Opcode != ISD::UREM) { + // Subtract the remainder from the shifted dividend. + SDValue Dividend = DAG.getNode(ISD::BUILD_PAIR, dl, VT, LL, LH); + SDValue Rem = DAG.getNode(ISD::BUILD_PAIR, dl, VT, RemL, RemH); + + Dividend = DAG.getNode(ISD::SUB, dl, VT, Dividend, Rem); + + // Multiply by the multiplicative inverse of the divisor modulo + // (1 << BitWidth). + APInt Mod = APInt::getSignedMinValue(BitWidth + 1); + APInt MulFactor = Divisor.zext(BitWidth + 1); + MulFactor = MulFactor.multiplicativeInverse(Mod); + MulFactor = MulFactor.trunc(BitWidth); + + SDValue Quotient = DAG.getNode(ISD::MUL, dl, VT, Dividend, + DAG.getConstant(MulFactor, dl, VT)); + + // Split the quotient into low and high parts. + SDValue QuotL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient, + DAG.getIntPtrConstant(0, dl)); + SDValue QuotH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient, + DAG.getIntPtrConstant(1, dl)); + Result.push_back(QuotL); + Result.push_back(QuotH); + } + + if (Opcode != ISD::UDIV) { + // If we shifted the input, shift the remainder left and add the bits we + // shifted off the input. + if (TrailingZeros) { + APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros); + RemL = DAG.getNode(ISD::SHL, dl, HiLoVT, RemL, + DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)); + RemL = DAG.getNode(ISD::ADD, dl, HiLoVT, RemL, PartialRem); + } Result.push_back(RemL); - Result.push_back(RemH); + Result.push_back(DAG.getConstant(0, dl, HiLoVT)); } return true; diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll --- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll @@ -502,24 +502,59 @@ define iXLen2 @test_udiv_12(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_12: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 12 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __udivdi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: slli a2, a1, 30 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: add a2, a0, a1 +; RV32-NEXT: sltu a3, a2, a0 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: lui a3, 699051 +; RV32-NEXT: addi a4, a3, -1365 +; RV32-NEXT: mulhu a5, a2, a4 +; RV32-NEXT: srli a6, a5, 1 +; RV32-NEXT: andi a5, a5, -2 +; RV32-NEXT: add a5, a5, a6 +; RV32-NEXT: sub a2, a2, a5 +; RV32-NEXT: sub a5, a0, a2 +; RV32-NEXT: addi a3, a3, -1366 +; RV32-NEXT: mul a3, a5, a3 +; RV32-NEXT: mulhu a6, a5, a4 +; RV32-NEXT: add a3, a6, a3 +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: mul a0, a0, a4 +; RV32-NEXT: add a1, a3, a0 +; RV32-NEXT: mul a0, a5, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_12: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 12 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __udivti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: slli a2, a1, 62 +; RV64-NEXT: srli a0, a0, 2 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: srli a1, a1, 2 +; RV64-NEXT: lui a2, %hi(.LCPI10_0) +; RV64-NEXT: ld a2, %lo(.LCPI10_0)(a2) +; RV64-NEXT: add a3, a0, a1 +; RV64-NEXT: sltu a4, a3, a0 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: mulhu a4, a3, a2 +; RV64-NEXT: srli a5, a4, 1 +; RV64-NEXT: andi a4, a4, -2 +; RV64-NEXT: lui a6, %hi(.LCPI10_1) +; RV64-NEXT: ld a6, %lo(.LCPI10_1)(a6) +; RV64-NEXT: add a4, a4, a5 +; RV64-NEXT: sub a3, a3, a4 +; RV64-NEXT: sub a4, a0, a3 +; RV64-NEXT: mul a5, a4, a6 +; RV64-NEXT: mulhu a6, a4, a2 +; RV64-NEXT: add a5, a6, a5 +; RV64-NEXT: sltu a0, a0, a3 +; RV64-NEXT: sub a0, a1, a0 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a1, a5, a0 +; RV64-NEXT: mul a0, a4, a2 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 12 ret iXLen2 %a diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll --- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll @@ -335,24 +335,46 @@ define iXLen2 @test_urem_12(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_12: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 12 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __umoddi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: slli a2, a1, 30 +; RV32-NEXT: srli a0, a0, 2 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: sltu a2, a1, a0 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: lui a2, 699051 +; RV32-NEXT: addi a2, a2, -1365 +; RV32-NEXT: mulhu a2, a1, a2 +; RV32-NEXT: srli a3, a2, 1 +; RV32-NEXT: andi a2, a2, -2 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: sub a1, a1, a2 +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: andi a0, a0, 3 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: li a1, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_12: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 12 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __umodti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: slli a2, a1, 62 +; RV64-NEXT: srli a0, a0, 2 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: srli a1, a1, 2 +; RV64-NEXT: lui a2, %hi(.LCPI10_0) +; RV64-NEXT: ld a2, %lo(.LCPI10_0)(a2) +; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: sltu a3, a1, a0 +; RV64-NEXT: add a1, a1, a3 +; RV64-NEXT: mulhu a2, a1, a2 +; RV64-NEXT: srli a3, a2, 1 +; RV64-NEXT: andi a2, a2, -2 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: sub a1, a1, a2 +; RV64-NEXT: slli a1, a1, 2 +; RV64-NEXT: andi a0, a0, 3 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: li a1, 0 ; RV64-NEXT: ret %a = urem iXLen2 %x, 12 ret iXLen2 %a diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll --- a/llvm/test/CodeGen/X86/divide-by-constant.ll +++ b/llvm/test/CodeGen/X86/divide-by-constant.ll @@ -735,13 +735,23 @@ define i64 @urem_i64_12(i64 %x) nounwind { ; X32-LABEL: urem_i64_12: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $12 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: shrdl $2, %ecx, %esi +; X32-NEXT: shrl $2, %ecx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: shrl %edx +; X32-NEXT: leal (%edx,%edx,2), %eax +; X32-NEXT: subl %eax, %ecx +; X32-NEXT: andl $3, %esi +; X32-NEXT: leal (%esi,%ecx,4), %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: popl %esi ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_12: @@ -1116,13 +1126,33 @@ define i64 @udiv_i64_12(i64 %x) nounwind { ; X32-LABEL: udiv_i64_12: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $12 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __udivdi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %edi +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: shrdl $2, %edi, %ecx +; X32-NEXT: shrl $2, %edi +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: shrl %edx +; X32-NEXT: leal (%edx,%edx,2), %eax +; X32-NEXT: subl %eax, %esi +; X32-NEXT: subl %esi, %ecx +; X32-NEXT: sbbl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: popl %esi +; X32-NEXT: popl %edi +; X32-NEXT: popl %ebx ; X32-NEXT: retl ; ; X64-LABEL: udiv_i64_12: diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll --- a/llvm/test/CodeGen/X86/divmod128.ll +++ b/llvm/test/CodeGen/X86/divmod128.ll @@ -425,27 +425,37 @@ define i128 @urem_i128_12(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_12: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $12, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: shrdq $2, %rsi, %rdi +; X86-64-NEXT: shrq $2, %rsi +; X86-64-NEXT: addq %rdi, %rsi +; X86-64-NEXT: adcq $0, %rsi +; X86-64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: shrq %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax +; X86-64-NEXT: subq %rax, %rsi +; X86-64-NEXT: andl $3, %edi +; X86-64-NEXT: leaq (%rdi,%rsi,4), %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_12: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $12, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: shrdq $2, %rdx, %rcx +; WIN64-NEXT: shrq $2, %r8 +; WIN64-NEXT: addq %rcx, %r8 +; WIN64-NEXT: adcq $0, %r8 +; WIN64-NEXT: movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: shrq %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax +; WIN64-NEXT: subq %rax, %r8 +; WIN64-NEXT: andl $3, %ecx +; WIN64-NEXT: leaq (%rcx,%r8,4), %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 12 @@ -887,27 +897,51 @@ define i128 @udiv_i128_12(i128 %x) nounwind { ; X86-64-LABEL: udiv_i128_12: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $12, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __udivti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: shrdq $2, %rsi, %rdi +; X86-64-NEXT: shrq $2, %rsi +; X86-64-NEXT: movq %rdi, %rcx +; X86-64-NEXT: addq %rsi, %rcx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: movabsq $-6148914691236517205, %r8 # imm = 0xAAAAAAAAAAAAAAAB +; X86-64-NEXT: movq %rcx, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: shrq %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax +; X86-64-NEXT: subq %rax, %rcx +; X86-64-NEXT: subq %rcx, %rdi +; X86-64-NEXT: sbbq $0, %rsi +; X86-64-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA +; X86-64-NEXT: imulq %rdi, %rcx +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: addq %rcx, %rdx +; X86-64-NEXT: imulq %rsi, %r8 +; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv_i128_12: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $12, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __udivti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: shrdq $2, %rdx, %rcx +; WIN64-NEXT: shrq $2, %r8 +; WIN64-NEXT: movq %rcx, %r9 +; WIN64-NEXT: addq %r8, %r9 +; WIN64-NEXT: adcq $0, %r9 +; WIN64-NEXT: movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB +; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: shrq %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax +; WIN64-NEXT: subq %rax, %r9 +; WIN64-NEXT: subq %r9, %rcx +; WIN64-NEXT: sbbq $0, %r8 +; WIN64-NEXT: movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA +; WIN64-NEXT: imulq %rcx, %r9 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %r9, %rdx +; WIN64-NEXT: imulq %r10, %r8 +; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: %rem = udiv i128 %x, 12