diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7168,8 +7168,15 @@ // Remainder = Sum % Constant // This is based on "Remainder by Summing Digits" from Hacker's Delight. // -// For division, we can compute the remainder, subtract it from the dividend, -// and then multiply by the multiplicative inverse modulo (1 << (BitWidth / 2)). +// If Constant is even, we can shift right the dividend and the divisor by the +// number of trailing zeros in Constant before computing the remainder. Then +// fixup the remainder by shifting it left by the number of trailing zeros and +// adding the bits that were shifted out of the dividend. +// +// For division, we can compute the remainder using the algorithm described +// above, subtract it from the dividend to get an exact multiple of Constant. +// Then multiply that extact multiple by the multiplicative inverse modulo +// (1 << (BitWidth / 2)). bool TargetLowering::expandDIVREMByConstant(SDNode *N, SmallVectorImpl &Result, EVT HiLoVT, SelectionDAG &DAG, @@ -7188,7 +7195,7 @@ if (!CN) return false; - const APInt &Divisor = CN->getAPIntValue(); + APInt Divisor = CN->getAPIntValue(); unsigned BitWidth = Divisor.getBitWidth(); unsigned HBitWidth = BitWidth / 2; assert(VT.getScalarSizeInBits() == BitWidth && @@ -7209,10 +7216,17 @@ if (DAG.shouldOptForSize()) return false; - // Early out for 0, 1 or even divisors. - if (Divisor.ule(1) || Divisor[0] == 0) + // Early out for 0 or 1 divisors. + if (Divisor.ule(1)) return false; + // If the divisor is even, shift it until it becomes odd. + unsigned TrailingZeros = 0; + if (!Divisor[0]) { + TrailingZeros = Divisor.countTrailingZeros(); + Divisor.lshrInPlace(TrailingZeros); + } + SDLoc dl(N); SDValue Sum; @@ -7229,17 +7243,35 @@ DAG.getIntPtrConstant(1, dl)); } + SDValue ShiftedLL = LL; + SDValue ShiftedLH = LH; + + // Shift the input by the number of TrailingZeros in the divisor. The + // shifted out bits will be added to the remainder later. + if (TrailingZeros) { + ShiftedLL = DAG.getNode( + ISD::OR, dl, HiLoVT, + DAG.getNode(ISD::SRL, dl, HiLoVT, ShiftedLL, + DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)), + DAG.getNode(ISD::SHL, dl, HiLoVT, ShiftedLH, + DAG.getShiftAmountConstant(HBitWidth - TrailingZeros, + HiLoVT, dl))); + ShiftedLH = + DAG.getNode(ISD::SRL, dl, HiLoVT, ShiftedLH, + DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)); + } + // Use addcarry if we can, otherwise use a compare to detect overflow. EVT SetCCType = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT); if (isOperationLegalOrCustom(ISD::ADDCARRY, HiLoVT)) { SDVTList VTList = DAG.getVTList(HiLoVT, SetCCType); - Sum = DAG.getNode(ISD::UADDO, dl, VTList, LL, LH); + Sum = DAG.getNode(ISD::UADDO, dl, VTList, ShiftedLL, ShiftedLH); Sum = DAG.getNode(ISD::ADDCARRY, dl, VTList, Sum, DAG.getConstant(0, dl, HiLoVT), Sum.getValue(1)); } else { - Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, LL, LH); - SDValue Carry = DAG.getSetCC(dl, SetCCType, Sum, LL, ISD::SETULT); + Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, ShiftedLL, ShiftedLH); + SDValue Carry = DAG.getSetCC(dl, SetCCType, Sum, ShiftedLL, ISD::SETULT); // If the boolean for the target is 0 or 1, we can add the setcc result // directly. if (getBooleanContents(HiLoVT) == @@ -7263,6 +7295,17 @@ // High half of the remainder is 0. SDValue RemH = DAG.getConstant(0, dl, HiLoVT); + // If we shifted the input, shift the remainder left and add the bits we + // shifted off the input. + if (TrailingZeros) { + APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros); + RemL = DAG.getNode(ISD::SHL, dl, HiLoVT, RemL, + DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)); + RemL = DAG.getNode(ISD::ADD, dl, HiLoVT, RemL, + DAG.getNode(ISD::AND, dl, HiLoVT, LL, + DAG.getConstant(Mask, dl, HiLoVT))); + } + // If we only want remainder, we're done. if (Opcode == ISD::UREM) { Result.push_back(RemL); diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll --- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll @@ -502,24 +502,65 @@ define iXLen2 @test_udiv_12(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_12: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 12 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __udivdi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: slli a2, a1, 30 +; RV32-NEXT: srli a3, a0, 2 +; RV32-NEXT: or a2, a3, a2 +; RV32-NEXT: srli a3, a1, 2 +; RV32-NEXT: add a3, a2, a3 +; RV32-NEXT: sltu a2, a3, a2 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: lui a3, 699051 +; RV32-NEXT: addi a4, a3, -1365 +; RV32-NEXT: mulhu a5, a2, a4 +; RV32-NEXT: srli a6, a5, 1 +; RV32-NEXT: andi a5, a5, -2 +; RV32-NEXT: add a5, a5, a6 +; RV32-NEXT: sub a2, a2, a5 +; RV32-NEXT: slli a2, a2, 2 +; RV32-NEXT: andi a5, a0, 3 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: sub a5, a0, a2 +; RV32-NEXT: addi a3, a3, -1366 +; RV32-NEXT: mul a3, a5, a3 +; RV32-NEXT: mulhu a6, a5, a4 +; RV32-NEXT: add a3, a6, a3 +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: mul a0, a0, a4 +; RV32-NEXT: add a1, a3, a0 +; RV32-NEXT: mul a0, a5, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_12: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 12 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __udivti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: slli a2, a1, 62 +; RV64-NEXT: srli a3, a0, 2 +; RV64-NEXT: or a2, a3, a2 +; RV64-NEXT: srli a3, a1, 2 +; RV64-NEXT: lui a4, %hi(.LCPI10_0) +; RV64-NEXT: ld a4, %lo(.LCPI10_0)(a4) +; RV64-NEXT: add a3, a2, a3 +; RV64-NEXT: sltu a2, a3, a2 +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: mulhu a3, a2, a4 +; RV64-NEXT: srli a5, a3, 1 +; RV64-NEXT: andi a3, a3, -2 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: sub a2, a2, a3 +; RV64-NEXT: slli a2, a2, 2 +; RV64-NEXT: lui a3, %hi(.LCPI10_1) +; RV64-NEXT: ld a3, %lo(.LCPI10_1)(a3) +; RV64-NEXT: andi a5, a0, 3 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: sub a5, a0, a2 +; RV64-NEXT: mul a3, a5, a3 +; RV64-NEXT: mulhu a6, a5, a4 +; RV64-NEXT: add a3, a6, a3 +; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: sub a0, a1, a0 +; RV64-NEXT: mul a0, a0, a4 +; RV64-NEXT: add a1, a3, a0 +; RV64-NEXT: mul a0, a5, a4 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 12 ret iXLen2 %a diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll --- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll @@ -335,24 +335,46 @@ define iXLen2 @test_urem_12(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_12: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 12 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __umoddi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: slli a2, a1, 30 +; RV32-NEXT: srli a3, a0, 2 +; RV32-NEXT: or a2, a3, a2 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: sltu a2, a1, a2 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: lui a2, 699051 +; RV32-NEXT: addi a2, a2, -1365 +; RV32-NEXT: mulhu a2, a1, a2 +; RV32-NEXT: srli a3, a2, 1 +; RV32-NEXT: andi a2, a2, -2 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: sub a1, a1, a2 +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: andi a0, a0, 3 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: li a1, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_12: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 12 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __umodti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: slli a2, a1, 62 +; RV64-NEXT: srli a3, a0, 2 +; RV64-NEXT: or a2, a3, a2 +; RV64-NEXT: srli a1, a1, 2 +; RV64-NEXT: lui a3, %hi(.LCPI10_0) +; RV64-NEXT: ld a3, %lo(.LCPI10_0)(a3) +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: sltu a2, a1, a2 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: mulhu a2, a1, a3 +; RV64-NEXT: srli a3, a2, 1 +; RV64-NEXT: andi a2, a2, -2 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: sub a1, a1, a2 +; RV64-NEXT: slli a1, a1, 2 +; RV64-NEXT: andi a0, a0, 3 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: li a1, 0 ; RV64-NEXT: ret %a = urem iXLen2 %x, 12 ret iXLen2 %a diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll --- a/llvm/test/CodeGen/X86/divide-by-constant.ll +++ b/llvm/test/CodeGen/X86/divide-by-constant.ll @@ -735,13 +735,24 @@ define i64 @urem_i64_12(i64 %x) nounwind { ; X32-LABEL: urem_i64_12: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $12 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: shrl $2, %eax +; X32-NEXT: shldl $30, %esi, %ecx +; X32-NEXT: addl %eax, %ecx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: shrl %edx +; X32-NEXT: leal (%edx,%edx,2), %eax +; X32-NEXT: subl %eax, %ecx +; X32-NEXT: andl $3, %esi +; X32-NEXT: leal (%esi,%ecx,4), %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: popl %esi ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_12: @@ -1116,13 +1127,37 @@ define i64 @udiv_i64_12(i64 %x) nounwind { ; X32-LABEL: udiv_i64_12: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $12 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __udivdi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %edi +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: shrl $2, %eax +; X32-NEXT: movl %edi, %esi +; X32-NEXT: shldl $30, %ecx, %esi +; X32-NEXT: addl %eax, %esi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: shrl %edx +; X32-NEXT: leal (%edx,%edx,2), %eax +; X32-NEXT: subl %eax, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: andl $3, %eax +; X32-NEXT: leal (%eax,%esi,4), %eax +; X32-NEXT: subl %eax, %ecx +; X32-NEXT: sbbl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: popl %esi +; X32-NEXT: popl %edi +; X32-NEXT: popl %ebx ; X32-NEXT: retl ; ; X64-LABEL: udiv_i64_12: diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll --- a/llvm/test/CodeGen/X86/divmod128.ll +++ b/llvm/test/CodeGen/X86/divmod128.ll @@ -425,27 +425,39 @@ define i128 @urem_i128_12(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_12: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $12, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: shldq $62, %rdi, %rax +; X86-64-NEXT: shrq $2, %rsi +; X86-64-NEXT: addq %rax, %rsi +; X86-64-NEXT: adcq $0, %rsi +; X86-64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: shrq %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax +; X86-64-NEXT: subq %rax, %rsi +; X86-64-NEXT: andl $3, %edi +; X86-64-NEXT: leaq (%rdi,%rsi,4), %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_12: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $12, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: shldq $62, %rcx, %rax +; WIN64-NEXT: shrq $2, %r8 +; WIN64-NEXT: addq %rax, %r8 +; WIN64-NEXT: adcq $0, %r8 +; WIN64-NEXT: movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: shrq %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax +; WIN64-NEXT: subq %rax, %r8 +; WIN64-NEXT: andl $3, %ecx +; WIN64-NEXT: leaq (%rcx,%r8,4), %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 12 @@ -887,27 +899,59 @@ define i128 @udiv_i128_12(i128 %x) nounwind { ; X86-64-LABEL: udiv_i128_12: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $12, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __udivti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: shldq $62, %rdi, %rax +; X86-64-NEXT: movq %rsi, %rcx +; X86-64-NEXT: shrq $2, %rcx +; X86-64-NEXT: addq %rax, %rcx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: movabsq $-6148914691236517205, %r8 # imm = 0xAAAAAAAAAAAAAAAB +; X86-64-NEXT: movq %rcx, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: shrq %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax +; X86-64-NEXT: subq %rax, %rcx +; X86-64-NEXT: movl %edi, %eax +; X86-64-NEXT: andl $3, %eax +; X86-64-NEXT: leaq (%rax,%rcx,4), %rax +; X86-64-NEXT: subq %rax, %rdi +; X86-64-NEXT: sbbq $0, %rsi +; X86-64-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA +; X86-64-NEXT: imulq %rdi, %rcx +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: addq %rcx, %rdx +; X86-64-NEXT: imulq %rsi, %r8 +; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv_i128_12: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $12, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __udivti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: shldq $62, %rcx, %rax +; WIN64-NEXT: movq %rdx, %r9 +; WIN64-NEXT: shrq $2, %r9 +; WIN64-NEXT: addq %rax, %r9 +; WIN64-NEXT: adcq $0, %r9 +; WIN64-NEXT: movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB +; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: shrq %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax +; WIN64-NEXT: subq %rax, %r9 +; WIN64-NEXT: movl %ecx, %eax +; WIN64-NEXT: andl $3, %eax +; WIN64-NEXT: leaq (%rax,%r9,4), %rax +; WIN64-NEXT: subq %rax, %rcx +; WIN64-NEXT: sbbq $0, %r8 +; WIN64-NEXT: movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA +; WIN64-NEXT: imulq %rcx, %r9 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %r9, %rdx +; WIN64-NEXT: imulq %r10, %r8 +; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: %rem = udiv i128 %x, 12