diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4686,6 +4686,24 @@ SDValue LL = SDValue(), SDValue LH = SDValue(), SDValue RL = SDValue(), SDValue RH = SDValue()) const; + /// Attempt to expand an n-bit urem by constant using adds, compares, and + /// a n/2-bit urem by constant which will be expanded by DAGCombiner. This is + /// not possible for all constant divisors. + /// \param N Node to expand. + /// \param Divisor The constant divisor. + /// \param Lo Low half of the result. + /// \param Hi High half of the result. + /// \param HiLoVT The value type to use for the Lo and Hi nodes. Should be + /// half of VT. + /// \param InL Low bits of the LHS of the UREM. You can use this parameter + /// if you want to control how low bits are extracted from the LHS. + /// \param InH High bits of the LHS of the UREM. See InL for meaning. + /// \returns true if the node has been expanded, false if it has not. + bool expandUREMByConstant(SDNode *N, const APInt &Divisor, SDValue &Lo, + SDValue &Hi, EVT HiLoVT, SelectionDAG &DAG, + SDValue InL = SDValue(), + SDValue InH = SDValue()) const; + /// Expand funnel shift. /// \param N Node to expand /// \returns The expansion if successful, SDValue() otherwise diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -4539,6 +4539,19 @@ return; } + // Try to expand UREM by constant. + if (auto *CN = dyn_cast(N->getOperand(1))) { + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + // Only if the new type is legal. + if (isTypeLegal(NVT)) { + SDValue InL, InH; + GetExpandedInteger(N->getOperand(0), InL, InH); + if (TLI.expandUREMByConstant(N, CN->getAPIntValue(), Lo, Hi, NVT, DAG, + InL, InH)) + return; + } + } + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; if (VT == MVT::i16) LC = RTLIB::UREM_I16; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7136,6 +7136,88 @@ return Ok; } +// Optimize unsigned Op0 % Constant for types twice as large as a legal VT. If +// (1 << (BitWidth / 2)) % Constant == 1, then the remainder can be computed +// as: +// Sum += __builtin_uadd_overflow(Lo, High, &Sum); +// Remainder = Sum % Constant +// This is based on "Remainder by Summing Digits" from Hacker's Delight. +bool TargetLowering::expandUREMByConstant(SDNode *N, const APInt &Divisor, + SDValue &Lo, SDValue &Hi, EVT HiLoVT, + SelectionDAG &DAG, SDValue InL, + SDValue InH) const { + assert(N->getOpcode() == ISD::UREM && "Unexpected opcode!"); + EVT VT = N->getValueType(0); + + // We depend on the UREM by constant optimization in DAGCombiner that requires + // high multiply. + if (!isOperationLegalOrCustom(ISD::MULHU, HiLoVT) && + !isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT)) + return false; + + // Don't expand if the target doesn't want us to. + AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); + if (isIntDivCheap(VT, Attr)) + return false; + + // Early out for 0, 1 or even divisors. + if (Divisor.ule(1) || Divisor[0] == 0) + return false; + + unsigned BitWidth = Divisor.getBitWidth(); + unsigned HBitWidth = BitWidth / 2; + assert(VT.getScalarSizeInBits() == BitWidth && + HiLoVT.getScalarSizeInBits() == HBitWidth && "Unexpected VTs"); + + SDLoc dl(N); + SDValue Sum; + + // If (1 << HBitWidth) % divisor == 1, we can add the two halves together and + // then add in the carry. + if (APInt::getOneBitSet(BitWidth, HBitWidth).urem(Divisor).isOneValue()) { + assert(!InL == !InH && "Expected both input halves or no input halves!"); + if (!InL) { + InL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, N->getOperand(0), + DAG.getIntPtrConstant(0, dl)); + InH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, N->getOperand(0), + DAG.getIntPtrConstant(1, dl)); + } + + // Use addcarry if we can, otherwise fall back to add followed by setcc to + // detect carry. + // TODO: Support other ways? + EVT SetCCType = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT); + if (isOperationLegalOrCustom(ISD::ADDCARRY, HiLoVT)) { + SDVTList VTList = DAG.getVTList(HiLoVT, SetCCType); + Sum = DAG.getNode(ISD::UADDO, dl, VTList, InL, InH); + Sum = DAG.getNode(ISD::ADDCARRY, dl, VTList, Sum, + DAG.getConstant(0, dl, HiLoVT), Sum.getValue(1)); + } else { + Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, InL, InH); + SDValue Carry = DAG.getSetCC(dl, SetCCType, Sum, InL, ISD::SETULT); + if (getBooleanContents(HiLoVT) == + TargetLoweringBase::ZeroOrOneBooleanContent) + Carry = DAG.getZExtOrTrunc(Carry, dl, HiLoVT); + else + Carry = DAG.getSelect(dl, HiLoVT, Carry, DAG.getConstant(1, dl, HiLoVT), + DAG.getConstant(0, dl, HiLoVT)); + Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, Sum, Carry); + } + } + + // If we didn't find a sum, fall back to the non-constant expansion. + if (!Sum) + return false; + + // Perform an HiLoVT urem on the Sum using truncated divisor. + Lo = DAG.getNode(ISD::UREM, dl, HiLoVT, Sum, + DAG.getConstant(Divisor.trunc(HBitWidth), dl, HiLoVT)); + // High half of the remainder is 0. + Hi = DAG.getConstant(0, dl, HiLoVT); + return true; +} + // Check that (every element of) Z is undef or not an exact multiple of BW. static bool isNonZeroModBitWidthOrUndef(SDValue Z, unsigned BW) { return ISD::matchUnaryPredicate( diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29438,6 +29438,15 @@ assert(VT.isInteger() && VT.getSizeInBits() == 128 && "Unexpected return type for lowering"); + if (Op.getOpcode() == ISD::UREM) { + if (auto *CN = dyn_cast(Op->getOperand(1))) { + SDValue Lo, Hi; + if (expandUREMByConstant(Op.getNode(), CN->getAPIntValue(), Lo, Hi, + MVT::i64, DAG)) + return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Lo, Hi); + } + } + RTLIB::Libcall LC; bool isSigned; switch (Op->getOpcode()) { diff --git a/llvm/test/CodeGen/VE/Scalar/rem.ll b/llvm/test/CodeGen/VE/Scalar/rem.ll --- a/llvm/test/CodeGen/VE/Scalar/rem.ll +++ b/llvm/test/CodeGen/VE/Scalar/rem.ll @@ -181,11 +181,11 @@ ; CHECK-NEXT: lea %s2, __umodti3@lo ; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: lea.sl %s12, __umodti3@hi(, %s2) -; CHECK-NEXT: or %s2, 3, (0)1 +; CHECK-NEXT: or %s2, 11, (0)1 ; CHECK-NEXT: or %s3, 0, (0)1 ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 - %r = urem i128 %a, 3 + %r = urem i128 %a, 11 ret i128 %r } diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll --- a/llvm/test/CodeGen/X86/divide-by-constant.ll +++ b/llvm/test/CodeGen/X86/divide-by-constant.ll @@ -460,13 +460,17 @@ define i64 @urem_i64_3(i64 %x) nounwind { ; X32-LABEL: urem_i64_3: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $3 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: shrl %edx +; X32-NEXT: leal (%edx,%edx,2), %eax +; X32-NEXT: subl %eax, %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: xorl %edx, %edx ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_3: @@ -487,13 +491,17 @@ define i64 @urem_i64_5(i64 %x) nounwind { ; X32-LABEL: urem_i64_5: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $5 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: shrl $2, %edx +; X32-NEXT: leal (%edx,%edx,4), %eax +; X32-NEXT: subl %eax, %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: xorl %edx, %edx ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_5: @@ -514,13 +522,18 @@ define i64 @urem_i64_15(i64 %x) nounwind { ; X32-LABEL: urem_i64_15: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $15 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl $-2004318071, %edx # imm = 0x88888889 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: shrl $3, %edx +; X32-NEXT: leal (%edx,%edx,4), %eax +; X32-NEXT: leal (%eax,%eax,2), %eax +; X32-NEXT: subl %eax, %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: xorl %edx, %edx ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_15: @@ -542,13 +555,19 @@ define i64 @urem_i64_17(i64 %x) nounwind { ; X32-LABEL: urem_i64_17: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $17 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl $-252645135, %edx # imm = 0xF0F0F0F1 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %eax +; X32-NEXT: andl $-16, %eax +; X32-NEXT: shrl $4, %edx +; X32-NEXT: addl %eax, %edx +; X32-NEXT: subl %edx, %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: xorl %edx, %edx ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_17: @@ -571,13 +590,23 @@ define i64 @urem_i64_255(i64 %x) nounwind { ; X32-LABEL: urem_i64_255: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $255 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: addl %esi, %eax +; X32-NEXT: adcl $0, %eax +; X32-NEXT: movl $-2139062143, %edx # imm = 0x80808081 +; X32-NEXT: mull %edx +; X32-NEXT: shrl $7, %edx +; X32-NEXT: movl %edx, %eax +; X32-NEXT: shll $8, %eax +; X32-NEXT: subl %eax, %edx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: adcl %edx, %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: popl %esi ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_255: @@ -599,13 +628,19 @@ define i64 @urem_i64_257(i64 %x) nounwind { ; X32-LABEL: urem_i64_257: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $257 # imm = 0x101 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl $-16711935, %edx # imm = 0xFF00FF01 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %eax +; X32-NEXT: andl $-256, %eax +; X32-NEXT: shrl $8, %edx +; X32-NEXT: addl %eax, %edx +; X32-NEXT: subl %edx, %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: xorl %edx, %edx ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_257: @@ -628,13 +663,23 @@ define i64 @urem_i64_65535(i64 %x) nounwind { ; X32-LABEL: urem_i64_65535: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $65535 # imm = 0xFFFF -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: addl %esi, %eax +; X32-NEXT: adcl $0, %eax +; X32-NEXT: movl $-2147450879, %edx # imm = 0x80008001 +; X32-NEXT: mull %edx +; X32-NEXT: shrl $15, %edx +; X32-NEXT: movl %edx, %eax +; X32-NEXT: shll $16, %eax +; X32-NEXT: subl %eax, %edx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: adcl %edx, %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: popl %esi ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_65535: @@ -656,13 +701,18 @@ define i64 @urem_i64_65537(i64 %x) nounwind { ; X32-LABEL: urem_i64_65537: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $65537 # imm = 0x10001 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl $-65535, %edx # imm = 0xFFFF0001 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %eax +; X32-NEXT: shrl $16, %eax +; X32-NEXT: shldl $16, %edx, %eax +; X32-NEXT: subl %eax, %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: xorl %edx, %edx ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_65537: diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll --- a/llvm/test/CodeGen/X86/divmod128.ll +++ b/llvm/test/CodeGen/X86/divmod128.ll @@ -68,7 +68,7 @@ ; X86-64-LABEL: umod128: ; X86-64: # %bb.0: ; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $3, %edx +; X86-64-NEXT: movl $11, %edx ; X86-64-NEXT: xorl %ecx, %ecx ; X86-64-NEXT: callq __umodti3@PLT ; X86-64-NEXT: popq %rcx @@ -79,7 +79,7 @@ ; WIN64-NEXT: subq $72, %rsp ; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $11, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx @@ -89,7 +89,7 @@ ; WIN64-NEXT: retq - %1 = urem i128 %x, 3 + %1 = urem i128 %x, 11 %2 = trunc i128 %1 to i64 ret i64 %2 } @@ -127,27 +127,30 @@ define i128 @urem_i128_3(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_3: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $3, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: addq %rsi, %rdi +; X86-64-NEXT: adcq $0, %rdi +; X86-64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: shrq %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax +; X86-64-NEXT: subq %rax, %rdi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_3: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: addq %rdx, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: shrq %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax +; WIN64-NEXT: subq %rax, %rcx +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 3 @@ -157,27 +160,30 @@ define i128 @urem_i128_5(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_5: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $5, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: addq %rsi, %rdi +; X86-64-NEXT: adcq $0, %rdi +; X86-64-NEXT: movabsq $-3689348814741910323, %rcx # imm = 0xCCCCCCCCCCCCCCCD +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: shrq $2, %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax +; X86-64-NEXT: subq %rax, %rdi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_5: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $5, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: addq %rdx, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-3689348814741910323, %rdx # imm = 0xCCCCCCCCCCCCCCCD +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: shrq $2, %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax +; WIN64-NEXT: subq %rax, %rcx +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 5 @@ -187,27 +193,32 @@ define i128 @urem_i128_15(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_15: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $15, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: addq %rsi, %rdi +; X86-64-NEXT: adcq $0, %rdi +; X86-64-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: shrq $3, %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax +; X86-64-NEXT: leaq (%rax,%rax,2), %rax +; X86-64-NEXT: subq %rax, %rdi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_15: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $15, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: addq %rdx, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: shrq $3, %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax +; WIN64-NEXT: leaq (%rax,%rax,2), %rax +; WIN64-NEXT: subq %rax, %rcx +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 15 @@ -217,27 +228,34 @@ define i128 @urem_i128_17(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_17: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $17, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: addq %rsi, %rdi +; X86-64-NEXT: adcq $0, %rdi +; X86-64-NEXT: movabsq $-1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F1 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %rax +; X86-64-NEXT: andq $-16, %rax +; X86-64-NEXT: shrq $4, %rdx +; X86-64-NEXT: addq %rax, %rdx +; X86-64-NEXT: subq %rdx, %rdi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_17: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $17, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: addq %rdx, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F1 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: andq $-16, %rax +; WIN64-NEXT: shrq $4, %rdx +; WIN64-NEXT: addq %rax, %rdx +; WIN64-NEXT: subq %rdx, %rcx +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 17 @@ -247,27 +265,37 @@ define i128 @urem_i128_255(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_255: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $255, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: addq %rsi, %rax +; X86-64-NEXT: adcq $0, %rax +; X86-64-NEXT: movabsq $-9187201950435737471, %rcx # imm = 0x8080808080808081 +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: shrq $7, %rdx +; X86-64-NEXT: movq %rdx, %rax +; X86-64-NEXT: shlq $8, %rax +; X86-64-NEXT: subq %rax, %rdx +; X86-64-NEXT: addq %rsi, %rdi +; X86-64-NEXT: adcq %rdx, %rdi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_255: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $255, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: addq %rdx, %rax +; WIN64-NEXT: adcq $0, %rax +; WIN64-NEXT: movabsq $-9187201950435737471, %rdx # imm = 0x8080808080808081 +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: shrq $7, %rdx +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: shlq $8, %rax +; WIN64-NEXT: subq %rax, %rdx +; WIN64-NEXT: addq %rcx, %r8 +; WIN64-NEXT: adcq %rdx, %r8 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 255 @@ -277,27 +305,34 @@ define i128 @urem_i128_257(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_257: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $257, %edx # imm = 0x101 -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: addq %rsi, %rdi +; X86-64-NEXT: adcq $0, %rdi +; X86-64-NEXT: movabsq $-71777214294589695, %rcx # imm = 0xFF00FF00FF00FF01 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %rax +; X86-64-NEXT: andq $-256, %rax +; X86-64-NEXT: shrq $8, %rdx +; X86-64-NEXT: addq %rax, %rdx +; X86-64-NEXT: subq %rdx, %rdi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_257: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $257, {{[0-9]+}}(%rsp) # imm = 0x101 -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: addq %rdx, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-71777214294589695, %rdx # imm = 0xFF00FF00FF00FF01 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: andq $-256, %rax +; WIN64-NEXT: shrq $8, %rdx +; WIN64-NEXT: addq %rax, %rdx +; WIN64-NEXT: subq %rdx, %rcx +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 257 @@ -307,27 +342,37 @@ define i128 @urem_i128_65535(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_65535: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $65535, %edx # imm = 0xFFFF -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: addq %rsi, %rax +; X86-64-NEXT: adcq $0, %rax +; X86-64-NEXT: movabsq $-9223231297218904063, %rcx # imm = 0x8000800080008001 +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: shrq $15, %rdx +; X86-64-NEXT: movq %rdx, %rax +; X86-64-NEXT: shlq $16, %rax +; X86-64-NEXT: subq %rax, %rdx +; X86-64-NEXT: addq %rsi, %rdi +; X86-64-NEXT: adcq %rdx, %rdi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_65535: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $65535, {{[0-9]+}}(%rsp) # imm = 0xFFFF -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: addq %rdx, %rax +; WIN64-NEXT: adcq $0, %rax +; WIN64-NEXT: movabsq $-9223231297218904063, %rdx # imm = 0x8000800080008001 +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: shrq $15, %rdx +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: shlq $16, %rax +; WIN64-NEXT: subq %rax, %rdx +; WIN64-NEXT: addq %rcx, %r8 +; WIN64-NEXT: adcq %rdx, %r8 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 65535 @@ -337,27 +382,34 @@ define i128 @urem_i128_65537(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_65537: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $65537, %edx # imm = 0x10001 -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: addq %rsi, %rdi +; X86-64-NEXT: adcq $0, %rdi +; X86-64-NEXT: movabsq $-281470681808895, %rcx # imm = 0xFFFF0000FFFF0001 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %rax +; X86-64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000 +; X86-64-NEXT: shrq $16, %rdx +; X86-64-NEXT: addq %rax, %rdx +; X86-64-NEXT: subq %rdx, %rdi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_65537: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $65537, {{[0-9]+}}(%rsp) # imm = 0x10001 -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: addq %rdx, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-281470681808895, %rdx # imm = 0xFFFF0000FFFF0001 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000 +; WIN64-NEXT: shrq $16, %rdx +; WIN64-NEXT: addq %rax, %rdx +; WIN64-NEXT: subq %rdx, %rcx +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 65537