diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4694,6 +4694,23 @@ SDValue LL = SDValue(), SDValue LH = SDValue(), SDValue RL = SDValue(), SDValue RH = SDValue()) const; + /// Attempt to expand an n-bit div/rem/divrem by constant using adds, + /// compares, and a n/2-bit urem by constant which will be expanded by + /// DAGCombiner. This is not possible for all constant divisors. + /// \param N Node to expand + /// \param Result A vector that will be filled with the parts of the result. + /// \param HiLoVT The value type to use for the Lo and Hi nodes. Should be + /// half of VT. + /// \param InL Low bits of the LHS of the operation. You can use this + /// parameter if you want to control how low bits are extracted from + /// the LHS. + /// \param InH High bits of the LHS of the operation. See InL for meaning. + /// \returns true if the node has been expanded, false if it has not. + bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl &Result, + EVT HiLoVT, SelectionDAG &DAG, + SDValue LL = SDValue(), + SDValue LH = SDValue()) const; + /// Expand funnel shift. /// \param N Node to expand /// \returns The expansion if successful, SDValue() otherwise diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -4504,6 +4504,22 @@ return; } + // Try to expand UDIV by constant. + if (isa(N->getOperand(1))) { + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + // Only if the new type is legal. + if (isTypeLegal(NVT)) { + SDValue InL, InH; + GetExpandedInteger(N->getOperand(0), InL, InH); + SmallVector Result; + if (TLI.expandDIVREMByConstant(N, Result, NVT, DAG, InL, InH)) { + Lo = Result[0]; + Hi = Result[1]; + return; + } + } + } + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; if (VT == MVT::i16) LC = RTLIB::UDIV_I16; @@ -4539,6 +4555,22 @@ return; } + // Try to expand UREM by constant. + if (isa(N->getOperand(1))) { + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + // Only if the new type is legal. + if (isTypeLegal(NVT)) { + SDValue InL, InH; + GetExpandedInteger(N->getOperand(0), InL, InH); + SmallVector Result; + if (TLI.expandDIVREMByConstant(N, Result, NVT, DAG, InL, InH)) { + Lo = Result[0]; + Hi = Result[1]; + return; + } + } + } + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; if (VT == MVT::i16) LC = RTLIB::UREM_I16; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7157,6 +7157,174 @@ return Ok; } +// Optimize unsigned Op0 % Constant for types twice as large as a legal VT. If +// (1 << (BitWidth / 2)) % Constant == 1, then the remainder can be computed +// as: +// Sum += __builtin_uadd_overflow(Lo, High, &Sum); +// Remainder = Sum % Constant +// This is based on "Remainder by Summing Digits" from Hacker's Delight. +bool TargetLowering::expandDIVREMByConstant(SDNode *N, + SmallVectorImpl &Result, + EVT HiLoVT, SelectionDAG &DAG, + SDValue LL, SDValue LH) const { + unsigned Opcode = N->getOpcode(); + EVT VT = N->getValueType(0); + + // TODO: Support signed. + if (Opcode == ISD::SREM || Opcode == ISD::SDIV || Opcode == ISD::SDIVREM) + return false; + + auto *CN = dyn_cast(N->getOperand(1)); + if (!CN) + return false; + + APInt Divisor = CN->getAPIntValue(); + unsigned BitWidth = Divisor.getBitWidth(); + unsigned HBitWidth = BitWidth / 2; + assert(VT.getScalarSizeInBits() == BitWidth && + HiLoVT.getScalarSizeInBits() == HBitWidth && "Unexpected VTs"); + + // Divisor needs to less than (1 << HBitWidth). + APInt HalfMaxPlus1 = APInt::getOneBitSet(BitWidth, HBitWidth); + if (Divisor.uge(HalfMaxPlus1)) + return false; + + // We depend on the UREM by constant optimization in DAGCombiner that requires + // high multiply. + if (!isOperationLegalOrCustom(ISD::MULHU, HiLoVT) && + !isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT)) + return false; + + // Don't expand if the target doesn't want us to. + AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); + if (isIntDivCheap(VT, Attr)) + return false; + + // Early out for 0, 1 or even divisors. + if (Divisor.ule(1)) + return false; + + SDLoc dl(N); + SDValue Sum; + + // If the divisor is even, shift it until it becomes odd. + unsigned TrailingZeros = 0; + if (!Divisor[0]) { + TrailingZeros = Divisor.countTrailingZeros(); + Divisor.lshrInPlace(TrailingZeros); + } + + // If (1 << HBitWidth) % divisor == 1, we can add the two halves together and + // then add in the carry. + if (HalfMaxPlus1.urem(Divisor).isOneValue()) { + assert(!LL == !LH && "Expected both input halves or no input halves!"); + if (!LL) { + LL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, N->getOperand(0), + DAG.getIntPtrConstant(0, dl)); + LH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, N->getOperand(0), + DAG.getIntPtrConstant(1, dl)); + } + + SDValue DL = LL; + SDValue DH = LH; + + // Shift the input by TrailingZeros. + if (TrailingZeros) { + DL = DAG.getNode( + ISD::OR, dl, HiLoVT, + DAG.getNode(ISD::SRL, dl, HiLoVT, DL, + DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)), + DAG.getNode(ISD::SHL, dl, HiLoVT, DH, + DAG.getShiftAmountConstant(HBitWidth - TrailingZeros, + HiLoVT, dl))); + DH = DAG.getNode(ISD::SRL, dl, HiLoVT, DH, + DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)); + } + + // Use addcarry if we can, otherwise fall back to add followed by setcc to + // detect carry. + // TODO: Support other ways? + EVT SetCCType = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT); + if (isOperationLegalOrCustom(ISD::ADDCARRY, HiLoVT)) { + SDVTList VTList = DAG.getVTList(HiLoVT, SetCCType); + Sum = DAG.getNode(ISD::UADDO, dl, VTList, DL, DH); + Sum = DAG.getNode(ISD::ADDCARRY, dl, VTList, Sum, + DAG.getConstant(0, dl, HiLoVT), Sum.getValue(1)); + } else { + Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, DL, DH); + SDValue Carry = DAG.getSetCC(dl, SetCCType, Sum, DL, ISD::SETULT); + if (getBooleanContents(HiLoVT) == + TargetLoweringBase::ZeroOrOneBooleanContent) + Carry = DAG.getZExtOrTrunc(Carry, dl, HiLoVT); + else + Carry = DAG.getSelect(dl, HiLoVT, Carry, DAG.getConstant(1, dl, HiLoVT), + DAG.getConstant(0, dl, HiLoVT)); + Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, Sum, Carry); + } + } + + // If we didn't find a sum, fall back to the non-constant expansion. + if (!Sum) + return false; + + // Perform a HiLoVT urem on the Sum using truncated divisor. + SDValue RemL = + DAG.getNode(ISD::UREM, dl, HiLoVT, Sum, + DAG.getConstant(Divisor.trunc(HBitWidth), dl, HiLoVT)); + // High half of the remainder is 0. + SDValue RemH = DAG.getConstant(0, dl, HiLoVT); + + // If we shifted the input, shift the remainder left and add the bits we + // shifted off the input. + if (TrailingZeros) { + APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros); + RemL = DAG.getNode(ISD::SHL, dl, HiLoVT, RemL, + DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)); + RemL = DAG.getNode(ISD::ADD, dl, HiLoVT, RemL, + DAG.getNode(ISD::AND, dl, HiLoVT, LL, + DAG.getConstant(Mask, dl, HiLoVT))); + } + + // If we only want remainder, we're done. + if (Opcode == ISD::UREM) { + Result.push_back(RemL); + Result.push_back(RemH); + return true; + } + + // We need to compute the quotient. + + // Join the remainder halves. + SDValue Rem = DAG.getNode(ISD::BUILD_PAIR, dl, VT, RemL, RemH); + + // Subtract the remainder from the input. + SDValue In = DAG.getNode(ISD::SUB, dl, VT, N->getOperand(0), Rem); + + // Multiply by the multiplicative inverse of the divisor. + APInt Mod = APInt::getSignedMinValue(BitWidth + 1); + APInt MulFactor = Divisor.zext(BitWidth + 1); + MulFactor = MulFactor.multiplicativeInverse(Mod); + MulFactor = MulFactor.trunc(BitWidth); + + SDValue Quotient = + DAG.getNode(ISD::MUL, dl, VT, In, DAG.getConstant(MulFactor, dl, VT)); + + // Split the quotient. + SDValue QuotL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient, + DAG.getIntPtrConstant(0, dl)); + SDValue QuotH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient, + DAG.getIntPtrConstant(1, dl)); + Result.push_back(QuotL); + Result.push_back(QuotH); + if (Opcode == ISD::UDIVREM) { + Result.push_back(RemL); + Result.push_back(RemH); + } + + return true; +} + // Check that (every element of) Z is undef or not an exact multiple of BW. static bool isNonZeroModBitWidthOrUndef(SDValue Z, unsigned BW) { return ISD::matchUnaryPredicate( diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -20423,9 +20423,19 @@ "Invalid opcode for Div/Rem lowering"); bool isSigned = (Opcode == ISD::SDIVREM); EVT VT = Op->getValueType(0); - Type *Ty = VT.getTypeForEVT(*DAG.getContext()); SDLoc dl(Op); + if (VT == MVT::i64 && isa(Op.getOperand(1))) { + SmallVector Result; + if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) { + SDValue Div = DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]); + SDValue Rem = DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]); + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), {Div, Rem}); + } + } + + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + // If the target has hardware divide, use divide + multiply + subtract: // div = a / b // rem = a - b * div @@ -20474,11 +20484,20 @@ // Lowers REM using divmod helpers // see RTABI section 4.2/4.3 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { + EVT VT = N->getValueType(0); + + if (VT == MVT::i64 && isa(N->getOperand(1))) { + SmallVector Result; + if (expandDIVREMByConstant(N, Result, MVT::i32, DAG)) + return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0), + Result[0], Result[1]); + } + // Build return types (div and rem) std::vector RetTyParams; Type *RetTyElement; - switch (N->getValueType(0).getSimpleVT().SimpleTy) { + switch (VT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Unexpected request for libcall!"); case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29502,6 +29502,12 @@ assert(VT.isInteger() && VT.getSizeInBits() == 128 && "Unexpected return type for lowering"); + if (isa(Op->getOperand(1))) { + SmallVector Result; + if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG)) + return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]); + } + RTLIB::Libcall LC; bool isSigned; switch (Op->getOpcode()) { diff --git a/llvm/test/CodeGen/ARM/div.ll b/llvm/test/CodeGen/ARM/div.ll --- a/llvm/test/CodeGen/ARM/div.ll +++ b/llvm/test/CodeGen/ARM/div.ll @@ -104,3 +104,33 @@ %tmp1 = urem i64 %a, %b ; [#uses=1] ret i64 %tmp1 } + +; Make sure we avoid a libcall for some constants. +define i64 @f7(i64 %a) { +; CHECK-SWDIV-LABEL: f7 +; CHECK-SWDIV: adc +; CHECK-SWDIV: umull +; CHECK-HWDIV-LABEL: f7 +; CHECK-HWDIV: adc +; CHECK-HWDIV: umull +; CHECK-EABI-LABEL: f7 +; CHECK-EABI: adc +; CHECK-EABI: umull + %tmp1 = urem i64 %a, 3 + ret i64 %tmp1 +} + +; Make sure we avoid a libcall for some constants. +define i64 @f8(i64 %a) { +; CHECK-SWDIV-LABEL: f8 +; CHECK-SWDIV: adc +; CHECK-SWDIV: umull +; CHECK-HWDIV-LABEL: f8 +; CHECK-HWDIV: adc +; CHECK-HWDIV: umull +; CHECK-EABI-LABEL: f8 +; CHECK-EABI: adc +; CHECK-EABI: umull + %tmp1 = udiv i64 %a, 3 + ret i64 %tmp1 +} diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll --- a/llvm/test/CodeGen/RISCV/div-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll @@ -66,13 +66,26 @@ define i64 @udiv64_constant_no_add(i64 %a) nounwind { ; RV32-LABEL: udiv64_constant_no_add: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 5 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __udivdi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: add a2, a0, a1 +; RV32-NEXT: sltu a3, a2, a0 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: lui a3, 838861 +; RV32-NEXT: addi a4, a3, -819 +; RV32-NEXT: mulhu a5, a2, a4 +; RV32-NEXT: srli a6, a5, 2 +; RV32-NEXT: andi a5, a5, -4 +; RV32-NEXT: add a5, a5, a6 +; RV32-NEXT: sub a2, a2, a5 +; RV32-NEXT: sub a5, a0, a2 +; RV32-NEXT: addi a3, a3, -820 +; RV32-NEXT: mul a3, a5, a3 +; RV32-NEXT: mulhu a6, a5, a4 +; RV32-NEXT: add a3, a6, a3 +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: mul a0, a0, a4 +; RV32-NEXT: add a1, a3, a0 +; RV32-NEXT: mul a0, a5, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: udiv64_constant_no_add: diff --git a/llvm/test/CodeGen/RISCV/div.ll b/llvm/test/CodeGen/RISCV/div.ll --- a/llvm/test/CodeGen/RISCV/div.ll +++ b/llvm/test/CodeGen/RISCV/div.ll @@ -181,13 +181,26 @@ ; ; RV32IM-LABEL: udiv64_constant: ; RV32IM: # %bb.0: -; RV32IM-NEXT: addi sp, sp, -16 -; RV32IM-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32IM-NEXT: li a2, 5 -; RV32IM-NEXT: li a3, 0 -; RV32IM-NEXT: call __udivdi3@plt -; RV32IM-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32IM-NEXT: addi sp, sp, 16 +; RV32IM-NEXT: add a2, a0, a1 +; RV32IM-NEXT: sltu a3, a2, a0 +; RV32IM-NEXT: add a2, a2, a3 +; RV32IM-NEXT: lui a3, 838861 +; RV32IM-NEXT: addi a4, a3, -819 +; RV32IM-NEXT: mulhu a5, a2, a4 +; RV32IM-NEXT: srli a6, a5, 2 +; RV32IM-NEXT: andi a5, a5, -4 +; RV32IM-NEXT: add a5, a5, a6 +; RV32IM-NEXT: sub a2, a2, a5 +; RV32IM-NEXT: sub a5, a0, a2 +; RV32IM-NEXT: addi a3, a3, -820 +; RV32IM-NEXT: mul a3, a5, a3 +; RV32IM-NEXT: mulhu a6, a5, a4 +; RV32IM-NEXT: add a3, a6, a3 +; RV32IM-NEXT: sltu a0, a0, a2 +; RV32IM-NEXT: sub a0, a1, a0 +; RV32IM-NEXT: mul a0, a0, a4 +; RV32IM-NEXT: add a1, a3, a0 +; RV32IM-NEXT: mul a0, a5, a4 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: udiv64_constant: diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll --- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll @@ -7,24 +7,51 @@ define iXLen2 @test_udiv_3(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_3: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 3 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __udivdi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: add a2, a0, a1 +; RV32-NEXT: sltu a3, a2, a0 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: lui a3, 699051 +; RV32-NEXT: addi a4, a3, -1365 +; RV32-NEXT: mulhu a5, a2, a4 +; RV32-NEXT: srli a6, a5, 1 +; RV32-NEXT: andi a5, a5, -2 +; RV32-NEXT: add a5, a5, a6 +; RV32-NEXT: sub a2, a2, a5 +; RV32-NEXT: sub a5, a0, a2 +; RV32-NEXT: addi a3, a3, -1366 +; RV32-NEXT: mul a3, a5, a3 +; RV32-NEXT: mulhu a6, a5, a4 +; RV32-NEXT: add a3, a6, a3 +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: mul a0, a0, a4 +; RV32-NEXT: add a1, a3, a0 +; RV32-NEXT: mul a0, a5, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_3: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 3 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __udivti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lui a2, %hi(.LCPI0_0) +; RV64-NEXT: ld a2, %lo(.LCPI0_0)(a2) +; RV64-NEXT: add a3, a0, a1 +; RV64-NEXT: sltu a4, a3, a0 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: mulhu a4, a3, a2 +; RV64-NEXT: srli a5, a4, 1 +; RV64-NEXT: andi a4, a4, -2 +; RV64-NEXT: lui a6, %hi(.LCPI0_1) +; RV64-NEXT: ld a6, %lo(.LCPI0_1)(a6) +; RV64-NEXT: add a4, a4, a5 +; RV64-NEXT: sub a3, a3, a4 +; RV64-NEXT: sub a4, a0, a3 +; RV64-NEXT: mul a5, a4, a6 +; RV64-NEXT: mulhu a6, a4, a2 +; RV64-NEXT: add a5, a6, a5 +; RV64-NEXT: sltu a0, a0, a3 +; RV64-NEXT: sub a0, a1, a0 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a1, a5, a0 +; RV64-NEXT: mul a0, a4, a2 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 3 ret iXLen2 %a @@ -33,24 +60,51 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_5: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 5 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __udivdi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: add a2, a0, a1 +; RV32-NEXT: sltu a3, a2, a0 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: lui a3, 838861 +; RV32-NEXT: addi a4, a3, -819 +; RV32-NEXT: mulhu a5, a2, a4 +; RV32-NEXT: srli a6, a5, 2 +; RV32-NEXT: andi a5, a5, -4 +; RV32-NEXT: add a5, a5, a6 +; RV32-NEXT: sub a2, a2, a5 +; RV32-NEXT: sub a5, a0, a2 +; RV32-NEXT: addi a3, a3, -820 +; RV32-NEXT: mul a3, a5, a3 +; RV32-NEXT: mulhu a6, a5, a4 +; RV32-NEXT: add a3, a6, a3 +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: mul a0, a0, a4 +; RV32-NEXT: add a1, a3, a0 +; RV32-NEXT: mul a0, a5, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_5: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 5 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __udivti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lui a2, %hi(.LCPI1_0) +; RV64-NEXT: ld a2, %lo(.LCPI1_0)(a2) +; RV64-NEXT: add a3, a0, a1 +; RV64-NEXT: sltu a4, a3, a0 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: mulhu a4, a3, a2 +; RV64-NEXT: srli a5, a4, 2 +; RV64-NEXT: andi a4, a4, -4 +; RV64-NEXT: lui a6, %hi(.LCPI1_1) +; RV64-NEXT: ld a6, %lo(.LCPI1_1)(a6) +; RV64-NEXT: add a4, a4, a5 +; RV64-NEXT: sub a3, a3, a4 +; RV64-NEXT: sub a4, a0, a3 +; RV64-NEXT: mul a5, a4, a6 +; RV64-NEXT: mulhu a6, a4, a2 +; RV64-NEXT: add a5, a6, a5 +; RV64-NEXT: sltu a0, a0, a3 +; RV64-NEXT: sub a0, a1, a0 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a1, a5, a0 +; RV64-NEXT: mul a0, a4, a2 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 5 ret iXLen2 %a @@ -111,24 +165,55 @@ define iXLen2 @test_udiv_15(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_15: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 15 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __udivdi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: add a2, a0, a1 +; RV32-NEXT: sltu a3, a2, a0 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: lui a3, 559241 +; RV32-NEXT: addi a3, a3, -1911 +; RV32-NEXT: mulhu a3, a2, a3 +; RV32-NEXT: srli a3, a3, 3 +; RV32-NEXT: slli a4, a3, 4 +; RV32-NEXT: sub a3, a3, a4 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: sub a3, a0, a2 +; RV32-NEXT: lui a4, 978671 +; RV32-NEXT: addi a5, a4, -274 +; RV32-NEXT: mul a5, a3, a5 +; RV32-NEXT: addi a4, a4, -273 +; RV32-NEXT: mulhu a6, a3, a4 +; RV32-NEXT: add a5, a6, a5 +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: mul a0, a0, a4 +; RV32-NEXT: add a1, a5, a0 +; RV32-NEXT: mul a0, a3, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_15: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 15 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __udivti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lui a2, %hi(.LCPI4_0) +; RV64-NEXT: ld a2, %lo(.LCPI4_0)(a2) +; RV64-NEXT: add a3, a0, a1 +; RV64-NEXT: sltu a4, a3, a0 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: mulhu a2, a3, a2 +; RV64-NEXT: srli a2, a2, 3 +; RV64-NEXT: slli a4, a2, 4 +; RV64-NEXT: sub a2, a2, a4 +; RV64-NEXT: lui a4, %hi(.LCPI4_1) +; RV64-NEXT: ld a4, %lo(.LCPI4_1)(a4) +; RV64-NEXT: lui a5, %hi(.LCPI4_2) +; RV64-NEXT: ld a5, %lo(.LCPI4_2)(a5) +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: sub a3, a0, a2 +; RV64-NEXT: mul a4, a3, a4 +; RV64-NEXT: mulhu a6, a3, a5 +; RV64-NEXT: add a4, a6, a4 +; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: sub a0, a1, a0 +; RV64-NEXT: mul a0, a0, a5 +; RV64-NEXT: add a1, a4, a0 +; RV64-NEXT: mul a0, a3, a5 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 15 ret iXLen2 %a @@ -137,24 +222,51 @@ define iXLen2 @test_udiv_17(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_17: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 17 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __udivdi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: add a2, a0, a1 +; RV32-NEXT: sltu a3, a2, a0 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: lui a3, 986895 +; RV32-NEXT: addi a4, a3, 241 +; RV32-NEXT: mulhu a5, a2, a4 +; RV32-NEXT: srli a6, a5, 4 +; RV32-NEXT: andi a5, a5, -16 +; RV32-NEXT: add a5, a5, a6 +; RV32-NEXT: sub a2, a2, a5 +; RV32-NEXT: sub a5, a0, a2 +; RV32-NEXT: addi a3, a3, 240 +; RV32-NEXT: mul a3, a5, a3 +; RV32-NEXT: mulhu a6, a5, a4 +; RV32-NEXT: add a3, a6, a3 +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: mul a0, a0, a4 +; RV32-NEXT: add a1, a3, a0 +; RV32-NEXT: mul a0, a5, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_17: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 17 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __udivti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lui a2, %hi(.LCPI5_0) +; RV64-NEXT: ld a2, %lo(.LCPI5_0)(a2) +; RV64-NEXT: add a3, a0, a1 +; RV64-NEXT: sltu a4, a3, a0 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: mulhu a4, a3, a2 +; RV64-NEXT: srli a5, a4, 4 +; RV64-NEXT: andi a4, a4, -16 +; RV64-NEXT: lui a6, %hi(.LCPI5_1) +; RV64-NEXT: ld a6, %lo(.LCPI5_1)(a6) +; RV64-NEXT: add a4, a4, a5 +; RV64-NEXT: sub a3, a3, a4 +; RV64-NEXT: sub a4, a0, a3 +; RV64-NEXT: mul a5, a4, a6 +; RV64-NEXT: mulhu a6, a4, a2 +; RV64-NEXT: add a5, a6, a5 +; RV64-NEXT: sltu a0, a0, a3 +; RV64-NEXT: sub a0, a1, a0 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a1, a5, a0 +; RV64-NEXT: mul a0, a4, a2 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 17 ret iXLen2 %a @@ -163,24 +275,55 @@ define iXLen2 @test_udiv_255(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_255: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 255 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __udivdi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: add a2, a0, a1 +; RV32-NEXT: sltu a3, a2, a0 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: lui a3, 526344 +; RV32-NEXT: addi a3, a3, 129 +; RV32-NEXT: mulhu a3, a2, a3 +; RV32-NEXT: srli a3, a3, 7 +; RV32-NEXT: slli a4, a3, 8 +; RV32-NEXT: sub a3, a3, a4 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: sub a3, a0, a2 +; RV32-NEXT: lui a4, 1044464 +; RV32-NEXT: addi a5, a4, -258 +; RV32-NEXT: mul a5, a3, a5 +; RV32-NEXT: addi a4, a4, -257 +; RV32-NEXT: mulhu a6, a3, a4 +; RV32-NEXT: add a5, a6, a5 +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: mul a0, a0, a4 +; RV32-NEXT: add a1, a5, a0 +; RV32-NEXT: mul a0, a3, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_255: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 255 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __udivti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lui a2, %hi(.LCPI6_0) +; RV64-NEXT: ld a2, %lo(.LCPI6_0)(a2) +; RV64-NEXT: add a3, a0, a1 +; RV64-NEXT: sltu a4, a3, a0 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: mulhu a2, a3, a2 +; RV64-NEXT: srli a2, a2, 7 +; RV64-NEXT: slli a4, a2, 8 +; RV64-NEXT: sub a2, a2, a4 +; RV64-NEXT: lui a4, %hi(.LCPI6_1) +; RV64-NEXT: ld a4, %lo(.LCPI6_1)(a4) +; RV64-NEXT: lui a5, %hi(.LCPI6_2) +; RV64-NEXT: ld a5, %lo(.LCPI6_2)(a5) +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: sub a3, a0, a2 +; RV64-NEXT: mul a4, a3, a4 +; RV64-NEXT: mulhu a6, a3, a5 +; RV64-NEXT: add a4, a6, a4 +; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: sub a0, a1, a0 +; RV64-NEXT: mul a0, a0, a5 +; RV64-NEXT: add a1, a4, a0 +; RV64-NEXT: mul a0, a3, a5 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 255 ret iXLen2 %a @@ -189,24 +332,51 @@ define iXLen2 @test_udiv_257(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_257: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 257 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __udivdi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: add a2, a0, a1 +; RV32-NEXT: sltu a3, a2, a0 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: lui a3, 1044496 +; RV32-NEXT: addi a4, a3, -255 +; RV32-NEXT: mulhu a5, a2, a4 +; RV32-NEXT: srli a6, a5, 8 +; RV32-NEXT: andi a5, a5, -256 +; RV32-NEXT: add a5, a5, a6 +; RV32-NEXT: sub a2, a2, a5 +; RV32-NEXT: sub a5, a0, a2 +; RV32-NEXT: addi a3, a3, -256 +; RV32-NEXT: mul a3, a5, a3 +; RV32-NEXT: mulhu a6, a5, a4 +; RV32-NEXT: add a3, a6, a3 +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: mul a0, a0, a4 +; RV32-NEXT: add a1, a3, a0 +; RV32-NEXT: mul a0, a5, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_257: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 257 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __udivti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lui a2, %hi(.LCPI7_0) +; RV64-NEXT: ld a2, %lo(.LCPI7_0)(a2) +; RV64-NEXT: add a3, a0, a1 +; RV64-NEXT: sltu a4, a3, a0 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: mulhu a4, a3, a2 +; RV64-NEXT: srli a5, a4, 8 +; RV64-NEXT: andi a4, a4, -256 +; RV64-NEXT: lui a6, %hi(.LCPI7_1) +; RV64-NEXT: ld a6, %lo(.LCPI7_1)(a6) +; RV64-NEXT: add a4, a4, a5 +; RV64-NEXT: sub a3, a3, a4 +; RV64-NEXT: sub a4, a0, a3 +; RV64-NEXT: mul a5, a4, a6 +; RV64-NEXT: mulhu a6, a4, a2 +; RV64-NEXT: add a5, a6, a5 +; RV64-NEXT: sltu a0, a0, a3 +; RV64-NEXT: sub a0, a1, a0 +; RV64-NEXT: mul a0, a0, a2 +; RV64-NEXT: add a1, a5, a0 +; RV64-NEXT: mul a0, a4, a2 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 257 ret iXLen2 %a @@ -215,26 +385,60 @@ define iXLen2 @test_udiv_65535(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_65535: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -1 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __udivdi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: add a2, a0, a1 +; RV32-NEXT: sltu a3, a2, a0 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: lui a3, 524296 +; RV32-NEXT: addi a3, a3, 1 +; RV32-NEXT: mulhu a3, a2, a3 +; RV32-NEXT: srli a3, a3, 15 +; RV32-NEXT: slli a4, a3, 16 +; RV32-NEXT: sub a3, a3, a4 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: sub a3, a0, a2 +; RV32-NEXT: lui a4, 1048560 +; RV32-NEXT: addi a5, a4, -2 +; RV32-NEXT: mul a5, a3, a5 +; RV32-NEXT: addi a4, a4, -1 +; RV32-NEXT: mulhu a4, a3, a4 +; RV32-NEXT: add a4, a4, a5 +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: slli a1, a0, 16 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: sub a1, a4, a0 +; RV32-NEXT: slli a0, a3, 16 +; RV32-NEXT: add a0, a0, a3 +; RV32-NEXT: neg a0, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_65535: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: lui a2, 16 -; RV64-NEXT: addiw a2, a2, -1 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __udivti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lui a2, %hi(.LCPI8_0) +; RV64-NEXT: ld a2, %lo(.LCPI8_0)(a2) +; RV64-NEXT: add a3, a0, a1 +; RV64-NEXT: sltu a4, a3, a0 +; RV64-NEXT: add a3, a3, a4 +; RV64-NEXT: mulhu a2, a3, a2 +; RV64-NEXT: srli a2, a2, 15 +; RV64-NEXT: slli a4, a2, 16 +; RV64-NEXT: sub a2, a2, a4 +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: sub a3, a0, a2 +; RV64-NEXT: lui a4, 983039 +; RV64-NEXT: slli a4, a4, 4 +; RV64-NEXT: addi a4, a4, -1 +; RV64-NEXT: slli a4, a4, 16 +; RV64-NEXT: addi a5, a4, -2 +; RV64-NEXT: mul a5, a3, a5 +; RV64-NEXT: addi a4, a4, -1 +; RV64-NEXT: mulhu a6, a3, a4 +; RV64-NEXT: add a5, a6, a5 +; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: sub a0, a1, a0 +; RV64-NEXT: mul a0, a0, a4 +; RV64-NEXT: add a1, a5, a0 +; RV64-NEXT: mul a0, a3, a4 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 65535 ret iXLen2 %a @@ -243,26 +447,53 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_65537: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, 1 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __udivdi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: add a2, a0, a1 +; RV32-NEXT: sltu a3, a2, a0 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: lui a3, 1048560 +; RV32-NEXT: addi a4, a3, 1 +; RV32-NEXT: mulhu a5, a2, a4 +; RV32-NEXT: and a3, a5, a3 +; RV32-NEXT: srli a5, a5, 16 +; RV32-NEXT: or a3, a3, a5 +; RV32-NEXT: sub a2, a2, a3 +; RV32-NEXT: sub a3, a0, a2 +; RV32-NEXT: mulhu a4, a3, a4 +; RV32-NEXT: slli a5, a3, 16 +; RV32-NEXT: sub a4, a4, a5 +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: slli a1, a0, 16 +; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: add a1, a4, a0 +; RV32-NEXT: sub a0, a3, a5 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_65537: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: lui a2, 16 -; RV64-NEXT: addiw a2, a2, 1 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __udivti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: add a2, a0, a1 +; RV64-NEXT: sltu a3, a2, a0 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: lui a3, 983041 +; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: addi a3, a3, -1 +; RV64-NEXT: slli a3, a3, 16 +; RV64-NEXT: addi a4, a3, 1 +; RV64-NEXT: mulhu a5, a2, a4 +; RV64-NEXT: lui a6, 1048560 +; RV64-NEXT: and a6, a5, a6 +; RV64-NEXT: srli a5, a5, 16 +; RV64-NEXT: add a5, a6, a5 +; RV64-NEXT: sub a2, a2, a5 +; RV64-NEXT: sub a5, a0, a2 +; RV64-NEXT: mul a3, a5, a3 +; RV64-NEXT: mulhu a6, a5, a4 +; RV64-NEXT: add a3, a6, a3 +; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: sub a0, a1, a0 +; RV64-NEXT: mul a0, a0, a4 +; RV64-NEXT: add a1, a3, a0 +; RV64-NEXT: mul a0, a5, a4 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 65537 ret iXLen2 %a @@ -271,24 +502,65 @@ define iXLen2 @test_udiv_12(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_12: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 12 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __udivdi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: slli a2, a1, 30 +; RV32-NEXT: srli a3, a0, 2 +; RV32-NEXT: or a2, a3, a2 +; RV32-NEXT: srli a3, a1, 2 +; RV32-NEXT: add a3, a2, a3 +; RV32-NEXT: sltu a2, a3, a2 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: lui a3, 699051 +; RV32-NEXT: addi a4, a3, -1365 +; RV32-NEXT: mulhu a5, a2, a4 +; RV32-NEXT: srli a6, a5, 1 +; RV32-NEXT: andi a5, a5, -2 +; RV32-NEXT: add a5, a5, a6 +; RV32-NEXT: sub a2, a2, a5 +; RV32-NEXT: slli a2, a2, 2 +; RV32-NEXT: andi a5, a0, 3 +; RV32-NEXT: or a2, a2, a5 +; RV32-NEXT: sub a5, a0, a2 +; RV32-NEXT: addi a3, a3, -1366 +; RV32-NEXT: mul a3, a5, a3 +; RV32-NEXT: mulhu a6, a5, a4 +; RV32-NEXT: add a3, a6, a3 +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: mul a0, a0, a4 +; RV32-NEXT: add a1, a3, a0 +; RV32-NEXT: mul a0, a5, a4 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_12: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 12 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __udivti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: slli a2, a1, 62 +; RV64-NEXT: srli a3, a0, 2 +; RV64-NEXT: or a2, a3, a2 +; RV64-NEXT: srli a3, a1, 2 +; RV64-NEXT: lui a4, %hi(.LCPI10_0) +; RV64-NEXT: ld a4, %lo(.LCPI10_0)(a4) +; RV64-NEXT: add a3, a2, a3 +; RV64-NEXT: sltu a2, a3, a2 +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: mulhu a3, a2, a4 +; RV64-NEXT: srli a5, a3, 1 +; RV64-NEXT: andi a3, a3, -2 +; RV64-NEXT: add a3, a3, a5 +; RV64-NEXT: sub a2, a2, a3 +; RV64-NEXT: slli a2, a2, 2 +; RV64-NEXT: lui a3, %hi(.LCPI10_1) +; RV64-NEXT: ld a3, %lo(.LCPI10_1)(a3) +; RV64-NEXT: andi a5, a0, 3 +; RV64-NEXT: or a2, a2, a5 +; RV64-NEXT: sub a5, a0, a2 +; RV64-NEXT: mul a3, a5, a3 +; RV64-NEXT: mulhu a6, a5, a4 +; RV64-NEXT: add a3, a6, a3 +; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: sub a0, a1, a0 +; RV64-NEXT: mul a0, a0, a4 +; RV64-NEXT: add a1, a3, a0 +; RV64-NEXT: mul a0, a5, a4 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 12 ret iXLen2 %a diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll --- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll @@ -7,24 +7,32 @@ define iXLen2 @test_urem_3(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_3: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 3 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __umoddi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: lui a1, 699051 +; RV32-NEXT: addi a1, a1, -1365 +; RV32-NEXT: mulhu a1, a0, a1 +; RV32-NEXT: srli a2, a1, 1 +; RV32-NEXT: andi a1, a1, -2 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: li a1, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_3: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 3 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __umodti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lui a2, %hi(.LCPI0_0) +; RV64-NEXT: ld a2, %lo(.LCPI0_0)(a2) +; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: mulhu a1, a0, a2 +; RV64-NEXT: srli a2, a1, 1 +; RV64-NEXT: andi a1, a1, -2 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: sub a0, a0, a1 +; RV64-NEXT: li a1, 0 ; RV64-NEXT: ret %a = urem iXLen2 %x, 3 ret iXLen2 %a @@ -33,24 +41,32 @@ define iXLen2 @test_urem_5(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_5: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 5 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __umoddi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: lui a1, 838861 +; RV32-NEXT: addi a1, a1, -819 +; RV32-NEXT: mulhu a1, a0, a1 +; RV32-NEXT: srli a2, a1, 2 +; RV32-NEXT: andi a1, a1, -4 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: li a1, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_5: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 5 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __umodti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lui a2, %hi(.LCPI1_0) +; RV64-NEXT: ld a2, %lo(.LCPI1_0)(a2) +; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: mulhu a1, a0, a2 +; RV64-NEXT: srli a2, a1, 2 +; RV64-NEXT: andi a1, a1, -4 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: sub a0, a0, a1 +; RV64-NEXT: li a1, 0 ; RV64-NEXT: ret %a = urem iXLen2 %x, 5 ret iXLen2 %a @@ -111,24 +127,32 @@ define iXLen2 @test_urem_15(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_15: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 15 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __umoddi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: lui a1, 559241 +; RV32-NEXT: addi a1, a1, -1911 +; RV32-NEXT: mulhu a1, a0, a1 +; RV32-NEXT: srli a1, a1, 3 +; RV32-NEXT: slli a2, a1, 4 +; RV32-NEXT: sub a1, a1, a2 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: li a1, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_15: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 15 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __umodti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lui a2, %hi(.LCPI4_0) +; RV64-NEXT: ld a2, %lo(.LCPI4_0)(a2) +; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: mulhu a1, a0, a2 +; RV64-NEXT: srli a1, a1, 3 +; RV64-NEXT: slli a2, a1, 4 +; RV64-NEXT: sub a1, a1, a2 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: li a1, 0 ; RV64-NEXT: ret %a = urem iXLen2 %x, 15 ret iXLen2 %a @@ -137,24 +161,32 @@ define iXLen2 @test_urem_17(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_17: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 17 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __umoddi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: lui a1, 986895 +; RV32-NEXT: addi a1, a1, 241 +; RV32-NEXT: mulhu a1, a0, a1 +; RV32-NEXT: srli a2, a1, 4 +; RV32-NEXT: andi a1, a1, -16 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: li a1, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_17: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 17 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __umodti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lui a2, %hi(.LCPI5_0) +; RV64-NEXT: ld a2, %lo(.LCPI5_0)(a2) +; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: mulhu a1, a0, a2 +; RV64-NEXT: srli a2, a1, 4 +; RV64-NEXT: andi a1, a1, -16 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: sub a0, a0, a1 +; RV64-NEXT: li a1, 0 ; RV64-NEXT: ret %a = urem iXLen2 %x, 17 ret iXLen2 %a @@ -163,24 +195,32 @@ define iXLen2 @test_urem_255(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_255: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 255 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __umoddi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: lui a1, 526344 +; RV32-NEXT: addi a1, a1, 129 +; RV32-NEXT: mulhu a1, a0, a1 +; RV32-NEXT: srli a1, a1, 7 +; RV32-NEXT: slli a2, a1, 8 +; RV32-NEXT: sub a1, a1, a2 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: li a1, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_255: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 255 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __umodti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lui a2, %hi(.LCPI6_0) +; RV64-NEXT: ld a2, %lo(.LCPI6_0)(a2) +; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: mulhu a1, a0, a2 +; RV64-NEXT: srli a1, a1, 7 +; RV64-NEXT: slli a2, a1, 8 +; RV64-NEXT: sub a1, a1, a2 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: li a1, 0 ; RV64-NEXT: ret %a = urem iXLen2 %x, 255 ret iXLen2 %a @@ -189,24 +229,32 @@ define iXLen2 @test_urem_257(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_257: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 257 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __umoddi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: lui a1, 1044496 +; RV32-NEXT: addi a1, a1, -255 +; RV32-NEXT: mulhu a1, a0, a1 +; RV32-NEXT: srli a2, a1, 8 +; RV32-NEXT: andi a1, a1, -256 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: li a1, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_257: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 257 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __umodti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lui a2, %hi(.LCPI7_0) +; RV64-NEXT: ld a2, %lo(.LCPI7_0)(a2) +; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: mulhu a1, a0, a2 +; RV64-NEXT: srli a2, a1, 8 +; RV64-NEXT: andi a1, a1, -256 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: sub a0, a0, a1 +; RV64-NEXT: li a1, 0 ; RV64-NEXT: ret %a = urem iXLen2 %x, 257 ret iXLen2 %a @@ -215,26 +263,32 @@ define iXLen2 @test_urem_65535(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_65535: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, -1 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __umoddi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: lui a1, 524296 +; RV32-NEXT: addi a1, a1, 1 +; RV32-NEXT: mulhu a1, a0, a1 +; RV32-NEXT: srli a1, a1, 15 +; RV32-NEXT: slli a2, a1, 16 +; RV32-NEXT: sub a1, a1, a2 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: li a1, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_65535: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: lui a2, 16 -; RV64-NEXT: addiw a2, a2, -1 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __umodti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: lui a2, %hi(.LCPI8_0) +; RV64-NEXT: ld a2, %lo(.LCPI8_0)(a2) +; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: mulhu a1, a0, a2 +; RV64-NEXT: srli a1, a1, 15 +; RV64-NEXT: slli a2, a1, 16 +; RV64-NEXT: sub a1, a1, a2 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: li a1, 0 ; RV64-NEXT: ret %a = urem iXLen2 %x, 65535 ret iXLen2 %a @@ -243,26 +297,36 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_65537: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: lui a2, 16 -; RV32-NEXT: addi a2, a2, 1 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __umoddi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: lui a1, 1048560 +; RV32-NEXT: addi a2, a1, 1 +; RV32-NEXT: mulhu a2, a0, a2 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: srli a2, a2, 16 +; RV32-NEXT: or a1, a1, a2 +; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: li a1, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_65537: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: lui a2, 16 -; RV64-NEXT: addiw a2, a2, 1 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __umodti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: lui a1, 983041 +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: slli a1, a1, 16 +; RV64-NEXT: addi a1, a1, 1 +; RV64-NEXT: mulhu a1, a0, a1 +; RV64-NEXT: lui a2, 1048560 +; RV64-NEXT: and a2, a1, a2 +; RV64-NEXT: srli a1, a1, 16 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: sub a0, a0, a1 +; RV64-NEXT: li a1, 0 ; RV64-NEXT: ret %a = urem iXLen2 %x, 65537 ret iXLen2 %a @@ -271,24 +335,46 @@ define iXLen2 @test_urem_12(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_12: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 12 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __umoddi3@plt -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: slli a2, a1, 30 +; RV32-NEXT: srli a3, a0, 2 +; RV32-NEXT: or a2, a3, a2 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: sltu a2, a1, a2 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: lui a2, 699051 +; RV32-NEXT: addi a2, a2, -1365 +; RV32-NEXT: mulhu a2, a1, a2 +; RV32-NEXT: srli a3, a2, 1 +; RV32-NEXT: andi a2, a2, -2 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: sub a1, a1, a2 +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: andi a0, a0, 3 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: li a1, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_12: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 12 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __umodti3@plt -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: slli a2, a1, 62 +; RV64-NEXT: srli a3, a0, 2 +; RV64-NEXT: or a2, a3, a2 +; RV64-NEXT: srli a1, a1, 2 +; RV64-NEXT: lui a3, %hi(.LCPI10_0) +; RV64-NEXT: ld a3, %lo(.LCPI10_0)(a3) +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: sltu a2, a1, a2 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: mulhu a2, a1, a3 +; RV64-NEXT: srli a3, a2, 1 +; RV64-NEXT: andi a2, a2, -2 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: sub a1, a1, a2 +; RV64-NEXT: slli a1, a1, 2 +; RV64-NEXT: andi a0, a0, 3 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: li a1, 0 ; RV64-NEXT: ret %a = urem iXLen2 %x, 12 ret iXLen2 %a diff --git a/llvm/test/CodeGen/VE/Scalar/rem.ll b/llvm/test/CodeGen/VE/Scalar/rem.ll --- a/llvm/test/CodeGen/VE/Scalar/rem.ll +++ b/llvm/test/CodeGen/VE/Scalar/rem.ll @@ -181,11 +181,11 @@ ; CHECK-NEXT: lea %s2, __umodti3@lo ; CHECK-NEXT: and %s2, %s2, (32)0 ; CHECK-NEXT: lea.sl %s12, __umodti3@hi(, %s2) -; CHECK-NEXT: or %s2, 3, (0)1 +; CHECK-NEXT: or %s2, 11, (0)1 ; CHECK-NEXT: or %s3, 0, (0)1 ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 - %r = urem i128 %a, 3 + %r = urem i128 %a, 11 ret i128 %r } diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll --- a/llvm/test/CodeGen/X86/divide-by-constant.ll +++ b/llvm/test/CodeGen/X86/divide-by-constant.ll @@ -460,13 +460,17 @@ define i64 @urem_i64_3(i64 %x) nounwind { ; X32-LABEL: urem_i64_3: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $3 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: shrl %edx +; X32-NEXT: leal (%edx,%edx,2), %eax +; X32-NEXT: subl %eax, %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: xorl %edx, %edx ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_3: @@ -487,13 +491,17 @@ define i64 @urem_i64_5(i64 %x) nounwind { ; X32-LABEL: urem_i64_5: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $5 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: shrl $2, %edx +; X32-NEXT: leal (%edx,%edx,4), %eax +; X32-NEXT: subl %eax, %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: xorl %edx, %edx ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_5: @@ -514,13 +522,18 @@ define i64 @urem_i64_15(i64 %x) nounwind { ; X32-LABEL: urem_i64_15: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $15 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl $-2004318071, %edx # imm = 0x88888889 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: shrl $3, %edx +; X32-NEXT: leal (%edx,%edx,4), %eax +; X32-NEXT: leal (%eax,%eax,2), %eax +; X32-NEXT: subl %eax, %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: xorl %edx, %edx ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_15: @@ -542,13 +555,19 @@ define i64 @urem_i64_17(i64 %x) nounwind { ; X32-LABEL: urem_i64_17: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $17 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl $-252645135, %edx # imm = 0xF0F0F0F1 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %eax +; X32-NEXT: andl $-16, %eax +; X32-NEXT: shrl $4, %edx +; X32-NEXT: addl %eax, %edx +; X32-NEXT: subl %edx, %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: xorl %edx, %edx ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_17: @@ -571,13 +590,23 @@ define i64 @urem_i64_255(i64 %x) nounwind { ; X32-LABEL: urem_i64_255: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $255 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: addl %esi, %eax +; X32-NEXT: adcl $0, %eax +; X32-NEXT: movl $-2139062143, %edx # imm = 0x80808081 +; X32-NEXT: mull %edx +; X32-NEXT: shrl $7, %edx +; X32-NEXT: movl %edx, %eax +; X32-NEXT: shll $8, %eax +; X32-NEXT: subl %eax, %edx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: adcl %edx, %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: popl %esi ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_255: @@ -599,13 +628,19 @@ define i64 @urem_i64_257(i64 %x) nounwind { ; X32-LABEL: urem_i64_257: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $257 # imm = 0x101 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl $-16711935, %edx # imm = 0xFF00FF01 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %eax +; X32-NEXT: andl $-256, %eax +; X32-NEXT: shrl $8, %edx +; X32-NEXT: addl %eax, %edx +; X32-NEXT: subl %edx, %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: xorl %edx, %edx ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_257: @@ -628,13 +663,23 @@ define i64 @urem_i64_65535(i64 %x) nounwind { ; X32-LABEL: urem_i64_65535: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $65535 # imm = 0xFFFF -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: addl %esi, %eax +; X32-NEXT: adcl $0, %eax +; X32-NEXT: movl $-2147450879, %edx # imm = 0x80008001 +; X32-NEXT: mull %edx +; X32-NEXT: shrl $15, %edx +; X32-NEXT: movl %edx, %eax +; X32-NEXT: shll $16, %eax +; X32-NEXT: subl %eax, %edx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: adcl %edx, %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: popl %esi ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_65535: @@ -656,13 +701,18 @@ define i64 @urem_i64_65537(i64 %x) nounwind { ; X32-LABEL: urem_i64_65537: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $65537 # imm = 0x10001 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl $-65535, %edx # imm = 0xFFFF0001 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %eax +; X32-NEXT: shrl $16, %eax +; X32-NEXT: shldl $16, %edx, %eax +; X32-NEXT: subl %eax, %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: xorl %edx, %edx ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_65537: @@ -685,13 +735,24 @@ define i64 @urem_i64_12(i64 %x) nounwind { ; X32-LABEL: urem_i64_12: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $12 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __umoddi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: shrl $2, %eax +; X32-NEXT: shldl $30, %esi, %ecx +; X32-NEXT: addl %eax, %ecx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: shrl %edx +; X32-NEXT: leal (%edx,%edx,2), %eax +; X32-NEXT: subl %eax, %ecx +; X32-NEXT: andl $3, %esi +; X32-NEXT: leal (%esi,%ecx,4), %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: popl %esi ; X32-NEXT: retl ; ; X64-LABEL: urem_i64_12: @@ -713,13 +774,31 @@ define i64 @udiv_i64_3(i64 %x) nounwind { ; X32-LABEL: udiv_i64_3: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $3 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __udivdi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %edi +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: shrl %edx +; X32-NEXT: leal (%edx,%edx,2), %eax +; X32-NEXT: subl %eax, %esi +; X32-NEXT: subl %esi, %ecx +; X32-NEXT: sbbl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: popl %esi +; X32-NEXT: popl %edi +; X32-NEXT: popl %ebx ; X32-NEXT: retl ; ; X64-LABEL: udiv_i64_3: @@ -738,13 +817,31 @@ define i64 @udiv_i64_5(i64 %x) nounwind { ; X32-LABEL: udiv_i64_5: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $5 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __udivdi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %edi +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl $-858993459, %ebx # imm = 0xCCCCCCCD +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: shrl $2, %edx +; X32-NEXT: leal (%edx,%edx,4), %eax +; X32-NEXT: subl %eax, %esi +; X32-NEXT: subl %esi, %ecx +; X32-NEXT: sbbl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: imull $-858993460, %ecx, %ecx # imm = 0xCCCCCCCC +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: popl %esi +; X32-NEXT: popl %edi +; X32-NEXT: popl %ebx ; X32-NEXT: retl ; ; X64-LABEL: udiv_i64_5: @@ -763,13 +860,31 @@ define i64 @udiv_i64_15(i64 %x) nounwind { ; X32-LABEL: udiv_i64_15: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $15 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __udivdi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %edi +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl $-2004318071, %edx # imm = 0x88888889 +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edx +; X32-NEXT: shrl $3, %edx +; X32-NEXT: leal (%edx,%edx,4), %eax +; X32-NEXT: leal (%eax,%eax,2), %eax +; X32-NEXT: subl %eax, %esi +; X32-NEXT: subl %esi, %ecx +; X32-NEXT: sbbl $0, %edi +; X32-NEXT: movl $-286331153, %edx # imm = 0xEEEEEEEF +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: imull $-286331154, %ecx, %ecx # imm = 0xEEEEEEEE +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull $-286331153, %edi, %ecx # imm = 0xEEEEEEEF +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: popl %esi +; X32-NEXT: popl %edi ; X32-NEXT: retl ; ; X64-LABEL: udiv_i64_15: @@ -788,13 +903,33 @@ define i64 @udiv_i64_17(i64 %x) nounwind { ; X32-LABEL: udiv_i64_17: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $17 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __udivdi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %edi +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl $-252645135, %ebx # imm = 0xF0F0F0F1 +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %eax +; X32-NEXT: andl $-16, %eax +; X32-NEXT: shrl $4, %edx +; X32-NEXT: addl %eax, %edx +; X32-NEXT: subl %edx, %esi +; X32-NEXT: subl %esi, %ecx +; X32-NEXT: sbbl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: imull $-252645136, %ecx, %ecx # imm = 0xF0F0F0F0 +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull $-252645135, %edi, %ecx # imm = 0xF0F0F0F1 +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: popl %esi +; X32-NEXT: popl %edi +; X32-NEXT: popl %ebx ; X32-NEXT: retl ; ; X64-LABEL: udiv_i64_17: @@ -813,13 +948,31 @@ define i64 @udiv_i64_255(i64 %x) nounwind { ; X32-LABEL: udiv_i64_255: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $255 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __udivdi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: addl %esi, %eax +; X32-NEXT: adcl $0, %eax +; X32-NEXT: movl $-2139062143, %edx # imm = 0x80808081 +; X32-NEXT: mull %edx +; X32-NEXT: shrl $7, %edx +; X32-NEXT: movl %edx, %eax +; X32-NEXT: shll $8, %eax +; X32-NEXT: subl %eax, %edx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: addl %esi, %eax +; X32-NEXT: adcl %edx, %eax +; X32-NEXT: subl %eax, %ecx +; X32-NEXT: sbbl $0, %esi +; X32-NEXT: movl $-16843009, %edx # imm = 0xFEFEFEFF +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: imull $-16843010, %ecx, %ecx # imm = 0xFEFEFEFE +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull $-16843009, %esi, %ecx # imm = 0xFEFEFEFF +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: popl %esi ; X32-NEXT: retl ; ; X64-LABEL: udiv_i64_255: @@ -838,13 +991,33 @@ define i64 @udiv_i64_257(i64 %x) nounwind { ; X32-LABEL: udiv_i64_257: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $257 # imm = 0x101 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __udivdi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %edi +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl $-16711935, %ebx # imm = 0xFF00FF01 +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %eax +; X32-NEXT: andl $-256, %eax +; X32-NEXT: shrl $8, %edx +; X32-NEXT: addl %eax, %edx +; X32-NEXT: subl %edx, %esi +; X32-NEXT: subl %esi, %ecx +; X32-NEXT: sbbl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: imull $-16711936, %ecx, %ecx # imm = 0xFF00FF00 +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull $-16711935, %edi, %ecx # imm = 0xFF00FF01 +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: popl %esi +; X32-NEXT: popl %edi +; X32-NEXT: popl %ebx ; X32-NEXT: retl ; ; X64-LABEL: udiv_i64_257: @@ -863,13 +1036,33 @@ define i64 @udiv_i64_65535(i64 %x) nounwind { ; X32-LABEL: udiv_i64_65535: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $65535 # imm = 0xFFFF -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __udivdi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: addl %esi, %eax +; X32-NEXT: adcl $0, %eax +; X32-NEXT: movl $-2147450879, %edx # imm = 0x80008001 +; X32-NEXT: mull %edx +; X32-NEXT: shrl $15, %edx +; X32-NEXT: movl %edx, %eax +; X32-NEXT: shll $16, %eax +; X32-NEXT: subl %eax, %edx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: addl %esi, %eax +; X32-NEXT: adcl %edx, %eax +; X32-NEXT: subl %eax, %ecx +; X32-NEXT: sbbl $0, %esi +; X32-NEXT: movl $-65537, %edx # imm = 0xFFFEFFFF +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edx +; X32-NEXT: imull $-65538, %ecx, %ecx # imm = 0xFFFEFFFE +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl %esi, %ecx +; X32-NEXT: shll $16, %ecx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: subl %ecx, %edx +; X32-NEXT: popl %esi ; X32-NEXT: retl ; ; X64-LABEL: udiv_i64_65535: @@ -888,13 +1081,34 @@ define i64 @udiv_i64_65537(i64 %x) nounwind { ; X32-LABEL: udiv_i64_65537: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $65537 # imm = 0x10001 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __udivdi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %edi +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl $-65535, %ebx # imm = 0xFFFF0001 +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %eax +; X32-NEXT: shrl $16, %eax +; X32-NEXT: shldl $16, %edx, %eax +; X32-NEXT: subl %eax, %esi +; X32-NEXT: subl %esi, %ecx +; X32-NEXT: sbbl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: shll $16, %ecx +; X32-NEXT: subl %ecx, %edx +; X32-NEXT: movl %edi, %ecx +; X32-NEXT: shll $16, %ecx +; X32-NEXT: subl %ecx, %edi +; X32-NEXT: addl %edi, %edx +; X32-NEXT: popl %esi +; X32-NEXT: popl %edi +; X32-NEXT: popl %ebx ; X32-NEXT: retl ; ; X64-LABEL: udiv_i64_65537: @@ -913,13 +1127,37 @@ define i64 @udiv_i64_12(i64 %x) nounwind { ; X32-LABEL: udiv_i64_12: ; X32: # %bb.0: # %entry -; X32-NEXT: subl $12, %esp -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $12 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll __udivdi3 -; X32-NEXT: addl $28, %esp +; X32-NEXT: pushl %ebx +; X32-NEXT: pushl %edi +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: shrl $2, %eax +; X32-NEXT: movl %edi, %esi +; X32-NEXT: shldl $30, %ecx, %esi +; X32-NEXT: addl %eax, %esi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: shrl %edx +; X32-NEXT: leal (%edx,%edx,2), %eax +; X32-NEXT: subl %eax, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: andl $3, %eax +; X32-NEXT: leal (%eax,%esi,4), %eax +; X32-NEXT: subl %eax, %ecx +; X32-NEXT: sbbl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: popl %esi +; X32-NEXT: popl %edi +; X32-NEXT: popl %ebx ; X32-NEXT: retl ; ; X64-LABEL: udiv_i64_12: diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll --- a/llvm/test/CodeGen/X86/divmod128.ll +++ b/llvm/test/CodeGen/X86/divmod128.ll @@ -68,7 +68,7 @@ ; X86-64-LABEL: umod128: ; X86-64: # %bb.0: ; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $3, %edx +; X86-64-NEXT: movl $11, %edx ; X86-64-NEXT: xorl %ecx, %ecx ; X86-64-NEXT: callq __umodti3@PLT ; X86-64-NEXT: popq %rcx @@ -79,7 +79,7 @@ ; WIN64-NEXT: subq $72, %rsp ; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $11, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx @@ -89,7 +89,7 @@ ; WIN64-NEXT: retq - %1 = urem i128 %x, 3 + %1 = urem i128 %x, 11 %2 = trunc i128 %1 to i64 ret i64 %2 } @@ -97,25 +97,31 @@ define i64 @udiv128(i128 %x) nounwind { ; X86-64-LABEL: udiv128: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $3, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __udivti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: addq %rdi, %rsi +; X86-64-NEXT: adcq $0, %rsi +; X86-64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: shrq %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax +; X86-64-NEXT: subq %rsi, %rax +; X86-64-NEXT: addq %rdi, %rax +; X86-64-NEXT: imulq %rcx, %rax ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv128: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __udivti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: addq %rcx, %r8 +; WIN64-NEXT: adcq $0, %r8 +; WIN64-NEXT: movabsq $-6148914691236517205, %r9 # imm = 0xAAAAAAAAAAAAAAAB +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %r9 +; WIN64-NEXT: shrq %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax +; WIN64-NEXT: subq %r8, %rax +; WIN64-NEXT: addq %rcx, %rax +; WIN64-NEXT: imulq %r9, %rax ; WIN64-NEXT: retq @@ -127,27 +133,30 @@ define i128 @urem_i128_3(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_3: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $3, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: addq %rsi, %rdi +; X86-64-NEXT: adcq $0, %rdi +; X86-64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: shrq %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax +; X86-64-NEXT: subq %rax, %rdi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_3: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: addq %rdx, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: shrq %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax +; WIN64-NEXT: subq %rax, %rcx +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 3 @@ -157,27 +166,30 @@ define i128 @urem_i128_5(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_5: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $5, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: addq %rsi, %rdi +; X86-64-NEXT: adcq $0, %rdi +; X86-64-NEXT: movabsq $-3689348814741910323, %rcx # imm = 0xCCCCCCCCCCCCCCCD +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: shrq $2, %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax +; X86-64-NEXT: subq %rax, %rdi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_5: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $5, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: addq %rdx, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-3689348814741910323, %rdx # imm = 0xCCCCCCCCCCCCCCCD +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: shrq $2, %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax +; WIN64-NEXT: subq %rax, %rcx +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 5 @@ -187,27 +199,32 @@ define i128 @urem_i128_15(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_15: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $15, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: addq %rsi, %rdi +; X86-64-NEXT: adcq $0, %rdi +; X86-64-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: shrq $3, %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax +; X86-64-NEXT: leaq (%rax,%rax,2), %rax +; X86-64-NEXT: subq %rax, %rdi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_15: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $15, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: addq %rdx, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: shrq $3, %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax +; WIN64-NEXT: leaq (%rax,%rax,2), %rax +; WIN64-NEXT: subq %rax, %rcx +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 15 @@ -217,27 +234,34 @@ define i128 @urem_i128_17(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_17: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $17, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: addq %rsi, %rdi +; X86-64-NEXT: adcq $0, %rdi +; X86-64-NEXT: movabsq $-1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F1 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %rax +; X86-64-NEXT: andq $-16, %rax +; X86-64-NEXT: shrq $4, %rdx +; X86-64-NEXT: addq %rax, %rdx +; X86-64-NEXT: subq %rdx, %rdi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_17: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $17, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: addq %rdx, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F1 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: andq $-16, %rax +; WIN64-NEXT: shrq $4, %rdx +; WIN64-NEXT: addq %rax, %rdx +; WIN64-NEXT: subq %rdx, %rcx +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 17 @@ -247,27 +271,37 @@ define i128 @urem_i128_255(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_255: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $255, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: addq %rsi, %rax +; X86-64-NEXT: adcq $0, %rax +; X86-64-NEXT: movabsq $-9187201950435737471, %rcx # imm = 0x8080808080808081 +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: shrq $7, %rdx +; X86-64-NEXT: movq %rdx, %rax +; X86-64-NEXT: shlq $8, %rax +; X86-64-NEXT: subq %rax, %rdx +; X86-64-NEXT: addq %rsi, %rdi +; X86-64-NEXT: adcq %rdx, %rdi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_255: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $255, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: addq %rdx, %rax +; WIN64-NEXT: adcq $0, %rax +; WIN64-NEXT: movabsq $-9187201950435737471, %rdx # imm = 0x8080808080808081 +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: shrq $7, %rdx +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: shlq $8, %rax +; WIN64-NEXT: subq %rax, %rdx +; WIN64-NEXT: addq %rcx, %r8 +; WIN64-NEXT: adcq %rdx, %r8 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 255 @@ -277,27 +311,34 @@ define i128 @urem_i128_257(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_257: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $257, %edx # imm = 0x101 -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: addq %rsi, %rdi +; X86-64-NEXT: adcq $0, %rdi +; X86-64-NEXT: movabsq $-71777214294589695, %rcx # imm = 0xFF00FF00FF00FF01 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %rax +; X86-64-NEXT: andq $-256, %rax +; X86-64-NEXT: shrq $8, %rdx +; X86-64-NEXT: addq %rax, %rdx +; X86-64-NEXT: subq %rdx, %rdi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_257: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $257, {{[0-9]+}}(%rsp) # imm = 0x101 -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: addq %rdx, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-71777214294589695, %rdx # imm = 0xFF00FF00FF00FF01 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: andq $-256, %rax +; WIN64-NEXT: shrq $8, %rdx +; WIN64-NEXT: addq %rax, %rdx +; WIN64-NEXT: subq %rdx, %rcx +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 257 @@ -307,27 +348,37 @@ define i128 @urem_i128_65535(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_65535: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $65535, %edx # imm = 0xFFFF -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: addq %rsi, %rax +; X86-64-NEXT: adcq $0, %rax +; X86-64-NEXT: movabsq $-9223231297218904063, %rcx # imm = 0x8000800080008001 +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: shrq $15, %rdx +; X86-64-NEXT: movq %rdx, %rax +; X86-64-NEXT: shlq $16, %rax +; X86-64-NEXT: subq %rax, %rdx +; X86-64-NEXT: addq %rsi, %rdi +; X86-64-NEXT: adcq %rdx, %rdi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_65535: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $65535, {{[0-9]+}}(%rsp) # imm = 0xFFFF -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: addq %rdx, %rax +; WIN64-NEXT: adcq $0, %rax +; WIN64-NEXT: movabsq $-9223231297218904063, %rdx # imm = 0x8000800080008001 +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: shrq $15, %rdx +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: shlq $16, %rax +; WIN64-NEXT: subq %rax, %rdx +; WIN64-NEXT: addq %rcx, %r8 +; WIN64-NEXT: adcq %rdx, %r8 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 65535 @@ -337,27 +388,34 @@ define i128 @urem_i128_65537(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_65537: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $65537, %edx # imm = 0x10001 -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: addq %rsi, %rdi +; X86-64-NEXT: adcq $0, %rdi +; X86-64-NEXT: movabsq $-281470681808895, %rcx # imm = 0xFFFF0000FFFF0001 +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: movq %rdx, %rax +; X86-64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000 +; X86-64-NEXT: shrq $16, %rdx +; X86-64-NEXT: addq %rax, %rdx +; X86-64-NEXT: subq %rdx, %rdi +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_65537: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $65537, {{[0-9]+}}(%rsp) # imm = 0x10001 -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: addq %rdx, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-281470681808895, %rdx # imm = 0xFFFF0000FFFF0001 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000 +; WIN64-NEXT: shrq $16, %rdx +; WIN64-NEXT: addq %rax, %rdx +; WIN64-NEXT: subq %rdx, %rcx +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 65537 @@ -367,27 +425,39 @@ define i128 @urem_i128_12(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_12: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $12, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: shldq $62, %rdi, %rax +; X86-64-NEXT: shrq $2, %rsi +; X86-64-NEXT: addq %rax, %rsi +; X86-64-NEXT: adcq $0, %rsi +; X86-64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: shrq %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax +; X86-64-NEXT: subq %rax, %rsi +; X86-64-NEXT: andl $3, %edi +; X86-64-NEXT: leaq (%rdi,%rsi,4), %rax +; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_12: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $12, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: shldq $62, %rcx, %rax +; WIN64-NEXT: shrq $2, %r8 +; WIN64-NEXT: addq %rax, %r8 +; WIN64-NEXT: adcq $0, %r8 +; WIN64-NEXT: movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: shrq %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax +; WIN64-NEXT: subq %rax, %r8 +; WIN64-NEXT: andl $3, %ecx +; WIN64-NEXT: leaq (%rcx,%r8,4), %rax +; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: %rem = urem i128 %x, 12 @@ -397,27 +467,47 @@ define i128 @udiv_i128_3(i128 %x) nounwind { ; X86-64-LABEL: udiv_i128_3: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $3, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __udivti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rdi, %rcx +; X86-64-NEXT: addq %rsi, %rcx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: movabsq $-6148914691236517205, %r8 # imm = 0xAAAAAAAAAAAAAAAB +; X86-64-NEXT: movq %rcx, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: shrq %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax +; X86-64-NEXT: subq %rax, %rcx +; X86-64-NEXT: subq %rcx, %rdi +; X86-64-NEXT: sbbq $0, %rsi +; X86-64-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA +; X86-64-NEXT: imulq %rdi, %rcx +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: addq %rcx, %rdx +; X86-64-NEXT: imulq %rsi, %r8 +; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv_i128_3: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __udivti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %r9 +; WIN64-NEXT: addq %rdx, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: shrq %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax +; WIN64-NEXT: subq %rax, %rcx +; WIN64-NEXT: subq %rcx, %r9 +; WIN64-NEXT: sbbq $0, %r8 +; WIN64-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA +; WIN64-NEXT: imulq %r9, %rcx +; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %rcx, %rdx +; WIN64-NEXT: imulq %r10, %r8 +; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: %rem = udiv i128 %x, 3 @@ -427,27 +517,47 @@ define i128 @udiv_i128_5(i128 %x) nounwind { ; X86-64-LABEL: udiv_i128_5: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $5, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __udivti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rdi, %rcx +; X86-64-NEXT: addq %rsi, %rcx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: movabsq $-3689348814741910323, %r8 # imm = 0xCCCCCCCCCCCCCCCD +; X86-64-NEXT: movq %rcx, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: shrq $2, %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax +; X86-64-NEXT: subq %rax, %rcx +; X86-64-NEXT: subq %rcx, %rdi +; X86-64-NEXT: sbbq $0, %rsi +; X86-64-NEXT: movabsq $-3689348814741910324, %rcx # imm = 0xCCCCCCCCCCCCCCCC +; X86-64-NEXT: imulq %rdi, %rcx +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: addq %rcx, %rdx +; X86-64-NEXT: imulq %rsi, %r8 +; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv_i128_5: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $5, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __udivti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %r9 +; WIN64-NEXT: addq %rdx, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-3689348814741910323, %r10 # imm = 0xCCCCCCCCCCCCCCCD +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: shrq $2, %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax +; WIN64-NEXT: subq %rax, %rcx +; WIN64-NEXT: subq %rcx, %r9 +; WIN64-NEXT: sbbq $0, %r8 +; WIN64-NEXT: movabsq $-3689348814741910324, %rcx # imm = 0xCCCCCCCCCCCCCCCC +; WIN64-NEXT: imulq %r9, %rcx +; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %rcx, %rdx +; WIN64-NEXT: imulq %r10, %r8 +; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: %rem = udiv i128 %x, 5 @@ -457,27 +567,51 @@ define i128 @udiv_i128_15(i128 %x) nounwind { ; X86-64-LABEL: udiv_i128_15: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $15, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __udivti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rdi, %rcx +; X86-64-NEXT: addq %rsi, %rcx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 +; X86-64-NEXT: movq %rcx, %rax +; X86-64-NEXT: mulq %rdx +; X86-64-NEXT: shrq $3, %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax +; X86-64-NEXT: leaq (%rax,%rax,2), %rax +; X86-64-NEXT: subq %rax, %rcx +; X86-64-NEXT: subq %rcx, %rdi +; X86-64-NEXT: sbbq $0, %rsi +; X86-64-NEXT: movabsq $-1229782938247303442, %r8 # imm = 0xEEEEEEEEEEEEEEEE +; X86-64-NEXT: imulq %rdi, %r8 +; X86-64-NEXT: movabsq $-1229782938247303441, %rcx # imm = 0xEEEEEEEEEEEEEEEF +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: addq %r8, %rdx +; X86-64-NEXT: imulq %rsi, %rcx +; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv_i128_15: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $15, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __udivti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %r9 +; WIN64-NEXT: addq %rdx, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: shrq $3, %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax +; WIN64-NEXT: leaq (%rax,%rax,2), %rax +; WIN64-NEXT: subq %rax, %rcx +; WIN64-NEXT: subq %rcx, %r9 +; WIN64-NEXT: sbbq $0, %r8 +; WIN64-NEXT: movabsq $-1229782938247303442, %rcx # imm = 0xEEEEEEEEEEEEEEEE +; WIN64-NEXT: imulq %r9, %rcx +; WIN64-NEXT: movabsq $-1229782938247303441, %r10 # imm = 0xEEEEEEEEEEEEEEEF +; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %rcx, %rdx +; WIN64-NEXT: imulq %r10, %r8 +; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: %rem = udiv i128 %x, 15 @@ -487,27 +621,51 @@ define i128 @udiv_i128_17(i128 %x) nounwind { ; X86-64-LABEL: udiv_i128_17: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $17, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __udivti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rdi, %rcx +; X86-64-NEXT: addq %rsi, %rcx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: movabsq $-1085102592571150095, %r8 # imm = 0xF0F0F0F0F0F0F0F1 +; X86-64-NEXT: movq %rcx, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: movq %rdx, %rax +; X86-64-NEXT: andq $-16, %rax +; X86-64-NEXT: shrq $4, %rdx +; X86-64-NEXT: addq %rax, %rdx +; X86-64-NEXT: subq %rdx, %rcx +; X86-64-NEXT: subq %rcx, %rdi +; X86-64-NEXT: sbbq $0, %rsi +; X86-64-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 +; X86-64-NEXT: imulq %rdi, %rcx +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: addq %rcx, %rdx +; X86-64-NEXT: imulq %rsi, %r8 +; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv_i128_17: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $17, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __udivti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %r9 +; WIN64-NEXT: addq %rdx, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-1085102592571150095, %r10 # imm = 0xF0F0F0F0F0F0F0F1 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: andq $-16, %rax +; WIN64-NEXT: shrq $4, %rdx +; WIN64-NEXT: addq %rax, %rdx +; WIN64-NEXT: subq %rdx, %rcx +; WIN64-NEXT: subq %rcx, %r9 +; WIN64-NEXT: sbbq $0, %r8 +; WIN64-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 +; WIN64-NEXT: imulq %r9, %rcx +; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %rcx, %rdx +; WIN64-NEXT: imulq %r10, %r8 +; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: %rem = udiv i128 %x, 17 @@ -517,27 +675,55 @@ define i128 @udiv_i128_255(i128 %x) nounwind { ; X86-64-LABEL: udiv_i128_255: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $255, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __udivti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: addq %rsi, %rax +; X86-64-NEXT: adcq $0, %rax +; X86-64-NEXT: movabsq $-9187201950435737471, %rcx # imm = 0x8080808080808081 +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: shrq $7, %rdx +; X86-64-NEXT: movq %rdx, %rax +; X86-64-NEXT: shlq $8, %rax +; X86-64-NEXT: subq %rax, %rdx +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: addq %rsi, %rax +; X86-64-NEXT: adcq %rdx, %rax +; X86-64-NEXT: subq %rax, %rdi +; X86-64-NEXT: sbbq $0, %rsi +; X86-64-NEXT: movabsq $-72340172838076674, %r8 # imm = 0xFEFEFEFEFEFEFEFE +; X86-64-NEXT: imulq %rdi, %r8 +; X86-64-NEXT: movabsq $-72340172838076673, %rcx # imm = 0xFEFEFEFEFEFEFEFF +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: addq %r8, %rdx +; X86-64-NEXT: imulq %rsi, %rcx +; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv_i128_255: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $255, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __udivti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: addq %rdx, %rax +; WIN64-NEXT: adcq $0, %rax +; WIN64-NEXT: movabsq $-9187201950435737471, %rdx # imm = 0x8080808080808081 +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: shrq $7, %rdx +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: shlq $8, %rax +; WIN64-NEXT: subq %rax, %rdx +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: addq %r8, %rax +; WIN64-NEXT: adcq %rdx, %rax +; WIN64-NEXT: subq %rax, %rcx +; WIN64-NEXT: sbbq $0, %r8 +; WIN64-NEXT: movabsq $-72340172838076674, %r9 # imm = 0xFEFEFEFEFEFEFEFE +; WIN64-NEXT: imulq %rcx, %r9 +; WIN64-NEXT: movabsq $-72340172838076673, %r10 # imm = 0xFEFEFEFEFEFEFEFF +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %r9, %rdx +; WIN64-NEXT: imulq %r10, %r8 +; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: %rem = udiv i128 %x, 255 @@ -547,27 +733,51 @@ define i128 @udiv_i128_257(i128 %x) nounwind { ; X86-64-LABEL: udiv_i128_257: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $257, %edx # imm = 0x101 -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __udivti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rdi, %rcx +; X86-64-NEXT: addq %rsi, %rcx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: movabsq $-71777214294589695, %r8 # imm = 0xFF00FF00FF00FF01 +; X86-64-NEXT: movq %rcx, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: movq %rdx, %rax +; X86-64-NEXT: andq $-256, %rax +; X86-64-NEXT: shrq $8, %rdx +; X86-64-NEXT: addq %rax, %rdx +; X86-64-NEXT: subq %rdx, %rcx +; X86-64-NEXT: subq %rcx, %rdi +; X86-64-NEXT: sbbq $0, %rsi +; X86-64-NEXT: movabsq $-71777214294589696, %rcx # imm = 0xFF00FF00FF00FF00 +; X86-64-NEXT: imulq %rdi, %rcx +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: addq %rcx, %rdx +; X86-64-NEXT: imulq %rsi, %r8 +; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv_i128_257: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $257, {{[0-9]+}}(%rsp) # imm = 0x101 -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __udivti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %r9 +; WIN64-NEXT: addq %rdx, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-71777214294589695, %r10 # imm = 0xFF00FF00FF00FF01 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: andq $-256, %rax +; WIN64-NEXT: shrq $8, %rdx +; WIN64-NEXT: addq %rax, %rdx +; WIN64-NEXT: subq %rdx, %rcx +; WIN64-NEXT: subq %rcx, %r9 +; WIN64-NEXT: sbbq $0, %r8 +; WIN64-NEXT: movabsq $-71777214294589696, %rcx # imm = 0xFF00FF00FF00FF00 +; WIN64-NEXT: imulq %r9, %rcx +; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %rcx, %rdx +; WIN64-NEXT: imulq %r10, %r8 +; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: %rem = udiv i128 %x, 257 @@ -577,27 +787,55 @@ define i128 @udiv_i128_65535(i128 %x) nounwind { ; X86-64-LABEL: udiv_i128_65535: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $65535, %edx # imm = 0xFFFF -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __udivti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: addq %rsi, %rax +; X86-64-NEXT: adcq $0, %rax +; X86-64-NEXT: movabsq $-9223231297218904063, %rcx # imm = 0x8000800080008001 +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: shrq $15, %rdx +; X86-64-NEXT: movq %rdx, %rax +; X86-64-NEXT: shlq $16, %rax +; X86-64-NEXT: subq %rax, %rdx +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: addq %rsi, %rax +; X86-64-NEXT: adcq %rdx, %rax +; X86-64-NEXT: subq %rax, %rdi +; X86-64-NEXT: sbbq $0, %rsi +; X86-64-NEXT: movabsq $-281479271743490, %r8 # imm = 0xFFFEFFFEFFFEFFFE +; X86-64-NEXT: imulq %rdi, %r8 +; X86-64-NEXT: movabsq $-281479271743489, %rcx # imm = 0xFFFEFFFEFFFEFFFF +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: addq %r8, %rdx +; X86-64-NEXT: imulq %rsi, %rcx +; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv_i128_65535: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $65535, {{[0-9]+}}(%rsp) # imm = 0xFFFF -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __udivti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: addq %rdx, %rax +; WIN64-NEXT: adcq $0, %rax +; WIN64-NEXT: movabsq $-9223231297218904063, %rdx # imm = 0x8000800080008001 +; WIN64-NEXT: mulq %rdx +; WIN64-NEXT: shrq $15, %rdx +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: shlq $16, %rax +; WIN64-NEXT: subq %rax, %rdx +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: addq %r8, %rax +; WIN64-NEXT: adcq %rdx, %rax +; WIN64-NEXT: subq %rax, %rcx +; WIN64-NEXT: sbbq $0, %r8 +; WIN64-NEXT: movabsq $-281479271743490, %r9 # imm = 0xFFFEFFFEFFFEFFFE +; WIN64-NEXT: imulq %rcx, %r9 +; WIN64-NEXT: movabsq $-281479271743489, %r10 # imm = 0xFFFEFFFEFFFEFFFF +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %r9, %rdx +; WIN64-NEXT: imulq %r10, %r8 +; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: %rem = udiv i128 %x, 65535 @@ -607,27 +845,51 @@ define i128 @udiv_i128_65537(i128 %x) nounwind { ; X86-64-LABEL: udiv_i128_65537: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $65537, %edx # imm = 0x10001 -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __udivti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rdi, %rcx +; X86-64-NEXT: addq %rsi, %rcx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: movabsq $-281470681808895, %r8 # imm = 0xFFFF0000FFFF0001 +; X86-64-NEXT: movq %rcx, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: movq %rdx, %rax +; X86-64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000 +; X86-64-NEXT: shrq $16, %rdx +; X86-64-NEXT: addq %rax, %rdx +; X86-64-NEXT: subq %rdx, %rcx +; X86-64-NEXT: subq %rcx, %rdi +; X86-64-NEXT: sbbq $0, %rsi +; X86-64-NEXT: movabsq $-281470681808896, %rcx # imm = 0xFFFF0000FFFF0000 +; X86-64-NEXT: imulq %rdi, %rcx +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: addq %rcx, %rdx +; X86-64-NEXT: imulq %rsi, %r8 +; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv_i128_65537: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $65537, {{[0-9]+}}(%rsp) # imm = 0x10001 -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __udivti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %r9 +; WIN64-NEXT: addq %rdx, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-281470681808895, %r10 # imm = 0xFFFF0000FFFF0001 +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000 +; WIN64-NEXT: shrq $16, %rdx +; WIN64-NEXT: addq %rax, %rdx +; WIN64-NEXT: subq %rdx, %rcx +; WIN64-NEXT: subq %rcx, %r9 +; WIN64-NEXT: sbbq $0, %r8 +; WIN64-NEXT: movabsq $-281470681808896, %rcx # imm = 0xFFFF0000FFFF0000 +; WIN64-NEXT: imulq %r9, %rcx +; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %rcx, %rdx +; WIN64-NEXT: imulq %r10, %r8 +; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: %rem = udiv i128 %x, 65537 @@ -637,27 +899,60 @@ define i128 @udiv_i128_12(i128 %x) nounwind { ; X86-64-LABEL: udiv_i128_12: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $12, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __udivti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movq %rsi, %rax +; X86-64-NEXT: shldq $62, %rdi, %rax +; X86-64-NEXT: movq %rsi, %rcx +; X86-64-NEXT: shrq $2, %rcx +; X86-64-NEXT: addq %rax, %rcx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: movabsq $-6148914691236517205, %r8 # imm = 0xAAAAAAAAAAAAAAAB +; X86-64-NEXT: movq %rcx, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: shrq %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax +; X86-64-NEXT: subq %rax, %rcx +; X86-64-NEXT: movl %edi, %eax +; X86-64-NEXT: andl $3, %eax +; X86-64-NEXT: leaq (%rax,%rcx,4), %rax +; X86-64-NEXT: subq %rax, %rdi +; X86-64-NEXT: sbbq $0, %rsi +; X86-64-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA +; X86-64-NEXT: imulq %rdi, %rcx +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: addq %rcx, %rdx +; X86-64-NEXT: imulq %rsi, %r8 +; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; ; WIN64-LABEL: udiv_i128_12: ; WIN64: # %bb.0: # %entry -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $12, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __udivti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; WIN64-NEXT: movq %xmm0, %rdx -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movq %rdx, %r8 +; WIN64-NEXT: movq %rcx, %r9 +; WIN64-NEXT: movq %rdx, %rax +; WIN64-NEXT: shldq $62, %rcx, %rax +; WIN64-NEXT: movq %rdx, %rcx +; WIN64-NEXT: shrq $2, %rcx +; WIN64-NEXT: addq %rax, %rcx +; WIN64-NEXT: adcq $0, %rcx +; WIN64-NEXT: movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: shrq %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax +; WIN64-NEXT: subq %rax, %rcx +; WIN64-NEXT: movl %r9d, %eax +; WIN64-NEXT: andl $3, %eax +; WIN64-NEXT: leaq (%rax,%rcx,4), %rax +; WIN64-NEXT: subq %rax, %r9 +; WIN64-NEXT: sbbq $0, %r8 +; WIN64-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA +; WIN64-NEXT: imulq %r9, %rcx +; WIN64-NEXT: movq %r9, %rax +; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %rcx, %rdx +; WIN64-NEXT: imulq %r10, %r8 +; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: %rem = udiv i128 %x, 12