Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -490,6 +490,8 @@ SDValue visitShiftByConstant(SDNode *N); + SDValue foldUREM(SDNode *N); + SDValue foldSREM(SDNode *N); SDValue foldSelectOfConstants(SDNode *N); SDValue foldVSelectOfConstants(SDNode *N); SDValue foldBinOpIntoSelect(SDNode *BO); @@ -3911,6 +3913,20 @@ AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); + if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) { + if (isConstantOrConstantVector(N1)) { + // check if there is a div to combine with rem. + unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; + SDNode *DivNode = + DAG.getNodeIfExists(DivOpcode, N->getVTList(), {N0, N1}); + if (!DivNode) { + SDValue OptimizedRem = isSigned ? foldSREM(N) : foldUREM(N); + if (OptimizedRem.getNode()) + return OptimizedRem; + } + } + } + // If X/C can be simplified by the division-by-constant logic, lower // X%C to the equivalent of X-X/C*C. // Reuse the SDIVLike/UDIVLike combines - to avoid mangling nodes, the @@ -3942,6 +3958,240 @@ return SDValue(); } +/// Given an ISD::UREM where the divisor is constant, +/// return a DAG expression that will generate the same comparison result +/// using only multiplications, additions and shifts. +/// Ref: D. Lemire, O. Kaser, and N. Kurz, "Faster Remainder by Direct +/// Computation" (LKK) +SDValue DAGCombiner::foldUREM(SDNode *node) { + SDLoc DL(node); + EVT VT = node->getValueType(0); + EVT FVT; + if (VT.isVector()) { + EVT SVT = + EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2); + FVT = EVT::getVectorVT(*DAG.getContext(), SVT, VT.getVectorElementCount()); + } else { + FVT = EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2); + } + + unsigned F = FVT.getScalarSizeInBits(); + + // when optimising for minimum size, we don't want to expand a div to a mul + // and a shift. + if (DAG.getMachineFunction().getFunction().hasMinSize()) + return SDValue(); + + // Check to see if we can do this. + if (!isTypeLegal(VT) || !isTypeLegal(FVT)) + return SDValue(); + + // If MUL is unavailable, we cannot proceed in any case. + if (!TLI.isOperationLegalOrCustomOrPromote(ISD::MUL, FVT) && + TLI.isOperationExpand(ISD::MUL, FVT)) + return SDValue(); + + SmallVector MagicFactors; + + auto BuildUREMPattern = [&](ConstantSDNode *DivisorConstant) { + // calculate magic number: c = ceil(2^N / d) + 1 + const APInt &D = DivisorConstant->getAPIntValue(); + APInt C = APInt::getMaxValue(F).udiv(D.zext(F)).uadd_sat(APInt(F, 1)); + SDValue AproximateReciprocal = DAG.getConstant(C, DL, FVT.getScalarType()); + + MagicFactors.push_back(AproximateReciprocal); + + assert(!D.isNullValue() && "Divisor cannot be zero"); + + if (!D.isStrictlyPositive() || D.isMaxValue() || D.isOneValue() || + D.isPowerOf2()) { + // Divisor must be in the range of (1,2^N) + // We can lower remainder of division by powers of two much better + // elsewhere. + return false; + } + + return true; + }; + + // numerator + SDValue Numerator = node->getOperand(0); + SDValue ExtendedNumerator = DAG.getZExtOrTrunc(Numerator, DL, FVT); + + // divisor constant + SDValue Divisor = node->getOperand(1); + SDValue ExtendedDivisor = DAG.getZExtOrTrunc(Divisor, DL, FVT); + + if (!ISD::matchUnaryPredicate(Divisor, BuildUREMPattern)) + return SDValue(); + + SDValue MagicFactor = VT.isVector() + ? DAG.getBuildVector(FVT, DL, MagicFactors) + : MagicFactors[0]; + + // lowbits = c * n + SDValue Lowbits = + DAG.getNode(ISD::MUL, DL, FVT, MagicFactor, ExtendedNumerator); + + // result = lowbits * d >> F + SDValue Result; + if (LegalOperations ? TLI.isOperationLegal(ISD::MULHU, FVT) + : TLI.isOperationLegalOrCustom(ISD::MULHU, FVT)) + Result = DAG.getNode(ISD::MULHU, DL, FVT, Lowbits, ExtendedDivisor); + else if (LegalOperations + ? TLI.isOperationLegal(ISD::UMUL_LOHI, FVT) + : TLI.isOperationLegalOrCustom(ISD::UMUL_LOHI, FVT)) { + SDValue LoHi = DAG.getNode(ISD::UMUL_LOHI, DL, DAG.getVTList(FVT, FVT), + Lowbits, ExtendedDivisor); + Result = SDValue(LoHi.getNode(), 1); + } else { + return SDValue(); // No mulhu or equivalent + } + + AddToWorklist(MagicFactor.getNode()); + AddToWorklist(ExtendedNumerator.getNode()); + AddToWorklist(Lowbits.getNode()); + AddToWorklist(ExtendedDivisor.getNode()); + AddToWorklist(Result.getNode()); + + return DAG.getZExtOrTrunc(Result, DL, VT); +} + +/// Given an ISD::SREM where the divisor is constant, +/// return a DAG expression that will generate the same comparison result +/// using only multiplications, additions and shifts. +/// Ref: D. Lemire, O. Kaser, and N. Kurz, "Faster Remainder by Direct +/// Computation" (LKK) +SDValue DAGCombiner::foldSREM(SDNode *node) { + SDLoc DL(node); + EVT VT = node->getValueType(0); + EVT FVT; + if (VT.isVector()) { + EVT TmpVT = + EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2); + FVT = + EVT::getVectorVT(*DAG.getContext(), TmpVT, VT.getVectorElementCount()); + } else { + FVT = EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2); + } + + unsigned N = VT.getScalarSizeInBits(); + unsigned F = FVT.getScalarSizeInBits(); + + // Check to see if we can do this. + if (!isTypeLegal(VT) || !isTypeLegal(FVT)) + return SDValue(); + + // when optimising for minimum size, we don't want to expand a div to a mul + // and a shift. + if (DAG.getMachineFunction().getFunction().hasMinSize()) + return SDValue(); + + // If MUL is unavailable, we cannot proceed in any case. + if (!TLI.isOperationLegalOrCustomOrPromote(ISD::MUL, FVT) && + TLI.isOperationExpand(ISD::MUL, FVT)) + return SDValue(); + + if (!TLI.isOperationLegalOrCustomOrPromote(ISD::SRA, FVT) && + TLI.isOperationExpand(ISD::SRA, FVT)) + return SDValue(); + + SmallVector MagicFactors, AbsoluteDivisors; + + auto BuildSREMPattern = [&](ConstantSDNode *DivisorConstant) { + // calculate magic number: c = floor( (1<getAPIntValue().abs(); + APInt IsPow2 = APInt(F, pd.isPowerOf2()); + APInt C = APInt::getMaxValue(F) + .udiv(pd.zext(F)) + .uadd_sat(APInt(F, 1)) + .uadd_sat(IsPow2); + + SDValue AproximateReciprocal = DAG.getConstant(C, DL, FVT.getScalarType()); + SDValue AbsoluteDivisor = DAG.getConstant(pd, DL, VT.getScalarType()); + + MagicFactors.push_back(AproximateReciprocal); + AbsoluteDivisors.push_back(AbsoluteDivisor); + + assert(!pd.isNullValue() && "Divisor cannot be zero"); + + if (!pd.isStrictlyPositive() || pd.isMaxSignedValue() || pd.isOneValue() || + pd.isPowerOf2()) { + // Absolute divisor must be in the range of (1,2^(N-1)) + // We can lower remainder of division by powers of two much better + // elsewhere. + return false; + } + + return true; + }; + + // numerator + SDValue Numerator = node->getOperand(0); + SDValue ExtendedNumerator = DAG.getSExtOrTrunc(Numerator, DL, FVT); + + // divisor constant + SDValue Divisor = node->getOperand(1); + + if (!ISD::matchUnaryPredicate(Divisor, BuildSREMPattern)) + return SDValue(); + + // absolute divisor + SDValue AbsoluteDivisor = VT.isVector() + ? DAG.getBuildVector(VT, DL, AbsoluteDivisors) + : AbsoluteDivisors[0]; + SDValue ExtendedAbsoluteDivisor = + DAG.getZExtOrTrunc(AbsoluteDivisor, DL, FVT); + + SDValue MagicFactor = VT.isVector() + ? DAG.getBuildVector(FVT, DL, MagicFactors) + : MagicFactors[0]; + + // lowbits = c * n + SDValue Lowbits = + DAG.getNode(ISD::MUL, DL, FVT, MagicFactor, ExtendedNumerator); + + // highbits = lowbits * pd >> F + SDValue Highbits; + if (LegalOperations ? TLI.isOperationLegal(ISD::MULHU, FVT) + : TLI.isOperationLegalOrCustom(ISD::MULHU, FVT)) + Highbits = + DAG.getNode(ISD::MULHU, DL, FVT, Lowbits, ExtendedAbsoluteDivisor); + else if (LegalOperations + ? TLI.isOperationLegal(ISD::UMUL_LOHI, FVT) + : TLI.isOperationLegalOrCustom(ISD::UMUL_LOHI, FVT)) { + SDValue LoHi = DAG.getNode(ISD::UMUL_LOHI, DL, DAG.getVTList(FVT, FVT), + Lowbits, ExtendedAbsoluteDivisor); + Highbits = SDValue(LoHi.getNode(), 1); + } else { + return SDValue(); // No mulhu or equivalent + } + SDValue TruncatedHighbits = DAG.getSExtOrTrunc(Highbits, DL, VT); + + // result = highbits -((pd - 1) & (n >> N-1)) + SDValue One = DAG.getConstant(1, DL, VT); + SDValue DecrementedAbsoluteDivisor = + DAG.getNode(ISD::SUB, DL, VT, AbsoluteDivisor, One); + SDValue ShiftAmount = DAG.getConstant(N - 1, DL, VT); + SDValue Sign = DAG.getNode(ISD::SRA, DL, VT, Numerator, ShiftAmount); + SDValue And = DAG.getNode(ISD::AND, DL, VT, DecrementedAbsoluteDivisor, Sign); + SDValue Result = DAG.getNode(ISD::SUB, DL, VT, TruncatedHighbits, And); + + AddToWorklist(MagicFactor.getNode()); + AddToWorklist(ExtendedNumerator.getNode()); + AddToWorklist(Lowbits.getNode()); + AddToWorklist(AbsoluteDivisor.getNode()); + AddToWorklist(ExtendedAbsoluteDivisor.getNode()); + AddToWorklist(Highbits.getNode()); + AddToWorklist(One.getNode()); + AddToWorklist(DecrementedAbsoluteDivisor.getNode()); + AddToWorklist(ShiftAmount.getNode()); + AddToWorklist(Sign.getNode()); + AddToWorklist(And.getNode()); + + return Result; +} + SDValue DAGCombiner::visitMULHS(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); Index: llvm/test/CodeGen/AArch64/srem-llk.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/srem-llk.ll @@ -0,0 +1,164 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +define i32 @fold_srem_positve_odd(i32 %x) { +; CHECK-LABEL: fold_srem_positve_odd: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x10, #7589 +; CHECK-NEXT: movk x10, #4139, lsl #16 +; CHECK-NEXT: movk x10, #55878, lsl #32 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: movk x10, #689, lsl #48 +; CHECK-NEXT: mov w8, #94 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mov w10, #95 +; CHECK-NEXT: and w8, w8, w0, asr #31 +; CHECK-NEXT: umulh x9, x9, x10 +; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: ret + %1 = srem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_srem_positve_even(i32 %x) { +; CHECK-LABEL: fold_srem_positve_even: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x10, #7172 +; CHECK-NEXT: movk x10, #61579, lsl #16 +; CHECK-NEXT: movk x10, #54159, lsl #32 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: movk x10, #61, lsl #48 +; CHECK-NEXT: mov w8, #1059 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mov w10, #1060 +; CHECK-NEXT: and w8, w8, w0, asr #31 +; CHECK-NEXT: umulh x9, x9, x10 +; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: ret + %1 = srem i32 %x, 1060 + ret i32 %1 +} + + +define i32 @fold_srem_negative_odd(i32 %x) { +; CHECK-LABEL: fold_srem_negative_odd: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x10, #91 +; CHECK-NEXT: movk x10, #23205, lsl #16 +; CHECK-NEXT: movk x10, #42240, lsl #32 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: movk x10, #90, lsl #48 +; CHECK-NEXT: mov w8, #722 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mov w10, #723 +; CHECK-NEXT: and w8, w8, w0, asr #31 +; CHECK-NEXT: umulh x9, x9, x10 +; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: ret + %1 = srem i32 %x, -723 + ret i32 %1 +} + + +define i32 @fold_srem_negative_even(i32 %x) { +; CHECK-LABEL: fold_srem_negative_even: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x10, #21004 +; CHECK-NEXT: movk x10, #6399, lsl #16 +; CHECK-NEXT: movk x10, #55820, lsl #32 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: movk x10, #2, lsl #48 +; CHECK-NEXT: mov w8, #22980 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mov w10, #22981 +; CHECK-NEXT: and w8, w8, w0, asr #31 +; CHECK-NEXT: umulh x9, x9, x10 +; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: ret + %1 = srem i32 %x, -22981 + ret i32 %1 +} + + +; Don't fold if we can combine srem with sdiv. +define i32 @combine_srem_sdiv(i32 %x) { +; CHECK-LABEL: combine_srem_sdiv: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: movk w8, #44150, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: add w8, w8, w0 +; CHECK-NEXT: asr w9, w8, #6 +; CHECK-NEXT: add w8, w9, w8, lsr #31 +; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: msub w9, w8, w9, w0 +; CHECK-NEXT: add w0, w9, w8 +; CHECK-NEXT: ret + %1 = srem i32 %x, 95 + %2 = sdiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_srem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_fold_srem_power_of_two: +; CHECK: // %bb.0: +; CHECK-NEXT: add w8, w0, #63 // =63 +; CHECK-NEXT: cmp w0, #0 // =0 +; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: and w8, w8, #0xffffffc0 +; CHECK-NEXT: sub w0, w0, w8 +; CHECK-NEXT: ret + %1 = srem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_srem_one(i32 %x) { +; CHECK-LABEL: dont_fold_srem_one: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret + %1 = srem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^31. +define i32 @dont_fold_srem_i32_smax(i32 %x) { +; CHECK-LABEL: dont_fold_srem_i32_smax: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #2147483647 +; CHECK-NEXT: add w8, w0, w8 +; CHECK-NEXT: cmp w0, #0 // =0 +; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: and w8, w8, #0x80000000 +; CHECK-NEXT: add w0, w0, w8 +; CHECK-NEXT: ret + %1 = srem i32 %x, 2147483648 + ret i32 %1 +} + +; Don't fold i64 srem +define i64 @dont_fold_srem_i64(i64 %x) { +; CHECK-LABEL: dont_fold_srem_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #58849 +; CHECK-NEXT: movk x8, #48148, lsl #16 +; CHECK-NEXT: movk x8, #33436, lsl #32 +; CHECK-NEXT: movk x8, #21399, lsl #48 +; CHECK-NEXT: smulh x8, x0, x8 +; CHECK-NEXT: asr x9, x8, #5 +; CHECK-NEXT: add x8, x9, x8, lsr #63 +; CHECK-NEXT: mov w9, #98 +; CHECK-NEXT: msub x0, x8, x9, x0 +; CHECK-NEXT: ret + %1 = srem i64 %x, 98 + ret i64 %1 +} Index: llvm/test/CodeGen/AArch64/srem-seteq.ll =================================================================== --- llvm/test/CodeGen/AArch64/srem-seteq.ll +++ llvm/test/CodeGen/AArch64/srem-seteq.ll @@ -83,17 +83,16 @@ define i16 @test_srem_even(i16 %X) nounwind { ; CHECK-LABEL: test_srem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #9363 +; CHECK-NEXT: mov w10, #9363 ; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: movk w9, #37449, lsl #16 -; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: asr w9, w8, #3 -; CHECK-NEXT: add w8, w9, w8, lsr #31 -; CHECK-NEXT: mov w9, #14 -; CHECK-NEXT: msub w8, w8, w9, w0 -; CHECK-NEXT: tst w8, #0xffff +; CHECK-NEXT: mov w9, #13 +; CHECK-NEXT: movk w10, #4681, lsl #16 +; CHECK-NEXT: and w9, w9, w8, lsr #15 +; CHECK-NEXT: mul w8, w8, w10 +; CHECK-NEXT: mov w10, #14 +; CHECK-NEXT: umull x8, w8, w10 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %srem = srem i16 %X, 14 Index: llvm/test/CodeGen/AArch64/srem-vector-llk.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/srem-vector-llk.ll @@ -0,0 +1,286 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { +; CHECK-LABEL: fold_srem_vec_1: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: adrp x8, .LCPI0_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: sshll v3.4s, v0.4h, #0 +; CHECK-NEXT: adrp x8, .LCPI0_2 +; CHECK-NEXT: mul v1.4s, v3.4s, v1.4s +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: sshr v0.4h, v0.4h, #15 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: sub v0.4h, v1.4h, v0.4h +; CHECK-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { +; CHECK-LABEL: fold_srem_vec_2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #55879 +; CHECK-NEXT: movk w8, #689, lsl #16 +; CHECK-NEXT: sshll v1.4s, v0.4h, #0 +; CHECK-NEXT: dup v4.4s, w8 +; CHECK-NEXT: movi v2.4s, #95 +; CHECK-NEXT: mul v1.4s, v1.4s, v4.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: sshr v0.4h, v0.4h, #15 +; CHECK-NEXT: movi v3.4h, #94 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: and v0.8b, v0.8b, v3.8b +; CHECK-NEXT: sub v0.4h, v1.4h, v0.4h +; CHECK-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine srem with sdiv. +define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { +; CHECK-LABEL: combine_srem_sdiv: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: movk w8, #44150, lsl #16 +; CHECK-NEXT: smov w9, v0.h[1] +; CHECK-NEXT: smov w10, v0.h[0] +; CHECK-NEXT: smull x13, w9, w8 +; CHECK-NEXT: smov w11, v0.h[2] +; CHECK-NEXT: smull x14, w10, w8 +; CHECK-NEXT: lsr x13, x13, #32 +; CHECK-NEXT: smov w12, v0.h[3] +; CHECK-NEXT: smull x15, w11, w8 +; CHECK-NEXT: lsr x14, x14, #32 +; CHECK-NEXT: add w13, w13, w9 +; CHECK-NEXT: smull x8, w12, w8 +; CHECK-NEXT: lsr x15, x15, #32 +; CHECK-NEXT: add w14, w14, w10 +; CHECK-NEXT: asr w16, w13, #6 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: add w15, w15, w11 +; CHECK-NEXT: add w13, w16, w13, lsr #31 +; CHECK-NEXT: asr w16, w14, #6 +; CHECK-NEXT: add w8, w8, w12 +; CHECK-NEXT: add w14, w16, w14, lsr #31 +; CHECK-NEXT: asr w16, w15, #6 +; CHECK-NEXT: add w15, w16, w15, lsr #31 +; CHECK-NEXT: asr w16, w8, #6 +; CHECK-NEXT: add w8, w16, w8, lsr #31 +; CHECK-NEXT: mov w16, #95 +; CHECK-NEXT: msub w10, w14, w16, w10 +; CHECK-NEXT: msub w9, w13, w16, w9 +; CHECK-NEXT: fmov s0, w14 +; CHECK-NEXT: fmov s1, w10 +; CHECK-NEXT: msub w11, w15, w16, w11 +; CHECK-NEXT: mov v0.h[1], w13 +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: msub w12, w8, w16, w12 +; CHECK-NEXT: mov v0.h[2], w15 +; CHECK-NEXT: mov v1.h[2], w11 +; CHECK-NEXT: mov v1.h[3], w12 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-NEXT: ret + %1 = srem <4 x i16> %x, + %2 = sdiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_srem_power_of_two: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x12, #7589 +; CHECK-NEXT: movk x12, #4139, lsl #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w10, v0.h[3] +; CHECK-NEXT: mov w11, #94 +; CHECK-NEXT: movk x12, #55878, lsl #32 +; CHECK-NEXT: movk x12, #689, lsl #48 +; CHECK-NEXT: and w11, w11, w10, asr #31 +; CHECK-NEXT: sxtw x10, w10 +; CHECK-NEXT: smov w8, v0.h[1] +; CHECK-NEXT: mul x10, x10, x12 +; CHECK-NEXT: mov w12, #95 +; CHECK-NEXT: umulh x10, x10, x12 +; CHECK-NEXT: add w12, w8, #31 // =31 +; CHECK-NEXT: cmp w8, #0 // =0 +; CHECK-NEXT: csel w12, w12, w8, lt +; CHECK-NEXT: smov w9, v0.h[0] +; CHECK-NEXT: and w12, w12, #0xffffffe0 +; CHECK-NEXT: sub w8, w8, w12 +; CHECK-NEXT: add w12, w9, #63 // =63 +; CHECK-NEXT: cmp w9, #0 // =0 +; CHECK-NEXT: csel w12, w12, w9, lt +; CHECK-NEXT: and w12, w12, #0xffffffc0 +; CHECK-NEXT: sub w9, w9, w12 +; CHECK-NEXT: smov w12, v0.h[2] +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: add w9, w12, #7 // =7 +; CHECK-NEXT: cmp w12, #0 // =0 +; CHECK-NEXT: csel w9, w9, w12, lt +; CHECK-NEXT: and w9, w9, #0xfffffff8 +; CHECK-NEXT: sub w9, w12, w9 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov v0.h[2], w9 +; CHECK-NEXT: sub w8, w10, w11 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_srem_one: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x13, #17236 +; CHECK-NEXT: movk x13, #18438, lsl #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov x10, #45591 +; CHECK-NEXT: smov w11, v0.h[1] +; CHECK-NEXT: mov w12, #653 +; CHECK-NEXT: movk x13, #13628, lsl #32 +; CHECK-NEXT: movk x10, #34192, lsl #16 +; CHECK-NEXT: movk x13, #100, lsl #48 +; CHECK-NEXT: and w12, w12, w11, asr #31 +; CHECK-NEXT: sxtw x11, w11 +; CHECK-NEXT: smov w8, v0.h[2] +; CHECK-NEXT: mov w9, #22 +; CHECK-NEXT: movk x10, #25644, lsl #32 +; CHECK-NEXT: mul x11, x11, x13 +; CHECK-NEXT: mov x13, #48291 +; CHECK-NEXT: movk x10, #2849, lsl #48 +; CHECK-NEXT: and w9, w9, w8, asr #31 +; CHECK-NEXT: sxtw x8, w8 +; CHECK-NEXT: movk x13, #1244, lsl #16 +; CHECK-NEXT: smov w14, v0.h[3] +; CHECK-NEXT: mul x8, x8, x10 +; CHECK-NEXT: mov w10, #5422 +; CHECK-NEXT: movk x13, #5559, lsl #32 +; CHECK-NEXT: movk x13, #12, lsl #48 +; CHECK-NEXT: and w10, w10, w14, asr #31 +; CHECK-NEXT: sxtw x14, w14 +; CHECK-NEXT: mul x13, x14, x13 +; CHECK-NEXT: mov w14, #23 +; CHECK-NEXT: umulh x8, x8, x14 +; CHECK-NEXT: mov w14, #654 +; CHECK-NEXT: umulh x11, x11, x14 +; CHECK-NEXT: mov w14, #5423 +; CHECK-NEXT: sub w8, w8, w9 +; CHECK-NEXT: sub w9, w11, w12 +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: umulh x13, x13, x14 +; CHECK-NEXT: mov v0.h[1], w9 +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: sub w8, w13, w10 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^15. +define <4 x i16> @dont_fold_srem_i16_smax(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_srem_i16_smax: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x11, #45591 +; CHECK-NEXT: movk x11, #34192, lsl #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w9, v0.h[2] +; CHECK-NEXT: mov w10, #22 +; CHECK-NEXT: movk x11, #25644, lsl #32 +; CHECK-NEXT: movk x11, #2849, lsl #48 +; CHECK-NEXT: and w10, w10, w9, asr #31 +; CHECK-NEXT: sxtw x9, w9 +; CHECK-NEXT: mul x9, x9, x11 +; CHECK-NEXT: mov x11, #48291 +; CHECK-NEXT: movk x11, #1244, lsl #16 +; CHECK-NEXT: smov w12, v0.h[3] +; CHECK-NEXT: mov w13, #5422 +; CHECK-NEXT: movk x11, #5559, lsl #32 +; CHECK-NEXT: movk x11, #12, lsl #48 +; CHECK-NEXT: and w13, w13, w12, asr #31 +; CHECK-NEXT: sxtw x12, w12 +; CHECK-NEXT: mul x11, x12, x11 +; CHECK-NEXT: mov w12, #23 +; CHECK-NEXT: umulh x9, x9, x12 +; CHECK-NEXT: mov w12, #5423 +; CHECK-NEXT: smov w8, v0.h[1] +; CHECK-NEXT: umulh x11, x11, x12 +; CHECK-NEXT: mov w12, #32767 +; CHECK-NEXT: add w12, w8, w12 +; CHECK-NEXT: cmp w8, #0 // =0 +; CHECK-NEXT: csel w12, w12, w8, lt +; CHECK-NEXT: and w12, w12, #0xffff8000 +; CHECK-NEXT: sub w8, w8, w12 +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: sub w9, w9, w10 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov v0.h[2], w9 +; CHECK-NEXT: sub w8, w11, w13 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 srem. +define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { +; CHECK-LABEL: dont_fold_srem_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x9, #6055 +; CHECK-NEXT: movk x9, #58853, lsl #16 +; CHECK-NEXT: movk x9, #47142, lsl #32 +; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: movk x9, #24749, lsl #48 +; CHECK-NEXT: smulh x9, x8, x9 +; CHECK-NEXT: asr x12, x9, #11 +; CHECK-NEXT: mov w10, #5423 +; CHECK-NEXT: add x9, x12, x9, lsr #63 +; CHECK-NEXT: msub x8, x9, x10, x8 +; CHECK-NEXT: mov x9, #21445 +; CHECK-NEXT: movk x9, #1603, lsl #16 +; CHECK-NEXT: movk x9, #15432, lsl #32 +; CHECK-NEXT: mov x12, v0.d[1] +; CHECK-NEXT: movk x9, #25653, lsl #48 +; CHECK-NEXT: smulh x9, x12, x9 +; CHECK-NEXT: asr x10, x9, #8 +; CHECK-NEXT: add x9, x10, x9, lsr #63 +; CHECK-NEXT: mov w10, #654 +; CHECK-NEXT: msub x9, x9, x10, x12 +; CHECK-NEXT: mov x10, #8549 +; CHECK-NEXT: movk x10, #22795, lsl #16 +; CHECK-NEXT: movk x10, #17096, lsl #32 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: movk x10, #45590, lsl #48 +; CHECK-NEXT: smulh x10, x11, x10 +; CHECK-NEXT: add x10, x10, x11 +; CHECK-NEXT: asr x12, x10, #4 +; CHECK-NEXT: add x10, x12, x10, lsr #63 +; CHECK-NEXT: mov w12, #23 +; CHECK-NEXT: msub x10, x10, x12, x11 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: fmov d1, x10 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.d[1], x9 +; CHECK-NEXT: ret + %1 = srem <4 x i64> %x, + ret <4 x i64> %1 +} Index: llvm/test/CodeGen/AArch64/urem-llk.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/urem-llk.ll @@ -0,0 +1,106 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +define i32 @fold_urem_positve_odd(i32 %x) { +; CHECK-LABEL: fold_urem_positve_odd: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x9, #7589 +; CHECK-NEXT: movk x9, #4139, lsl #16 +; CHECK-NEXT: movk x9, #55878, lsl #32 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: movk x9, #689, lsl #48 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: umulh x0, x8, x9 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %1 = urem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_urem_positve_even(i32 %x) { +; CHECK-LABEL: fold_urem_positve_even: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x9, #7172 +; CHECK-NEXT: movk x9, #61579, lsl #16 +; CHECK-NEXT: movk x9, #54159, lsl #32 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: movk x9, #61, lsl #48 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: mov w9, #1060 +; CHECK-NEXT: umulh x0, x8, x9 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %1 = urem i32 %x, 1060 + ret i32 %1 +} + + +; Don't fold if we can combine urem with udiv. +define i32 @combine_urem_udiv(i32 %x) { +; CHECK-LABEL: combine_urem_udiv: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #8969 +; CHECK-NEXT: movk w8, #22765, lsl #16 +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: sub w9, w0, w8 +; CHECK-NEXT: add w8, w8, w9, lsr #1 +; CHECK-NEXT: lsr w8, w8, #6 +; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: msub w9, w8, w9, w0 +; CHECK-NEXT: add w0, w9, w8 +; CHECK-NEXT: ret + %1 = urem i32 %x, 95 + %2 = udiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_urem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_fold_urem_power_of_two: +; CHECK: // %bb.0: +; CHECK-NEXT: and w0, w0, #0x3f +; CHECK-NEXT: ret + %1 = urem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_urem_one(i32 %x) { +; CHECK-LABEL: dont_fold_urem_one: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret + %1 = urem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^32. +define i32 @dont_fold_urem_i32_umax(i32 %x) { +; CHECK-LABEL: dont_fold_urem_i32_umax: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %1 = urem i32 %x, 4294967296 + ret i32 %1 +} + +; Don't fold i64 urem +define i64 @dont_fold_urem_i64(i64 %x) { +; CHECK-LABEL: dont_fold_urem_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x9, #58849 +; CHECK-NEXT: movk x9, #48148, lsl #16 +; CHECK-NEXT: movk x9, #33436, lsl #32 +; CHECK-NEXT: lsr x8, x0, #1 +; CHECK-NEXT: movk x9, #21399, lsl #48 +; CHECK-NEXT: umulh x8, x8, x9 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mov w9, #98 +; CHECK-NEXT: msub x0, x8, x9, x0 +; CHECK-NEXT: ret + %1 = urem i64 %x, 98 + ret i64 %1 +} Index: llvm/test/CodeGen/AArch64/urem-seteq.ll =================================================================== --- llvm/test/CodeGen/AArch64/urem-seteq.ll +++ llvm/test/CodeGen/AArch64/urem-seteq.ll @@ -78,15 +78,15 @@ define i16 @test_urem_even(i16 %X) nounwind { ; CHECK-LABEL: test_urem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #28087 +; CHECK-NEXT: mov w9, #9363 ; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: movk w9, #46811, lsl #16 -; CHECK-NEXT: mul w8, w8, w9 -; CHECK-NEXT: mov w9, #9362 -; CHECK-NEXT: ror w8, w8, #1 ; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: mul w8, w8, w9 +; CHECK-NEXT: mov w9, #14 +; CHECK-NEXT: umull x8, w8, w9 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: cmp w8, #0 // =0 +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %urem = urem i16 %X, 14 %cmp = icmp ne i16 %urem, 0 Index: llvm/test/CodeGen/AArch64/urem-vector-llk.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/urem-vector-llk.ll @@ -0,0 +1,212 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { +; CHECK-LABEL: fold_urem_vec_1: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: adrp x9, .LCPI0_1 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_1] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v1.2d, v0.4s, v2.4s +; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { +; CHECK-LABEL: fold_urem_vec_2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #55879 +; CHECK-NEXT: movk w8, #689, lsl #16 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: dup v2.4s, w8 +; CHECK-NEXT: movi v1.4s, #95 +; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine urem with udiv. +define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { +; CHECK-LABEL: combine_urem_udiv: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #8969 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: movk w8, #22765, lsl #16 +; CHECK-NEXT: umov w9, v0.h[1] +; CHECK-NEXT: umov w10, v0.h[0] +; CHECK-NEXT: umull x13, w9, w8 +; CHECK-NEXT: umov w11, v0.h[2] +; CHECK-NEXT: umull x14, w10, w8 +; CHECK-NEXT: lsr x13, x13, #32 +; CHECK-NEXT: umov w12, v0.h[3] +; CHECK-NEXT: umull x15, w11, w8 +; CHECK-NEXT: lsr x14, x14, #32 +; CHECK-NEXT: sub w16, w9, w13 +; CHECK-NEXT: umull x8, w12, w8 +; CHECK-NEXT: lsr x15, x15, #32 +; CHECK-NEXT: add w13, w13, w16, lsr #1 +; CHECK-NEXT: sub w16, w10, w14 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: add w14, w14, w16, lsr #1 +; CHECK-NEXT: sub w16, w11, w15 +; CHECK-NEXT: add w15, w15, w16, lsr #1 +; CHECK-NEXT: sub w16, w12, w8 +; CHECK-NEXT: add w8, w8, w16, lsr #1 +; CHECK-NEXT: mov w16, #95 +; CHECK-NEXT: lsr w14, w14, #6 +; CHECK-NEXT: lsr w13, w13, #6 +; CHECK-NEXT: msub w10, w14, w16, w10 +; CHECK-NEXT: lsr w15, w15, #6 +; CHECK-NEXT: msub w9, w13, w16, w9 +; CHECK-NEXT: fmov s0, w14 +; CHECK-NEXT: fmov s1, w10 +; CHECK-NEXT: lsr w8, w8, #6 +; CHECK-NEXT: msub w11, w15, w16, w11 +; CHECK-NEXT: mov v0.h[1], w13 +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: msub w12, w8, w16, w12 +; CHECK-NEXT: mov v0.h[2], w15 +; CHECK-NEXT: mov v1.h[2], w11 +; CHECK-NEXT: mov v1.h[3], w12 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + %2 = udiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_urem_power_of_two: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w8, v0.h[0] +; CHECK-NEXT: and w8, w8, #0x3f +; CHECK-NEXT: mov x10, #7589 +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: movk x10, #4139, lsl #16 +; CHECK-NEXT: and w8, w8, #0x1f +; CHECK-NEXT: movk x10, #55878, lsl #32 +; CHECK-NEXT: mov v1.h[1], w8 +; CHECK-NEXT: umov w8, v0.h[2] +; CHECK-NEXT: umov w9, v0.h[3] +; CHECK-NEXT: movk x10, #689, lsl #48 +; CHECK-NEXT: and w8, w8, #0x7 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: mov w8, #95 +; CHECK-NEXT: umulh x8, x9, x8 +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_srem_one: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x11, #45591 +; CHECK-NEXT: movk x11, #34192, lsl #16 +; CHECK-NEXT: movk x11, #25644, lsl #32 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov x9, #17236 +; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: movk x11, #2849, lsl #48 +; CHECK-NEXT: movk x9, #18438, lsl #16 +; CHECK-NEXT: mul x10, x10, x11 +; CHECK-NEXT: mov x11, #48291 +; CHECK-NEXT: movk x9, #13628, lsl #32 +; CHECK-NEXT: movk x11, #1244, lsl #16 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: movk x9, #100, lsl #48 +; CHECK-NEXT: movk x11, #5559, lsl #32 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: umov w9, v0.h[3] +; CHECK-NEXT: movk x11, #12, lsl #48 +; CHECK-NEXT: mul x9, x9, x11 +; CHECK-NEXT: mov w11, #654 +; CHECK-NEXT: umulh x8, x8, x11 +; CHECK-NEXT: mov w11, #23 +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: umulh x10, x10, x11 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, #5423 +; CHECK-NEXT: mov v0.h[2], w10 +; CHECK-NEXT: umulh x8, x9, x8 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^16. +define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_urem_i16_smax: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 urem. +define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { +; CHECK-LABEL: dont_fold_urem_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x10, #12109 +; CHECK-NEXT: movk x10, #52170, lsl #16 +; CHECK-NEXT: movk x10, #28749, lsl #32 +; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: movk x10, #49499, lsl #48 +; CHECK-NEXT: umulh x10, x8, x10 +; CHECK-NEXT: mov w11, #5423 +; CHECK-NEXT: lsr x10, x10, #12 +; CHECK-NEXT: msub x8, x10, x11, x8 +; CHECK-NEXT: mov x10, #21445 +; CHECK-NEXT: movk x10, #1603, lsl #16 +; CHECK-NEXT: mov x12, v0.d[1] +; CHECK-NEXT: movk x10, #15432, lsl #32 +; CHECK-NEXT: movk x10, #25653, lsl #48 +; CHECK-NEXT: lsr x11, x12, #1 +; CHECK-NEXT: umulh x10, x11, x10 +; CHECK-NEXT: mov w11, #654 +; CHECK-NEXT: lsr x10, x10, #7 +; CHECK-NEXT: msub x10, x10, x11, x12 +; CHECK-NEXT: mov x11, #17097 +; CHECK-NEXT: movk x11, #45590, lsl #16 +; CHECK-NEXT: movk x11, #34192, lsl #32 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: movk x11, #25644, lsl #48 +; CHECK-NEXT: umulh x11, x9, x11 +; CHECK-NEXT: sub x12, x9, x11 +; CHECK-NEXT: add x11, x11, x12, lsr #1 +; CHECK-NEXT: mov w12, #23 +; CHECK-NEXT: lsr x11, x11, #4 +; CHECK-NEXT: msub x9, x11, x12, x9 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.d[1], x10 +; CHECK-NEXT: ret + %1 = urem <4 x i64> %x, + ret <4 x i64> %1 +} Index: llvm/test/CodeGen/PowerPC/machine-pre.ll =================================================================== --- llvm/test/CodeGen/PowerPC/machine-pre.ll +++ llvm/test/CodeGen/PowerPC/machine-pre.ll @@ -58,16 +58,21 @@ ; CHECK-P9-LABEL: foo: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mflr r0 +; CHECK-P9-NEXT: std r26, -48(r1) # 8-byte Folded Spill ; CHECK-P9-NEXT: std r27, -40(r1) # 8-byte Folded Spill ; CHECK-P9-NEXT: std r28, -32(r1) # 8-byte Folded Spill ; CHECK-P9-NEXT: std r29, -24(r1) # 8-byte Folded Spill ; CHECK-P9-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P9-NEXT: std r0, 16(r1) ; CHECK-P9-NEXT: stdu r1, -80(r1) -; CHECK-P9-NEXT: mr r30, r4 ; CHECK-P9-NEXT: mr r29, r3 ; CHECK-P9-NEXT: lis r3, 21845 +; CHECK-P9-NEXT: ori r3, r3, 21845 +; CHECK-P9-NEXT: sldi r3, r3, 32 +; CHECK-P9-NEXT: mr r30, r4 ; CHECK-P9-NEXT: add r28, r30, r29 +; CHECK-P9-NEXT: li r26, 3 +; CHECK-P9-NEXT: oris r3, r3, 21845 ; CHECK-P9-NEXT: ori r27, r3, 21846 ; CHECK-P9-NEXT: b .LBB1_4 ; CHECK-P9-NEXT: .p2align 4 @@ -93,12 +98,9 @@ ; CHECK-P9-NEXT: mr r30, r3 ; CHECK-P9-NEXT: extsw r3, r28 ; CHECK-P9-NEXT: mulld r4, r3, r27 -; CHECK-P9-NEXT: rldicl r5, r4, 1, 63 -; CHECK-P9-NEXT: rldicl r4, r4, 32, 32 -; CHECK-P9-NEXT: add r4, r4, r5 -; CHECK-P9-NEXT: slwi r5, r4, 1 -; CHECK-P9-NEXT: add r4, r4, r5 -; CHECK-P9-NEXT: subf r3, r4, r3 +; CHECK-P9-NEXT: rlwinm r3, r3, 2, 30, 30 +; CHECK-P9-NEXT: mulhdu r4, r4, r26 +; CHECK-P9-NEXT: subf r3, r3, r4 ; CHECK-P9-NEXT: cmplwi r3, 1 ; CHECK-P9-NEXT: beq cr0, .LBB1_1 ; CHECK-P9-NEXT: # %bb.5: # %while.cond @@ -139,6 +141,7 @@ ; CHECK-P9-NEXT: ld r29, -24(r1) # 8-byte Folded Reload ; CHECK-P9-NEXT: ld r28, -32(r1) # 8-byte Folded Reload ; CHECK-P9-NEXT: ld r27, -40(r1) # 8-byte Folded Reload +; CHECK-P9-NEXT: ld r26, -48(r1) # 8-byte Folded Reload ; CHECK-P9-NEXT: blr entry: %add = add nsw i32 %y, %x Index: llvm/test/CodeGen/X86/load-scalar-as-vector.ll =================================================================== --- llvm/test/CodeGen/X86/load-scalar-as-vector.ll +++ llvm/test/CodeGen/X86/load-scalar-as-vector.ll @@ -418,28 +418,28 @@ define <4 x i32> @srem_op1_constant(i32* %p) nounwind { ; SSE-LABEL: srem_op1_constant: ; SSE: # %bb.0: -; SSE-NEXT: movslq (%rdi), %rax -; SSE-NEXT: imulq $818089009, %rax, %rcx # imm = 0x30C30C31 -; SSE-NEXT: movq %rcx, %rdx -; SSE-NEXT: shrq $63, %rdx -; SSE-NEXT: sarq $35, %rcx -; SSE-NEXT: addl %edx, %ecx -; SSE-NEXT: imull $42, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movslq (%rdi), %rcx +; SSE-NEXT: movabsq $439208192231179801, %rax # imm = 0x618618618618619 +; SSE-NEXT: imulq %rcx, %rax +; SSE-NEXT: movl $42, %edx +; SSE-NEXT: mulq %rdx +; SSE-NEXT: sarl $31, %ecx +; SSE-NEXT: andl $41, %ecx +; SSE-NEXT: subl %ecx, %edx +; SSE-NEXT: movd %edx, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: srem_op1_constant: ; AVX: # %bb.0: -; AVX-NEXT: movslq (%rdi), %rax -; AVX-NEXT: imulq $818089009, %rax, %rcx # imm = 0x30C30C31 -; AVX-NEXT: movq %rcx, %rdx -; AVX-NEXT: shrq $63, %rdx -; AVX-NEXT: sarq $35, %rcx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: imull $42, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: movslq (%rdi), %rcx +; AVX-NEXT: movabsq $439208192231179801, %rax # imm = 0x618618618618619 +; AVX-NEXT: imulq %rcx, %rax +; AVX-NEXT: movl $42, %edx +; AVX-NEXT: mulq %rdx +; AVX-NEXT: sarl $31, %ecx +; AVX-NEXT: andl $41, %ecx +; AVX-NEXT: subl %ecx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 ; AVX-NEXT: retq %x = load i32, i32* %p %b = srem i32 %x, 42 @@ -520,29 +520,21 @@ define <16 x i8> @urem_op1_constant(i8* %p) nounwind { ; SSE-LABEL: urem_op1_constant: ; SSE: # %bb.0: -; SSE-NEXT: movb (%rdi), %al -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb %cl -; SSE-NEXT: movzbl %cl, %ecx -; SSE-NEXT: imull $49, %ecx, %ecx -; SSE-NEXT: shrl $10, %ecx -; SSE-NEXT: imull $42, %ecx, %ecx -; SSE-NEXT: subb %cl, %al -; SSE-NEXT: movzbl %al, %eax +; SSE-NEXT: movzbl (%rdi), %eax +; SSE-NEXT: imull $1561, %eax, %eax # imm = 0x619 +; SSE-NEXT: movzwl %ax, %eax +; SSE-NEXT: imull $42, %eax, %eax +; SSE-NEXT: shrl $16, %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: urem_op1_constant: ; AVX: # %bb.0: -; AVX-NEXT: movb (%rdi), %al -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrb %cl -; AVX-NEXT: movzbl %cl, %ecx -; AVX-NEXT: imull $49, %ecx, %ecx -; AVX-NEXT: shrl $10, %ecx -; AVX-NEXT: imull $42, %ecx, %ecx -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: movzbl (%rdi), %eax +; AVX-NEXT: imull $1561, %eax, %eax # imm = 0x619 +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: imull $42, %eax, %eax +; AVX-NEXT: shrl $16, %eax ; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: retq %x = load i8, i8* %p Index: llvm/test/CodeGen/X86/pr14088.ll =================================================================== --- llvm/test/CodeGen/X86/pr14088.ll +++ llvm/test/CodeGen/X86/pr14088.ll @@ -17,23 +17,24 @@ ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: movslq %r8d, %rax -; CHECK-NEXT: imulq $1374389535, %rax, %rcx # imm = 0x51EB851F -; CHECK-NEXT: movq %rcx, %rdi -; CHECK-NEXT: shrq $63, %rdi -; CHECK-NEXT: sarq $37, %rcx -; CHECK-NEXT: addl %edi, %ecx -; CHECK-NEXT: imull $100, %ecx, %ecx -; CHECK-NEXT: subl %ecx, %eax -; CHECK-NEXT: movw %ax, (%rsi) -; CHECK-NEXT: cwtl +; CHECK-NEXT: movq %rdx, %rcx +; CHECK-NEXT: movslq %r8d, %rdi +; CHECK-NEXT: movabsq $184467440737095517, %rax # imm = 0x28F5C28F5C28F5D +; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: movl $100, %edx +; CHECK-NEXT: mulq %rdx +; CHECK-NEXT: sarl $31, %edi +; CHECK-NEXT: andl $99, %edi +; CHECK-NEXT: subl %edi, %edx +; CHECK-NEXT: movw %dx, (%rsi) +; CHECK-NEXT: movswl %dx, %eax ; CHECK-NEXT: cltq ; CHECK-NEXT: imulq $1717986919, %rax, %rax # imm = 0x66666667 -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: shrq $63, %rcx +; CHECK-NEXT: movq %rax, %rdx +; CHECK-NEXT: shrq $63, %rdx ; CHECK-NEXT: shrq $34, %rax -; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: movb %al, (%rdx) +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: movb %al, (%rcx) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: .LBB0_2: # %return ; CHECK-NEXT: retq Index: llvm/test/CodeGen/X86/srem-llk.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/srem-llk.ll @@ -0,0 +1,161 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK + +define i32 @fold_srem_positve_odd(i32 %x) { +; CHECK-LABEL: fold_srem_positve_odd: +; CHECK: # %bb.0: +; CHECK-NEXT: movslq %edi, %rcx +; CHECK-NEXT: movabsq $194176253407468965, %rax # imm = 0x2B1DA46102B1DA5 +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $95, %edx +; CHECK-NEXT: mulq %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: sarl $31, %ecx +; CHECK-NEXT: andl $94, %ecx +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = srem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_srem_positve_even(i32 %x) { +; CHECK-LABEL: fold_srem_positve_even: +; CHECK: # %bb.0: +; CHECK-NEXT: movslq %edi, %rcx +; CHECK-NEXT: movabsq $17402588748782596, %rax # imm = 0x3DD38FF08B1C04 +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $1060, %edx # imm = 0x424 +; CHECK-NEXT: mulq %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: sarl $31, %ecx +; CHECK-NEXT: andl $1059, %ecx # imm = 0x423 +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = srem i32 %x, 1060 + ret i32 %1 +} + + +define i32 @fold_srem_negative_odd(i32 %x) { +; CHECK-LABEL: fold_srem_negative_odd: +; CHECK: # %bb.0: +; CHECK-NEXT: movslq %edi, %rcx +; CHECK-NEXT: movabsq $25514168843305051, %rax # imm = 0x5AA5005AA5005B +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $723, %edx # imm = 0x2D3 +; CHECK-NEXT: mulq %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: sarl $31, %ecx +; CHECK-NEXT: andl $722, %ecx # imm = 0x2D2 +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = srem i32 %x, -723 + ret i32 %1 +} + + +define i32 @fold_srem_negative_even(i32 %x) { +; CHECK-LABEL: fold_srem_negative_even: +; CHECK: # %bb.0: +; CHECK-NEXT: movslq %edi, %rcx +; CHECK-NEXT: movabsq $802695447269900, %rax # imm = 0x2DA0C18FF520C +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $22981, %edx # imm = 0x59C5 +; CHECK-NEXT: mulq %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: sarl $31, %ecx +; CHECK-NEXT: andl $22980, %ecx # imm = 0x59C4 +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = srem i32 %x, -22981 + ret i32 %1 +} + + +; Don't fold if we can combine srem with sdiv. +define i32 @combine_srem_sdiv(i32 %x) { +; CHECK-LABEL: combine_srem_sdiv: +; CHECK: # %bb.0: +; CHECK-NEXT: movslq %edi, %rax +; CHECK-NEXT: imulq $-1401515643, %rax, %rcx # imm = 0xAC769185 +; CHECK-NEXT: shrq $32, %rcx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: shrl $31, %edx +; CHECK-NEXT: sarl $6, %ecx +; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: imull $95, %ecx, %edx +; CHECK-NEXT: subl %edx, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = srem i32 %x, 95 + %2 = sdiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_srem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_fold_srem_power_of_two: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: leal 63(%rax), %ecx +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: cmovnsl %edi, %ecx +; CHECK-NEXT: andl $-64, %ecx +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = srem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_srem_one(i32 %x) { +; CHECK-LABEL: dont_fold_srem_one: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: retq + %1 = srem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^31. +define i32 @dont_fold_srem_i32_smax(i32 %x) { +; CHECK-LABEL: dont_fold_srem_i32_smax: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: leal 2147483647(%rdi), %eax +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: cmovnsl %edi, %eax +; CHECK-NEXT: andl $-2147483648, %eax # imm = 0x80000000 +; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: retq + %1 = srem i32 %x, 2147483648 + ret i32 %1 +} + +; Don't fold i64 srem +define i64 @dont_fold_srem_i64(i64 %x) { +; CHECK-LABEL: dont_fold_srem_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: movabsq $6023426636313322977, %rcx # imm = 0x5397829CBC14E5E1 +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: imulq %rcx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: shrq $63, %rax +; CHECK-NEXT: sarq $5, %rdx +; CHECK-NEXT: addq %rax, %rdx +; CHECK-NEXT: imulq $98, %rdx, %rax +; CHECK-NEXT: subq %rax, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq + %1 = srem i64 %x, 98 + ret i64 %1 +} Index: llvm/test/CodeGen/X86/srem-vector-llk.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/srem-vector-llk.ll @@ -0,0 +1,503 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 + +define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { +; SSE-LABEL: fold_srem_vec_1: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [95,124,98,1003] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE-NEXT: pmovsxwd %xmm0, %xmm3 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE-NEXT: psraw $15, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: psubw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fold_srem_vec_1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [95,124,98,1003] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm3 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fold_srem_vec_1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [95,124,98,1003] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vpmovsxwd %xmm0, %xmm3 +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { +; SSE-LABEL: fold_srem_vec_2: +; SSE: # %bb.0: +; SSE-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [95,95,95,95] +; SSE-NEXT: pmuludq %xmm3, %xmm2 +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE-NEXT: psraw $15, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: psubw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fold_srem_vec_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [95,95,95,95] +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fold_srem_vec_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [45210183,45210183,45210183,45210183] +; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [95,95,95,95] +; AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine srem with sdiv. +define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { +; SSE-LABEL: combine_srem_sdiv: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE-NEXT: pmulhw %xmm0, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrlw $15, %xmm2 +; SSE-NEXT: psraw $6, %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE-NEXT: pmullw %xmm1, %xmm2 +; SSE-NEXT: psubw %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_srem_sdiv: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX-NEXT: vpsraw $6, %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 +; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = srem <4 x i16> %x, + %2 = sdiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { +; SSE-LABEL: dont_fold_srem_power_of_two: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: leal 31(%rax), %ecx +; SSE-NEXT: testw %ax, %ax +; SSE-NEXT: cmovnsl %eax, %ecx +; SSE-NEXT: andl $-32, %ecx +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: movd %xmm0, %ecx +; SSE-NEXT: leal 63(%rcx), %edx +; SSE-NEXT: testw %cx, %cx +; SSE-NEXT: cmovnsl %ecx, %edx +; SSE-NEXT: andl $-64, %edx +; SSE-NEXT: subl %edx, %ecx +; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: pinsrw $1, %eax, %xmm1 +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: leal 7(%rax), %ecx +; SSE-NEXT: testw %ax, %ax +; SSE-NEXT: cmovnsl %eax, %ecx +; SSE-NEXT: andl $-8, %ecx +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pinsrw $2, %eax, %xmm1 +; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: cwtl +; SSE-NEXT: imull $45210183, %eax, %ecx # imm = 0x2B1DA47 +; SSE-NEXT: shrl $15, %eax +; SSE-NEXT: andl $94, %eax +; SSE-NEXT: imulq $95, %rcx, %rcx +; SSE-NEXT: shrq $32, %rcx +; SSE-NEXT: subl %eax, %ecx +; SSE-NEXT: pinsrw $3, %ecx, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: dont_fold_srem_power_of_two: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: leal 31(%rax), %ecx +; AVX-NEXT: testw %ax, %ax +; AVX-NEXT: cmovnsl %eax, %ecx +; AVX-NEXT: andl $-32, %ecx +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vmovd %xmm0, %ecx +; AVX-NEXT: leal 63(%rcx), %edx +; AVX-NEXT: testw %cx, %cx +; AVX-NEXT: cmovnsl %ecx, %edx +; AVX-NEXT: andl $-64, %edx +; AVX-NEXT: subl %edx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: leal 7(%rax), %ecx +; AVX-NEXT: testw %ax, %ax +; AVX-NEXT: cmovnsl %eax, %ecx +; AVX-NEXT: andl $-8, %ecx +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: cwtl +; AVX-NEXT: imull $45210183, %eax, %ecx # imm = 0x2B1DA47 +; AVX-NEXT: shrl $15, %eax +; AVX-NEXT: andl $94, %eax +; AVX-NEXT: imulq $95, %rcx, %rcx +; AVX-NEXT: shrq $32, %rcx +; AVX-NEXT: subl %eax, %ecx +; AVX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { +; SSE-LABEL: dont_fold_srem_one: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: cwtl +; SSE-NEXT: imull $6567229, %eax, %ecx # imm = 0x64353D +; SSE-NEXT: shrl $15, %eax +; SSE-NEXT: andl $653, %eax # imm = 0x28D +; SSE-NEXT: imulq $654, %rcx, %rcx # imm = 0x28E +; SSE-NEXT: shrq $32, %rcx +; SSE-NEXT: subl %eax, %ecx +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-NEXT: pextrw $2, %xmm1, %eax +; SSE-NEXT: cwtl +; SSE-NEXT: imull $186737709, %eax, %ecx # imm = 0xB21642D +; SSE-NEXT: leaq (%rcx,%rcx,2), %rdx +; SSE-NEXT: shlq $3, %rdx +; SSE-NEXT: subq %rcx, %rdx +; SSE-NEXT: shrq $32, %rdx +; SSE-NEXT: shrl $15, %eax +; SSE-NEXT: andl $22, %eax +; SSE-NEXT: subl %eax, %edx +; SSE-NEXT: pinsrw $2, %edx, %xmm0 +; SSE-NEXT: pextrw $3, %xmm1, %eax +; SSE-NEXT: cwtl +; SSE-NEXT: imull $791992, %eax, %ecx # imm = 0xC15B8 +; SSE-NEXT: shrl $15, %eax +; SSE-NEXT: andl $5422, %eax # imm = 0x152E +; SSE-NEXT: imulq $5423, %rcx, %rcx # imm = 0x152F +; SSE-NEXT: shrq $32, %rcx +; SSE-NEXT: subl %eax, %ecx +; SSE-NEXT: pinsrw $3, %ecx, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: dont_fold_srem_one: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: cwtl +; AVX-NEXT: imull $6567229, %eax, %ecx # imm = 0x64353D +; AVX-NEXT: shrl $15, %eax +; AVX-NEXT: andl $653, %eax # imm = 0x28D +; AVX-NEXT: imulq $654, %rcx, %rcx # imm = 0x28E +; AVX-NEXT: shrq $32, %rcx +; AVX-NEXT: subl %eax, %ecx +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: cwtl +; AVX-NEXT: imull $186737709, %eax, %ecx # imm = 0xB21642D +; AVX-NEXT: leaq (%rcx,%rcx,2), %rdx +; AVX-NEXT: shlq $3, %rdx +; AVX-NEXT: subq %rcx, %rdx +; AVX-NEXT: shrq $32, %rdx +; AVX-NEXT: shrl $15, %eax +; AVX-NEXT: andl $22, %eax +; AVX-NEXT: subl %eax, %edx +; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: cwtl +; AVX-NEXT: imull $791992, %eax, %ecx # imm = 0xC15B8 +; AVX-NEXT: shrl $15, %eax +; AVX-NEXT: andl $5422, %eax # imm = 0x152E +; AVX-NEXT: imulq $5423, %rcx, %rcx # imm = 0x152F +; AVX-NEXT: shrq $32, %rcx +; AVX-NEXT: subl %eax, %ecx +; AVX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^15. +define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { +; SSE-LABEL: dont_fold_urem_i16_smax: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: leal 32767(%rax), %ecx +; SSE-NEXT: testw %ax, %ax +; SSE-NEXT: cmovnsl %eax, %ecx +; SSE-NEXT: andl $-32768, %ecx # imm = 0x8000 +; SSE-NEXT: addl %eax, %ecx +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-NEXT: pextrw $2, %xmm1, %eax +; SSE-NEXT: cwtl +; SSE-NEXT: imull $186737709, %eax, %ecx # imm = 0xB21642D +; SSE-NEXT: leaq (%rcx,%rcx,2), %rdx +; SSE-NEXT: shlq $3, %rdx +; SSE-NEXT: subq %rcx, %rdx +; SSE-NEXT: shrq $32, %rdx +; SSE-NEXT: shrl $15, %eax +; SSE-NEXT: andl $22, %eax +; SSE-NEXT: subl %eax, %edx +; SSE-NEXT: pinsrw $2, %edx, %xmm0 +; SSE-NEXT: pextrw $3, %xmm1, %eax +; SSE-NEXT: cwtl +; SSE-NEXT: imull $791992, %eax, %ecx # imm = 0xC15B8 +; SSE-NEXT: shrl $15, %eax +; SSE-NEXT: andl $5422, %eax # imm = 0x152E +; SSE-NEXT: imulq $5423, %rcx, %rcx # imm = 0x152F +; SSE-NEXT: shrq $32, %rcx +; SSE-NEXT: subl %eax, %ecx +; SSE-NEXT: pinsrw $3, %ecx, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: dont_fold_urem_i16_smax: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: leal 32767(%rax), %ecx +; AVX-NEXT: testw %ax, %ax +; AVX-NEXT: cmovnsl %eax, %ecx +; AVX-NEXT: andl $-32768, %ecx # imm = 0x8000 +; AVX-NEXT: addl %eax, %ecx +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: cwtl +; AVX-NEXT: imull $186737709, %eax, %ecx # imm = 0xB21642D +; AVX-NEXT: leaq (%rcx,%rcx,2), %rdx +; AVX-NEXT: shlq $3, %rdx +; AVX-NEXT: subq %rcx, %rdx +; AVX-NEXT: shrq $32, %rdx +; AVX-NEXT: shrl $15, %eax +; AVX-NEXT: andl $22, %eax +; AVX-NEXT: subl %eax, %edx +; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: cwtl +; AVX-NEXT: imull $791992, %eax, %ecx # imm = 0xC15B8 +; AVX-NEXT: shrl $15, %eax +; AVX-NEXT: andl $5422, %eax # imm = 0x152E +; AVX-NEXT: imulq $5423, %rcx, %rcx # imm = 0x152F +; AVX-NEXT: shrq $32, %rcx +; AVX-NEXT: subl %eax, %ecx +; AVX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 srem. +define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { +; SSE-LABEL: dont_fold_srem_i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movq %xmm1, %rcx +; SSE-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: imulq %rdx +; SSE-NEXT: addq %rcx, %rdx +; SSE-NEXT: movq %rdx, %rax +; SSE-NEXT: shrq $63, %rax +; SSE-NEXT: sarq $4, %rdx +; SSE-NEXT: addq %rax, %rdx +; SSE-NEXT: leaq (%rdx,%rdx,2), %rax +; SSE-NEXT: shlq $3, %rax +; SSE-NEXT: subq %rax, %rdx +; SSE-NEXT: addq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: pextrq $1, %xmm2, %rcx +; SSE-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: imulq %rdx +; SSE-NEXT: movq %rdx, %rax +; SSE-NEXT: shrq $63, %rax +; SSE-NEXT: sarq $11, %rdx +; SSE-NEXT: addq %rax, %rdx +; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; SSE-NEXT: subq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm2 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: imulq %rdx +; SSE-NEXT: movq %rdx, %rax +; SSE-NEXT: shrq $63, %rax +; SSE-NEXT: sarq $8, %rdx +; SSE-NEXT: addq %rax, %rdx +; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; SSE-NEXT: subq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE-NEXT: retq +; +; AVX1-LABEL: dont_fold_srem_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rcx +; AVX1-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: imulq %rdx +; AVX1-NEXT: addq %rcx, %rdx +; AVX1-NEXT: movq %rdx, %rax +; AVX1-NEXT: shrq $63, %rax +; AVX1-NEXT: sarq $4, %rdx +; AVX1-NEXT: addq %rax, %rdx +; AVX1-NEXT: leaq (%rdx,%rdx,2), %rax +; AVX1-NEXT: shlq $3, %rax +; AVX1-NEXT: subq %rax, %rdx +; AVX1-NEXT: addq %rcx, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vpextrq $1, %xmm1, %rcx +; AVX1-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: imulq %rdx +; AVX1-NEXT: movq %rdx, %rax +; AVX1-NEXT: shrq $63, %rax +; AVX1-NEXT: sarq $11, %rdx +; AVX1-NEXT: addq %rax, %rdx +; AVX1-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: imulq %rdx +; AVX1-NEXT: movq %rdx, %rax +; AVX1-NEXT: shrq $63, %rax +; AVX1-NEXT: sarq $8, %rdx +; AVX1-NEXT: addq %rax, %rdx +; AVX1-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: dont_fold_srem_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: imulq %rdx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movq %rdx, %rax +; AVX2-NEXT: shrq $63, %rax +; AVX2-NEXT: sarq $4, %rdx +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: leaq (%rdx,%rdx,2), %rax +; AVX2-NEXT: shlq $3, %rax +; AVX2-NEXT: subq %rax, %rdx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: imulq %rdx +; AVX2-NEXT: movq %rdx, %rax +; AVX2-NEXT: shrq $63, %rax +; AVX2-NEXT: sarq $11, %rdx +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: imulq %rdx +; AVX2-NEXT: movq %rdx, %rax +; AVX2-NEXT: shrq $63, %rax +; AVX2-NEXT: sarq $8, %rdx +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %1 = srem <4 x i64> %x, + ret <4 x i64> %1 +} Index: llvm/test/CodeGen/X86/urem-i8-constant.ll =================================================================== --- llvm/test/CodeGen/X86/urem-i8-constant.ll +++ llvm/test/CodeGen/X86/urem-i8-constant.ll @@ -7,11 +7,11 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: ; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: imull $111, %eax, %ecx -; CHECK-NEXT: shrl $12, %ecx -; CHECK-NEXT: leal (%ecx,%ecx,8), %edx -; CHECK-NEXT: leal (%ecx,%edx,4), %ecx -; CHECK-NEXT: subb %cl, %al +; CHECK-NEXT: imull $1772, %eax, %eax # imm = 0x6EC +; CHECK-NEXT: movzwl %ax, %eax +; CHECK-NEXT: leal (%eax,%eax,8), %ecx +; CHECK-NEXT: leal (%eax,%ecx,4), %eax +; CHECK-NEXT: shrl $16, %eax ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retl %t546 = urem i8 %tmp325, 37 Index: llvm/test/CodeGen/X86/urem-llk.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/urem-llk.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK + +define i32 @fold_urem_positve_odd(i32 %x) { +; CHECK-LABEL: fold_urem_positve_odd: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: movabsq $194176253407468965, %rax # imm = 0x2B1DA46102B1DA5 +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $95, %ecx +; CHECK-NEXT: mulq %rcx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = urem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_urem_positve_even(i32 %x) { +; CHECK-LABEL: fold_urem_positve_even: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: movabsq $17402588748782596, %rax # imm = 0x3DD38FF08B1C04 +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $1060, %ecx # imm = 0x424 +; CHECK-NEXT: mulq %rcx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = urem i32 %x, 1060 + ret i32 %1 +} + + +; Don't fold if we can combine urem with udiv. +define i32 @combine_urem_udiv(i32 %x) { +; CHECK-LABEL: combine_urem_udiv: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: imulq $1491936009, %rax, %rax # imm = 0x58ED2309 +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: subl %eax, %ecx +; CHECK-NEXT: shrl %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: shrl $6, %ecx +; CHECK-NEXT: imull $95, %ecx, %eax +; CHECK-NEXT: subl %eax, %edi +; CHECK-NEXT: leal (%rdi,%rcx), %eax +; CHECK-NEXT: retq + %1 = urem i32 %x, 95 + %2 = udiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_urem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_fold_urem_power_of_two: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: andl $63, %eax +; CHECK-NEXT: retq + %1 = urem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_urem_one(i32 %x) { +; CHECK-LABEL: dont_fold_urem_one: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: retq + %1 = urem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^32. +define i32 @dont_fold_urem_i32_umax(i32 %x) { +; CHECK-LABEL: dont_fold_urem_i32_umax: +; CHECK: # %bb.0: +; CHECK-NEXT: retq + %1 = urem i32 %x, 4294967296 + ret i32 %1 +} + +; Don't fold i64 urem +define i64 @dont_fold_urem_i64(i64 %x) { +; CHECK-LABEL: dont_fold_urem_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: movabsq $6023426636313322977, %rcx # imm = 0x5397829CBC14E5E1 +; CHECK-NEXT: mulq %rcx +; CHECK-NEXT: shrq $4, %rdx +; CHECK-NEXT: imulq $98, %rdx, %rax +; CHECK-NEXT: subq %rax, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq + %1 = urem i64 %x, 98 + ret i64 %1 +} Index: llvm/test/CodeGen/X86/urem-vector-llk.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/urem-vector-llk.ll @@ -0,0 +1,344 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 + +define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { +; SSE-LABEL: fold_urem_vec_1: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [95,124,98,1003] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] +; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE-NEXT: retq +; +; AVX1-LABEL: fold_urem_vec_1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [95,124,98,1003] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: retq +; +; AVX2-LABEL: fold_urem_vec_1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [95,124,98,1003] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: retq + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { +; SSE-LABEL: fold_urem_vec_2: +; SSE: # %bb.0: +; SSE-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95] +; SSE-NEXT: pmuludq %xmm2, %xmm1 +; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE-NEXT: retq +; +; AVX1-LABEL: fold_urem_vec_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [95,95,95,95] +; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: retq +; +; AVX2-LABEL: fold_urem_vec_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [45210183,45210183,45210183,45210183] +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [95,95,95,95] +; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: retq + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine urem with udiv. +define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { +; SSE-LABEL: combine_urem_udiv: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE-NEXT: pmulhuw %xmm0, %xmm1 +; SSE-NEXT: psrlw $6, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE-NEXT: pmullw %xmm1, %xmm2 +; SSE-NEXT: psubw %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_urem_udiv: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 +; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = urem <4 x i16> %x, + %2 = udiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { +; SSE-LABEL: dont_fold_urem_power_of_two: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: andl $31, %eax +; SSE-NEXT: movd %xmm0, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: pinsrw $1, %eax, %xmm1 +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: andl $7, %eax +; SSE-NEXT: pinsrw $2, %eax, %xmm1 +; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: imull $45210183, %eax, %eax # imm = 0x2B1DA47 +; SSE-NEXT: imulq $95, %rax, %rax +; SSE-NEXT: shrq $32, %rax +; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: dont_fold_urem_power_of_two: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: andl $31, %eax +; AVX-NEXT: vmovd %xmm0, %ecx +; AVX-NEXT: andl $63, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: andl $7, %eax +; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: imull $45210183, %eax, %eax # imm = 0x2B1DA47 +; AVX-NEXT: imulq $95, %rax, %rax +; AVX-NEXT: shrq $32, %rax +; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { +; SSE-LABEL: dont_fold_urem_one: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: imull $6567229, %eax, %eax # imm = 0x64353D +; SSE-NEXT: imulq $654, %rax, %rax # imm = 0x28E +; SSE-NEXT: shrq $32, %rax +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pinsrw $1, %eax, %xmm1 +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: imull $186737709, %eax, %eax # imm = 0xB21642D +; SSE-NEXT: leaq (%rax,%rax,2), %rcx +; SSE-NEXT: shlq $3, %rcx +; SSE-NEXT: subq %rax, %rcx +; SSE-NEXT: shrq $32, %rcx +; SSE-NEXT: pinsrw $2, %ecx, %xmm1 +; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: imull $791992, %eax, %eax # imm = 0xC15B8 +; SSE-NEXT: imulq $5423, %rax, %rax # imm = 0x152F +; SSE-NEXT: shrq $32, %rax +; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: dont_fold_urem_one: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: imull $6567229, %eax, %eax # imm = 0x64353D +; AVX-NEXT: imulq $654, %rax, %rax # imm = 0x28E +; AVX-NEXT: shrq $32, %rax +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: imull $186737709, %eax, %eax # imm = 0xB21642D +; AVX-NEXT: leaq (%rax,%rax,2), %rcx +; AVX-NEXT: shlq $3, %rcx +; AVX-NEXT: subq %rax, %rcx +; AVX-NEXT: shrq $32, %rcx +; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: imull $791992, %eax, %eax # imm = 0xC15B8 +; AVX-NEXT: imulq $5423, %rax, %rax # imm = 0x152F +; AVX-NEXT: shrq $32, %rax +; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^16. +define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_urem_i16_smax: +; CHECK: # %bb.0: +; CHECK-NEXT: retq + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 urem. +define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { +; SSE-LABEL: dont_fold_urem_i64: +; SSE: # %bb.0: +; SSE-NEXT: movq %xmm1, %rcx +; SSE-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: mulq %rdx +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: subq %rdx, %rax +; SSE-NEXT: shrq %rax +; SSE-NEXT: addq %rdx, %rax +; SSE-NEXT: shrq $4, %rax +; SSE-NEXT: leaq (%rax,%rax,2), %rdx +; SSE-NEXT: shlq $3, %rdx +; SSE-NEXT: subq %rdx, %rax +; SSE-NEXT: addq %rcx, %rax +; SSE-NEXT: movq %rax, %xmm2 +; SSE-NEXT: pextrq $1, %xmm1, %rcx +; SSE-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: mulq %rdx +; SSE-NEXT: shrq $12, %rdx +; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; SSE-NEXT: subq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: shrq %rax +; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; SSE-NEXT: mulq %rdx +; SSE-NEXT: shrq $7, %rdx +; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; SSE-NEXT: subq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: dont_fold_urem_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rcx +; AVX1-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: mulq %rdx +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: subq %rdx, %rax +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: addq %rdx, %rax +; AVX1-NEXT: shrq $4, %rax +; AVX1-NEXT: leaq (%rax,%rax,2), %rdx +; AVX1-NEXT: shlq $3, %rdx +; AVX1-NEXT: subq %rdx, %rax +; AVX1-NEXT: addq %rcx, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpextrq $1, %xmm1, %rcx +; AVX1-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: mulq %rdx +; AVX1-NEXT: shrq $12, %rdx +; AVX1-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; AVX1-NEXT: mulq %rdx +; AVX1-NEXT: shrq $7, %rdx +; AVX1-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: dont_fold_urem_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: mulq %rdx +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: subq %rdx, %rax +; AVX2-NEXT: shrq %rax +; AVX2-NEXT: addq %rdx, %rax +; AVX2-NEXT: shrq $4, %rax +; AVX2-NEXT: leaq (%rax,%rax,2), %rdx +; AVX2-NEXT: shlq $3, %rdx +; AVX2-NEXT: subq %rdx, %rax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: mulq %rdx +; AVX2-NEXT: shrq $12, %rdx +; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: shrq %rax +; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; AVX2-NEXT: mulq %rdx +; AVX2-NEXT: shrq $7, %rdx +; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %1 = urem <4 x i64> %x, + ret <4 x i64> %1 +} Index: llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -625,15 +625,70 @@ ; SSE-NEXT: psubw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_rem7_8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 -; AVX-NEXT: vpsraw $1, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_rem7_8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7] +; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm5, %xmm5 +; AVX1-NEXT: vpmulld %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2NOBW-LABEL: test_rem7_8i16: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpmovsxwd %xmm0, %ymm1 +; AVX2NOBW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX2NOBW-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; AVX2NOBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] +; AVX2NOBW-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 +; AVX2NOBW-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX2NOBW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2NOBW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2NOBW-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX2NOBW-NEXT: vzeroupper +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovsxwd %xmm0, %ymm1 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX512BW-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %res = srem <8 x i16> %a, ret <8 x i16> %res } @@ -699,71 +754,49 @@ ; AVX1-LABEL: test_rem7_16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363] ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 -; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7] +; AVX1-NEXT: vpmulhuw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5 +; AVX1-NEXT: vpmullw %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2NOBW-LABEL: test_rem7_16i8: ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 ; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm2 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX2NOBW-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX2NOBW-NEXT: vpsubb %xmm3, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2NOBW-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpsubb %xmm0, %xmm1, %xmm0 ; AVX2NOBW-NEXT: vzeroupper ; AVX2NOBW-NEXT: retq ; ; AVX512BW-LABEL: test_rem7_16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX512BW-NEXT: vpsubb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %res = srem <16 x i8> %a, Index: llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -553,15 +553,31 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: test_rem7_16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm1 -; AVX2-NEXT: vpsrlw $15, %ymm1, %ymm2 -; AVX2-NEXT: vpsraw $1, %ymm1, %ymm1 -; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2NOBW-LABEL: test_rem7_16i16: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm1 +; AVX2NOBW-NEXT: vpsrlw $15, %ymm1, %ymm2 +; AVX2NOBW-NEXT: vpsraw $1, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_16i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovsxwd %ymm0, %zmm1 +; AVX512BW-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmuludq %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512BW-NEXT: vpmuludq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 +; AVX512BW-NEXT: vpmovdw %zmm2, %ymm1 +; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpsubw %ymm0, %ymm1, %ymm0 +; AVX512BW-NEXT: retq %res = srem <16 x i16> %a, ret <16 x i16> %res } @@ -647,23 +663,14 @@ ; ; AVX512BW-LABEL: test_rem7_32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1 -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm1 -; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxor %ymm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsrlw $7, %ymm1, %ymm1 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm1 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vpsubb %ymm3, %ymm1, %ymm1 -; AVX512BW-NEXT: vpsllw $3, %ymm1, %ymm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq %res = srem <32 x i8> %a, ret <32 x i8> %res Index: llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -442,20 +442,30 @@ ; AVX512F-LABEL: test_rem7_32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] -; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpsrlw $15, %ymm3, %ymm4 -; AVX512F-NEXT: vpsraw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vpaddw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsrlw $15, %ymm2, %ymm3 -; AVX512F-NEXT: vpsraw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm2 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX512F-NEXT: vpmulld %zmm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpmuludq %zmm4, %zmm2, %zmm5 +; AVX512F-NEXT: vpshufd {{.*#+}} zmm2 = zmm2[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-NEXT: vpmuludq %zmm4, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm6, %zmm5 +; AVX512F-NEXT: vpmovdw %zmm5, %ymm2 +; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpsubw %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm2 +; AVX512F-NEXT: vpmulld %zmm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpmuludq %zmm4, %zmm2, %zmm3 +; AVX512F-NEXT: vpshufd {{.*#+}} zmm2 = zmm2[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-NEXT: vpmuludq %zmm4, %zmm2, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 +; AVX512F-NEXT: vpmovdw %zmm3, %ymm2 +; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpsubw %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -616,16 +616,63 @@ ; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_rem7_8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_rem7_8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7] +; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2NOBW-LABEL: test_rem7_8i16: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2NOBW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX2NOBW-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] +; AVX2NOBW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] +; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2NOBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2NOBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2NOBW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2NOBW-NEXT: vzeroupper +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX512BW-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %res = urem <8 x i16> %a, ret <8 x i16> %res } @@ -690,61 +737,37 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363] ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7] +; AVX1-NEXT: vpmulhuw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmulhuw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2NOBW-LABEL: test_rem7_16i8: ; AVX2NOBW: # %bb.0: -; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm2 -; AVX2NOBW-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2NOBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vzeroupper ; AVX2NOBW-NEXT: retq ; ; AVX512BW-LABEL: test_rem7_16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm2 -; AVX512BW-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %res = urem <16 x i8> %a, Index: llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -561,16 +561,29 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: test_rem7_16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1 -; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1 -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2NOBW-LABEL: test_rem7_16i16: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1 +; AVX2NOBW-NEXT: vpsubw %ymm1, %ymm0, %ymm2 +; AVX2NOBW-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX2NOBW-NEXT: vpaddw %ymm1, %ymm2, %ymm1 +; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_16i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512BW-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm0 +; AVX512BW-NEXT: retq %res = urem <16 x i16> %a, ret <16 x i16> %res } @@ -646,20 +659,10 @@ ; ; AVX512BW-LABEL: test_rem7_32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm2 -; AVX512BW-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpsllw $3, %ymm1, %ymm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq %res = urem <32 x i8> %a, ret <32 x i8> %res Index: llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -440,22 +440,23 @@ ; AVX512F-LABEL: test_rem7_32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm4 -; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddw %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm3 -; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX512F-NEXT: vpmulld %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 +; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm4 +; AVX512F-NEXT: vpmovdw %zmm4, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmulld %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmuludq %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm2 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ;