Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -490,6 +490,8 @@ SDValue visitShiftByConstant(SDNode *N); + SDValue foldUREM(SDNode *N); + SDValue foldSREM(SDNode *N); SDValue foldSelectOfConstants(SDNode *N); SDValue foldVSelectOfConstants(SDNode *N); SDValue foldBinOpIntoSelect(SDNode *BO); @@ -3911,6 +3913,20 @@ AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); + if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) { + if (isConstantOrConstantVector(N1)) { + // check if there is a div to combine with rem. + unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; + SDNode *DivNode = + DAG.getNodeIfExists(DivOpcode, N->getVTList(), {N0, N1}); + if (!DivNode) { + SDValue OptimizedRem = isSigned ? foldSREM(N) : foldUREM(N); + if (OptimizedRem.getNode()) + return OptimizedRem; + } + } + } + // If X/C can be simplified by the division-by-constant logic, lower // X%C to the equivalent of X-X/C*C. // Reuse the SDIVLike/UDIVLike combines - to avoid mangling nodes, the @@ -3942,6 +3958,240 @@ return SDValue(); } +/// Given an ISD::UREM where the divisor is constant, +/// return a DAG expression that will generate the same result +/// using only multiplications, additions and shifts. +/// Ref: D. Lemire, O. Kaser, and N. Kurz, "Faster Remainder by Direct +/// Computation" (LKK) +SDValue DAGCombiner::foldUREM(SDNode *node) { + SDLoc DL(node); + EVT VT = node->getValueType(0); + EVT FVT; + if (VT.isVector()) { + EVT SVT = + EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2); + FVT = EVT::getVectorVT(*DAG.getContext(), SVT, VT.getVectorElementCount()); + } else { + FVT = EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2); + } + + unsigned F = FVT.getScalarSizeInBits(); + + // when optimising for minimum size, we don't want to expand a div to a mul + // and a shift. + if (DAG.getMachineFunction().getFunction().hasMinSize()) + return SDValue(); + + // Check to see if we can do this. + if (!isTypeLegal(VT) || !isTypeLegal(FVT)) + return SDValue(); + + // If MUL is unavailable, we cannot proceed in any case. + if (!TLI.isOperationLegalOrCustomOrPromote(ISD::MUL, FVT) && + TLI.isOperationExpand(ISD::MUL, FVT)) + return SDValue(); + + SmallVector MagicFactors; + + auto BuildUREMPattern = [&](ConstantSDNode *DivisorConstant) { + // calculate magic number: c = ceil(2^N / d) + 1 + const APInt &D = DivisorConstant->getAPIntValue(); + APInt C = APInt::getMaxValue(F).udiv(D.zext(F)).uadd_sat(APInt(F, 1)); + SDValue AproximateReciprocal = DAG.getConstant(C, DL, FVT.getScalarType()); + + MagicFactors.push_back(AproximateReciprocal); + + assert(!D.isNullValue() && "Divisor cannot be zero"); + + if (!D.isStrictlyPositive() || D.isMaxValue() || D.isOneValue() || + D.isPowerOf2()) { + // Divisor must be in the range of (1,2^N) + // We can lower remainder of division by powers of two much better + // elsewhere. + return false; + } + + return true; + }; + + // numerator + SDValue Numerator = node->getOperand(0); + SDValue ExtendedNumerator = DAG.getZExtOrTrunc(Numerator, DL, FVT); + + // divisor constant + SDValue Divisor = node->getOperand(1); + SDValue ExtendedDivisor = DAG.getZExtOrTrunc(Divisor, DL, FVT); + + if (!ISD::matchUnaryPredicate(Divisor, BuildUREMPattern)) + return SDValue(); + + SDValue MagicFactor = VT.isVector() + ? DAG.getBuildVector(FVT, DL, MagicFactors) + : MagicFactors[0]; + + // lowbits = c * n + SDValue Lowbits = + DAG.getNode(ISD::MUL, DL, FVT, MagicFactor, ExtendedNumerator); + + // result = lowbits * d >> F + SDValue Result; + if (LegalOperations ? TLI.isOperationLegal(ISD::MULHU, FVT) + : TLI.isOperationLegalOrCustom(ISD::MULHU, FVT)) + Result = DAG.getNode(ISD::MULHU, DL, FVT, Lowbits, ExtendedDivisor); + else if (LegalOperations + ? TLI.isOperationLegal(ISD::UMUL_LOHI, FVT) + : TLI.isOperationLegalOrCustom(ISD::UMUL_LOHI, FVT)) { + SDValue LoHi = DAG.getNode(ISD::UMUL_LOHI, DL, DAG.getVTList(FVT, FVT), + Lowbits, ExtendedDivisor); + Result = SDValue(LoHi.getNode(), 1); + } else { + return SDValue(); // No mulhu or equivalent + } + + AddToWorklist(MagicFactor.getNode()); + AddToWorklist(ExtendedNumerator.getNode()); + AddToWorklist(Lowbits.getNode()); + AddToWorklist(ExtendedDivisor.getNode()); + AddToWorklist(Result.getNode()); + + return DAG.getZExtOrTrunc(Result, DL, VT); +} + +/// Given an ISD::SREM where the divisor is constant, +/// return a DAG expression that will generate the same result +/// using only multiplications, additions and shifts. +/// Ref: D. Lemire, O. Kaser, and N. Kurz, "Faster Remainder by Direct +/// Computation" (LKK) +SDValue DAGCombiner::foldSREM(SDNode *node) { + SDLoc DL(node); + EVT VT = node->getValueType(0); + EVT FVT; + if (VT.isVector()) { + EVT TmpVT = + EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2); + FVT = + EVT::getVectorVT(*DAG.getContext(), TmpVT, VT.getVectorElementCount()); + } else { + FVT = EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2); + } + + unsigned N = VT.getScalarSizeInBits(); + unsigned F = FVT.getScalarSizeInBits(); + + // Check to see if we can do this. + if (!isTypeLegal(VT) || !isTypeLegal(FVT)) + return SDValue(); + + // when optimising for minimum size, we don't want to expand a div to a mul + // and a shift. + if (DAG.getMachineFunction().getFunction().hasMinSize()) + return SDValue(); + + // If MUL is unavailable, we cannot proceed in any case. + if (!TLI.isOperationLegalOrCustomOrPromote(ISD::MUL, FVT) && + TLI.isOperationExpand(ISD::MUL, FVT)) + return SDValue(); + + if (!TLI.isOperationLegalOrCustomOrPromote(ISD::SRA, FVT) && + TLI.isOperationExpand(ISD::SRA, FVT)) + return SDValue(); + + SmallVector MagicFactors, AbsoluteDivisors; + + auto BuildSREMPattern = [&](ConstantSDNode *DivisorConstant) { + // calculate magic number: c = floor( (1<getAPIntValue().abs(); + APInt IsPow2 = APInt(F, pd.isPowerOf2()); + APInt C = APInt::getMaxValue(F) + .udiv(pd.zext(F)) + .uadd_sat(APInt(F, 1)) + .uadd_sat(IsPow2); + + SDValue AproximateReciprocal = DAG.getConstant(C, DL, FVT.getScalarType()); + SDValue AbsoluteDivisor = DAG.getConstant(pd, DL, VT.getScalarType()); + + MagicFactors.push_back(AproximateReciprocal); + AbsoluteDivisors.push_back(AbsoluteDivisor); + + assert(!pd.isNullValue() && "Divisor cannot be zero"); + + if (!pd.isStrictlyPositive() || pd.isMaxSignedValue() || pd.isOneValue() || + pd.isPowerOf2()) { + // Absolute divisor must be in the range of (1,2^(N-1)) + // We can lower remainder of division by powers of two much better + // elsewhere. + return false; + } + + return true; + }; + + // numerator + SDValue Numerator = node->getOperand(0); + SDValue ExtendedNumerator = DAG.getSExtOrTrunc(Numerator, DL, FVT); + + // divisor constant + SDValue Divisor = node->getOperand(1); + + if (!ISD::matchUnaryPredicate(Divisor, BuildSREMPattern)) + return SDValue(); + + // absolute divisor + SDValue AbsoluteDivisor = VT.isVector() + ? DAG.getBuildVector(VT, DL, AbsoluteDivisors) + : AbsoluteDivisors[0]; + SDValue ExtendedAbsoluteDivisor = + DAG.getZExtOrTrunc(AbsoluteDivisor, DL, FVT); + + SDValue MagicFactor = VT.isVector() + ? DAG.getBuildVector(FVT, DL, MagicFactors) + : MagicFactors[0]; + + // lowbits = c * n + SDValue Lowbits = + DAG.getNode(ISD::MUL, DL, FVT, MagicFactor, ExtendedNumerator); + + // highbits = lowbits * pd >> F + SDValue Highbits; + if (LegalOperations ? TLI.isOperationLegal(ISD::MULHU, FVT) + : TLI.isOperationLegalOrCustom(ISD::MULHU, FVT)) + Highbits = + DAG.getNode(ISD::MULHU, DL, FVT, Lowbits, ExtendedAbsoluteDivisor); + else if (LegalOperations + ? TLI.isOperationLegal(ISD::UMUL_LOHI, FVT) + : TLI.isOperationLegalOrCustom(ISD::UMUL_LOHI, FVT)) { + SDValue LoHi = DAG.getNode(ISD::UMUL_LOHI, DL, DAG.getVTList(FVT, FVT), + Lowbits, ExtendedAbsoluteDivisor); + Highbits = SDValue(LoHi.getNode(), 1); + } else { + return SDValue(); // No mulhu or equivalent + } + SDValue TruncatedHighbits = DAG.getSExtOrTrunc(Highbits, DL, VT); + + // result = highbits -((pd - 1) & (n >> N-1)) + SDValue One = DAG.getConstant(1, DL, VT); + SDValue DecrementedAbsoluteDivisor = + DAG.getNode(ISD::SUB, DL, VT, AbsoluteDivisor, One); + SDValue ShiftAmount = DAG.getConstant(N - 1, DL, VT); + SDValue Sign = DAG.getNode(ISD::SRA, DL, VT, Numerator, ShiftAmount); + SDValue And = DAG.getNode(ISD::AND, DL, VT, DecrementedAbsoluteDivisor, Sign); + SDValue Result = DAG.getNode(ISD::SUB, DL, VT, TruncatedHighbits, And); + + AddToWorklist(MagicFactor.getNode()); + AddToWorklist(ExtendedNumerator.getNode()); + AddToWorklist(Lowbits.getNode()); + AddToWorklist(AbsoluteDivisor.getNode()); + AddToWorklist(ExtendedAbsoluteDivisor.getNode()); + AddToWorklist(Highbits.getNode()); + AddToWorklist(One.getNode()); + AddToWorklist(DecrementedAbsoluteDivisor.getNode()); + AddToWorklist(ShiftAmount.getNode()); + AddToWorklist(Sign.getNode()); + AddToWorklist(And.getNode()); + + return Result; +} + SDValue DAGCombiner::visitMULHS(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); Index: llvm/test/CodeGen/AArch64/srem-lkk.ll =================================================================== --- llvm/test/CodeGen/AArch64/srem-lkk.ll +++ llvm/test/CodeGen/AArch64/srem-lkk.ll @@ -4,15 +4,18 @@ define i32 @fold_srem_positive_odd(i32 %x) { ; CHECK-LABEL: fold_srem_positive_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #37253 -; CHECK-NEXT: movk w8, #44150, lsl #16 -; CHECK-NEXT: smull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: add w8, w8, w0 -; CHECK-NEXT: asr w9, w8, #6 -; CHECK-NEXT: add w8, w9, w8, lsr #31 -; CHECK-NEXT: mov w9, #95 -; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: mov x10, #7589 +; CHECK-NEXT: movk x10, #4139, lsl #16 +; CHECK-NEXT: movk x10, #55878, lsl #32 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: movk x10, #689, lsl #48 +; CHECK-NEXT: mov w8, #94 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mov w10, #95 +; CHECK-NEXT: and w8, w8, w0, asr #31 +; CHECK-NEXT: umulh x9, x9, x10 +; CHECK-NEXT: sub w0, w9, w8 ; CHECK-NEXT: ret %1 = srem i32 %x, 95 ret i32 %1 @@ -22,14 +25,18 @@ define i32 @fold_srem_positive_even(i32 %x) { ; CHECK-LABEL: fold_srem_positive_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #36849 -; CHECK-NEXT: movk w8, #15827, lsl #16 -; CHECK-NEXT: smull x8, w0, w8 -; CHECK-NEXT: lsr x9, x8, #63 -; CHECK-NEXT: asr x8, x8, #40 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: mov w9, #1060 -; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: mov x10, #7172 +; CHECK-NEXT: movk x10, #61579, lsl #16 +; CHECK-NEXT: movk x10, #54159, lsl #32 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: movk x10, #61, lsl #48 +; CHECK-NEXT: mov w8, #1059 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mov w10, #1060 +; CHECK-NEXT: and w8, w8, w0, asr #31 +; CHECK-NEXT: umulh x9, x9, x10 +; CHECK-NEXT: sub w0, w9, w8 ; CHECK-NEXT: ret %1 = srem i32 %x, 1060 ret i32 %1 @@ -39,14 +46,18 @@ define i32 @fold_srem_negative_odd(i32 %x) { ; CHECK-LABEL: fold_srem_negative_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #65445 -; CHECK-NEXT: movk w8, #42330, lsl #16 -; CHECK-NEXT: smull x8, w0, w8 -; CHECK-NEXT: lsr x9, x8, #63 -; CHECK-NEXT: asr x8, x8, #40 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: mov w9, #-723 -; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: mov x10, #91 +; CHECK-NEXT: movk x10, #23205, lsl #16 +; CHECK-NEXT: movk x10, #42240, lsl #32 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: movk x10, #90, lsl #48 +; CHECK-NEXT: mov w8, #722 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mov w10, #723 +; CHECK-NEXT: and w8, w8, w0, asr #31 +; CHECK-NEXT: umulh x9, x9, x10 +; CHECK-NEXT: sub w0, w9, w8 ; CHECK-NEXT: ret %1 = srem i32 %x, -723 ret i32 %1 @@ -56,14 +67,18 @@ define i32 @fold_srem_negative_even(i32 %x) { ; CHECK-LABEL: fold_srem_negative_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #62439 -; CHECK-NEXT: movk w8, #64805, lsl #16 -; CHECK-NEXT: smull x8, w0, w8 -; CHECK-NEXT: lsr x9, x8, #63 -; CHECK-NEXT: asr x8, x8, #40 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: mov w9, #-22981 -; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: mov x10, #21004 +; CHECK-NEXT: movk x10, #6399, lsl #16 +; CHECK-NEXT: movk x10, #55820, lsl #32 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: movk x10, #2, lsl #48 +; CHECK-NEXT: mov w8, #22980 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mov w10, #22981 +; CHECK-NEXT: and w8, w8, w0, asr #31 +; CHECK-NEXT: umulh x9, x9, x10 +; CHECK-NEXT: sub w0, w9, w8 ; CHECK-NEXT: ret %1 = srem i32 %x, -22981 ret i32 %1 Index: llvm/test/CodeGen/AArch64/srem-seteq.ll =================================================================== --- llvm/test/CodeGen/AArch64/srem-seteq.ll +++ llvm/test/CodeGen/AArch64/srem-seteq.ll @@ -83,17 +83,16 @@ define i16 @test_srem_even(i16 %X) nounwind { ; CHECK-LABEL: test_srem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #9363 +; CHECK-NEXT: mov w10, #9363 ; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: movk w9, #37449, lsl #16 -; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: asr w9, w8, #3 -; CHECK-NEXT: add w8, w9, w8, lsr #31 -; CHECK-NEXT: mov w9, #14 -; CHECK-NEXT: msub w8, w8, w9, w0 -; CHECK-NEXT: tst w8, #0xffff +; CHECK-NEXT: mov w9, #13 +; CHECK-NEXT: movk w10, #4681, lsl #16 +; CHECK-NEXT: and w9, w9, w8, lsr #15 +; CHECK-NEXT: mul w8, w8, w10 +; CHECK-NEXT: mov w10, #14 +; CHECK-NEXT: umull x8, w8, w10 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %srem = srem i16 %X, 14 Index: llvm/test/CodeGen/AArch64/srem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/AArch64/srem-vector-lkk.ll +++ llvm/test/CodeGen/AArch64/srem-vector-lkk.ll @@ -4,50 +4,21 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { ; CHECK-LABEL: fold_srem_vec_1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #63421 -; CHECK-NEXT: mov w12, #33437 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: movk w9, #31710, lsl #16 -; CHECK-NEXT: smov w11, v0.h[2] -; CHECK-NEXT: movk w12, #21399, lsl #16 -; CHECK-NEXT: smull x12, w11, w12 -; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: lsr x13, x12, #63 -; CHECK-NEXT: asr x12, x12, #37 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w12, w12, w13 -; CHECK-NEXT: mov w13, #98 -; CHECK-NEXT: sub w9, w9, w8 -; CHECK-NEXT: msub w11, w12, w13, w11 -; CHECK-NEXT: asr w13, w9, #6 -; CHECK-NEXT: add w9, w13, w9, lsr #31 -; CHECK-NEXT: mov w13, #37253 -; CHECK-NEXT: mov w10, #-124 -; CHECK-NEXT: smov w12, v0.h[0] -; CHECK-NEXT: movk w13, #44150, lsl #16 -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: smull x10, w12, w13 -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: add w10, w10, w12 -; CHECK-NEXT: asr w13, w10, #6 -; CHECK-NEXT: mov w9, #95 -; CHECK-NEXT: add w10, w13, w10, lsr #31 -; CHECK-NEXT: msub w9, w10, w9, w12 -; CHECK-NEXT: mov w10, #63249 -; CHECK-NEXT: smov w13, v0.h[3] -; CHECK-NEXT: movk w10, #48808, lsl #16 -; CHECK-NEXT: smull x10, w13, w10 -; CHECK-NEXT: lsr x12, x10, #63 -; CHECK-NEXT: asr x10, x10, #40 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: add w10, w10, w12 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #-1003 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: msub w8, w10, w8, w13 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: adrp x8, .LCPI0_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: sshll v3.4s, v0.4h, #0 +; CHECK-NEXT: adrp x8, .LCPI0_2 +; CHECK-NEXT: mul v1.4s, v3.4s, v1.4s +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: sshr v0.4h, v0.4h, #15 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: sub v0.4h, v1.4h, v0.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -56,43 +27,20 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { ; CHECK-LABEL: fold_srem_vec_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #37253 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: movk w9, #44150, lsl #16 -; CHECK-NEXT: smov w10, v0.h[0] -; CHECK-NEXT: smull x13, w8, w9 -; CHECK-NEXT: smov w11, v0.h[2] -; CHECK-NEXT: smull x14, w10, w9 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: smull x15, w11, w9 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: add w13, w13, w8 -; CHECK-NEXT: smull x9, w12, w9 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: add w14, w14, w10 -; CHECK-NEXT: asr w16, w13, #6 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w15, w15, w11 -; CHECK-NEXT: add w13, w16, w13, lsr #31 -; CHECK-NEXT: asr w16, w14, #6 -; CHECK-NEXT: add w9, w9, w12 -; CHECK-NEXT: add w14, w16, w14, lsr #31 -; CHECK-NEXT: asr w16, w15, #6 -; CHECK-NEXT: add w15, w16, w15, lsr #31 -; CHECK-NEXT: asr w16, w9, #6 -; CHECK-NEXT: add w9, w16, w9, lsr #31 -; CHECK-NEXT: mov w16, #95 -; CHECK-NEXT: msub w10, w14, w16, w10 -; CHECK-NEXT: msub w8, w13, w16, w8 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: msub w11, w15, w16, w11 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: msub w8, w9, w16, w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov w8, #55879 +; CHECK-NEXT: movk w8, #689, lsl #16 +; CHECK-NEXT: sshll v1.4s, v0.4h, #0 +; CHECK-NEXT: dup v4.4s, w8 +; CHECK-NEXT: movi v2.4s, #95 +; CHECK-NEXT: mul v1.4s, v1.4s, v4.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: sshr v0.4h, v0.4h, #15 +; CHECK-NEXT: movi v3.4h, #94 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: and v0.8b, v0.8b, v3.8b +; CHECK-NEXT: sub v0.4h, v1.4h, v0.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -155,27 +103,30 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_power_of_two: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x12, #7589 +; CHECK-NEXT: movk x12, #4139, lsl #16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w10, v0.h[3] +; CHECK-NEXT: mov w11, #94 +; CHECK-NEXT: movk x12, #55878, lsl #32 +; CHECK-NEXT: movk x12, #689, lsl #48 +; CHECK-NEXT: and w11, w11, w10, asr #31 +; CHECK-NEXT: sxtw x10, w10 ; CHECK-NEXT: smov w8, v0.h[1] +; CHECK-NEXT: mul x10, x10, x12 +; CHECK-NEXT: mov w12, #95 +; CHECK-NEXT: umulh x10, x10, x12 ; CHECK-NEXT: add w12, w8, #31 // =31 ; CHECK-NEXT: cmp w8, #0 // =0 -; CHECK-NEXT: mov w11, #37253 ; CHECK-NEXT: csel w12, w12, w8, lt ; CHECK-NEXT: smov w9, v0.h[0] -; CHECK-NEXT: smov w10, v0.h[3] -; CHECK-NEXT: movk w11, #44150, lsl #16 ; CHECK-NEXT: and w12, w12, #0xffffffe0 ; CHECK-NEXT: sub w8, w8, w12 ; CHECK-NEXT: add w12, w9, #63 // =63 -; CHECK-NEXT: smull x11, w10, w11 ; CHECK-NEXT: cmp w9, #0 // =0 -; CHECK-NEXT: lsr x11, x11, #32 ; CHECK-NEXT: csel w12, w12, w9, lt -; CHECK-NEXT: add w11, w11, w10 ; CHECK-NEXT: and w12, w12, #0xffffffc0 ; CHECK-NEXT: sub w9, w9, w12 -; CHECK-NEXT: asr w12, w11, #6 -; CHECK-NEXT: add w11, w12, w11, lsr #31 ; CHECK-NEXT: smov w12, v0.h[2] ; CHECK-NEXT: fmov s0, w9 ; CHECK-NEXT: add w9, w12, #7 // =7 @@ -184,9 +135,8 @@ ; CHECK-NEXT: and w9, w9, #0xfffffff8 ; CHECK-NEXT: sub w9, w12, w9 ; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #95 ; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: msub w8, w11, w8, w10 +; CHECK-NEXT: sub w8, w10, w11 ; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret @@ -198,39 +148,46 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #17097 +; CHECK-NEXT: mov x13, #17236 +; CHECK-NEXT: movk x13, #18438, lsl #16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[2] -; CHECK-NEXT: movk w9, #45590, lsl #16 -; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w9, w9, w8 -; CHECK-NEXT: asr w12, w9, #4 -; CHECK-NEXT: add w9, w12, w9, lsr #31 -; CHECK-NEXT: mov w12, #30865 -; CHECK-NEXT: mov w10, #23 +; CHECK-NEXT: mov x10, #45591 ; CHECK-NEXT: smov w11, v0.h[1] -; CHECK-NEXT: movk w12, #51306, lsl #16 -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: smull x10, w11, w12 -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: add w10, w10, w11 -; CHECK-NEXT: asr w12, w10, #9 -; CHECK-NEXT: mov w9, #654 -; CHECK-NEXT: add w10, w12, w10, lsr #31 -; CHECK-NEXT: msub w9, w10, w9, w11 -; CHECK-NEXT: mov w10, #47143 -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: movk w10, #24749, lsl #16 -; CHECK-NEXT: smull x10, w12, w10 -; CHECK-NEXT: lsr x11, x10, #63 -; CHECK-NEXT: asr x10, x10, #43 +; CHECK-NEXT: mov w12, #653 +; CHECK-NEXT: movk x13, #13628, lsl #32 +; CHECK-NEXT: movk x10, #34192, lsl #16 +; CHECK-NEXT: movk x13, #100, lsl #48 +; CHECK-NEXT: and w12, w12, w11, asr #31 +; CHECK-NEXT: sxtw x11, w11 +; CHECK-NEXT: smov w8, v0.h[2] +; CHECK-NEXT: mov w9, #22 +; CHECK-NEXT: movk x10, #25644, lsl #32 +; CHECK-NEXT: mul x11, x11, x13 +; CHECK-NEXT: mov x13, #48291 +; CHECK-NEXT: movk x10, #2849, lsl #48 +; CHECK-NEXT: and w9, w9, w8, asr #31 +; CHECK-NEXT: sxtw x8, w8 +; CHECK-NEXT: movk x13, #1244, lsl #16 +; CHECK-NEXT: smov w14, v0.h[3] +; CHECK-NEXT: mul x8, x8, x10 +; CHECK-NEXT: mov w10, #5422 +; CHECK-NEXT: movk x13, #5559, lsl #32 +; CHECK-NEXT: movk x13, #12, lsl #48 +; CHECK-NEXT: and w10, w10, w14, asr #31 +; CHECK-NEXT: sxtw x14, w14 +; CHECK-NEXT: mul x13, x14, x13 +; CHECK-NEXT: mov w14, #23 +; CHECK-NEXT: umulh x8, x8, x14 +; CHECK-NEXT: mov w14, #654 +; CHECK-NEXT: umulh x11, x11, x14 +; CHECK-NEXT: mov w14, #5423 +; CHECK-NEXT: sub w8, w8, w9 +; CHECK-NEXT: sub w9, w11, w12 ; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: add w10, w10, w11 +; CHECK-NEXT: umulh x13, x13, x14 ; CHECK-NEXT: mov v0.h[1], w9 -; CHECK-NEXT: mov w9, #5423 ; CHECK-NEXT: mov v0.h[2], w8 -; CHECK-NEXT: msub w8, w10, w9, w12 +; CHECK-NEXT: sub w8, w13, w10 ; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret @@ -242,36 +199,41 @@ define <4 x i16> @dont_fold_srem_i16_smax(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_i16_smax: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w10, #17097 +; CHECK-NEXT: mov x11, #45591 +; CHECK-NEXT: movk x11, #34192, lsl #16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w9, v0.h[2] -; CHECK-NEXT: movk w10, #45590, lsl #16 -; CHECK-NEXT: smull x10, w9, w10 -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: add w10, w10, w9 -; CHECK-NEXT: asr w12, w10, #4 -; CHECK-NEXT: mov w11, #23 -; CHECK-NEXT: add w10, w12, w10, lsr #31 -; CHECK-NEXT: msub w9, w10, w11, w9 -; CHECK-NEXT: mov w10, #47143 +; CHECK-NEXT: mov w10, #22 +; CHECK-NEXT: movk x11, #25644, lsl #32 +; CHECK-NEXT: movk x11, #2849, lsl #48 +; CHECK-NEXT: and w10, w10, w9, asr #31 +; CHECK-NEXT: sxtw x9, w9 +; CHECK-NEXT: mul x9, x9, x11 +; CHECK-NEXT: mov x11, #48291 +; CHECK-NEXT: movk x11, #1244, lsl #16 ; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: movk w10, #24749, lsl #16 -; CHECK-NEXT: smull x10, w12, w10 -; CHECK-NEXT: lsr x11, x10, #63 -; CHECK-NEXT: asr x10, x10, #43 +; CHECK-NEXT: mov w13, #5422 +; CHECK-NEXT: movk x11, #5559, lsl #32 +; CHECK-NEXT: movk x11, #12, lsl #48 +; CHECK-NEXT: and w13, w13, w12, asr #31 +; CHECK-NEXT: sxtw x12, w12 +; CHECK-NEXT: mul x11, x12, x11 +; CHECK-NEXT: mov w12, #23 +; CHECK-NEXT: umulh x9, x9, x12 +; CHECK-NEXT: mov w12, #5423 ; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: add w10, w10, w11 -; CHECK-NEXT: mov w11, #32767 -; CHECK-NEXT: add w11, w8, w11 +; CHECK-NEXT: umulh x11, x11, x12 +; CHECK-NEXT: mov w12, #32767 +; CHECK-NEXT: add w12, w8, w12 ; CHECK-NEXT: cmp w8, #0 // =0 -; CHECK-NEXT: csel w11, w11, w8, lt -; CHECK-NEXT: and w11, w11, #0xffff8000 -; CHECK-NEXT: sub w8, w8, w11 +; CHECK-NEXT: csel w12, w12, w8, lt +; CHECK-NEXT: and w12, w12, #0xffff8000 +; CHECK-NEXT: sub w8, w8, w12 ; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: sub w9, w9, w10 ; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #5423 ; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: msub w8, w10, w8, w12 +; CHECK-NEXT: sub w8, w11, w13 ; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret Index: llvm/test/CodeGen/AArch64/urem-lkk.ll =================================================================== --- llvm/test/CodeGen/AArch64/urem-lkk.ll +++ llvm/test/CodeGen/AArch64/urem-lkk.ll @@ -4,15 +4,15 @@ define i32 @fold_urem_positive_odd(i32 %x) { ; CHECK-LABEL: fold_urem_positive_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #8969 -; CHECK-NEXT: movk w8, #22765, lsl #16 -; CHECK-NEXT: umull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: sub w9, w0, w8 -; CHECK-NEXT: add w8, w8, w9, lsr #1 -; CHECK-NEXT: lsr w8, w8, #6 +; CHECK-NEXT: mov x9, #7589 +; CHECK-NEXT: movk x9, #4139, lsl #16 +; CHECK-NEXT: movk x9, #55878, lsl #32 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: movk x9, #689, lsl #48 +; CHECK-NEXT: mul x8, x8, x9 ; CHECK-NEXT: mov w9, #95 -; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: umulh x0, x8, x9 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %1 = urem i32 %x, 95 ret i32 %1 @@ -22,12 +22,15 @@ define i32 @fold_urem_positive_even(i32 %x) { ; CHECK-LABEL: fold_urem_positive_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #16323 -; CHECK-NEXT: movk w8, #63310, lsl #16 -; CHECK-NEXT: umull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #42 +; CHECK-NEXT: mov x9, #7172 +; CHECK-NEXT: movk x9, #61579, lsl #16 +; CHECK-NEXT: movk x9, #54159, lsl #32 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: movk x9, #61, lsl #48 +; CHECK-NEXT: mul x8, x8, x9 ; CHECK-NEXT: mov w9, #1060 -; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: umulh x0, x8, x9 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %1 = urem i32 %x, 1060 ret i32 %1 Index: llvm/test/CodeGen/AArch64/urem-seteq.ll =================================================================== --- llvm/test/CodeGen/AArch64/urem-seteq.ll +++ llvm/test/CodeGen/AArch64/urem-seteq.ll @@ -78,15 +78,15 @@ define i16 @test_urem_even(i16 %X) nounwind { ; CHECK-LABEL: test_urem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #28087 +; CHECK-NEXT: mov w9, #9363 ; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: movk w9, #46811, lsl #16 -; CHECK-NEXT: mul w8, w8, w9 -; CHECK-NEXT: mov w9, #9362 -; CHECK-NEXT: ror w8, w8, #1 ; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: mul w8, w8, w9 +; CHECK-NEXT: mov w9, #14 +; CHECK-NEXT: umull x8, w8, w9 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: cmp w8, #0 // =0 +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %urem = urem i16 %X, 14 %cmp = icmp ne i16 %urem, 0 Index: llvm/test/CodeGen/AArch64/urem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/AArch64/urem-vector-lkk.ll +++ llvm/test/CodeGen/AArch64/urem-vector-lkk.ll @@ -4,44 +4,16 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { ; CHECK-LABEL: fold_urem_vec_1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w11, #33437 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: movk w11, #21399, lsl #16 -; CHECK-NEXT: umull x11, w10, w11 -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: mov w9, #16913 -; CHECK-NEXT: mov w12, #98 -; CHECK-NEXT: lsr x11, x11, #37 -; CHECK-NEXT: movk w9, #8456, lsl #16 -; CHECK-NEXT: msub w10, w11, w12, w10 -; CHECK-NEXT: ubfx w12, w8, #2, #14 -; CHECK-NEXT: umull x9, w12, w9 -; CHECK-NEXT: mov w11, #124 -; CHECK-NEXT: lsr x9, x9, #34 -; CHECK-NEXT: msub w8, w9, w11, w8 -; CHECK-NEXT: mov w9, #8969 -; CHECK-NEXT: umov w12, v0.h[0] -; CHECK-NEXT: movk w9, #22765, lsl #16 -; CHECK-NEXT: umull x9, w12, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: sub w11, w12, w9 -; CHECK-NEXT: add w9, w9, w11, lsr #1 -; CHECK-NEXT: mov w11, #95 -; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: msub w9, w9, w11, w12 -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov w9, #2287 -; CHECK-NEXT: movk w9, #16727, lsl #16 -; CHECK-NEXT: umull x9, w11, w9 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #1003 -; CHECK-NEXT: lsr x9, x9, #40 -; CHECK-NEXT: mov v0.h[2], w10 -; CHECK-NEXT: msub w8, w9, w8, w11 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: adrp x9, .LCPI0_1 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_1] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v1.2d, v0.4s, v2.4s +; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -50,43 +22,16 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { ; CHECK-LABEL: fold_urem_vec_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #8969 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: movk w9, #22765, lsl #16 -; CHECK-NEXT: umov w10, v0.h[0] -; CHECK-NEXT: umull x13, w8, w9 -; CHECK-NEXT: umov w11, v0.h[2] -; CHECK-NEXT: umull x14, w10, w9 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: umov w12, v0.h[3] -; CHECK-NEXT: umull x15, w11, w9 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: sub w16, w8, w13 -; CHECK-NEXT: umull x9, w12, w9 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: add w13, w13, w16, lsr #1 -; CHECK-NEXT: sub w16, w10, w14 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w14, w14, w16, lsr #1 -; CHECK-NEXT: sub w16, w11, w15 -; CHECK-NEXT: add w15, w15, w16, lsr #1 -; CHECK-NEXT: sub w16, w12, w9 -; CHECK-NEXT: add w9, w9, w16, lsr #1 -; CHECK-NEXT: mov w16, #95 -; CHECK-NEXT: lsr w13, w13, #6 -; CHECK-NEXT: msub w8, w13, w16, w8 -; CHECK-NEXT: lsr w13, w14, #6 -; CHECK-NEXT: msub w10, w13, w16, w10 -; CHECK-NEXT: lsr w13, w15, #6 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: msub w11, w13, w16, w11 -; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: msub w8, w9, w16, w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov w8, #55879 +; CHECK-NEXT: movk w8, #689, lsl #16 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: dup v2.4s, w8 +; CHECK-NEXT: movi v1.4s, #95 +; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -150,26 +95,24 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_urem_power_of_two: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #8969 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: movk w9, #22765, lsl #16 -; CHECK-NEXT: umull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: sub w10, w8, w9 -; CHECK-NEXT: add w9, w9, w10, lsr #1 -; CHECK-NEXT: mov w10, #95 -; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: and w9, w9, #0x3f -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: and w10, w10, #0x1f -; CHECK-NEXT: and w9, w9, #0x7 -; CHECK-NEXT: mov v1.h[1], w10 -; CHECK-NEXT: mov v1.h[2], w9 +; CHECK-NEXT: umov w8, v0.h[0] +; CHECK-NEXT: and w8, w8, #0x3f +; CHECK-NEXT: mov x10, #7589 +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: movk x10, #4139, lsl #16 +; CHECK-NEXT: and w8, w8, #0x1f +; CHECK-NEXT: movk x10, #55878, lsl #32 +; CHECK-NEXT: mov v1.h[1], w8 +; CHECK-NEXT: umov w8, v0.h[2] +; CHECK-NEXT: umov w9, v0.h[3] +; CHECK-NEXT: movk x10, #689, lsl #48 +; CHECK-NEXT: and w8, w8, #0x7 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: mov w8, #95 +; CHECK-NEXT: umulh x8, x9, x8 ; CHECK-NEXT: mov v1.h[3], w8 ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret @@ -181,34 +124,36 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #17097 +; CHECK-NEXT: mov x11, #45591 +; CHECK-NEXT: movk x11, #34192, lsl #16 +; CHECK-NEXT: movk x11, #25644, lsl #32 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: movk w9, #45590, lsl #16 -; CHECK-NEXT: umull x9, w8, w9 -; CHECK-NEXT: mov w10, #23 -; CHECK-NEXT: lsr x9, x9, #36 -; CHECK-NEXT: umov w11, v0.h[1] -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: mov w9, #30865 -; CHECK-NEXT: movk w9, #51306, lsl #16 -; CHECK-NEXT: ubfx w10, w11, #1, #15 -; CHECK-NEXT: umull x9, w10, w9 -; CHECK-NEXT: mov w10, #654 -; CHECK-NEXT: lsr x9, x9, #40 -; CHECK-NEXT: msub w9, w9, w10, w11 -; CHECK-NEXT: mov w11, #47143 -; CHECK-NEXT: umov w10, v0.h[3] -; CHECK-NEXT: movk w11, #24749, lsl #16 -; CHECK-NEXT: movi d1, #0000000000000000 -; CHECK-NEXT: umull x11, w10, w11 -; CHECK-NEXT: mov v1.h[1], w9 -; CHECK-NEXT: mov w9, #5423 -; CHECK-NEXT: lsr x11, x11, #43 -; CHECK-NEXT: mov v1.h[2], w8 -; CHECK-NEXT: msub w8, w11, w9, w10 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov x9, #17236 +; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: movk x11, #2849, lsl #48 +; CHECK-NEXT: movk x9, #18438, lsl #16 +; CHECK-NEXT: mul x10, x10, x11 +; CHECK-NEXT: mov x11, #48291 +; CHECK-NEXT: movk x9, #13628, lsl #32 +; CHECK-NEXT: movk x11, #1244, lsl #16 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: movk x9, #100, lsl #48 +; CHECK-NEXT: movk x11, #5559, lsl #32 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: umov w9, v0.h[3] +; CHECK-NEXT: movk x11, #12, lsl #48 +; CHECK-NEXT: mul x9, x9, x11 +; CHECK-NEXT: mov w11, #654 +; CHECK-NEXT: umulh x8, x8, x11 +; CHECK-NEXT: mov w11, #23 +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: umulh x10, x10, x11 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, #5423 +; CHECK-NEXT: mov v0.h[2], w10 +; CHECK-NEXT: umulh x8, x9, x8 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 Index: llvm/test/CodeGen/PowerPC/machine-pre.ll =================================================================== --- llvm/test/CodeGen/PowerPC/machine-pre.ll +++ llvm/test/CodeGen/PowerPC/machine-pre.ll @@ -58,16 +58,21 @@ ; CHECK-P9-LABEL: foo: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mflr r0 +; CHECK-P9-NEXT: std r26, -48(r1) # 8-byte Folded Spill ; CHECK-P9-NEXT: std r27, -40(r1) # 8-byte Folded Spill ; CHECK-P9-NEXT: std r28, -32(r1) # 8-byte Folded Spill ; CHECK-P9-NEXT: std r29, -24(r1) # 8-byte Folded Spill ; CHECK-P9-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P9-NEXT: std r0, 16(r1) ; CHECK-P9-NEXT: stdu r1, -80(r1) -; CHECK-P9-NEXT: mr r30, r4 ; CHECK-P9-NEXT: mr r29, r3 ; CHECK-P9-NEXT: lis r3, 21845 +; CHECK-P9-NEXT: ori r3, r3, 21845 +; CHECK-P9-NEXT: sldi r3, r3, 32 +; CHECK-P9-NEXT: mr r30, r4 ; CHECK-P9-NEXT: add r28, r30, r29 +; CHECK-P9-NEXT: li r26, 3 +; CHECK-P9-NEXT: oris r3, r3, 21845 ; CHECK-P9-NEXT: ori r27, r3, 21846 ; CHECK-P9-NEXT: b .LBB1_4 ; CHECK-P9-NEXT: .p2align 4 @@ -93,12 +98,9 @@ ; CHECK-P9-NEXT: mr r30, r3 ; CHECK-P9-NEXT: extsw r3, r28 ; CHECK-P9-NEXT: mulld r4, r3, r27 -; CHECK-P9-NEXT: rldicl r5, r4, 1, 63 -; CHECK-P9-NEXT: rldicl r4, r4, 32, 32 -; CHECK-P9-NEXT: add r4, r4, r5 -; CHECK-P9-NEXT: slwi r5, r4, 1 -; CHECK-P9-NEXT: add r4, r4, r5 -; CHECK-P9-NEXT: subf r3, r4, r3 +; CHECK-P9-NEXT: rlwinm r3, r3, 2, 30, 30 +; CHECK-P9-NEXT: mulhdu r4, r4, r26 +; CHECK-P9-NEXT: subf r3, r3, r4 ; CHECK-P9-NEXT: cmplwi r3, 1 ; CHECK-P9-NEXT: beq cr0, .LBB1_1 ; CHECK-P9-NEXT: # %bb.5: # %while.cond @@ -139,6 +141,7 @@ ; CHECK-P9-NEXT: ld r29, -24(r1) # 8-byte Folded Reload ; CHECK-P9-NEXT: ld r28, -32(r1) # 8-byte Folded Reload ; CHECK-P9-NEXT: ld r27, -40(r1) # 8-byte Folded Reload +; CHECK-P9-NEXT: ld r26, -48(r1) # 8-byte Folded Reload ; CHECK-P9-NEXT: blr entry: %add = add nsw i32 %y, %x Index: llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll +++ llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll @@ -11,66 +11,72 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { ; P9LE-LABEL: fold_srem_vec_1: ; P9LE: # %bb.0: +; P9LE-NEXT: lis r5, 689 ; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: ori r5, r5, 55878 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: lis r5, -21386 -; P9LE-NEXT: ori r5, r5, 37253 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: mulld r5, r4, r5 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: lis r5, 31710 -; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: oris r5, r5, 4139 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: ori r5, r5, 7589 +; P9LE-NEXT: andi. r4, r4, 94 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 95 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: lis r5, 528 +; P9LE-NEXT: ori r5, r5, 33825 +; P9LE-NEXT: sldi r5, r5, 32 ; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: oris r5, r5, 2114 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: ori r5, r5, 4229 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: ori r5, r5, 63421 -; P9LE-NEXT: mulld r5, r4, r5 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r4, r5 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: lis r5, 21399 -; P9LE-NEXT: mulli r4, r4, -124 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 124 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: andi. r4, r4, 123 ; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: lis r5, 668 ; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: ori r5, r5, 48148 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: oris r5, r5, 58848 +; P9LE-NEXT: ori r5, r5, 42800 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: ori r5, r5, 33437 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: rldicl r5, r4, 1, 63 -; P9LE-NEXT: rldicl r4, r4, 32, 32 -; P9LE-NEXT: srawi r4, r4, 5 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: lis r5, -16728 -; P9LE-NEXT: mulli r4, r4, 98 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 98 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: andi. r4, r4, 97 ; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: lis r5, 65 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: ori r5, r5, 22280 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: oris r5, r5, 61158 +; P9LE-NEXT: ori r5, r5, 14506 +; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: ori r5, r5, 63249 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: rldicl r5, r4, 1, 63 -; P9LE-NEXT: rldicl r4, r4, 32, 32 -; P9LE-NEXT: srawi r4, r4, 8 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, -1003 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 1003 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: andi. r4, r4, 1002 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 @@ -80,203 +86,219 @@ ; ; P9BE-LABEL: fold_srem_vec_1: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: lis r5, 65 +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: ori r5, r5, 22280 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: sldi r5, r5, 32 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: lis r4, 31710 -; P9BE-NEXT: ori r4, r4, 63421 +; P9BE-NEXT: oris r5, r5, 61158 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r4, r3, r4 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, -124 +; P9BE-NEXT: ori r5, r5, 14506 +; P9BE-NEXT: andi. r4, r4, 1002 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 1003 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: lis r5, 668 +; P9BE-NEXT: ori r5, r5, 48148 +; P9BE-NEXT: sldi r5, r5, 32 ; P9BE-NEXT: subf r3, r4, r3 -; P9BE-NEXT: lis r4, -21386 +; P9BE-NEXT: oris r5, r5, 58848 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r5, r5, 42800 ; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 37253 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 98 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: andi. r4, r4, 97 ; P9BE-NEXT: subf r3, r4, r3 -; P9BE-NEXT: lis r4, -16728 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: lis r5, 528 +; P9BE-NEXT: ori r5, r5, 33825 +; P9BE-NEXT: sldi r5, r5, 32 +; P9BE-NEXT: oris r5, r5, 2114 +; P9BE-NEXT: ori r5, r5, 4229 ; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 63249 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r5, r4, 1, 63 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: srawi r4, r4, 8 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, -1003 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 124 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: andi. r4, r4, 123 ; P9BE-NEXT: subf r3, r4, r3 -; P9BE-NEXT: lis r4, 21399 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: lis r5, 689 ; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: ori r5, r5, 55878 +; P9BE-NEXT: sldi r5, r5, 32 +; P9BE-NEXT: oris r5, r5, 4139 +; P9BE-NEXT: ori r5, r5, 7589 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 33437 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r5, r4, 1, 63 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: srawi r4, r4, 5 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 98 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 95 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: andi. r4, r4, 94 ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: vmrghh v2, v2, v4 -; P9BE-NEXT: vmrghw v2, v3, v2 +; P9BE-NEXT: vmrghw v2, v2, v3 ; P9BE-NEXT: blr ; ; P8LE-LABEL: fold_srem_vec_1: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r4, 21399 -; P8LE-NEXT: lis r9, -16728 -; P8LE-NEXT: lis r11, -21386 -; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; P8LE-NEXT: ori r4, r4, 33437 -; P8LE-NEXT: ori r9, r9, 63249 -; P8LE-NEXT: ori r11, r11, 37253 -; P8LE-NEXT: mfvsrd r5, f0 -; P8LE-NEXT: rldicl r3, r5, 32, 48 -; P8LE-NEXT: rldicl r6, r5, 16, 48 -; P8LE-NEXT: clrldi r7, r5, 48 -; P8LE-NEXT: extsh r8, r3 -; P8LE-NEXT: extsh r10, r6 -; P8LE-NEXT: rldicl r5, r5, 48, 48 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: lis r4, 528 +; P8LE-NEXT: lis r5, 668 +; P8LE-NEXT: lis r6, 65 +; P8LE-NEXT: li r11, 95 +; P8LE-NEXT: ori r3, r3, 55878 +; P8LE-NEXT: ori r4, r4, 33825 +; P8LE-NEXT: ori r5, r5, 48148 +; P8LE-NEXT: ori r6, r6, 22280 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: sldi r4, r4, 32 +; P8LE-NEXT: mfvsrd r7, f0 +; P8LE-NEXT: oris r3, r3, 4139 +; P8LE-NEXT: sldi r5, r5, 32 +; P8LE-NEXT: oris r4, r4, 2114 +; P8LE-NEXT: ori r3, r3, 7589 +; P8LE-NEXT: sldi r6, r6, 32 +; P8LE-NEXT: oris r5, r5, 58848 +; P8LE-NEXT: ori r4, r4, 4229 +; P8LE-NEXT: clrldi r8, r7, 48 +; P8LE-NEXT: rldicl r9, r7, 48, 48 +; P8LE-NEXT: oris r6, r6, 61158 +; P8LE-NEXT: ori r5, r5, 42800 +; P8LE-NEXT: extsh r8, r8 +; P8LE-NEXT: rldicl r10, r7, 32, 48 +; P8LE-NEXT: extsh r9, r9 +; P8LE-NEXT: ori r6, r6, 14506 ; P8LE-NEXT: extsw r8, r8 -; P8LE-NEXT: extsh r12, r7 +; P8LE-NEXT: rldicl r7, r7, 16, 48 +; P8LE-NEXT: extsh r10, r10 +; P8LE-NEXT: extsw r9, r9 +; P8LE-NEXT: mulld r3, r8, r3 +; P8LE-NEXT: extsh r7, r7 ; P8LE-NEXT: extsw r10, r10 -; P8LE-NEXT: mulld r4, r8, r4 -; P8LE-NEXT: lis r8, 31710 -; P8LE-NEXT: extsh r0, r5 -; P8LE-NEXT: extsw r12, r12 -; P8LE-NEXT: mulld r9, r10, r9 -; P8LE-NEXT: ori r8, r8, 63421 -; P8LE-NEXT: extsw r10, r0 -; P8LE-NEXT: mulld r11, r12, r11 -; P8LE-NEXT: mulld r8, r10, r8 -; P8LE-NEXT: rldicl r0, r4, 1, 63 -; P8LE-NEXT: rldicl r4, r4, 32, 32 -; P8LE-NEXT: rldicl r30, r9, 1, 63 -; P8LE-NEXT: rldicl r9, r9, 32, 32 -; P8LE-NEXT: rldicl r11, r11, 32, 32 -; P8LE-NEXT: rldicl r8, r8, 32, 32 -; P8LE-NEXT: add r11, r11, r12 -; P8LE-NEXT: srawi r4, r4, 5 -; P8LE-NEXT: subf r8, r10, r8 -; P8LE-NEXT: srawi r9, r9, 8 -; P8LE-NEXT: srwi r10, r11, 31 -; P8LE-NEXT: add r4, r4, r0 -; P8LE-NEXT: srawi r11, r11, 6 -; P8LE-NEXT: add r9, r9, r30 -; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload -; P8LE-NEXT: add r10, r11, r10 -; P8LE-NEXT: srwi r11, r8, 31 -; P8LE-NEXT: srawi r8, r8, 6 -; P8LE-NEXT: mulli r4, r4, 98 -; P8LE-NEXT: mulli r9, r9, -1003 -; P8LE-NEXT: add r8, r8, r11 -; P8LE-NEXT: mulli r10, r10, 95 -; P8LE-NEXT: mulli r8, r8, -124 -; P8LE-NEXT: subf r3, r4, r3 -; P8LE-NEXT: subf r4, r9, r6 +; P8LE-NEXT: mulld r4, r9, r4 +; P8LE-NEXT: extsw r7, r7 +; P8LE-NEXT: mulld r5, r10, r5 +; P8LE-NEXT: mulld r6, r7, r6 +; P8LE-NEXT: srawi r8, r8, 31 +; P8LE-NEXT: srawi r9, r9, 31 +; P8LE-NEXT: mulhdu r3, r3, r11 +; P8LE-NEXT: li r11, 124 +; P8LE-NEXT: andi. r8, r8, 94 +; P8LE-NEXT: andi. r9, r9, 123 +; P8LE-NEXT: mulhdu r4, r4, r11 +; P8LE-NEXT: li r11, 98 +; P8LE-NEXT: mulhdu r5, r5, r11 +; P8LE-NEXT: li r11, 1003 +; P8LE-NEXT: mulhdu r6, r6, r11 +; P8LE-NEXT: subf r3, r8, r3 +; P8LE-NEXT: srawi r8, r10, 31 ; P8LE-NEXT: mtvsrd f0, r3 -; P8LE-NEXT: subf r3, r10, r7 +; P8LE-NEXT: srawi r3, r7, 31 +; P8LE-NEXT: andi. r7, r8, 97 +; P8LE-NEXT: subf r4, r9, r4 +; P8LE-NEXT: andi. r3, r3, 1002 +; P8LE-NEXT: subf r5, r7, r5 ; P8LE-NEXT: mtvsrd f1, r4 -; P8LE-NEXT: subf r4, r8, r5 -; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: subf r3, r3, r6 +; P8LE-NEXT: mtvsrd f2, r5 ; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: mtvsrd f3, r3 ; P8LE-NEXT: xxswapd v3, vs1 ; P8LE-NEXT: xxswapd v4, vs2 ; P8LE-NEXT: xxswapd v5, vs3 ; P8LE-NEXT: vmrglh v2, v3, v2 ; P8LE-NEXT: vmrglh v3, v5, v4 -; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: vmrglw v2, v3, v2 ; P8LE-NEXT: blr ; ; P8BE-LABEL: fold_srem_vec_1: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, -16728 -; P8BE-NEXT: lis r9, 31710 -; P8BE-NEXT: lis r8, 21399 -; P8BE-NEXT: lis r10, -21386 -; P8BE-NEXT: ori r3, r3, 63249 -; P8BE-NEXT: ori r9, r9, 63421 -; P8BE-NEXT: ori r8, r8, 33437 -; P8BE-NEXT: ori r10, r10, 37253 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r7, r4, 32, 48 -; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: rldicl r4, r4, 16, 48 -; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: lis r3, 65 +; P8BE-NEXT: mfvsrd r7, v2 +; P8BE-NEXT: lis r4, 668 +; P8BE-NEXT: lis r5, 528 +; P8BE-NEXT: lis r6, 689 +; P8BE-NEXT: li r11, 1003 +; P8BE-NEXT: ori r3, r3, 22280 +; P8BE-NEXT: ori r4, r4, 48148 +; P8BE-NEXT: ori r5, r5, 33825 +; P8BE-NEXT: ori r6, r6, 55878 +; P8BE-NEXT: li r12, 98 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: clrldi r8, r7, 48 +; P8BE-NEXT: oris r3, r3, 61158 +; P8BE-NEXT: extsh r8, r8 +; P8BE-NEXT: sldi r4, r4, 32 +; P8BE-NEXT: rldicl r9, r7, 48, 48 +; P8BE-NEXT: ori r3, r3, 14506 +; P8BE-NEXT: extsw r8, r8 +; P8BE-NEXT: sldi r5, r5, 32 +; P8BE-NEXT: sldi r6, r6, 32 +; P8BE-NEXT: oris r4, r4, 58848 +; P8BE-NEXT: extsh r9, r9 +; P8BE-NEXT: rldicl r10, r7, 32, 48 +; P8BE-NEXT: rldicl r7, r7, 16, 48 +; P8BE-NEXT: oris r5, r5, 2114 +; P8BE-NEXT: oris r6, r6, 4139 +; P8BE-NEXT: ori r4, r4, 42800 +; P8BE-NEXT: extsw r9, r9 +; P8BE-NEXT: mulld r3, r8, r3 +; P8BE-NEXT: extsh r10, r10 ; P8BE-NEXT: extsh r7, r7 -; P8BE-NEXT: extsh r6, r6 -; P8BE-NEXT: extsw r5, r5 -; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: ori r5, r5, 4229 +; P8BE-NEXT: ori r6, r6, 7589 +; P8BE-NEXT: extsw r10, r10 ; P8BE-NEXT: extsw r7, r7 -; P8BE-NEXT: extsw r6, r6 -; P8BE-NEXT: mulld r3, r5, r3 -; P8BE-NEXT: extsw r4, r4 -; P8BE-NEXT: mulld r9, r7, r9 -; P8BE-NEXT: mulld r8, r6, r8 -; P8BE-NEXT: mulld r10, r4, r10 -; P8BE-NEXT: rldicl r11, r3, 1, 63 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: rldicl r9, r9, 32, 32 -; P8BE-NEXT: rldicl r12, r8, 1, 63 -; P8BE-NEXT: rldicl r8, r8, 32, 32 -; P8BE-NEXT: rldicl r10, r10, 32, 32 -; P8BE-NEXT: subf r9, r7, r9 -; P8BE-NEXT: srawi r3, r3, 8 -; P8BE-NEXT: srawi r8, r8, 5 -; P8BE-NEXT: add r10, r10, r4 -; P8BE-NEXT: add r3, r3, r11 -; P8BE-NEXT: srwi r11, r9, 31 -; P8BE-NEXT: add r8, r8, r12 -; P8BE-NEXT: srawi r9, r9, 6 -; P8BE-NEXT: mulli r3, r3, -1003 -; P8BE-NEXT: add r9, r9, r11 -; P8BE-NEXT: srwi r11, r10, 31 -; P8BE-NEXT: srawi r10, r10, 6 -; P8BE-NEXT: mulli r8, r8, 98 -; P8BE-NEXT: add r10, r10, r11 -; P8BE-NEXT: mulli r9, r9, -124 -; P8BE-NEXT: mulli r10, r10, 95 -; P8BE-NEXT: subf r3, r3, r5 +; P8BE-NEXT: mulld r4, r9, r4 +; P8BE-NEXT: mulld r5, r10, r5 +; P8BE-NEXT: mulld r6, r7, r6 +; P8BE-NEXT: srawi r8, r8, 31 +; P8BE-NEXT: mulhdu r3, r3, r11 +; P8BE-NEXT: li r11, 124 +; P8BE-NEXT: andi. r8, r8, 1002 +; P8BE-NEXT: srawi r9, r9, 31 +; P8BE-NEXT: srawi r10, r10, 31 +; P8BE-NEXT: mulhdu r4, r4, r12 +; P8BE-NEXT: li r12, 95 +; P8BE-NEXT: mulhdu r5, r5, r11 +; P8BE-NEXT: mulhdu r6, r6, r12 +; P8BE-NEXT: subf r3, r8, r3 +; P8BE-NEXT: srawi r7, r7, 31 +; P8BE-NEXT: andi. r8, r9, 97 ; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: subf r5, r8, r6 +; P8BE-NEXT: subf r4, r8, r4 +; P8BE-NEXT: andi. r8, r10, 123 ; P8BE-NEXT: mtvsrd v2, r3 -; P8BE-NEXT: subf r6, r9, r7 -; P8BE-NEXT: sldi r3, r5, 48 -; P8BE-NEXT: subf r4, r10, r4 -; P8BE-NEXT: mtvsrd v3, r3 -; P8BE-NEXT: sldi r3, r6, 48 +; P8BE-NEXT: andi. r3, r7, 94 +; P8BE-NEXT: subf r5, r8, r5 +; P8BE-NEXT: subf r3, r3, r6 ; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: mtvsrd v4, r3 -; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mtvsrd v3, r4 +; P8BE-NEXT: mtvsrd v4, r5 +; P8BE-NEXT: mtvsrd v5, r3 ; P8BE-NEXT: vmrghh v2, v3, v2 ; P8BE-NEXT: vmrghh v3, v5, v4 ; P8BE-NEXT: vmrghw v2, v3, v2 @@ -288,60 +310,52 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { ; P9LE-LABEL: fold_srem_vec_2: ; P9LE: # %bb.0: +; P9LE-NEXT: lis r5, 689 ; P9LE-NEXT: li r3, 0 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: lis r5, -21386 -; P9LE-NEXT: ori r5, r5, 37253 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: mulld r6, r4, r5 -; P9LE-NEXT: rldicl r6, r6, 32, 32 -; P9LE-NEXT: add r4, r6, r4 -; P9LE-NEXT: srwi r6, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r6 -; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: ori r5, r5, 55878 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: oris r5, r5, 4139 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: li r6, 95 +; P9LE-NEXT: ori r5, r5, 7589 +; P9LE-NEXT: andi. r4, r4, 94 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: mulhdu r3, r3, r6 ; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: mulld r6, r4, r5 -; P9LE-NEXT: rldicl r6, r6, 32, 32 -; P9LE-NEXT: add r4, r6, r4 -; P9LE-NEXT: srwi r6, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r6 -; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: mulhdu r3, r3, r6 +; P9LE-NEXT: andi. r4, r4, 94 ; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: mulld r6, r4, r5 -; P9LE-NEXT: rldicl r6, r6, 32, 32 -; P9LE-NEXT: add r4, r6, r4 -; P9LE-NEXT: srwi r6, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r6 -; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: mulhdu r3, r3, r6 +; P9LE-NEXT: andi. r4, r4, 94 ; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: mulld r5, r4, r5 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: mulhdu r3, r3, r6 +; P9LE-NEXT: andi. r4, r4, 94 ; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: xxswapd v4, vs0 @@ -353,62 +367,54 @@ ; ; P9BE-LABEL: fold_srem_vec_2: ; P9BE: # %bb.0: +; P9BE-NEXT: lis r5, 689 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: ori r5, r5, 55878 +; P9BE-NEXT: sldi r5, r5, 32 +; P9BE-NEXT: oris r5, r5, 4139 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: lis r4, -21386 -; P9BE-NEXT: ori r4, r4, 37253 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: mulld r5, r3, r4 -; P9BE-NEXT: rldicl r5, r5, 32, 32 -; P9BE-NEXT: add r5, r5, r3 -; P9BE-NEXT: srwi r6, r5, 31 -; P9BE-NEXT: srawi r5, r5, 6 -; P9BE-NEXT: add r5, r5, r6 -; P9BE-NEXT: mulli r5, r5, 95 -; P9BE-NEXT: subf r3, r5, r3 +; P9BE-NEXT: li r6, 95 +; P9BE-NEXT: ori r5, r5, 7589 +; P9BE-NEXT: andi. r4, r4, 94 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: mulhdu r3, r3, r6 +; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: mulld r5, r3, r4 -; P9BE-NEXT: rldicl r5, r5, 32, 32 -; P9BE-NEXT: add r5, r5, r3 -; P9BE-NEXT: srwi r6, r5, 31 -; P9BE-NEXT: srawi r5, r5, 6 -; P9BE-NEXT: add r5, r5, r6 -; P9BE-NEXT: mulli r5, r5, 95 -; P9BE-NEXT: subf r3, r5, r3 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: mulhdu r3, r3, r6 +; P9BE-NEXT: andi. r4, r4, 94 +; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: mulld r5, r3, r4 -; P9BE-NEXT: rldicl r5, r5, 32, 32 -; P9BE-NEXT: add r5, r5, r3 -; P9BE-NEXT: srwi r6, r5, 31 -; P9BE-NEXT: srawi r5, r5, 6 -; P9BE-NEXT: add r5, r5, r6 -; P9BE-NEXT: mulli r5, r5, 95 -; P9BE-NEXT: subf r3, r5, r3 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: mulhdu r3, r3, r6 +; P9BE-NEXT: andi. r4, r4, 94 +; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: mulhdu r3, r3, r6 +; P9BE-NEXT: andi. r4, r4, 94 ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v2, r3 @@ -419,60 +425,50 @@ ; P8LE-LABEL: fold_srem_vec_2: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r4, -21386 -; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; P8LE-NEXT: ori r4, r4, 37253 -; P8LE-NEXT: mfvsrd r5, f0 -; P8LE-NEXT: clrldi r3, r5, 48 -; P8LE-NEXT: rldicl r7, r5, 32, 48 -; P8LE-NEXT: extsh r8, r3 -; P8LE-NEXT: rldicl r6, r5, 48, 48 -; P8LE-NEXT: extsh r10, r7 -; P8LE-NEXT: rldicl r5, r5, 16, 48 -; P8LE-NEXT: extsw r8, r8 -; P8LE-NEXT: extsh r9, r6 -; P8LE-NEXT: extsw r10, r10 -; P8LE-NEXT: extsh r11, r5 -; P8LE-NEXT: mulld r12, r8, r4 -; P8LE-NEXT: extsw r9, r9 -; P8LE-NEXT: extsw r11, r11 -; P8LE-NEXT: mulld r30, r10, r4 -; P8LE-NEXT: mulld r0, r9, r4 -; P8LE-NEXT: mulld r4, r11, r4 -; P8LE-NEXT: rldicl r12, r12, 32, 32 -; P8LE-NEXT: add r8, r12, r8 -; P8LE-NEXT: rldicl r12, r30, 32, 32 -; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload -; P8LE-NEXT: rldicl r0, r0, 32, 32 -; P8LE-NEXT: rldicl r4, r4, 32, 32 -; P8LE-NEXT: add r10, r12, r10 -; P8LE-NEXT: add r9, r0, r9 -; P8LE-NEXT: srwi r0, r8, 31 -; P8LE-NEXT: add r4, r4, r11 -; P8LE-NEXT: srwi r11, r10, 31 -; P8LE-NEXT: srawi r8, r8, 6 -; P8LE-NEXT: srawi r10, r10, 6 -; P8LE-NEXT: srwi r12, r9, 31 -; P8LE-NEXT: add r8, r8, r0 -; P8LE-NEXT: srawi r9, r9, 6 -; P8LE-NEXT: add r10, r10, r11 -; P8LE-NEXT: srwi r11, r4, 31 -; P8LE-NEXT: srawi r4, r4, 6 -; P8LE-NEXT: add r9, r9, r12 -; P8LE-NEXT: mulli r8, r8, 95 -; P8LE-NEXT: add r4, r4, r11 -; P8LE-NEXT: mulli r9, r9, 95 -; P8LE-NEXT: mulli r10, r10, 95 -; P8LE-NEXT: mulli r4, r4, 95 -; P8LE-NEXT: subf r3, r8, r3 -; P8LE-NEXT: subf r6, r9, r6 -; P8LE-NEXT: mtvsrd f0, r3 -; P8LE-NEXT: subf r3, r10, r7 -; P8LE-NEXT: subf r4, r4, r5 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: li r11, 95 +; P8LE-NEXT: ori r3, r3, 55878 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: oris r3, r3, 4139 +; P8LE-NEXT: ori r3, r3, 7589 +; P8LE-NEXT: clrldi r5, r4, 48 +; P8LE-NEXT: rldicl r6, r4, 48, 48 +; P8LE-NEXT: extsh r5, r5 +; P8LE-NEXT: rldicl r7, r4, 32, 48 +; P8LE-NEXT: extsh r6, r6 +; P8LE-NEXT: rldicl r4, r4, 16, 48 +; P8LE-NEXT: extsw r5, r5 +; P8LE-NEXT: extsh r7, r7 +; P8LE-NEXT: extsw r6, r6 +; P8LE-NEXT: extsh r4, r4 +; P8LE-NEXT: mulld r8, r5, r3 +; P8LE-NEXT: extsw r7, r7 +; P8LE-NEXT: extsw r4, r4 +; P8LE-NEXT: mulld r9, r6, r3 +; P8LE-NEXT: mulld r10, r7, r3 +; P8LE-NEXT: mulld r3, r4, r3 +; P8LE-NEXT: srawi r5, r5, 31 +; P8LE-NEXT: srawi r6, r6, 31 +; P8LE-NEXT: mulhdu r8, r8, r11 +; P8LE-NEXT: andi. r5, r5, 94 +; P8LE-NEXT: andi. r6, r6, 94 +; P8LE-NEXT: mulhdu r9, r9, r11 +; P8LE-NEXT: srawi r7, r7, 31 +; P8LE-NEXT: mulhdu r10, r10, r11 +; P8LE-NEXT: mulhdu r3, r3, r11 +; P8LE-NEXT: srawi r4, r4, 31 +; P8LE-NEXT: subf r5, r5, r8 +; P8LE-NEXT: andi. r4, r4, 94 +; P8LE-NEXT: mtvsrd f0, r5 +; P8LE-NEXT: andi. r5, r7, 94 +; P8LE-NEXT: subf r6, r6, r9 +; P8LE-NEXT: subf r5, r5, r10 +; P8LE-NEXT: subf r3, r4, r3 ; P8LE-NEXT: mtvsrd f1, r6 -; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: mtvsrd f2, r5 ; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: mtvsrd f3, r3 ; P8LE-NEXT: xxswapd v3, vs1 ; P8LE-NEXT: xxswapd v4, vs2 ; P8LE-NEXT: xxswapd v5, vs3 @@ -483,62 +479,54 @@ ; ; P8BE-LABEL: fold_srem_vec_2: ; P8BE: # %bb.0: +; P8BE-NEXT: lis r3, 689 ; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, -21386 -; P8BE-NEXT: ori r3, r3, 37253 +; P8BE-NEXT: li r11, 95 +; P8BE-NEXT: ori r3, r3, 55878 +; P8BE-NEXT: sldi r3, r3, 32 ; P8BE-NEXT: clrldi r5, r4, 48 ; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: extsh r5, r5 ; P8BE-NEXT: rldicl r7, r4, 32, 48 -; P8BE-NEXT: extsh r6, r6 -; P8BE-NEXT: extsw r5, r5 +; P8BE-NEXT: oris r3, r3, 4139 +; P8BE-NEXT: extsh r5, r5 ; P8BE-NEXT: rldicl r4, r4, 16, 48 +; P8BE-NEXT: extsh r6, r6 ; P8BE-NEXT: extsh r7, r7 -; P8BE-NEXT: extsw r6, r6 -; P8BE-NEXT: mulld r8, r5, r3 +; P8BE-NEXT: ori r3, r3, 7589 +; P8BE-NEXT: extsw r5, r5 ; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: extsw r6, r6 ; P8BE-NEXT: extsw r7, r7 -; P8BE-NEXT: mulld r9, r6, r3 ; P8BE-NEXT: extsw r4, r4 +; P8BE-NEXT: mulld r8, r5, r3 +; P8BE-NEXT: mulld r9, r6, r3 ; P8BE-NEXT: mulld r10, r7, r3 ; P8BE-NEXT: mulld r3, r4, r3 -; P8BE-NEXT: rldicl r8, r8, 32, 32 -; P8BE-NEXT: rldicl r9, r9, 32, 32 -; P8BE-NEXT: add r8, r8, r5 -; P8BE-NEXT: rldicl r10, r10, 32, 32 -; P8BE-NEXT: add r9, r9, r6 -; P8BE-NEXT: srwi r11, r8, 31 -; P8BE-NEXT: srawi r8, r8, 6 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: add r10, r10, r7 -; P8BE-NEXT: add r8, r8, r11 -; P8BE-NEXT: srwi r11, r9, 31 -; P8BE-NEXT: add r3, r3, r4 -; P8BE-NEXT: srawi r9, r9, 6 -; P8BE-NEXT: mulli r8, r8, 95 -; P8BE-NEXT: add r9, r9, r11 -; P8BE-NEXT: srwi r11, r10, 31 -; P8BE-NEXT: srawi r10, r10, 6 -; P8BE-NEXT: mulli r9, r9, 95 -; P8BE-NEXT: add r10, r10, r11 -; P8BE-NEXT: srwi r11, r3, 31 -; P8BE-NEXT: srawi r3, r3, 6 -; P8BE-NEXT: mulli r10, r10, 95 -; P8BE-NEXT: subf r5, r8, r5 -; P8BE-NEXT: add r3, r3, r11 +; P8BE-NEXT: srawi r5, r5, 31 +; P8BE-NEXT: srawi r6, r6, 31 +; P8BE-NEXT: srawi r7, r7, 31 +; P8BE-NEXT: andi. r5, r5, 94 +; P8BE-NEXT: mulhdu r8, r8, r11 +; P8BE-NEXT: srawi r4, r4, 31 +; P8BE-NEXT: andi. r6, r6, 94 +; P8BE-NEXT: andi. r7, r7, 94 +; P8BE-NEXT: mulhdu r9, r9, r11 +; P8BE-NEXT: mulhdu r10, r10, r11 +; P8BE-NEXT: andi. r4, r4, 94 +; P8BE-NEXT: mulhdu r3, r3, r11 +; P8BE-NEXT: subf r5, r5, r8 +; P8BE-NEXT: subf r6, r6, r9 +; P8BE-NEXT: subf r7, r7, r10 +; P8BE-NEXT: subf r3, r4, r3 ; P8BE-NEXT: sldi r5, r5, 48 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: subf r6, r9, r6 -; P8BE-NEXT: mtvsrd v2, r5 ; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: subf r7, r10, r7 -; P8BE-NEXT: mtvsrd v3, r6 -; P8BE-NEXT: subf r3, r3, r4 ; P8BE-NEXT: sldi r4, r7, 48 -; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: mtvsrd v2, r5 ; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mtvsrd v3, r6 ; P8BE-NEXT: mtvsrd v4, r4 ; P8BE-NEXT: mtvsrd v5, r3 +; P8BE-NEXT: vmrghh v2, v3, v2 ; P8BE-NEXT: vmrghh v3, v5, v4 ; P8BE-NEXT: vmrghw v2, v3, v2 ; P8BE-NEXT: blr @@ -880,23 +868,6 @@ ; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: lis r5, -21386 -; P9LE-NEXT: ori r5, r5, 37253 -; P9LE-NEXT: xxswapd v4, vs0 -; P9LE-NEXT: vmrglh v3, v4, v3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: mulld r5, r4, r5 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: extsh r4, r3 @@ -904,10 +875,28 @@ ; P9LE-NEXT: addze r4, r4 ; P9LE-NEXT: slwi r4, r4, 3 ; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: lis r5, 689 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: ori r5, r5, 55878 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: oris r5, r5, 4139 +; P9LE-NEXT: ori r5, r5, 7589 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 95 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: andi. r4, r4, 94 +; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 -; P9LE-NEXT: vmrglh v2, v4, v2 +; P9LE-NEXT: vmrglh v2, v2, v4 ; P9LE-NEXT: vmrglw v2, v2, v3 ; P9LE-NEXT: blr ; @@ -929,130 +918,133 @@ ; P9BE-NEXT: addze r4, r4 ; P9BE-NEXT: slwi r4, r4, 6 ; P9BE-NEXT: subf r3, r4, r3 -; P9BE-NEXT: lis r4, -21386 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 37253 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: srawi r4, r3, 3 +; P9BE-NEXT: addze r4, r4 +; P9BE-NEXT: slwi r4, r4, 3 ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: lis r5, 689 ; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: ori r5, r5, 55878 +; P9BE-NEXT: sldi r5, r5, 32 +; P9BE-NEXT: oris r5, r5, 4139 +; P9BE-NEXT: ori r5, r5, 7589 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: srawi r4, r3, 3 -; P9BE-NEXT: addze r4, r4 -; P9BE-NEXT: slwi r4, r4, 3 +; P9BE-NEXT: srawi r4, r3, 31 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 95 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: andi. r4, r4, 94 ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghh v2, v4, v2 ; P9BE-NEXT: vmrghw v2, v3, v2 ; P9BE-NEXT: blr ; ; P8LE-LABEL: dont_fold_srem_power_of_two: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, -21386 -; P8LE-NEXT: ori r3, r3, 37253 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: li r9, 95 +; P8LE-NEXT: ori r3, r3, 55878 +; P8LE-NEXT: sldi r3, r3, 32 ; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: oris r3, r3, 4139 +; P8LE-NEXT: ori r3, r3, 7589 ; P8LE-NEXT: rldicl r5, r4, 16, 48 -; P8LE-NEXT: clrldi r7, r4, 48 -; P8LE-NEXT: extsh r6, r5 -; P8LE-NEXT: extsh r8, r7 -; P8LE-NEXT: extsw r6, r6 -; P8LE-NEXT: rldicl r9, r4, 48, 48 -; P8LE-NEXT: mulld r3, r6, r3 +; P8LE-NEXT: clrldi r6, r4, 48 +; P8LE-NEXT: extsh r5, r5 +; P8LE-NEXT: extsh r8, r6 +; P8LE-NEXT: extsw r5, r5 +; P8LE-NEXT: rldicl r7, r4, 48, 48 +; P8LE-NEXT: mulld r3, r5, r3 ; P8LE-NEXT: srawi r8, r8, 6 -; P8LE-NEXT: extsh r10, r9 +; P8LE-NEXT: extsh r10, r7 ; P8LE-NEXT: addze r8, r8 ; P8LE-NEXT: rldicl r4, r4, 32, 48 ; P8LE-NEXT: srawi r10, r10, 5 ; P8LE-NEXT: slwi r8, r8, 6 -; P8LE-NEXT: subf r7, r8, r7 -; P8LE-NEXT: rldicl r3, r3, 32, 32 -; P8LE-NEXT: mtvsrd f0, r7 -; P8LE-NEXT: add r3, r3, r6 -; P8LE-NEXT: addze r6, r10 -; P8LE-NEXT: srwi r10, r3, 31 -; P8LE-NEXT: srawi r3, r3, 6 -; P8LE-NEXT: slwi r6, r6, 5 -; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: add r3, r3, r10 -; P8LE-NEXT: extsh r10, r4 -; P8LE-NEXT: subf r6, r6, r9 -; P8LE-NEXT: mulli r3, r3, 95 -; P8LE-NEXT: srawi r8, r10, 3 +; P8LE-NEXT: addze r10, r10 +; P8LE-NEXT: subf r6, r8, r6 +; P8LE-NEXT: mulhdu r3, r3, r9 +; P8LE-NEXT: extsh r9, r4 +; P8LE-NEXT: slwi r8, r10, 5 +; P8LE-NEXT: mtvsrd f0, r6 +; P8LE-NEXT: srawi r9, r9, 3 +; P8LE-NEXT: subf r6, r8, r7 +; P8LE-NEXT: addze r9, r9 +; P8LE-NEXT: srawi r5, r5, 31 ; P8LE-NEXT: mtvsrd f1, r6 -; P8LE-NEXT: addze r7, r8 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: slwi r7, r9, 3 +; P8LE-NEXT: andi. r5, r5, 94 +; P8LE-NEXT: subf r4, r7, r4 +; P8LE-NEXT: subf r3, r5, r3 ; P8LE-NEXT: xxswapd v3, vs1 -; P8LE-NEXT: subf r3, r3, r5 -; P8LE-NEXT: slwi r5, r7, 3 -; P8LE-NEXT: subf r4, r5, r4 -; P8LE-NEXT: mtvsrd f2, r3 -; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: mtvsrd f2, r4 +; P8LE-NEXT: mtvsrd f3, r3 ; P8LE-NEXT: xxswapd v4, vs2 ; P8LE-NEXT: vmrglh v2, v3, v2 ; P8LE-NEXT: xxswapd v5, vs3 -; P8LE-NEXT: vmrglh v3, v4, v5 +; P8LE-NEXT: vmrglh v3, v5, v4 ; P8LE-NEXT: vmrglw v2, v3, v2 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_srem_power_of_two: ; P8BE: # %bb.0: +; P8BE-NEXT: lis r3, 689 ; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, -21386 -; P8BE-NEXT: ori r3, r3, 37253 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 32, 48 +; P8BE-NEXT: li r9, 95 +; P8BE-NEXT: ori r3, r3, 55878 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: rldicl r5, r4, 32, 48 +; P8BE-NEXT: rldicl r6, r4, 16, 48 +; P8BE-NEXT: rldicl r7, r4, 48, 48 +; P8BE-NEXT: oris r3, r3, 4139 ; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: clrldi r4, r4, 48 +; P8BE-NEXT: ori r3, r3, 7589 ; P8BE-NEXT: extsh r6, r6 -; P8BE-NEXT: extsw r5, r5 -; P8BE-NEXT: rldicl r7, r4, 16, 48 -; P8BE-NEXT: mulld r3, r5, r3 -; P8BE-NEXT: srawi r8, r6, 5 ; P8BE-NEXT: extsh r7, r7 -; P8BE-NEXT: addze r8, r8 -; P8BE-NEXT: rldicl r4, r4, 48, 48 -; P8BE-NEXT: srawi r9, r7, 6 ; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: srawi r8, r5, 5 +; P8BE-NEXT: extsw r4, r4 +; P8BE-NEXT: addze r8, r8 +; P8BE-NEXT: mulld r3, r4, r3 ; P8BE-NEXT: slwi r8, r8, 5 -; P8BE-NEXT: addze r9, r9 +; P8BE-NEXT: subf r5, r8, r5 +; P8BE-NEXT: srawi r8, r6, 6 +; P8BE-NEXT: addze r8, r8 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: slwi r8, r8, 6 +; P8BE-NEXT: mtvsrd v2, r5 +; P8BE-NEXT: mulhdu r3, r3, r9 +; P8BE-NEXT: srawi r9, r7, 3 ; P8BE-NEXT: subf r6, r8, r6 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: slwi r8, r9, 6 -; P8BE-NEXT: add r3, r3, r5 -; P8BE-NEXT: subf r7, r8, r7 -; P8BE-NEXT: srwi r10, r3, 31 -; P8BE-NEXT: srawi r3, r3, 6 -; P8BE-NEXT: add r3, r3, r10 -; P8BE-NEXT: srawi r9, r4, 3 -; P8BE-NEXT: mulli r3, r3, 95 +; P8BE-NEXT: addze r9, r9 +; P8BE-NEXT: srawi r4, r4, 31 +; P8BE-NEXT: slwi r8, r9, 3 +; P8BE-NEXT: andi. r4, r4, 94 +; P8BE-NEXT: subf r5, r8, r7 ; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: addze r8, r9 -; P8BE-NEXT: mtvsrd v2, r6 -; P8BE-NEXT: slwi r6, r8, 3 -; P8BE-NEXT: subf r4, r6, r4 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: subf r3, r3, r5 -; P8BE-NEXT: sldi r5, r7, 48 -; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: subf r3, r4, r3 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: mtvsrd v3, r6 ; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v3, r5 -; P8BE-NEXT: mtvsrd v4, r3 +; P8BE-NEXT: mtvsrd v4, r5 +; P8BE-NEXT: mtvsrd v5, r3 ; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: vmrghh v3, v4, v5 ; P8BE-NEXT: vmrghw v2, v2, v3 ; P8BE-NEXT: blr %1 = srem <4 x i16> %x, @@ -1063,111 +1055,119 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; P9LE-LABEL: dont_fold_srem_one: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: lis r5, 2849 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: ori r5, r5, 25644 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: lis r5, -14230 -; P9LE-NEXT: ori r5, r5, 30865 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: mulld r5, r4, r5 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: xxlxor v4, v4, v4 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 9 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: lis r5, -19946 -; P9LE-NEXT: mulli r4, r4, 654 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: oris r5, r5, 34192 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: ori r5, r5, 45591 +; P9LE-NEXT: andi. r4, r4, 22 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 23 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: lis r5, 12 +; P9LE-NEXT: ori r5, r5, 5559 +; P9LE-NEXT: sldi r5, r5, 32 ; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: oris r5, r5, 1244 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: ori r5, r5, 48291 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: ori r5, r5, 17097 -; P9LE-NEXT: mulld r5, r4, r5 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 4 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: lis r5, 24749 -; P9LE-NEXT: mulli r4, r4, 23 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 5423 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: andi. r4, r4, 5422 ; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: lis r5, 100 ; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: ori r5, r5, 13628 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: oris r5, r5, 18438 +; P9LE-NEXT: ori r5, r5, 17236 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: ori r5, r5, 47143 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: rldicl r5, r4, 1, 63 -; P9LE-NEXT: rldicl r4, r4, 32, 32 -; P9LE-NEXT: srawi r4, r4, 11 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 5423 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 654 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: andi. r4, r4, 653 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: vmrglh v3, v3, v4 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxlxor v4, v4, v4 ; P9LE-NEXT: vmrglh v2, v2, v4 -; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: vmrglw v2, v3, v2 ; P9LE-NEXT: blr ; ; P9BE-LABEL: dont_fold_srem_one: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: lis r5, 12 +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: ori r5, r5, 5559 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: sldi r5, r5, 32 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: lis r4, -19946 -; P9BE-NEXT: ori r4, r4, 17097 +; P9BE-NEXT: oris r5, r5, 1244 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 4 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 23 +; P9BE-NEXT: ori r5, r5, 48291 +; P9BE-NEXT: andi. r4, r4, 5422 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 5423 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: lis r5, 2849 +; P9BE-NEXT: ori r5, r5, 25644 +; P9BE-NEXT: sldi r5, r5, 32 ; P9BE-NEXT: subf r3, r4, r3 -; P9BE-NEXT: lis r4, 24749 +; P9BE-NEXT: oris r5, r5, 34192 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r5, r5, 45591 ; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 47143 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r5, r4, 1, 63 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: srawi r4, r4, 11 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 5423 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 23 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: andi. r4, r4, 22 ; P9BE-NEXT: subf r3, r4, r3 -; P9BE-NEXT: lis r4, -14230 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: lis r5, 100 +; P9BE-NEXT: ori r5, r5, 13628 +; P9BE-NEXT: sldi r5, r5, 32 +; P9BE-NEXT: oris r5, r5, 18438 +; P9BE-NEXT: ori r5, r5, 17236 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 30865 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 9 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 654 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 654 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: andi. r4, r4, 653 ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v3, v4 +; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: vmrghh v2, v4, v2 ; P9BE-NEXT: vmrghw v2, v2, v3 @@ -1176,107 +1176,115 @@ ; P8LE-LABEL: dont_fold_srem_one: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 24749 -; P8LE-NEXT: lis r8, -19946 -; P8LE-NEXT: lis r10, -14230 +; P8LE-NEXT: lis r3, 2849 +; P8LE-NEXT: lis r4, 12 +; P8LE-NEXT: lis r5, 100 +; P8LE-NEXT: li r9, 23 ; P8LE-NEXT: xxlxor v5, v5, v5 -; P8LE-NEXT: ori r3, r3, 47143 -; P8LE-NEXT: ori r8, r8, 17097 -; P8LE-NEXT: mfvsrd r4, f0 -; P8LE-NEXT: rldicl r5, r4, 16, 48 -; P8LE-NEXT: rldicl r6, r4, 32, 48 -; P8LE-NEXT: rldicl r4, r4, 48, 48 -; P8LE-NEXT: extsh r7, r5 -; P8LE-NEXT: extsh r9, r6 +; P8LE-NEXT: ori r3, r3, 25644 +; P8LE-NEXT: ori r4, r4, 5559 +; P8LE-NEXT: ori r5, r5, 13628 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: sldi r4, r4, 32 +; P8LE-NEXT: mfvsrd r6, f0 +; P8LE-NEXT: oris r3, r3, 34192 +; P8LE-NEXT: sldi r5, r5, 32 +; P8LE-NEXT: oris r4, r4, 1244 +; P8LE-NEXT: ori r3, r3, 45591 +; P8LE-NEXT: oris r5, r5, 18438 +; P8LE-NEXT: ori r4, r4, 48291 +; P8LE-NEXT: rldicl r7, r6, 32, 48 +; P8LE-NEXT: rldicl r8, r6, 16, 48 +; P8LE-NEXT: ori r5, r5, 17236 +; P8LE-NEXT: extsh r7, r7 +; P8LE-NEXT: rldicl r6, r6, 48, 48 +; P8LE-NEXT: extsh r8, r8 ; P8LE-NEXT: extsw r7, r7 -; P8LE-NEXT: extsh r11, r4 -; P8LE-NEXT: extsw r9, r9 +; P8LE-NEXT: extsh r6, r6 +; P8LE-NEXT: extsw r8, r8 ; P8LE-NEXT: mulld r3, r7, r3 -; P8LE-NEXT: ori r7, r10, 30865 -; P8LE-NEXT: extsw r10, r11 -; P8LE-NEXT: mulld r8, r9, r8 -; P8LE-NEXT: mulld r7, r10, r7 -; P8LE-NEXT: rldicl r11, r3, 1, 63 -; P8LE-NEXT: rldicl r3, r3, 32, 32 -; P8LE-NEXT: rldicl r8, r8, 32, 32 -; P8LE-NEXT: rldicl r7, r7, 32, 32 -; P8LE-NEXT: add r8, r8, r9 -; P8LE-NEXT: srawi r3, r3, 11 -; P8LE-NEXT: add r7, r7, r10 -; P8LE-NEXT: srwi r9, r8, 31 -; P8LE-NEXT: srawi r8, r8, 4 -; P8LE-NEXT: add r3, r3, r11 -; P8LE-NEXT: add r8, r8, r9 -; P8LE-NEXT: srwi r9, r7, 31 -; P8LE-NEXT: srawi r7, r7, 9 -; P8LE-NEXT: mulli r3, r3, 5423 -; P8LE-NEXT: add r7, r7, r9 -; P8LE-NEXT: mulli r8, r8, 23 -; P8LE-NEXT: mulli r7, r7, 654 -; P8LE-NEXT: subf r3, r3, r5 -; P8LE-NEXT: mtvsrd f0, r3 -; P8LE-NEXT: subf r3, r8, r6 +; P8LE-NEXT: extsw r6, r6 +; P8LE-NEXT: mulld r4, r8, r4 +; P8LE-NEXT: mulld r5, r6, r5 +; P8LE-NEXT: srawi r7, r7, 31 +; P8LE-NEXT: srawi r8, r8, 31 +; P8LE-NEXT: andi. r7, r7, 22 +; P8LE-NEXT: srawi r6, r6, 31 +; P8LE-NEXT: mulhdu r3, r3, r9 +; P8LE-NEXT: li r9, 5423 +; P8LE-NEXT: andi. r6, r6, 653 +; P8LE-NEXT: mulhdu r4, r4, r9 +; P8LE-NEXT: li r9, 654 +; P8LE-NEXT: mulhdu r5, r5, r9 +; P8LE-NEXT: subf r3, r7, r3 +; P8LE-NEXT: andi. r7, r8, 5422 ; P8LE-NEXT: subf r4, r7, r4 -; P8LE-NEXT: mtvsrd f1, r3 -; P8LE-NEXT: mtvsrd f2, r4 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: subf r3, r6, r5 +; P8LE-NEXT: mtvsrd f1, r4 +; P8LE-NEXT: mtvsrd f2, r3 ; P8LE-NEXT: xxswapd v2, vs0 ; P8LE-NEXT: xxswapd v3, vs1 ; P8LE-NEXT: xxswapd v4, vs2 -; P8LE-NEXT: vmrglh v2, v2, v3 +; P8LE-NEXT: vmrglh v2, v3, v2 ; P8LE-NEXT: vmrglh v3, v4, v5 ; P8LE-NEXT: vmrglw v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_srem_one: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 24749 -; P8BE-NEXT: lis r7, -19946 -; P8BE-NEXT: lis r8, -14230 -; P8BE-NEXT: ori r3, r3, 47143 -; P8BE-NEXT: ori r7, r7, 17097 -; P8BE-NEXT: ori r8, r8, 30865 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: rldicl r4, r4, 32, 48 -; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: lis r3, 12 +; P8BE-NEXT: mfvsrd r6, v2 +; P8BE-NEXT: lis r4, 2849 +; P8BE-NEXT: lis r5, 100 +; P8BE-NEXT: li r9, 5423 +; P8BE-NEXT: ori r3, r3, 5559 +; P8BE-NEXT: ori r4, r4, 25644 +; P8BE-NEXT: ori r5, r5, 13628 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: clrldi r7, r6, 48 +; P8BE-NEXT: sldi r4, r4, 32 +; P8BE-NEXT: rldicl r8, r6, 48, 48 +; P8BE-NEXT: oris r3, r3, 1244 +; P8BE-NEXT: extsh r7, r7 +; P8BE-NEXT: sldi r5, r5, 32 +; P8BE-NEXT: rldicl r6, r6, 32, 48 +; P8BE-NEXT: oris r4, r4, 34192 +; P8BE-NEXT: ori r3, r3, 48291 +; P8BE-NEXT: extsh r8, r8 +; P8BE-NEXT: extsw r7, r7 +; P8BE-NEXT: oris r5, r5, 18438 +; P8BE-NEXT: ori r4, r4, 45591 ; P8BE-NEXT: extsh r6, r6 -; P8BE-NEXT: extsh r4, r4 -; P8BE-NEXT: extsw r5, r5 +; P8BE-NEXT: extsw r8, r8 +; P8BE-NEXT: mulld r3, r7, r3 +; P8BE-NEXT: ori r5, r5, 17236 ; P8BE-NEXT: extsw r6, r6 -; P8BE-NEXT: extsw r4, r4 -; P8BE-NEXT: mulld r3, r5, r3 -; P8BE-NEXT: mulld r7, r6, r7 -; P8BE-NEXT: mulld r8, r4, r8 -; P8BE-NEXT: rldicl r9, r3, 1, 63 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: rldicl r7, r7, 32, 32 -; P8BE-NEXT: rldicl r8, r8, 32, 32 -; P8BE-NEXT: srawi r3, r3, 11 -; P8BE-NEXT: add r7, r7, r6 -; P8BE-NEXT: add r8, r8, r4 -; P8BE-NEXT: add r3, r3, r9 -; P8BE-NEXT: srwi r9, r7, 31 -; P8BE-NEXT: srawi r7, r7, 4 -; P8BE-NEXT: mulli r3, r3, 5423 -; P8BE-NEXT: add r7, r7, r9 -; P8BE-NEXT: srwi r9, r8, 31 -; P8BE-NEXT: srawi r8, r8, 9 -; P8BE-NEXT: mulli r7, r7, 23 -; P8BE-NEXT: add r8, r8, r9 +; P8BE-NEXT: mulld r4, r8, r4 +; P8BE-NEXT: mulld r5, r6, r5 +; P8BE-NEXT: srawi r7, r7, 31 +; P8BE-NEXT: srawi r8, r8, 31 +; P8BE-NEXT: andi. r7, r7, 5422 +; P8BE-NEXT: srawi r6, r6, 31 +; P8BE-NEXT: mulhdu r3, r3, r9 +; P8BE-NEXT: li r9, 23 +; P8BE-NEXT: andi. r6, r6, 653 +; P8BE-NEXT: mulhdu r4, r4, r9 +; P8BE-NEXT: li r9, 654 +; P8BE-NEXT: mulhdu r5, r5, r9 ; P8BE-NEXT: li r9, 0 -; P8BE-NEXT: mulli r8, r8, 654 -; P8BE-NEXT: subf r3, r3, r5 -; P8BE-NEXT: sldi r5, r9, 48 +; P8BE-NEXT: sldi r9, r9, 48 +; P8BE-NEXT: subf r3, r7, r3 +; P8BE-NEXT: andi. r7, r8, 22 +; P8BE-NEXT: mtvsrd v2, r9 +; P8BE-NEXT: subf r4, r7, r4 ; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v2, r5 -; P8BE-NEXT: subf r5, r7, r6 +; P8BE-NEXT: subf r5, r6, r5 +; P8BE-NEXT: sldi r4, r4, 48 ; P8BE-NEXT: mtvsrd v3, r3 ; P8BE-NEXT: sldi r3, r5, 48 -; P8BE-NEXT: subf r4, r8, r4 -; P8BE-NEXT: mtvsrd v4, r3 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: mtvsrd v4, r4 +; P8BE-NEXT: mtvsrd v5, r3 ; P8BE-NEXT: vmrghh v3, v4, v3 ; P8BE-NEXT: vmrghh v2, v2, v5 ; P8BE-NEXT: vmrghw v2, v2, v3 @@ -1289,33 +1297,36 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { ; P9LE-LABEL: dont_fold_urem_i16_smax: ; P9LE: # %bb.0: +; P9LE-NEXT: lis r5, 2849 ; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: ori r5, r5, 25644 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: lis r5, -19946 -; P9LE-NEXT: ori r5, r5, 17097 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: mulld r5, r4, r5 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 4 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: lis r5, 24749 -; P9LE-NEXT: mulli r4, r4, 23 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: oris r5, r5, 34192 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: ori r5, r5, 45591 +; P9LE-NEXT: andi. r4, r4, 22 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 23 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: lis r5, 12 +; P9LE-NEXT: ori r5, r5, 5559 +; P9LE-NEXT: sldi r5, r5, 32 ; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: oris r5, r5, 1244 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: ori r5, r5, 48291 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: ori r5, r5, 47143 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: rldicl r5, r4, 1, 63 -; P9LE-NEXT: rldicl r4, r4, 32, 32 -; P9LE-NEXT: srawi r4, r4, 11 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 5423 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 5423 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: andi. r4, r4, 5422 ; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 @@ -1337,34 +1348,37 @@ ; ; P9BE-LABEL: dont_fold_urem_i16_smax: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: lis r5, 12 +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: ori r5, r5, 5559 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: sldi r5, r5, 32 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: lis r4, -19946 -; P9BE-NEXT: ori r4, r4, 17097 +; P9BE-NEXT: oris r5, r5, 1244 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 4 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 23 +; P9BE-NEXT: ori r5, r5, 48291 +; P9BE-NEXT: andi. r4, r4, 5422 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 5423 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: lis r5, 2849 +; P9BE-NEXT: ori r5, r5, 25644 +; P9BE-NEXT: sldi r5, r5, 32 ; P9BE-NEXT: subf r3, r4, r3 -; P9BE-NEXT: lis r4, 24749 +; P9BE-NEXT: oris r5, r5, 34192 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r5, r5, 45591 ; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 47143 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r5, r4, 1, 63 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: srawi r4, r4, 11 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 5423 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 23 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: andi. r4, r4, 22 ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v4, r3 @@ -1379,7 +1393,7 @@ ; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v3, v4 +; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: vmrghh v2, v4, v2 ; P9BE-NEXT: vmrghw v2, v2, v3 @@ -1388,95 +1402,101 @@ ; P8LE-LABEL: dont_fold_urem_i16_smax: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r6, 24749 -; P8LE-NEXT: lis r7, -19946 +; P8LE-NEXT: lis r3, 2849 +; P8LE-NEXT: lis r4, 12 +; P8LE-NEXT: li r8, 23 ; P8LE-NEXT: xxlxor v5, v5, v5 -; P8LE-NEXT: ori r6, r6, 47143 -; P8LE-NEXT: ori r7, r7, 17097 -; P8LE-NEXT: mfvsrd r3, f0 -; P8LE-NEXT: rldicl r4, r3, 16, 48 -; P8LE-NEXT: rldicl r5, r3, 32, 48 -; P8LE-NEXT: extsh r8, r4 -; P8LE-NEXT: extsh r9, r5 -; P8LE-NEXT: extsw r8, r8 -; P8LE-NEXT: extsw r9, r9 -; P8LE-NEXT: mulld r6, r8, r6 -; P8LE-NEXT: mulld r7, r9, r7 -; P8LE-NEXT: rldicl r3, r3, 48, 48 -; P8LE-NEXT: rldicl r8, r6, 32, 32 -; P8LE-NEXT: rldicl r7, r7, 32, 32 -; P8LE-NEXT: rldicl r6, r6, 1, 63 -; P8LE-NEXT: srawi r8, r8, 11 -; P8LE-NEXT: add r7, r7, r9 -; P8LE-NEXT: add r6, r8, r6 -; P8LE-NEXT: srwi r8, r7, 31 -; P8LE-NEXT: srawi r7, r7, 4 -; P8LE-NEXT: mulli r6, r6, 5423 -; P8LE-NEXT: add r7, r7, r8 -; P8LE-NEXT: extsh r8, r3 -; P8LE-NEXT: mulli r7, r7, 23 +; P8LE-NEXT: ori r3, r3, 25644 +; P8LE-NEXT: ori r4, r4, 5559 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: sldi r4, r4, 32 +; P8LE-NEXT: mfvsrd r5, f0 +; P8LE-NEXT: oris r3, r3, 34192 +; P8LE-NEXT: oris r4, r4, 1244 +; P8LE-NEXT: ori r3, r3, 45591 +; P8LE-NEXT: ori r4, r4, 48291 +; P8LE-NEXT: rldicl r6, r5, 32, 48 +; P8LE-NEXT: rldicl r7, r5, 16, 48 +; P8LE-NEXT: extsh r6, r6 +; P8LE-NEXT: extsh r7, r7 +; P8LE-NEXT: extsw r6, r6 +; P8LE-NEXT: extsw r7, r7 +; P8LE-NEXT: mulld r3, r6, r3 +; P8LE-NEXT: mulld r4, r7, r4 +; P8LE-NEXT: rldicl r5, r5, 48, 48 +; P8LE-NEXT: srawi r6, r6, 31 +; P8LE-NEXT: srawi r7, r7, 31 +; P8LE-NEXT: andi. r6, r6, 22 +; P8LE-NEXT: mulhdu r3, r3, r8 +; P8LE-NEXT: li r8, 5423 +; P8LE-NEXT: mulhdu r4, r4, r8 +; P8LE-NEXT: extsh r8, r5 ; P8LE-NEXT: srawi r8, r8, 15 +; P8LE-NEXT: subf r3, r6, r3 +; P8LE-NEXT: andi. r6, r7, 5422 +; P8LE-NEXT: addze r7, r8 ; P8LE-NEXT: subf r4, r6, r4 -; P8LE-NEXT: addze r6, r8 -; P8LE-NEXT: mtvsrd f0, r4 -; P8LE-NEXT: slwi r4, r6, 15 -; P8LE-NEXT: subf r5, r7, r5 -; P8LE-NEXT: subf r3, r4, r3 -; P8LE-NEXT: mtvsrd f1, r5 -; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: slwi r6, r7, 15 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: subf r3, r6, r5 +; P8LE-NEXT: mtvsrd f1, r4 ; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: xxswapd v2, vs0 ; P8LE-NEXT: xxswapd v3, vs1 ; P8LE-NEXT: xxswapd v4, vs2 -; P8LE-NEXT: vmrglh v2, v2, v3 +; P8LE-NEXT: vmrglh v2, v3, v2 ; P8LE-NEXT: vmrglh v3, v4, v5 ; P8LE-NEXT: vmrglw v2, v2, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_urem_i16_smax: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 24749 -; P8BE-NEXT: lis r7, -19946 -; P8BE-NEXT: ori r3, r3, 47143 -; P8BE-NEXT: ori r7, r7, 17097 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: lis r3, 12 +; P8BE-NEXT: mfvsrd r5, v2 +; P8BE-NEXT: lis r4, 2849 +; P8BE-NEXT: li r8, 5423 +; P8BE-NEXT: li r9, 23 +; P8BE-NEXT: ori r3, r3, 5559 +; P8BE-NEXT: ori r4, r4, 25644 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: clrldi r6, r5, 48 +; P8BE-NEXT: sldi r4, r4, 32 +; P8BE-NEXT: oris r3, r3, 1244 ; P8BE-NEXT: extsh r6, r6 -; P8BE-NEXT: extsw r5, r5 +; P8BE-NEXT: rldicl r7, r5, 48, 48 +; P8BE-NEXT: oris r4, r4, 34192 +; P8BE-NEXT: ori r3, r3, 48291 ; P8BE-NEXT: extsw r6, r6 -; P8BE-NEXT: mulld r3, r5, r3 -; P8BE-NEXT: mulld r7, r6, r7 -; P8BE-NEXT: rldicl r4, r4, 32, 48 -; P8BE-NEXT: extsh r4, r4 -; P8BE-NEXT: rldicl r8, r3, 1, 63 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: rldicl r7, r7, 32, 32 -; P8BE-NEXT: srawi r3, r3, 11 -; P8BE-NEXT: add r7, r7, r6 -; P8BE-NEXT: add r3, r3, r8 -; P8BE-NEXT: srwi r8, r7, 31 -; P8BE-NEXT: srawi r7, r7, 4 -; P8BE-NEXT: mulli r3, r3, 5423 -; P8BE-NEXT: add r7, r7, r8 +; P8BE-NEXT: extsh r7, r7 +; P8BE-NEXT: ori r4, r4, 45591 +; P8BE-NEXT: extsw r7, r7 +; P8BE-NEXT: mulld r3, r6, r3 +; P8BE-NEXT: mulld r4, r7, r4 +; P8BE-NEXT: rldicl r5, r5, 32, 48 +; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: mulhdu r3, r3, r8 ; P8BE-NEXT: li r8, 0 -; P8BE-NEXT: mulli r7, r7, 23 -; P8BE-NEXT: srawi r9, r4, 15 -; P8BE-NEXT: subf r3, r3, r5 -; P8BE-NEXT: sldi r5, r8, 48 -; P8BE-NEXT: addze r8, r9 -; P8BE-NEXT: mtvsrd v2, r5 -; P8BE-NEXT: subf r5, r7, r6 -; P8BE-NEXT: slwi r6, r8, 15 -; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mulhdu r4, r4, r9 +; P8BE-NEXT: srawi r9, r5, 15 +; P8BE-NEXT: addze r9, r9 +; P8BE-NEXT: srawi r6, r6, 31 +; P8BE-NEXT: srawi r7, r7, 31 +; P8BE-NEXT: andi. r6, r6, 5422 +; P8BE-NEXT: slwi r9, r9, 15 +; P8BE-NEXT: subf r3, r6, r3 +; P8BE-NEXT: andi. r6, r7, 22 +; P8BE-NEXT: subf r5, r9, r5 ; P8BE-NEXT: subf r4, r6, r4 -; P8BE-NEXT: mtvsrd v3, r3 -; P8BE-NEXT: sldi r3, r5, 48 +; P8BE-NEXT: sldi r8, r8, 48 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mtvsrd v2, r8 ; P8BE-NEXT: sldi r4, r4, 48 +; P8BE-NEXT: mtvsrd v3, r5 ; P8BE-NEXT: mtvsrd v4, r3 ; P8BE-NEXT: mtvsrd v5, r4 -; P8BE-NEXT: vmrghh v3, v4, v3 -; P8BE-NEXT: vmrghh v2, v2, v5 +; P8BE-NEXT: vmrghh v2, v2, v3 +; P8BE-NEXT: vmrghh v3, v5, v4 ; P8BE-NEXT: vmrghw v2, v2, v3 ; P8BE-NEXT: blr %1 = srem <4 x i16> %x, Index: llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll +++ llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll @@ -11,113 +11,116 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { ; P9LE-LABEL: fold_urem_vec_1: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: lis r4, 689 +; P9LE-NEXT: ori r4, r4, 55878 +; P9LE-NEXT: li r3, 0 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: lis r5, 21399 -; P9LE-NEXT: ori r5, r5, 33437 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: lis r5, 16727 -; P9LE-NEXT: ori r5, r5, 2287 -; P9LE-NEXT: rldicl r4, r4, 27, 37 -; P9LE-NEXT: mulli r4, r4, 98 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: oris r4, r4, 4139 +; P9LE-NEXT: ori r4, r4, 7589 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 95 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: lis r4, 528 +; P9LE-NEXT: ori r4, r4, 33825 +; P9LE-NEXT: sldi r4, r4, 32 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: oris r4, r4, 2114 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: lis r5, 8456 -; P9LE-NEXT: ori r5, r5, 16913 -; P9LE-NEXT: rldicl r4, r4, 24, 40 -; P9LE-NEXT: mulli r4, r4, 1003 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: ori r4, r4, 4229 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 124 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: lis r4, 668 +; P9LE-NEXT: ori r4, r4, 48148 ; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: sldi r4, r4, 32 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: oris r4, r4, 58848 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 30, 18, 31 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: rldicl r4, r4, 30, 34 -; P9LE-NEXT: mulli r4, r4, 124 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: ori r4, r4, 42800 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 98 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: lis r4, 65 +; P9LE-NEXT: ori r4, r4, 22280 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: oris r4, r4, 61158 +; P9LE-NEXT: ori r4, r4, 14506 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: lis r6, 22765 -; P9LE-NEXT: ori r6, r6, 8969 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 1003 +; P9LE-NEXT: mulhdu r3, r3, r4 ; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: xxswapd v4, vs0 -; P9LE-NEXT: clrldi r5, r4, 32 -; P9LE-NEXT: mulld r5, r5, r6 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 1 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 -; P9LE-NEXT: vmrglh v2, v4, v2 -; P9LE-NEXT: vmrglw v2, v3, v2 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: fold_urem_vec_1: ; P9BE: # %bb.0: +; P9BE-NEXT: lis r4, 65 +; P9BE-NEXT: ori r4, r4, 22280 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: lis r5, 16727 -; P9BE-NEXT: ori r5, r5, 2287 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: lis r5, 21399 -; P9BE-NEXT: ori r5, r5, 33437 -; P9BE-NEXT: rldicl r4, r4, 24, 40 -; P9BE-NEXT: mulli r4, r4, 1003 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: oris r4, r4, 61158 +; P9BE-NEXT: ori r4, r4, 14506 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 1003 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: lis r4, 668 +; P9BE-NEXT: ori r4, r4, 48148 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: oris r4, r4, 58848 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: ori r4, r4, 42800 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: lis r5, 8456 -; P9BE-NEXT: ori r5, r5, 16913 -; P9BE-NEXT: rldicl r4, r4, 27, 37 -; P9BE-NEXT: mulli r4, r4, 98 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 98 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: lis r4, 528 +; P9BE-NEXT: ori r4, r4, 33825 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: oris r4, r4, 2114 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: ori r4, r4, 4229 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r4, r3, 16 -; P9BE-NEXT: rlwinm r3, r3, 30, 18, 31 -; P9BE-NEXT: mulld r3, r3, r5 -; P9BE-NEXT: lis r5, 22765 -; P9BE-NEXT: ori r5, r5, 8969 -; P9BE-NEXT: rldicl r3, r3, 30, 34 -; P9BE-NEXT: mulli r3, r3, 124 -; P9BE-NEXT: subf r3, r3, r4 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 124 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: lis r4, 689 +; P9BE-NEXT: ori r4, r4, 55878 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: oris r4, r4, 4139 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: ori r4, r4, 7589 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r5, r4, r3 -; P9BE-NEXT: srwi r5, r5, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 95 +; P9BE-NEXT: mulhdu r3, r3, r4 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: vmrghh v2, v2, v4 @@ -127,110 +130,113 @@ ; P8LE-LABEL: fold_urem_vec_1: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 22765 -; P8LE-NEXT: lis r8, 21399 -; P8LE-NEXT: ori r3, r3, 8969 -; P8LE-NEXT: ori r8, r8, 33437 -; P8LE-NEXT: mfvsrd r4, f0 -; P8LE-NEXT: clrldi r5, r4, 48 -; P8LE-NEXT: rldicl r9, r4, 32, 48 -; P8LE-NEXT: rlwinm r6, r5, 0, 16, 31 -; P8LE-NEXT: rldicl r10, r4, 16, 48 -; P8LE-NEXT: rlwinm r11, r9, 0, 16, 31 -; P8LE-NEXT: clrldi r7, r6, 32 -; P8LE-NEXT: rlwinm r12, r10, 0, 16, 31 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: lis r4, 528 +; P8LE-NEXT: lis r5, 668 +; P8LE-NEXT: ori r3, r3, 55878 +; P8LE-NEXT: ori r4, r4, 33825 +; P8LE-NEXT: ori r5, r5, 48148 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: sldi r4, r4, 32 +; P8LE-NEXT: mfvsrd r6, f0 +; P8LE-NEXT: oris r3, r3, 4139 +; P8LE-NEXT: oris r4, r4, 2114 +; P8LE-NEXT: ori r3, r3, 7589 +; P8LE-NEXT: sldi r5, r5, 32 +; P8LE-NEXT: ori r4, r4, 4229 +; P8LE-NEXT: clrldi r7, r6, 48 +; P8LE-NEXT: rldicl r8, r6, 48, 48 +; P8LE-NEXT: oris r5, r5, 58848 +; P8LE-NEXT: rlwinm r7, r7, 0, 16, 31 +; P8LE-NEXT: rlwinm r8, r8, 0, 16, 31 +; P8LE-NEXT: ori r5, r5, 42800 ; P8LE-NEXT: mulld r3, r7, r3 -; P8LE-NEXT: lis r7, 16727 -; P8LE-NEXT: ori r7, r7, 2287 -; P8LE-NEXT: mulld r8, r11, r8 -; P8LE-NEXT: lis r11, 8456 -; P8LE-NEXT: rldicl r4, r4, 48, 48 -; P8LE-NEXT: mulld r7, r12, r7 -; P8LE-NEXT: ori r11, r11, 16913 -; P8LE-NEXT: rlwinm r12, r4, 30, 18, 31 -; P8LE-NEXT: rldicl r3, r3, 32, 32 -; P8LE-NEXT: mulld r11, r12, r11 -; P8LE-NEXT: subf r6, r3, r6 -; P8LE-NEXT: rldicl r8, r8, 27, 37 -; P8LE-NEXT: srwi r6, r6, 1 -; P8LE-NEXT: add r3, r6, r3 -; P8LE-NEXT: rldicl r6, r7, 24, 40 -; P8LE-NEXT: mulli r7, r8, 98 -; P8LE-NEXT: srwi r3, r3, 6 -; P8LE-NEXT: rldicl r8, r11, 30, 34 -; P8LE-NEXT: mulli r6, r6, 1003 -; P8LE-NEXT: mulli r3, r3, 95 -; P8LE-NEXT: mulli r8, r8, 124 -; P8LE-NEXT: subf r7, r7, r9 -; P8LE-NEXT: subf r6, r6, r10 -; P8LE-NEXT: mtvsrd f0, r7 -; P8LE-NEXT: subf r3, r3, r5 -; P8LE-NEXT: subf r4, r8, r4 -; P8LE-NEXT: mtvsrd f1, r6 -; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: lis r7, 65 +; P8LE-NEXT: ori r7, r7, 22280 +; P8LE-NEXT: mulld r4, r8, r4 +; P8LE-NEXT: rldicl r8, r6, 32, 48 +; P8LE-NEXT: sldi r7, r7, 32 +; P8LE-NEXT: rlwinm r8, r8, 0, 16, 31 +; P8LE-NEXT: oris r7, r7, 61158 +; P8LE-NEXT: rldicl r6, r6, 16, 48 +; P8LE-NEXT: mulld r5, r8, r5 +; P8LE-NEXT: ori r7, r7, 14506 +; P8LE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8LE-NEXT: mulld r6, r6, r7 +; P8LE-NEXT: li r7, 95 +; P8LE-NEXT: mulhdu r3, r3, r7 +; P8LE-NEXT: li r7, 124 +; P8LE-NEXT: mulhdu r4, r4, r7 +; P8LE-NEXT: li r7, 98 +; P8LE-NEXT: mulhdu r5, r5, r7 +; P8LE-NEXT: li r7, 1003 +; P8LE-NEXT: mulhdu r6, r6, r7 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: mtvsrd f1, r4 +; P8LE-NEXT: mtvsrd f2, r5 ; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: mtvsrd f3, r6 ; P8LE-NEXT: xxswapd v3, vs1 ; P8LE-NEXT: xxswapd v4, vs2 ; P8LE-NEXT: xxswapd v5, vs3 ; P8LE-NEXT: vmrglh v2, v3, v2 ; P8LE-NEXT: vmrglh v3, v5, v4 -; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: vmrglw v2, v3, v2 ; P8LE-NEXT: blr ; ; P8BE-LABEL: fold_urem_vec_1: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 -; P8BE-NEXT: lis r9, 16727 -; P8BE-NEXT: ori r3, r3, 8969 -; P8BE-NEXT: ori r9, r9, 2287 -; P8BE-NEXT: rldicl r5, r4, 16, 48 -; P8BE-NEXT: clrldi r6, r4, 48 -; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 -; P8BE-NEXT: rldicl r7, r4, 48, 48 -; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 -; P8BE-NEXT: clrldi r8, r5, 32 +; P8BE-NEXT: lis r3, 65 +; P8BE-NEXT: mfvsrd r6, v2 +; P8BE-NEXT: lis r4, 668 +; P8BE-NEXT: lis r5, 528 +; P8BE-NEXT: ori r3, r3, 22280 +; P8BE-NEXT: ori r4, r4, 48148 +; P8BE-NEXT: ori r5, r5, 33825 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: clrldi r7, r6, 48 +; P8BE-NEXT: oris r3, r3, 61158 +; P8BE-NEXT: sldi r4, r4, 32 ; P8BE-NEXT: rlwinm r7, r7, 0, 16, 31 -; P8BE-NEXT: mulld r3, r8, r3 -; P8BE-NEXT: lis r8, 21399 -; P8BE-NEXT: clrldi r10, r6, 32 -; P8BE-NEXT: ori r8, r8, 33437 -; P8BE-NEXT: clrldi r11, r7, 32 -; P8BE-NEXT: mulld r9, r10, r9 -; P8BE-NEXT: lis r10, 8456 -; P8BE-NEXT: rldicl r4, r4, 32, 48 -; P8BE-NEXT: mulld r8, r11, r8 -; P8BE-NEXT: ori r10, r10, 16913 -; P8BE-NEXT: rlwinm r11, r4, 30, 18, 31 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: rlwinm r4, r4, 0, 16, 31 -; P8BE-NEXT: mulld r10, r11, r10 -; P8BE-NEXT: subf r11, r3, r5 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: rldicl r9, r9, 24, 40 -; P8BE-NEXT: add r3, r11, r3 -; P8BE-NEXT: rldicl r8, r8, 27, 37 -; P8BE-NEXT: srwi r3, r3, 6 -; P8BE-NEXT: mulli r9, r9, 1003 -; P8BE-NEXT: rldicl r10, r10, 30, 34 -; P8BE-NEXT: mulli r8, r8, 98 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: mulli r10, r10, 124 -; P8BE-NEXT: subf r6, r9, r6 -; P8BE-NEXT: subf r7, r8, r7 -; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: subf r3, r3, r5 -; P8BE-NEXT: subf r4, r10, r4 -; P8BE-NEXT: mtvsrd v2, r6 -; P8BE-NEXT: sldi r5, r7, 48 +; P8BE-NEXT: ori r3, r3, 14506 +; P8BE-NEXT: rldicl r8, r6, 48, 48 +; P8BE-NEXT: oris r4, r4, 58848 +; P8BE-NEXT: mulld r3, r7, r3 +; P8BE-NEXT: lis r7, 689 +; P8BE-NEXT: rlwinm r8, r8, 0, 16, 31 +; P8BE-NEXT: ori r4, r4, 42800 +; P8BE-NEXT: sldi r5, r5, 32 +; P8BE-NEXT: ori r7, r7, 55878 +; P8BE-NEXT: mulld r4, r8, r4 +; P8BE-NEXT: rldicl r8, r6, 32, 48 +; P8BE-NEXT: oris r5, r5, 2114 +; P8BE-NEXT: sldi r7, r7, 32 +; P8BE-NEXT: rlwinm r8, r8, 0, 16, 31 +; P8BE-NEXT: ori r5, r5, 4229 +; P8BE-NEXT: rldicl r6, r6, 16, 48 +; P8BE-NEXT: oris r7, r7, 4139 +; P8BE-NEXT: mulld r5, r8, r5 +; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8BE-NEXT: ori r7, r7, 7589 +; P8BE-NEXT: mulld r6, r6, r7 +; P8BE-NEXT: li r7, 1003 +; P8BE-NEXT: mulhdu r3, r3, r7 +; P8BE-NEXT: li r7, 98 +; P8BE-NEXT: mulhdu r4, r4, r7 +; P8BE-NEXT: li r7, 124 +; P8BE-NEXT: mulhdu r5, r5, r7 +; P8BE-NEXT: li r7, 95 +; P8BE-NEXT: mulhdu r6, r6, r7 ; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: mtvsrd v3, r5 -; P8BE-NEXT: mtvsrd v4, r3 -; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: mtvsrd v2, r3 +; P8BE-NEXT: sldi r3, r4, 48 +; P8BE-NEXT: sldi r4, r5, 48 +; P8BE-NEXT: mtvsrd v3, r3 +; P8BE-NEXT: sldi r3, r6, 48 +; P8BE-NEXT: mtvsrd v4, r4 +; P8BE-NEXT: mtvsrd v5, r3 ; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: vmrghh v3, v4, v5 +; P8BE-NEXT: vmrghh v3, v5, v4 ; P8BE-NEXT: vmrghw v2, v3, v2 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, @@ -240,61 +246,37 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { ; P9LE-LABEL: fold_urem_vec_2: ; P9LE: # %bb.0: +; P9LE-NEXT: lis r4, 689 +; P9LE-NEXT: ori r4, r4, 55878 ; P9LE-NEXT: li r3, 0 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: lis r6, 22765 -; P9LE-NEXT: ori r6, r6, 8969 -; P9LE-NEXT: clrldi r5, r4, 32 -; P9LE-NEXT: mulld r5, r5, r6 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 1 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: oris r4, r4, 4139 +; P9LE-NEXT: ori r4, r4, 7589 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r5, 95 +; P9LE-NEXT: mulhdu r3, r3, r5 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: clrldi r5, r4, 32 -; P9LE-NEXT: mulld r5, r5, r6 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 1 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: mulhdu r3, r3, r5 ; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: clrldi r5, r4, 32 -; P9LE-NEXT: mulld r5, r5, r6 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 1 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: mulhdu r3, r3, r5 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: clrldi r5, r4, 32 -; P9LE-NEXT: mulld r5, r5, r6 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 1 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: mulhdu r3, r3, r5 ; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 @@ -305,63 +287,39 @@ ; ; P9BE-LABEL: fold_urem_vec_2: ; P9BE: # %bb.0: +; P9BE-NEXT: lis r4, 689 +; P9BE-NEXT: ori r4, r4, 55878 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: lis r5, 22765 -; P9BE-NEXT: ori r5, r5, 8969 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r6, r4, r3 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r4, r6, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r4, r4, 32 +; P9BE-NEXT: oris r4, r4, 4139 +; P9BE-NEXT: ori r4, r4, 7589 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r5, 95 +; P9BE-NEXT: mulhdu r3, r3, r5 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r6, r4, r3 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r4, r6, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: mulhdu r3, r3, r5 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r6, r4, r3 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r4, r6, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: mulhdu r3, r3, r5 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r5, r4, r3 -; P9BE-NEXT: srwi r5, r5, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: mulhdu r3, r3, r5 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: vmrghh v2, v2, v4 @@ -371,62 +329,34 @@ ; P8LE-LABEL: fold_urem_vec_2: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r4, 22765 -; P8LE-NEXT: std r29, -24(r1) # 8-byte Folded Spill -; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; P8LE-NEXT: ori r4, r4, 8969 -; P8LE-NEXT: mfvsrd r5, f0 -; P8LE-NEXT: clrldi r3, r5, 48 -; P8LE-NEXT: rldicl r6, r5, 48, 48 -; P8LE-NEXT: rlwinm r8, r3, 0, 16, 31 -; P8LE-NEXT: rldicl r7, r5, 32, 48 -; P8LE-NEXT: rlwinm r9, r6, 0, 16, 31 -; P8LE-NEXT: rldicl r5, r5, 16, 48 -; P8LE-NEXT: clrldi r11, r8, 32 -; P8LE-NEXT: rlwinm r10, r7, 0, 16, 31 -; P8LE-NEXT: rlwinm r12, r5, 0, 16, 31 -; P8LE-NEXT: mulld r11, r11, r4 -; P8LE-NEXT: clrldi r0, r9, 32 -; P8LE-NEXT: clrldi r30, r10, 32 -; P8LE-NEXT: clrldi r29, r12, 32 -; P8LE-NEXT: mulld r0, r0, r4 -; P8LE-NEXT: mulld r30, r30, r4 -; P8LE-NEXT: mulld r4, r29, r4 -; P8LE-NEXT: ld r29, -24(r1) # 8-byte Folded Reload -; P8LE-NEXT: rldicl r11, r11, 32, 32 -; P8LE-NEXT: subf r8, r11, r8 -; P8LE-NEXT: rldicl r0, r0, 32, 32 -; P8LE-NEXT: srwi r8, r8, 1 -; P8LE-NEXT: rldicl r30, r30, 32, 32 -; P8LE-NEXT: rldicl r4, r4, 32, 32 -; P8LE-NEXT: subf r9, r0, r9 -; P8LE-NEXT: add r8, r8, r11 -; P8LE-NEXT: subf r10, r30, r10 -; P8LE-NEXT: subf r11, r4, r12 -; P8LE-NEXT: srwi r9, r9, 1 -; P8LE-NEXT: srwi r8, r8, 6 -; P8LE-NEXT: srwi r10, r10, 1 -; P8LE-NEXT: srwi r11, r11, 1 -; P8LE-NEXT: add r9, r9, r0 -; P8LE-NEXT: add r10, r10, r30 -; P8LE-NEXT: add r4, r11, r4 -; P8LE-NEXT: srwi r9, r9, 6 -; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload -; P8LE-NEXT: mulli r8, r8, 95 -; P8LE-NEXT: srwi r10, r10, 6 -; P8LE-NEXT: srwi r4, r4, 6 -; P8LE-NEXT: mulli r9, r9, 95 -; P8LE-NEXT: mulli r10, r10, 95 -; P8LE-NEXT: mulli r4, r4, 95 -; P8LE-NEXT: subf r3, r8, r3 -; P8LE-NEXT: subf r6, r9, r6 -; P8LE-NEXT: mtvsrd f0, r3 -; P8LE-NEXT: subf r3, r10, r7 -; P8LE-NEXT: subf r4, r4, r5 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: ori r3, r3, 55878 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: oris r3, r3, 4139 +; P8LE-NEXT: ori r3, r3, 7589 +; P8LE-NEXT: clrldi r5, r4, 48 +; P8LE-NEXT: rldicl r6, r4, 48, 48 +; P8LE-NEXT: rldicl r7, r4, 32, 48 +; P8LE-NEXT: rldicl r4, r4, 16, 48 +; P8LE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8LE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8LE-NEXT: rlwinm r7, r7, 0, 16, 31 +; P8LE-NEXT: rlwinm r4, r4, 0, 16, 31 +; P8LE-NEXT: mulld r5, r5, r3 +; P8LE-NEXT: mulld r6, r6, r3 +; P8LE-NEXT: mulld r7, r7, r3 +; P8LE-NEXT: mulld r3, r4, r3 +; P8LE-NEXT: li r4, 95 +; P8LE-NEXT: mulhdu r5, r5, r4 +; P8LE-NEXT: mulhdu r6, r6, r4 +; P8LE-NEXT: mulhdu r7, r7, r4 +; P8LE-NEXT: mulhdu r3, r3, r4 +; P8LE-NEXT: mtvsrd f0, r5 ; P8LE-NEXT: mtvsrd f1, r6 -; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: mtvsrd f2, r7 ; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: mtvsrd f3, r3 ; P8LE-NEXT: xxswapd v3, vs1 ; P8LE-NEXT: xxswapd v4, vs2 ; P8LE-NEXT: xxswapd v5, vs3 @@ -437,59 +367,35 @@ ; ; P8BE-LABEL: fold_urem_vec_2: ; P8BE: # %bb.0: +; P8BE-NEXT: lis r3, 689 ; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 -; P8BE-NEXT: ori r3, r3, 8969 +; P8BE-NEXT: ori r3, r3, 55878 +; P8BE-NEXT: sldi r3, r3, 32 ; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: oris r3, r3, 4139 ; P8BE-NEXT: rldicl r6, r4, 48, 48 ; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 ; P8BE-NEXT: rldicl r7, r4, 32, 48 -; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 -; P8BE-NEXT: clrldi r8, r5, 32 ; P8BE-NEXT: rldicl r4, r4, 16, 48 +; P8BE-NEXT: ori r3, r3, 7589 +; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8BE-NEXT: mulld r5, r5, r3 ; P8BE-NEXT: rlwinm r7, r7, 0, 16, 31 -; P8BE-NEXT: clrldi r9, r6, 32 -; P8BE-NEXT: mulld r8, r8, r3 ; P8BE-NEXT: rlwinm r4, r4, 0, 16, 31 -; P8BE-NEXT: clrldi r10, r7, 32 -; P8BE-NEXT: mulld r9, r9, r3 -; P8BE-NEXT: clrldi r11, r4, 32 -; P8BE-NEXT: mulld r10, r10, r3 -; P8BE-NEXT: mulld r3, r11, r3 -; P8BE-NEXT: rldicl r8, r8, 32, 32 -; P8BE-NEXT: rldicl r9, r9, 32, 32 -; P8BE-NEXT: subf r11, r8, r5 -; P8BE-NEXT: rldicl r10, r10, 32, 32 -; P8BE-NEXT: subf r12, r9, r6 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: add r8, r11, r8 -; P8BE-NEXT: subf r11, r10, r7 -; P8BE-NEXT: srwi r12, r12, 1 -; P8BE-NEXT: add r9, r12, r9 -; P8BE-NEXT: subf r12, r3, r4 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: srwi r8, r8, 6 -; P8BE-NEXT: add r10, r11, r10 -; P8BE-NEXT: srwi r11, r12, 1 -; P8BE-NEXT: srwi r9, r9, 6 -; P8BE-NEXT: add r3, r11, r3 -; P8BE-NEXT: srwi r10, r10, 6 -; P8BE-NEXT: srwi r3, r3, 6 -; P8BE-NEXT: mulli r8, r8, 95 -; P8BE-NEXT: mulli r9, r9, 95 -; P8BE-NEXT: mulli r10, r10, 95 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: subf r5, r8, r5 -; P8BE-NEXT: subf r6, r9, r6 -; P8BE-NEXT: subf r7, r10, r7 -; P8BE-NEXT: subf r3, r3, r4 -; P8BE-NEXT: sldi r5, r5, 48 -; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: mulld r6, r6, r3 +; P8BE-NEXT: mulld r7, r7, r3 +; P8BE-NEXT: mulld r3, r4, r3 +; P8BE-NEXT: li r4, 95 +; P8BE-NEXT: mulhdu r5, r5, r4 +; P8BE-NEXT: mulhdu r6, r6, r4 +; P8BE-NEXT: mulhdu r7, r7, r4 +; P8BE-NEXT: mulhdu r3, r3, r4 +; P8BE-NEXT: sldi r4, r5, 48 +; P8BE-NEXT: sldi r5, r6, 48 +; P8BE-NEXT: mtvsrd v2, r4 ; P8BE-NEXT: sldi r4, r7, 48 -; P8BE-NEXT: mtvsrd v2, r5 ; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v3, r6 +; P8BE-NEXT: mtvsrd v3, r5 ; P8BE-NEXT: mtvsrd v4, r4 ; P8BE-NEXT: mtvsrd v5, r3 ; P8BE-NEXT: vmrghh v2, v3, v2 @@ -830,30 +736,27 @@ ; P9LE-NEXT: rlwinm r3, r3, 0, 27, 31 ; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: lis r6, 22765 -; P9LE-NEXT: ori r6, r6, 8969 +; P9LE-NEXT: lis r4, 689 +; P9LE-NEXT: ori r4, r4, 55878 ; P9LE-NEXT: xxswapd v4, vs0 -; P9LE-NEXT: vmrglh v3, v4, v3 -; P9LE-NEXT: clrldi r5, r4, 32 -; P9LE-NEXT: mulld r5, r5, r6 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 1 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: oris r4, r4, 4139 +; P9LE-NEXT: ori r4, r4, 7589 +; P9LE-NEXT: rlwinm r3, r3, 0, 29, 31 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r3, r3, 0, 29, 31 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 95 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 -; P9LE-NEXT: vmrglh v2, v4, v2 +; P9LE-NEXT: vmrglh v2, v2, v4 ; P9LE-NEXT: vmrglw v2, v2, v3 ; P9LE-NEXT: blr ; @@ -869,100 +772,91 @@ ; P9BE-NEXT: rlwinm r3, r3, 0, 26, 31 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: lis r5, 22765 -; P9BE-NEXT: ori r5, r5, 8969 +; P9BE-NEXT: rlwinm r3, r3, 0, 29, 31 +; P9BE-NEXT: lis r4, 689 +; P9BE-NEXT: ori r4, r4, 55878 ; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r5, r4, r3 -; P9BE-NEXT: srwi r5, r5, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r4, r4, 32 +; P9BE-NEXT: oris r4, r4, 4139 +; P9BE-NEXT: ori r4, r4, 7589 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: rlwinm r3, r3, 0, 29, 31 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 95 +; P9BE-NEXT: mulhdu r3, r3, r4 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghh v2, v4, v2 ; P9BE-NEXT: vmrghw v2, v3, v2 ; P9BE-NEXT: blr ; ; P8LE-LABEL: dont_fold_urem_power_of_two: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 22765 -; P8LE-NEXT: ori r3, r3, 8969 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: li r6, 95 +; P8LE-NEXT: ori r3, r3, 55878 +; P8LE-NEXT: sldi r3, r3, 32 ; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: oris r3, r3, 4139 +; P8LE-NEXT: ori r3, r3, 7589 ; P8LE-NEXT: rldicl r5, r4, 16, 48 -; P8LE-NEXT: rlwinm r6, r5, 0, 16, 31 -; P8LE-NEXT: clrldi r7, r6, 32 -; P8LE-NEXT: mulld r3, r7, r3 -; P8LE-NEXT: rldicl r7, r4, 48, 48 -; P8LE-NEXT: rlwinm r7, r7, 0, 27, 31 -; P8LE-NEXT: mtvsrd f1, r7 -; P8LE-NEXT: rldicl r3, r3, 32, 32 -; P8LE-NEXT: xxswapd v3, vs1 -; P8LE-NEXT: subf r6, r3, r6 -; P8LE-NEXT: srwi r6, r6, 1 -; P8LE-NEXT: add r3, r6, r3 -; P8LE-NEXT: clrldi r6, r4, 48 -; P8LE-NEXT: srwi r3, r3, 6 +; P8LE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8LE-NEXT: mulld r3, r5, r3 +; P8LE-NEXT: clrldi r5, r4, 48 +; P8LE-NEXT: rlwinm r5, r5, 0, 26, 31 +; P8LE-NEXT: mtvsrd f0, r5 +; P8LE-NEXT: mulhdu r3, r3, r6 +; P8LE-NEXT: rldicl r6, r4, 48, 48 +; P8LE-NEXT: xxswapd v2, vs0 ; P8LE-NEXT: rldicl r4, r4, 32, 48 -; P8LE-NEXT: rlwinm r6, r6, 0, 26, 31 -; P8LE-NEXT: mulli r3, r3, 95 +; P8LE-NEXT: rlwinm r5, r6, 0, 27, 31 ; P8LE-NEXT: rlwinm r4, r4, 0, 29, 31 -; P8LE-NEXT: mtvsrd f0, r6 -; P8LE-NEXT: mtvsrd f3, r4 -; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: mtvsrd f1, r5 +; P8LE-NEXT: mtvsrd f2, r4 +; P8LE-NEXT: mtvsrd f3, r3 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: xxswapd v4, vs2 ; P8LE-NEXT: xxswapd v5, vs3 -; P8LE-NEXT: subf r3, r3, r5 -; P8LE-NEXT: mtvsrd f2, r3 ; P8LE-NEXT: vmrglh v2, v3, v2 -; P8LE-NEXT: xxswapd v4, vs2 -; P8LE-NEXT: vmrglh v3, v4, v5 +; P8LE-NEXT: vmrglh v3, v5, v4 ; P8LE-NEXT: vmrglw v2, v3, v2 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_urem_power_of_two: ; P8BE: # %bb.0: +; P8BE-NEXT: lis r3, 689 ; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 -; P8BE-NEXT: ori r3, r3, 8969 +; P8BE-NEXT: li r6, 95 +; P8BE-NEXT: ori r3, r3, 55878 +; P8BE-NEXT: sldi r3, r3, 32 ; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r7, r4, 16, 48 +; P8BE-NEXT: oris r3, r3, 4139 ; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 -; P8BE-NEXT: rlwinm r7, r7, 0, 26, 31 -; P8BE-NEXT: clrldi r6, r5, 32 -; P8BE-NEXT: mulld r3, r6, r3 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: subf r6, r3, r5 -; P8BE-NEXT: srwi r6, r6, 1 -; P8BE-NEXT: add r3, r6, r3 -; P8BE-NEXT: rldicl r6, r4, 32, 48 -; P8BE-NEXT: srwi r3, r3, 6 +; P8BE-NEXT: ori r3, r3, 7589 +; P8BE-NEXT: mulld r3, r5, r3 +; P8BE-NEXT: rldicl r5, r4, 32, 48 +; P8BE-NEXT: rlwinm r5, r5, 0, 27, 31 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: mtvsrd v2, r5 +; P8BE-NEXT: mulhdu r3, r3, r6 +; P8BE-NEXT: rldicl r6, r4, 16, 48 ; P8BE-NEXT: rldicl r4, r4, 48, 48 -; P8BE-NEXT: rlwinm r6, r6, 0, 27, 31 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: rlwinm r6, r6, 0, 26, 31 ; P8BE-NEXT: rlwinm r4, r4, 0, 29, 31 -; P8BE-NEXT: mtvsrd v2, r6 -; P8BE-NEXT: sldi r6, r7, 48 +; P8BE-NEXT: sldi r5, r6, 48 ; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: mtvsrd v3, r6 -; P8BE-NEXT: mtvsrd v5, r4 -; P8BE-NEXT: subf r3, r3, r5 -; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: mtvsrd v3, r5 ; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v4, r3 -; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: mtvsrd v4, r4 +; P8BE-NEXT: mtvsrd v5, r3 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: vmrghh v3, v4, v5 ; P8BE-NEXT: vmrghw v2, v2, v3 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, @@ -973,37 +867,42 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { ; P9LE-LABEL: dont_fold_urem_one: ; P9LE: # %bb.0: +; P9LE-NEXT: lis r4, 2849 +; P9LE-NEXT: ori r4, r4, 25644 ; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: li r5, 0 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: oris r6, r5, 45590 -; P9LE-NEXT: oris r5, r5, 51306 -; P9LE-NEXT: ori r6, r6, 17097 -; P9LE-NEXT: ori r5, r5, 30865 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: mulld r4, r4, r6 -; P9LE-NEXT: lis r6, 24749 -; P9LE-NEXT: ori r6, r6, 47143 -; P9LE-NEXT: rldicl r4, r4, 28, 36 -; P9LE-NEXT: mulli r4, r4, 23 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: oris r4, r4, 34192 +; P9LE-NEXT: ori r4, r4, 45591 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 23 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: lis r4, 12 +; P9LE-NEXT: ori r4, r4, 5559 +; P9LE-NEXT: sldi r4, r4, 32 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: oris r4, r4, 1244 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: mulld r4, r4, r6 -; P9LE-NEXT: rldicl r4, r4, 21, 43 -; P9LE-NEXT: mulli r4, r4, 5423 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: ori r4, r4, 48291 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 5423 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: lis r4, 100 +; P9LE-NEXT: ori r4, r4, 13628 ; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: sldi r4, r4, 32 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: oris r4, r4, 18438 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 31, 17, 31 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: rldicl r4, r4, 24, 40 -; P9LE-NEXT: mulli r4, r4, 654 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: ori r4, r4, 17236 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 654 +; P9LE-NEXT: mulhdu r3, r3, r4 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 @@ -1015,47 +914,49 @@ ; ; P9BE-LABEL: dont_fold_urem_one: ; P9BE: # %bb.0: +; P9BE-NEXT: lis r4, 12 +; P9BE-NEXT: ori r4, r4, 5559 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: lis r5, 24749 -; P9BE-NEXT: ori r5, r5, 47143 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: li r5, 0 -; P9BE-NEXT: oris r6, r5, 45590 -; P9BE-NEXT: oris r5, r5, 51306 -; P9BE-NEXT: ori r6, r6, 17097 -; P9BE-NEXT: ori r5, r5, 30865 -; P9BE-NEXT: rldicl r4, r4, 21, 43 -; P9BE-NEXT: mulli r4, r4, 5423 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: oris r4, r4, 1244 +; P9BE-NEXT: ori r4, r4, 48291 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 5423 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: lis r4, 2849 +; P9BE-NEXT: ori r4, r4, 25644 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: oris r4, r4, 34192 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: ori r4, r4, 45591 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r6 -; P9BE-NEXT: rldicl r4, r4, 28, 36 -; P9BE-NEXT: mulli r4, r4, 23 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 23 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: lis r4, 100 +; P9BE-NEXT: ori r4, r4, 13628 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: oris r4, r4, 18438 ; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r4, r3, 16 -; P9BE-NEXT: rlwinm r3, r3, 31, 17, 31 -; P9BE-NEXT: mulld r3, r3, r5 -; P9BE-NEXT: rldicl r3, r3, 24, 40 -; P9BE-NEXT: mulli r3, r3, 654 -; P9BE-NEXT: subf r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: ori r4, r4, 17236 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 654 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: vmrghh v2, v4, v2 ; P9BE-NEXT: vmrghw v2, v2, v3 ; P9BE-NEXT: blr @@ -1063,37 +964,42 @@ ; P8LE-LABEL: dont_fold_urem_one: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: li r3, 0 -; P8LE-NEXT: lis r8, 24749 +; P8LE-NEXT: lis r3, 2849 +; P8LE-NEXT: lis r4, 12 +; P8LE-NEXT: lis r7, 100 ; P8LE-NEXT: xxlxor v5, v5, v5 -; P8LE-NEXT: oris r5, r3, 45590 -; P8LE-NEXT: ori r8, r8, 47143 -; P8LE-NEXT: oris r3, r3, 51306 -; P8LE-NEXT: ori r5, r5, 17097 -; P8LE-NEXT: ori r3, r3, 30865 -; P8LE-NEXT: mfvsrd r4, f0 -; P8LE-NEXT: rldicl r6, r4, 32, 48 -; P8LE-NEXT: rldicl r7, r4, 16, 48 -; P8LE-NEXT: rlwinm r9, r6, 0, 16, 31 -; P8LE-NEXT: rldicl r4, r4, 48, 48 -; P8LE-NEXT: mulld r5, r9, r5 -; P8LE-NEXT: rlwinm r9, r7, 0, 16, 31 -; P8LE-NEXT: mulld r8, r9, r8 -; P8LE-NEXT: rlwinm r9, r4, 31, 17, 31 -; P8LE-NEXT: mulld r3, r9, r3 -; P8LE-NEXT: rldicl r5, r5, 28, 36 -; P8LE-NEXT: rldicl r8, r8, 21, 43 -; P8LE-NEXT: mulli r5, r5, 23 -; P8LE-NEXT: rldicl r3, r3, 24, 40 -; P8LE-NEXT: mulli r8, r8, 5423 -; P8LE-NEXT: mulli r3, r3, 654 -; P8LE-NEXT: subf r5, r5, r6 -; P8LE-NEXT: subf r6, r8, r7 -; P8LE-NEXT: mtvsrd f0, r5 -; P8LE-NEXT: subf r3, r3, r4 -; P8LE-NEXT: mtvsrd f1, r6 -; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: ori r3, r3, 25644 +; P8LE-NEXT: ori r4, r4, 5559 +; P8LE-NEXT: ori r7, r7, 13628 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: sldi r4, r4, 32 +; P8LE-NEXT: mfvsrd r5, f0 +; P8LE-NEXT: oris r3, r3, 34192 +; P8LE-NEXT: oris r4, r4, 1244 +; P8LE-NEXT: ori r3, r3, 45591 +; P8LE-NEXT: ori r4, r4, 48291 +; P8LE-NEXT: rldicl r6, r5, 32, 48 +; P8LE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8LE-NEXT: mulld r3, r6, r3 +; P8LE-NEXT: rldicl r6, r5, 16, 48 +; P8LE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8LE-NEXT: rldicl r5, r5, 48, 48 +; P8LE-NEXT: mulld r4, r6, r4 +; P8LE-NEXT: sldi r6, r7, 32 +; P8LE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8LE-NEXT: oris r6, r6, 18438 +; P8LE-NEXT: ori r6, r6, 17236 +; P8LE-NEXT: mulld r5, r5, r6 +; P8LE-NEXT: li r6, 23 +; P8LE-NEXT: mulhdu r3, r3, r6 +; P8LE-NEXT: li r6, 5423 +; P8LE-NEXT: mulhdu r4, r4, r6 +; P8LE-NEXT: li r6, 654 +; P8LE-NEXT: mulhdu r5, r5, r6 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: mtvsrd f1, r4 ; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: mtvsrd f2, r5 ; P8LE-NEXT: xxswapd v3, vs1 ; P8LE-NEXT: xxswapd v4, vs2 ; P8LE-NEXT: vmrglh v2, v3, v2 @@ -1103,46 +1009,48 @@ ; ; P8BE-LABEL: dont_fold_urem_one: ; P8BE: # %bb.0: +; P8BE-NEXT: lis r3, 12 ; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: li r3, 0 -; P8BE-NEXT: lis r8, 24749 -; P8BE-NEXT: oris r6, r3, 51306 -; P8BE-NEXT: ori r8, r8, 47143 -; P8BE-NEXT: oris r3, r3, 45590 -; P8BE-NEXT: rldicl r5, r4, 32, 48 -; P8BE-NEXT: clrldi r7, r4, 48 -; P8BE-NEXT: ori r6, r6, 30865 -; P8BE-NEXT: ori r3, r3, 17097 -; P8BE-NEXT: rldicl r4, r4, 48, 48 -; P8BE-NEXT: rlwinm r9, r5, 31, 17, 31 +; P8BE-NEXT: lis r5, 2849 +; P8BE-NEXT: ori r3, r3, 5559 +; P8BE-NEXT: ori r5, r5, 25644 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: clrldi r6, r4, 48 +; P8BE-NEXT: oris r3, r3, 1244 +; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8BE-NEXT: ori r3, r3, 48291 +; P8BE-NEXT: sldi r5, r5, 32 +; P8BE-NEXT: mulld r3, r6, r3 +; P8BE-NEXT: lis r6, 100 +; P8BE-NEXT: oris r5, r5, 34192 +; P8BE-NEXT: ori r6, r6, 13628 +; P8BE-NEXT: rldicl r7, r4, 48, 48 +; P8BE-NEXT: ori r5, r5, 45591 +; P8BE-NEXT: sldi r6, r6, 32 ; P8BE-NEXT: rlwinm r7, r7, 0, 16, 31 -; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8BE-NEXT: rldicl r4, r4, 32, 48 +; P8BE-NEXT: oris r6, r6, 18438 +; P8BE-NEXT: mulld r5, r7, r5 ; P8BE-NEXT: rlwinm r4, r4, 0, 16, 31 -; P8BE-NEXT: mulld r6, r9, r6 -; P8BE-NEXT: clrldi r9, r7, 32 -; P8BE-NEXT: mulld r8, r9, r8 -; P8BE-NEXT: clrldi r9, r4, 32 -; P8BE-NEXT: mulld r3, r9, r3 -; P8BE-NEXT: li r9, 0 -; P8BE-NEXT: rldicl r6, r6, 24, 40 -; P8BE-NEXT: mulli r6, r6, 654 -; P8BE-NEXT: rldicl r8, r8, 21, 43 -; P8BE-NEXT: rldicl r3, r3, 28, 36 -; P8BE-NEXT: mulli r8, r8, 5423 -; P8BE-NEXT: mulli r3, r3, 23 -; P8BE-NEXT: subf r5, r6, r5 -; P8BE-NEXT: sldi r6, r9, 48 -; P8BE-NEXT: mtvsrd v2, r6 -; P8BE-NEXT: sldi r5, r5, 48 -; P8BE-NEXT: subf r6, r8, r7 -; P8BE-NEXT: mtvsrd v3, r5 -; P8BE-NEXT: subf r3, r3, r4 -; P8BE-NEXT: sldi r4, r6, 48 +; P8BE-NEXT: ori r6, r6, 17236 +; P8BE-NEXT: mulld r4, r4, r6 +; P8BE-NEXT: li r6, 5423 +; P8BE-NEXT: mulhdu r3, r3, r6 +; P8BE-NEXT: li r6, 23 +; P8BE-NEXT: mulhdu r5, r5, r6 +; P8BE-NEXT: li r6, 654 +; P8BE-NEXT: mulhdu r4, r4, r6 +; P8BE-NEXT: li r6, 0 ; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v4, r4 +; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: mtvsrd v3, r3 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: mtvsrd v2, r6 +; P8BE-NEXT: sldi r3, r4, 48 +; P8BE-NEXT: mtvsrd v4, r5 ; P8BE-NEXT: mtvsrd v5, r3 -; P8BE-NEXT: vmrghh v2, v2, v3 -; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: vmrghh v3, v4, v3 +; P8BE-NEXT: vmrghh v2, v2, v5 ; P8BE-NEXT: vmrghw v2, v2, v3 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, Index: llvm/test/CodeGen/RISCV/srem-lkk.ll =================================================================== --- llvm/test/CodeGen/RISCV/srem-lkk.ll +++ llvm/test/CodeGen/RISCV/srem-lkk.ll @@ -56,22 +56,18 @@ ; RV64IM-LABEL: fold_srem_positive_odd: ; RV64IM: # %bb.0: ; RV64IM-NEXT: sext.w a0, a0 -; RV64IM-NEXT: lui a1, 1045903 -; RV64IM-NEXT: addiw a1, a1, -733 +; RV64IM-NEXT: lui a1, 176602 +; RV64IM-NEXT: addiw a1, a1, 1121 ; RV64IM-NEXT: slli a1, a1, 15 -; RV64IM-NEXT: addi a1, a1, 1035 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -905 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -1767 -; RV64IM-NEXT: mulh a1, a0, a1 -; RV64IM-NEXT: add a1, a1, a0 -; RV64IM-NEXT: srli a2, a1, 63 -; RV64IM-NEXT: srai a1, a1, 6 -; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: addi a1, a1, 345 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, -603 +; RV64IM-NEXT: mul a1, a0, a1 ; RV64IM-NEXT: addi a2, zero, 95 -; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: mulhu a1, a1, a2 +; RV64IM-NEXT: srli a0, a0, 31 +; RV64IM-NEXT: andi a0, a0, 94 +; RV64IM-NEXT: subw a0, a1, a0 ; RV64IM-NEXT: .cfi_def_cfa_offset 0 ; RV64IM-NEXT: ret %1 = srem i32 %x, 95 @@ -126,19 +122,18 @@ ; RV64IM-LABEL: fold_srem_positive_even: ; RV64IM: # %bb.0: ; RV64IM-NEXT: sext.w a0, a0 -; RV64IM-NEXT: lui a1, 506482 -; RV64IM-NEXT: addiw a1, a1, -31 +; RV64IM-NEXT: lui a1, 15828 +; RV64IM-NEXT: addiw a1, a1, -1793 +; RV64IM-NEXT: slli a1, a1, 15 +; RV64IM-NEXT: addi a1, a1, 1113 ; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, 711 -; RV64IM-NEXT: slli a1, a1, 19 -; RV64IM-NEXT: addi a1, a1, 1979 -; RV64IM-NEXT: mulh a1, a0, a1 -; RV64IM-NEXT: srli a2, a1, 63 -; RV64IM-NEXT: srai a1, a1, 9 -; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: addi a1, a1, -1020 +; RV64IM-NEXT: mul a1, a0, a1 ; RV64IM-NEXT: addi a2, zero, 1060 -; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: mulhu a1, a1, a2 +; RV64IM-NEXT: srli a0, a0, 31 +; RV64IM-NEXT: andi a0, a0, 1059 +; RV64IM-NEXT: subw a0, a1, a0 ; RV64IM-NEXT: .cfi_def_cfa_offset 0 ; RV64IM-NEXT: ret %1 = srem i32 %x, 1060 @@ -193,22 +188,18 @@ ; RV64IM-LABEL: fold_srem_negative_odd: ; RV64IM: # %bb.0: ; RV64IM-NEXT: sext.w a0, a0 -; RV64IM-NEXT: lui a1, 4781 -; RV64IM-NEXT: addiw a1, a1, 2045 +; RV64IM-NEXT: lui a1, 11603 +; RV64IM-NEXT: addiw a1, a1, -2045 ; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, 1371 -; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, -11 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -1355 -; RV64IM-NEXT: mulh a1, a0, a1 -; RV64IM-NEXT: sub a1, a1, a0 -; RV64IM-NEXT: srli a2, a1, 63 -; RV64IM-NEXT: srai a1, a1, 9 -; RV64IM-NEXT: add a1, a1, a2 -; RV64IM-NEXT: addi a2, zero, -723 -; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: addi a1, a1, -1371 +; RV64IM-NEXT: slli a1, a1, 16 +; RV64IM-NEXT: addi a1, a1, 91 +; RV64IM-NEXT: mul a1, a0, a1 +; RV64IM-NEXT: addi a2, zero, 723 +; RV64IM-NEXT: mulhu a1, a1, a2 +; RV64IM-NEXT: srli a0, a0, 31 +; RV64IM-NEXT: andi a0, a0, 722 +; RV64IM-NEXT: subw a0, a1, a0 ; RV64IM-NEXT: .cfi_def_cfa_offset 0 ; RV64IM-NEXT: ret %1 = srem i32 %x, -723 @@ -266,22 +257,20 @@ ; RV64IM-LABEL: fold_srem_negative_even: ; RV64IM: # %bb.0: ; RV64IM-NEXT: sext.w a0, a0 -; RV64IM-NEXT: lui a1, 1036895 -; RV64IM-NEXT: addiw a1, a1, 999 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, 11 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -523 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -481 -; RV64IM-NEXT: mulh a1, a0, a1 -; RV64IM-NEXT: srli a2, a1, 63 -; RV64IM-NEXT: srai a1, a1, 12 -; RV64IM-NEXT: add a1, a1, a2 -; RV64IM-NEXT: lui a2, 1048570 -; RV64IM-NEXT: addiw a2, a2, 1595 -; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: srli a1, a0, 31 +; RV64IM-NEXT: lui a2, 6 +; RV64IM-NEXT: addiw a3, a2, -1596 +; RV64IM-NEXT: and a1, a1, a3 +; RV64IM-NEXT: lui a3, 11681 +; RV64IM-NEXT: addiw a3, a3, -999 +; RV64IM-NEXT: slli a3, a3, 12 +; RV64IM-NEXT: addi a3, a3, -11 +; RV64IM-NEXT: slli a3, a3, 12 +; RV64IM-NEXT: addi a3, a3, 524 +; RV64IM-NEXT: mul a0, a0, a3 +; RV64IM-NEXT: addiw a2, a2, -1595 +; RV64IM-NEXT: mulhu a0, a0, a2 +; RV64IM-NEXT: subw a0, a0, a1 ; RV64IM-NEXT: .cfi_def_cfa_offset 0 ; RV64IM-NEXT: ret %1 = srem i32 %x, -22981 Index: llvm/test/CodeGen/RISCV/urem-lkk.ll =================================================================== --- llvm/test/CodeGen/RISCV/urem-lkk.ll +++ llvm/test/CodeGen/RISCV/urem-lkk.ll @@ -58,22 +58,15 @@ ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a0, a0, 32 ; RV64IM-NEXT: srli a0, a0, 32 -; RV64IM-NEXT: lui a1, 1423 -; RV64IM-NEXT: addiw a1, a1, -733 +; RV64IM-NEXT: lui a1, 176602 +; RV64IM-NEXT: addiw a1, a1, 1121 ; RV64IM-NEXT: slli a1, a1, 15 -; RV64IM-NEXT: addi a1, a1, 1035 +; RV64IM-NEXT: addi a1, a1, 345 ; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, -1811 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, 561 -; RV64IM-NEXT: mulhu a1, a0, a1 -; RV64IM-NEXT: sub a2, a0, a1 -; RV64IM-NEXT: srli a2, a2, 1 -; RV64IM-NEXT: add a1, a2, a1 -; RV64IM-NEXT: srli a1, a1, 6 -; RV64IM-NEXT: addi a2, zero, 95 -; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: addi a1, a1, -603 +; RV64IM-NEXT: mul a0, a0, a1 +; RV64IM-NEXT: addi a1, zero, 95 +; RV64IM-NEXT: mulhu a0, a0, a1 ; RV64IM-NEXT: .cfi_def_cfa_offset 0 ; RV64IM-NEXT: ret %1 = urem i32 %x, 95 @@ -128,19 +121,15 @@ ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a0, a0, 32 ; RV64IM-NEXT: srli a0, a0, 32 -; RV64IM-NEXT: lui a1, 1048020 +; RV64IM-NEXT: lui a1, 15828 ; RV64IM-NEXT: addiw a1, a1, -1793 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, 139 -; RV64IM-NEXT: slli a1, a1, 14 -; RV64IM-NEXT: addi a1, a1, 1793 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -139 -; RV64IM-NEXT: mulhu a1, a0, a1 -; RV64IM-NEXT: srli a1, a1, 10 -; RV64IM-NEXT: addi a2, zero, 1060 -; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: slli a1, a1, 15 +; RV64IM-NEXT: addi a1, a1, 1113 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, -1020 +; RV64IM-NEXT: mul a0, a0, a1 +; RV64IM-NEXT: addi a1, zero, 1060 +; RV64IM-NEXT: mulhu a0, a0, a1 ; RV64IM-NEXT: .cfi_def_cfa_offset 0 ; RV64IM-NEXT: ret %1 = urem i32 %x, 1060 Index: llvm/test/CodeGen/X86/load-scalar-as-vector.ll =================================================================== --- llvm/test/CodeGen/X86/load-scalar-as-vector.ll +++ llvm/test/CodeGen/X86/load-scalar-as-vector.ll @@ -418,28 +418,28 @@ define <4 x i32> @srem_op1_constant(i32* %p) nounwind { ; SSE-LABEL: srem_op1_constant: ; SSE: # %bb.0: -; SSE-NEXT: movslq (%rdi), %rax -; SSE-NEXT: imulq $818089009, %rax, %rcx # imm = 0x30C30C31 -; SSE-NEXT: movq %rcx, %rdx -; SSE-NEXT: shrq $63, %rdx -; SSE-NEXT: sarq $35, %rcx -; SSE-NEXT: addl %edx, %ecx -; SSE-NEXT: imull $42, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movslq (%rdi), %rcx +; SSE-NEXT: movabsq $439208192231179801, %rax # imm = 0x618618618618619 +; SSE-NEXT: imulq %rcx, %rax +; SSE-NEXT: movl $42, %edx +; SSE-NEXT: mulq %rdx +; SSE-NEXT: sarl $31, %ecx +; SSE-NEXT: andl $41, %ecx +; SSE-NEXT: subl %ecx, %edx +; SSE-NEXT: movd %edx, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: srem_op1_constant: ; AVX: # %bb.0: -; AVX-NEXT: movslq (%rdi), %rax -; AVX-NEXT: imulq $818089009, %rax, %rcx # imm = 0x30C30C31 -; AVX-NEXT: movq %rcx, %rdx -; AVX-NEXT: shrq $63, %rdx -; AVX-NEXT: sarq $35, %rcx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: imull $42, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: movslq (%rdi), %rcx +; AVX-NEXT: movabsq $439208192231179801, %rax # imm = 0x618618618618619 +; AVX-NEXT: imulq %rcx, %rax +; AVX-NEXT: movl $42, %edx +; AVX-NEXT: mulq %rdx +; AVX-NEXT: sarl $31, %ecx +; AVX-NEXT: andl $41, %ecx +; AVX-NEXT: subl %ecx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 ; AVX-NEXT: retq %x = load i32, i32* %p %b = srem i32 %x, 42 @@ -520,29 +520,21 @@ define <16 x i8> @urem_op1_constant(i8* %p) nounwind { ; SSE-LABEL: urem_op1_constant: ; SSE: # %bb.0: -; SSE-NEXT: movb (%rdi), %al -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb %cl -; SSE-NEXT: movzbl %cl, %ecx -; SSE-NEXT: imull $49, %ecx, %ecx -; SSE-NEXT: shrl $10, %ecx -; SSE-NEXT: imull $42, %ecx, %ecx -; SSE-NEXT: subb %cl, %al -; SSE-NEXT: movzbl %al, %eax +; SSE-NEXT: movzbl (%rdi), %eax +; SSE-NEXT: imull $1561, %eax, %eax # imm = 0x619 +; SSE-NEXT: movzwl %ax, %eax +; SSE-NEXT: imull $42, %eax, %eax +; SSE-NEXT: shrl $16, %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: urem_op1_constant: ; AVX: # %bb.0: -; AVX-NEXT: movb (%rdi), %al -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrb %cl -; AVX-NEXT: movzbl %cl, %ecx -; AVX-NEXT: imull $49, %ecx, %ecx -; AVX-NEXT: shrl $10, %ecx -; AVX-NEXT: imull $42, %ecx, %ecx -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: movzbl (%rdi), %eax +; AVX-NEXT: imull $1561, %eax, %eax # imm = 0x619 +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: imull $42, %eax, %eax +; AVX-NEXT: shrl $16, %eax ; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: retq %x = load i8, i8* %p Index: llvm/test/CodeGen/X86/pr14088.ll =================================================================== --- llvm/test/CodeGen/X86/pr14088.ll +++ llvm/test/CodeGen/X86/pr14088.ll @@ -17,23 +17,24 @@ ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: movslq %r8d, %rax -; CHECK-NEXT: imulq $1374389535, %rax, %rcx # imm = 0x51EB851F -; CHECK-NEXT: movq %rcx, %rdi -; CHECK-NEXT: shrq $63, %rdi -; CHECK-NEXT: sarq $37, %rcx -; CHECK-NEXT: addl %edi, %ecx -; CHECK-NEXT: imull $100, %ecx, %ecx -; CHECK-NEXT: subl %ecx, %eax -; CHECK-NEXT: movw %ax, (%rsi) -; CHECK-NEXT: cwtl +; CHECK-NEXT: movq %rdx, %rcx +; CHECK-NEXT: movslq %r8d, %rdi +; CHECK-NEXT: movabsq $184467440737095517, %rax # imm = 0x28F5C28F5C28F5D +; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: movl $100, %edx +; CHECK-NEXT: mulq %rdx +; CHECK-NEXT: sarl $31, %edi +; CHECK-NEXT: andl $99, %edi +; CHECK-NEXT: subl %edi, %edx +; CHECK-NEXT: movw %dx, (%rsi) +; CHECK-NEXT: movswl %dx, %eax ; CHECK-NEXT: cltq ; CHECK-NEXT: imulq $1717986919, %rax, %rax # imm = 0x66666667 -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: shrq $63, %rcx +; CHECK-NEXT: movq %rax, %rdx +; CHECK-NEXT: shrq $63, %rdx ; CHECK-NEXT: shrq $34, %rax -; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: movb %al, (%rdx) +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: movb %al, (%rcx) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: .LBB0_2: # %return ; CHECK-NEXT: retq Index: llvm/test/CodeGen/X86/srem-lkk.ll =================================================================== --- llvm/test/CodeGen/X86/srem-lkk.ll +++ llvm/test/CodeGen/X86/srem-lkk.ll @@ -4,15 +4,14 @@ define i32 @fold_srem_positive_odd(i32 %x) { ; CHECK-LABEL: fold_srem_positive_odd: ; CHECK: # %bb.0: -; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: imulq $-1401515643, %rax, %rcx # imm = 0xAC769185 -; CHECK-NEXT: shrq $32, %rcx -; CHECK-NEXT: addl %eax, %ecx -; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: shrl $31, %edx -; CHECK-NEXT: sarl $6, %ecx -; CHECK-NEXT: addl %edx, %ecx -; CHECK-NEXT: imull $95, %ecx, %ecx +; CHECK-NEXT: movslq %edi, %rcx +; CHECK-NEXT: movabsq $194176253407468965, %rax # imm = 0x2B1DA46102B1DA5 +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $95, %edx +; CHECK-NEXT: mulq %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: sarl $31, %ecx +; CHECK-NEXT: andl $94, %ecx ; CHECK-NEXT: subl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq @@ -24,13 +23,14 @@ define i32 @fold_srem_positive_even(i32 %x) { ; CHECK-LABEL: fold_srem_positive_even: ; CHECK: # %bb.0: -; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: imulq $1037275121, %rax, %rcx # imm = 0x3DD38FF1 -; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: shrq $63, %rdx -; CHECK-NEXT: sarq $40, %rcx -; CHECK-NEXT: addl %edx, %ecx -; CHECK-NEXT: imull $1060, %ecx, %ecx # imm = 0x424 +; CHECK-NEXT: movslq %edi, %rcx +; CHECK-NEXT: movabsq $17402588748782596, %rax # imm = 0x3DD38FF08B1C04 +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $1060, %edx # imm = 0x424 +; CHECK-NEXT: mulq %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: sarl $31, %ecx +; CHECK-NEXT: andl $1059, %ecx # imm = 0x423 ; CHECK-NEXT: subl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq @@ -42,13 +42,14 @@ define i32 @fold_srem_negative_odd(i32 %x) { ; CHECK-LABEL: fold_srem_negative_odd: ; CHECK: # %bb.0: -; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: imulq $-1520762971, %rax, %rcx # imm = 0xA55AFFA5 -; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: shrq $63, %rdx -; CHECK-NEXT: sarq $40, %rcx -; CHECK-NEXT: addl %edx, %ecx -; CHECK-NEXT: imull $-723, %ecx, %ecx # imm = 0xFD2D +; CHECK-NEXT: movslq %edi, %rcx +; CHECK-NEXT: movabsq $25514168843305051, %rax # imm = 0x5AA5005AA5005B +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $723, %edx # imm = 0x2D3 +; CHECK-NEXT: mulq %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: sarl $31, %ecx +; CHECK-NEXT: andl $722, %ecx # imm = 0x2D2 ; CHECK-NEXT: subl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq @@ -60,13 +61,14 @@ define i32 @fold_srem_negative_even(i32 %x) { ; CHECK-LABEL: fold_srem_negative_even: ; CHECK: # %bb.0: -; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: imulq $-47844377, %rax, %rcx # imm = 0xFD25F3E7 -; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: shrq $63, %rdx -; CHECK-NEXT: sarq $40, %rcx -; CHECK-NEXT: addl %edx, %ecx -; CHECK-NEXT: imull $-22981, %ecx, %ecx # imm = 0xA63B +; CHECK-NEXT: movslq %edi, %rcx +; CHECK-NEXT: movabsq $802695447269900, %rax # imm = 0x2DA0C18FF520C +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $22981, %edx # imm = 0x59C5 +; CHECK-NEXT: mulq %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: sarl $31, %ecx +; CHECK-NEXT: andl $22980, %ecx # imm = 0x59C4 ; CHECK-NEXT: subl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq Index: llvm/test/CodeGen/X86/srem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/X86/srem-vector-lkk.ll +++ llvm/test/CodeGen/X86/srem-vector-lkk.ll @@ -6,104 +6,55 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { ; SSE-LABEL: fold_srem_vec_1: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: subl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $9, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 -; SSE-NEXT: shrl $16, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: movzwl %dx, %edx -; SSE-NEXT: movswl %dx, %esi -; SSE-NEXT: shrl $15, %edx -; SSE-NEXT: sarl $6, %esi -; SSE-NEXT: addl %edx, %esi -; SSE-NEXT: imull $95, %esi, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: pextrw $1, %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF -; SSE-NEXT: movl %edx, %esi -; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $21, %edx -; SSE-NEXT: addl %esi, %edx -; SSE-NEXT: imull $-124, %edx, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $2675, %edx, %edx # imm = 0xA73 -; SSE-NEXT: movl %edx, %esi -; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $18, %edx -; SSE-NEXT: addl %esi, %edx -; SSE-NEXT: imull $98, %edx, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: pinsrw $2, %ecx, %xmm1 -; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [95,124,98,1003] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE-NEXT: pmovsxwd %xmm0, %xmm3 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE-NEXT: psraw $15, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: psubw %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: fold_srem_vec_1: -; AVX: # %bb.0: -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: subl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $9, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 -; AVX-NEXT: shrl $16, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: movswl %dx, %esi -; AVX-NEXT: shrl $15, %edx -; AVX-NEXT: sarl $6, %esi -; AVX-NEXT: addl %edx, %esi -; AVX-NEXT: imull $95, %esi, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpextrw $1, %xmm0, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $21, %edx -; AVX-NEXT: addl %esi, %edx -; AVX-NEXT: imull $-124, %edx, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: imull $2675, %edx, %edx # imm = 0xA73 -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $18, %edx -; AVX-NEXT: addl %esi, %edx -; AVX-NEXT: imull $98, %edx, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 -; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: fold_srem_vec_1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [95,124,98,1003] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm3 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fold_srem_vec_1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [95,124,98,1003] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vpmovsxwd %xmm0, %xmm3 +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 } @@ -111,27 +62,53 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { ; SSE-LABEL: fold_srem_vec_2: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] -; SSE-NEXT: pmulhw %xmm0, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psrlw $15, %xmm2 -; SSE-NEXT: psraw $6, %xmm1 -; SSE-NEXT: paddw %xmm2, %xmm1 -; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 -; SSE-NEXT: psubw %xmm1, %xmm0 +; SSE-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [95,95,95,95] +; SSE-NEXT: pmuludq %xmm3, %xmm2 +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE-NEXT: psraw $15, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: psubw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: fold_srem_vec_2: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 -; AVX-NEXT: vpsraw $6, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: fold_srem_vec_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [95,95,95,95] +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fold_srem_vec_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [45210183,45210183,45210183,45210183] +; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [95,95,95,95] +; AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 } @@ -197,18 +174,14 @@ ; SSE-NEXT: subl %ecx, %eax ; SSE-NEXT: pinsrw $2, %eax, %xmm1 ; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $6, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: imull $95, %edx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: cwtl +; SSE-NEXT: imull $45210183, %eax, %ecx # imm = 0x2B1DA47 +; SSE-NEXT: shrl $15, %eax +; SSE-NEXT: andl $94, %eax +; SSE-NEXT: imulq $95, %rcx, %rcx +; SSE-NEXT: shrq $32, %rcx +; SSE-NEXT: subl %eax, %ecx +; SSE-NEXT: pinsrw $3, %ecx, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -236,18 +209,14 @@ ; AVX-NEXT: subl %ecx, %eax ; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 ; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $6, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: imull $95, %edx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: cwtl +; AVX-NEXT: imull $45210183, %eax, %ecx # imm = 0x2B1DA47 +; AVX-NEXT: shrl $15, %eax +; AVX-NEXT: andl $94, %eax +; AVX-NEXT: imulq $95, %rcx, %rcx +; AVX-NEXT: shrq $32, %rcx +; AVX-NEXT: subl %eax, %ecx +; AVX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 ; AVX-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -257,83 +226,71 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; SSE-LABEL: dont_fold_srem_one: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $4, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: leal (%rdx,%rdx,2), %ecx -; SSE-NEXT: shll $3, %ecx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: addl %eax, %edx +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B -; SSE-NEXT: movl %ecx, %esi -; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $23, %ecx -; SSE-NEXT: addl %esi, %ecx -; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pinsrw $2, %edx, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; SSE-NEXT: movl %ecx, %edx -; SSE-NEXT: shrl $31, %edx -; SSE-NEXT: sarl $26, %ecx -; SSE-NEXT: addl %edx, %ecx -; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: cwtl +; SSE-NEXT: imull $6567229, %eax, %ecx # imm = 0x64353D +; SSE-NEXT: shrl $15, %eax +; SSE-NEXT: andl $653, %eax # imm = 0x28D +; SSE-NEXT: imulq $654, %rcx, %rcx # imm = 0x28E +; SSE-NEXT: shrq $32, %rcx +; SSE-NEXT: subl %eax, %ecx +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-NEXT: pextrw $2, %xmm1, %eax +; SSE-NEXT: cwtl +; SSE-NEXT: imull $186737709, %eax, %ecx # imm = 0xB21642D +; SSE-NEXT: leaq (%rcx,%rcx,2), %rdx +; SSE-NEXT: shlq $3, %rdx +; SSE-NEXT: subq %rcx, %rdx +; SSE-NEXT: shrq $32, %rdx +; SSE-NEXT: shrl $15, %eax +; SSE-NEXT: andl $22, %eax +; SSE-NEXT: subl %eax, %edx +; SSE-NEXT: pinsrw $2, %edx, %xmm0 +; SSE-NEXT: pextrw $3, %xmm1, %eax +; SSE-NEXT: cwtl +; SSE-NEXT: imull $791992, %eax, %ecx # imm = 0xC15B8 +; SSE-NEXT: shrl $15, %eax +; SSE-NEXT: andl $5422, %eax # imm = 0x152E +; SSE-NEXT: imulq $5423, %rcx, %rcx # imm = 0x152F +; SSE-NEXT: shrq $32, %rcx +; SSE-NEXT: subl %eax, %ecx +; SSE-NEXT: pinsrw $3, %ecx, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_srem_one: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $4, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX-NEXT: shll $3, %ecx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: addl %eax, %edx ; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B -; AVX-NEXT: movl %ecx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $23, %ecx -; AVX-NEXT: addl %esi, %ecx -; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: cwtl +; AVX-NEXT: imull $6567229, %eax, %ecx # imm = 0x64353D +; AVX-NEXT: shrl $15, %eax +; AVX-NEXT: andl $653, %eax # imm = 0x28D +; AVX-NEXT: imulq $654, %rcx, %rcx # imm = 0x28E +; AVX-NEXT: shrq $32, %rcx +; AVX-NEXT: subl %eax, %ecx ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: cwtl +; AVX-NEXT: imull $186737709, %eax, %ecx # imm = 0xB21642D +; AVX-NEXT: leaq (%rcx,%rcx,2), %rdx +; AVX-NEXT: shlq $3, %rdx +; AVX-NEXT: subq %rcx, %rdx +; AVX-NEXT: shrq $32, %rdx +; AVX-NEXT: shrl $15, %eax +; AVX-NEXT: andl $22, %eax +; AVX-NEXT: subl %eax, %edx ; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 ; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $26, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: cwtl +; AVX-NEXT: imull $791992, %eax, %ecx # imm = 0xC15B8 +; AVX-NEXT: shrl $15, %eax +; AVX-NEXT: andl $5422, %eax # imm = 0x152E +; AVX-NEXT: imulq $5423, %rcx, %rcx # imm = 0x152F +; AVX-NEXT: shrq $32, %rcx +; AVX-NEXT: subl %eax, %ecx +; AVX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 ; AVX-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -343,58 +300,39 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { ; SSE-LABEL: dont_fold_urem_i16_smax: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $4, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: leal (%rdx,%rdx,2), %ecx -; SSE-NEXT: shll $3, %ecx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: addl %eax, %edx +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pextrw $1, %xmm0, %eax ; SSE-NEXT: leal 32767(%rax), %ecx ; SSE-NEXT: testw %ax, %ax ; SSE-NEXT: cmovnsl %eax, %ecx ; SSE-NEXT: andl $-32768, %ecx # imm = 0x8000 ; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pinsrw $2, %edx, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; SSE-NEXT: movl %ecx, %edx -; SSE-NEXT: shrl $31, %edx -; SSE-NEXT: sarl $26, %ecx -; SSE-NEXT: addl %edx, %ecx -; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-NEXT: pextrw $2, %xmm1, %eax +; SSE-NEXT: cwtl +; SSE-NEXT: imull $186737709, %eax, %ecx # imm = 0xB21642D +; SSE-NEXT: leaq (%rcx,%rcx,2), %rdx +; SSE-NEXT: shlq $3, %rdx +; SSE-NEXT: subq %rcx, %rdx +; SSE-NEXT: shrq $32, %rdx +; SSE-NEXT: shrl $15, %eax +; SSE-NEXT: andl $22, %eax +; SSE-NEXT: subl %eax, %edx +; SSE-NEXT: pinsrw $2, %edx, %xmm0 +; SSE-NEXT: pextrw $3, %xmm1, %eax +; SSE-NEXT: cwtl +; SSE-NEXT: imull $791992, %eax, %ecx # imm = 0xC15B8 +; SSE-NEXT: shrl $15, %eax +; SSE-NEXT: andl $5422, %eax # imm = 0x152E +; SSE-NEXT: imulq $5423, %rcx, %rcx # imm = 0x152F +; SSE-NEXT: shrq $32, %rcx +; SSE-NEXT: subl %eax, %ecx +; SSE-NEXT: pinsrw $3, %ecx, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_urem_i16_smax: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $4, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX-NEXT: shll $3, %ecx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: addl %eax, %edx ; AVX-NEXT: vpextrw $1, %xmm0, %eax ; AVX-NEXT: leal 32767(%rax), %ecx ; AVX-NEXT: testw %ax, %ax @@ -403,17 +341,26 @@ ; AVX-NEXT: addl %eax, %ecx ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: cwtl +; AVX-NEXT: imull $186737709, %eax, %ecx # imm = 0xB21642D +; AVX-NEXT: leaq (%rcx,%rcx,2), %rdx +; AVX-NEXT: shlq $3, %rdx +; AVX-NEXT: subq %rcx, %rdx +; AVX-NEXT: shrq $32, %rdx +; AVX-NEXT: shrl $15, %eax +; AVX-NEXT: andl $22, %eax +; AVX-NEXT: subl %eax, %edx ; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 ; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $26, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: cwtl +; AVX-NEXT: imull $791992, %eax, %ecx # imm = 0xC15B8 +; AVX-NEXT: shrl $15, %eax +; AVX-NEXT: andl $5422, %eax # imm = 0x152E +; AVX-NEXT: imulq $5423, %rcx, %rcx # imm = 0x152F +; AVX-NEXT: shrq $32, %rcx +; AVX-NEXT: subl %eax, %ecx +; AVX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 ; AVX-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 Index: llvm/test/CodeGen/X86/urem-i8-constant.ll =================================================================== --- llvm/test/CodeGen/X86/urem-i8-constant.ll +++ llvm/test/CodeGen/X86/urem-i8-constant.ll @@ -7,11 +7,11 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: ; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: imull $111, %eax, %ecx -; CHECK-NEXT: shrl $12, %ecx -; CHECK-NEXT: leal (%ecx,%ecx,8), %edx -; CHECK-NEXT: leal (%ecx,%edx,4), %ecx -; CHECK-NEXT: subb %cl, %al +; CHECK-NEXT: imull $1772, %eax, %eax # imm = 0x6EC +; CHECK-NEXT: movzwl %ax, %eax +; CHECK-NEXT: leal (%eax,%eax,8), %ecx +; CHECK-NEXT: leal (%eax,%ecx,4), %eax +; CHECK-NEXT: shrl $16, %eax ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retl %t546 = urem i8 %tmp325, 37 Index: llvm/test/CodeGen/X86/urem-lkk.ll =================================================================== --- llvm/test/CodeGen/X86/urem-lkk.ll +++ llvm/test/CodeGen/X86/urem-lkk.ll @@ -4,17 +4,13 @@ define i32 @fold_urem_positive_odd(i32 %x) { ; CHECK-LABEL: fold_urem_positive_odd: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: imulq $1491936009, %rcx, %rcx # imm = 0x58ED2309 -; CHECK-NEXT: shrq $32, %rcx -; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: subl %ecx, %edx -; CHECK-NEXT: shrl %edx -; CHECK-NEXT: addl %ecx, %edx -; CHECK-NEXT: shrl $6, %edx -; CHECK-NEXT: imull $95, %edx, %ecx -; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: movabsq $194176253407468965, %rax # imm = 0x2B1DA46102B1DA5 +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $95, %ecx +; CHECK-NEXT: mulq %rcx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq %1 = urem i32 %x, 95 ret i32 %1 @@ -24,13 +20,13 @@ define i32 @fold_urem_positive_even(i32 %x) { ; CHECK-LABEL: fold_urem_positive_even: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: movl $4149100483, %edx # imm = 0xF74E3FC3 -; CHECK-NEXT: imulq %rcx, %rdx -; CHECK-NEXT: shrq $42, %rdx -; CHECK-NEXT: imull $1060, %edx, %ecx # imm = 0x424 -; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: movabsq $17402588748782596, %rax # imm = 0x3DD38FF08B1C04 +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $1060, %ecx # imm = 0x424 +; CHECK-NEXT: mulq %rcx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq %1 = urem i32 %x, 1060 ret i32 %1 Index: llvm/test/CodeGen/X86/urem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/X86/urem-vector-lkk.ll +++ llvm/test/CodeGen/X86/urem-vector-lkk.ll @@ -6,82 +6,45 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { ; SSE-LABEL: fold_urem_vec_1: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrl $2, %ecx -; SSE-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211 -; SSE-NEXT: shrl $19, %ecx -; SSE-NEXT: imull $124, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %xmm0, %ecx -; SSE-NEXT: movzwl %cx, %edx -; SSE-NEXT: imull $44151, %edx, %edx # imm = 0xAC77 -; SSE-NEXT: shrl $22, %edx -; SSE-NEXT: imull $95, %edx, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrl %ecx -; SSE-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 -; SSE-NEXT: shrl $17, %ecx -; SSE-NEXT: imull $98, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $1373, %eax, %ecx # imm = 0x55D -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: movl %eax, %edx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: movzwl %dx, %edx -; SSE-NEXT: shrl %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: shrl $9, %edx -; SSE-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [95,124,98,1003] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] +; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; SSE-NEXT: retq ; -; AVX-LABEL: fold_urem_vec_1: -; AVX: # %bb.0: -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $2, %ecx -; AVX-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211 -; AVX-NEXT: shrl $19, %ecx -; AVX-NEXT: imull $124, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: movzwl %cx, %edx -; AVX-NEXT: imull $44151, %edx, %edx # imm = 0xAC77 -; AVX-NEXT: shrl $22, %edx -; AVX-NEXT: imull $95, %edx, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl %ecx -; AVX-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 -; AVX-NEXT: shrl $17, %ecx -; AVX-NEXT: imull $98, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $1373, %eax, %ecx # imm = 0x55D -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $9, %edx -; AVX-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: fold_urem_vec_1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [95,124,98,1003] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: retq +; +; AVX2-LABEL: fold_urem_vec_1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [95,124,98,1003] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 } @@ -89,20 +52,43 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { ; SSE-LABEL: fold_urem_vec_2: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] -; SSE-NEXT: pmulhuw %xmm0, %xmm1 -; SSE-NEXT: psrlw $6, %xmm1 -; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 -; SSE-NEXT: psubw %xmm1, %xmm0 +; SSE-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95] +; SSE-NEXT: pmuludq %xmm2, %xmm1 +; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; SSE-NEXT: retq ; -; AVX-LABEL: fold_urem_vec_2: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: fold_urem_vec_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [95,95,95,95] +; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: retq +; +; AVX2-LABEL: fold_urem_vec_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [45210183,45210183,45210183,45210183] +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [95,95,95,95] +; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 } @@ -139,41 +125,39 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; SSE-LABEL: dont_fold_urem_power_of_two: ; SSE: # %bb.0: +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: andl $31, %eax +; SSE-NEXT: movd %xmm0, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: pinsrw $1, %eax, %xmm1 +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: andl $7, %eax +; SSE-NEXT: pinsrw $2, %eax, %xmm1 ; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; SSE-NEXT: shrl $22, %ecx -; SSE-NEXT: imull $95, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pextrw $1, %xmm0, %ecx -; SSE-NEXT: andl $31, %ecx -; SSE-NEXT: movd %xmm0, %edx -; SSE-NEXT: andl $63, %edx -; SSE-NEXT: movd %edx, %xmm1 -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %ecx -; SSE-NEXT: andl $7, %ecx -; SSE-NEXT: pinsrw $2, %ecx, %xmm1 +; SSE-NEXT: imull $45210183, %eax, %eax # imm = 0x2B1DA47 +; SSE-NEXT: imulq $95, %rax, %rax +; SSE-NEXT: shrq $32, %rax ; SSE-NEXT: pinsrw $3, %eax, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_urem_power_of_two: ; AVX: # %bb.0: +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: andl $31, %eax +; AVX-NEXT: vmovd %xmm0, %ecx +; AVX-NEXT: andl $63, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: andl $7, %eax +; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 ; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; AVX-NEXT: shrl $22, %ecx -; AVX-NEXT: imull $95, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpextrw $1, %xmm0, %ecx -; AVX-NEXT: andl $31, %ecx -; AVX-NEXT: vmovd %xmm0, %edx -; AVX-NEXT: andl $63, %edx -; AVX-NEXT: vmovd %edx, %xmm1 -; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %ecx -; AVX-NEXT: andl $7, %ecx -; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 -; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: imull $45210183, %eax, %eax # imm = 0x2B1DA47 +; AVX-NEXT: imulq $95, %rax, %rax +; AVX-NEXT: shrq $32, %rax +; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 ; AVX-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -183,64 +167,46 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { ; SSE-LABEL: dont_fold_urem_one: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: imull $25645, %eax, %ecx # imm = 0x642D -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: movl %eax, %edx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: movzwl %dx, %edx -; SSE-NEXT: shrl %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: shrl $4, %edx -; SSE-NEXT: leal (%rdx,%rdx,2), %ecx -; SSE-NEXT: shll $3, %ecx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: addl %eax, %edx ; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B -; SSE-NEXT: shrl $25, %ecx -; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: imull $6567229, %eax, %eax # imm = 0x64353D +; SSE-NEXT: imulq $654, %rax, %rax # imm = 0x28E +; SSE-NEXT: shrq $32, %rax ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pinsrw $2, %edx, %xmm1 +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: imull $186737709, %eax, %eax # imm = 0xB21642D +; SSE-NEXT: leaq (%rax,%rax,2), %rcx +; SSE-NEXT: shlq $3, %rcx +; SSE-NEXT: subq %rax, %rcx +; SSE-NEXT: shrq $32, %rcx +; SSE-NEXT: pinsrw $2, %ecx, %xmm1 ; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 -; SSE-NEXT: shrl $26, %ecx -; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: imull $791992, %eax, %eax # imm = 0xC15B8 +; SSE-NEXT: imulq $5423, %rax, %rax # imm = 0x152F +; SSE-NEXT: shrq $32, %rax ; SSE-NEXT: pinsrw $3, %eax, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_urem_one: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: imull $25645, %eax, %ecx # imm = 0x642D -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $4, %edx -; AVX-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX-NEXT: shll $3, %ecx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: addl %eax, %edx ; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B -; AVX-NEXT: shrl $25, %ecx -; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: imull $6567229, %eax, %eax # imm = 0x64353D +; AVX-NEXT: imulq $654, %rax, %rax # imm = 0x28E +; AVX-NEXT: shrq $32, %rax ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: imull $186737709, %eax, %eax # imm = 0xB21642D +; AVX-NEXT: leaq (%rax,%rax,2), %rcx +; AVX-NEXT: shlq $3, %rcx +; AVX-NEXT: subq %rax, %rcx +; AVX-NEXT: shrq $32, %rcx +; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 ; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 -; AVX-NEXT: shrl $26, %ecx -; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: imull $791992, %eax, %eax # imm = 0xC15B8 +; AVX-NEXT: imulq $5423, %rax, %rax # imm = 0x152F +; AVX-NEXT: shrq $32, %rax ; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 ; AVX-NEXT: retq %1 = urem <4 x i16> %x, @@ -375,4 +341,4 @@ ; AVX2-NEXT: retq %1 = urem <4 x i64> %x, ret <4 x i64> %1 -} \ No newline at end of file +} Index: llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -625,15 +625,70 @@ ; SSE-NEXT: psubw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_rem7_8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 -; AVX-NEXT: vpsraw $1, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_rem7_8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7] +; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm5, %xmm5 +; AVX1-NEXT: vpmulld %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2NOBW-LABEL: test_rem7_8i16: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpmovsxwd %xmm0, %ymm1 +; AVX2NOBW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX2NOBW-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; AVX2NOBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] +; AVX2NOBW-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 +; AVX2NOBW-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX2NOBW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2NOBW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2NOBW-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX2NOBW-NEXT: vzeroupper +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovsxwd %xmm0, %ymm1 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX512BW-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %res = srem <8 x i16> %a, ret <8 x i16> %res } @@ -699,71 +754,49 @@ ; AVX1-LABEL: test_rem7_16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363] ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 -; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7] +; AVX1-NEXT: vpmulhuw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5 +; AVX1-NEXT: vpmullw %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2NOBW-LABEL: test_rem7_16i8: ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 ; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm2 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX2NOBW-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX2NOBW-NEXT: vpsubb %xmm3, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2NOBW-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpsubb %xmm0, %xmm1, %xmm0 ; AVX2NOBW-NEXT: vzeroupper ; AVX2NOBW-NEXT: retq ; ; AVX512BW-LABEL: test_rem7_16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX512BW-NEXT: vpsubb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %res = srem <16 x i8> %a, Index: llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -553,15 +553,31 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: test_rem7_16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm1 -; AVX2-NEXT: vpsrlw $15, %ymm1, %ymm2 -; AVX2-NEXT: vpsraw $1, %ymm1, %ymm1 -; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2NOBW-LABEL: test_rem7_16i16: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm1 +; AVX2NOBW-NEXT: vpsrlw $15, %ymm1, %ymm2 +; AVX2NOBW-NEXT: vpsraw $1, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_16i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovsxwd %ymm0, %zmm1 +; AVX512BW-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmuludq %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512BW-NEXT: vpmuludq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 +; AVX512BW-NEXT: vpmovdw %zmm2, %ymm1 +; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpsubw %ymm0, %ymm1, %ymm0 +; AVX512BW-NEXT: retq %res = srem <16 x i16> %a, ret <16 x i16> %res } @@ -647,23 +663,14 @@ ; ; AVX512BW-LABEL: test_rem7_32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1 -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm1 -; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxor %ymm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsrlw $7, %ymm1, %ymm1 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm1 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vpsubb %ymm3, %ymm1, %ymm1 -; AVX512BW-NEXT: vpsllw $3, %ymm1, %ymm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq %res = srem <32 x i8> %a, ret <32 x i8> %res Index: llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -442,20 +442,30 @@ ; AVX512F-LABEL: test_rem7_32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] -; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpsrlw $15, %ymm3, %ymm4 -; AVX512F-NEXT: vpsraw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vpaddw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsrlw $15, %ymm2, %ymm3 -; AVX512F-NEXT: vpsraw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm2 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX512F-NEXT: vpmulld %zmm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpmuludq %zmm4, %zmm2, %zmm5 +; AVX512F-NEXT: vpshufd {{.*#+}} zmm2 = zmm2[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-NEXT: vpmuludq %zmm4, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm6, %zmm5 +; AVX512F-NEXT: vpmovdw %zmm5, %ymm2 +; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpsubw %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm2 +; AVX512F-NEXT: vpmulld %zmm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpmuludq %zmm4, %zmm2, %zmm3 +; AVX512F-NEXT: vpshufd {{.*#+}} zmm2 = zmm2[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-NEXT: vpmuludq %zmm4, %zmm2, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 +; AVX512F-NEXT: vpmovdw %zmm3, %ymm2 +; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpsubw %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -616,16 +616,63 @@ ; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_rem7_8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_rem7_8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7] +; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2NOBW-LABEL: test_rem7_8i16: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2NOBW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX2NOBW-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] +; AVX2NOBW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] +; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2NOBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2NOBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2NOBW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2NOBW-NEXT: vzeroupper +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX512BW-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %res = urem <8 x i16> %a, ret <8 x i16> %res } @@ -690,61 +737,37 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363] ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7] +; AVX1-NEXT: vpmulhuw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmulhuw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2NOBW-LABEL: test_rem7_16i8: ; AVX2NOBW: # %bb.0: -; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm2 -; AVX2NOBW-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2NOBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vzeroupper ; AVX2NOBW-NEXT: retq ; ; AVX512BW-LABEL: test_rem7_16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm2 -; AVX512BW-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %res = urem <16 x i8> %a, Index: llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -561,16 +561,29 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: test_rem7_16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1 -; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1 -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2NOBW-LABEL: test_rem7_16i16: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1 +; AVX2NOBW-NEXT: vpsubw %ymm1, %ymm0, %ymm2 +; AVX2NOBW-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX2NOBW-NEXT: vpaddw %ymm1, %ymm2, %ymm1 +; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_16i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512BW-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm0 +; AVX512BW-NEXT: retq %res = urem <16 x i16> %a, ret <16 x i16> %res } @@ -646,20 +659,10 @@ ; ; AVX512BW-LABEL: test_rem7_32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm2 -; AVX512BW-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpsllw $3, %ymm1, %ymm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq %res = urem <32 x i8> %a, ret <32 x i8> %res Index: llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -440,22 +440,23 @@ ; AVX512F-LABEL: test_rem7_32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm4 -; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddw %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm3 -; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX512F-NEXT: vpmulld %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 +; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm4 +; AVX512F-NEXT: vpmovdw %zmm4, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmulld %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmuludq %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm2 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ;