Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -3970,6 +3970,10 @@ /// power-of-2 denominators. If the target returns an empty SDValue, LLVM /// assumes SDIV is expensive and replaces it with a series of other integer /// operations. + + SDValue BuildSREM(SDNode *Node, SelectionDAG &DAG, bool IsAfterLegalization, + SmallVectorImpl &Created) const; + virtual SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const; Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3916,6 +3916,25 @@ AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); + if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr) && + isConstantOrConstantVector(N1)) { + // check if there is a div to combine with rem. + unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; + SDNode *DivNode = DAG.getNodeIfExists(DivOpcode, N->getVTList(), {N0, N1}); + if (!DivNode) { + SmallVector Built; + SDValue OptimizedRem = + isSigned ? TLI.BuildSREM(N, DAG, LegalOperations, Built) + : SDValue(); // placeholder for urem + if (OptimizedRem.getNode()) { + for (SDNode *N : Built) { + AddToWorklist(N); + } + return OptimizedRem; + } + } + } + // If X/C can be simplified by the division-by-constant logic, lower // X%C to the equivalent of X-X/C*C. // Reuse the SDIVLike/UDIVLike combines - to avoid mangling nodes, the Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4893,6 +4893,152 @@ std::replace_if(Values.begin(), Values.end(), Predicate, Replacement); } +/// Given an ISD::SREM where the divisor is constant, +/// return a DAG expression that will generate the same result +/// using only multiplications, additions and shifts. +/// Ref: D. Lemire, O. Kaser, and N. Kurz, "Faster Remainder by Direct +/// Computation" (LKK) +SDValue TargetLowering::BuildSREM(SDNode *Node, SelectionDAG &DAG, + bool IsAfterLegalization, + SmallVectorImpl &Created) const { + SDLoc DL(Node); + EVT VT = Node->getValueType(0); + EVT FVT; + if (VT.isVector()) { + EVT TmpVT = + EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2); + FVT = + EVT::getVectorVT(*DAG.getContext(), TmpVT, VT.getVectorElementCount()); + } else { + FVT = EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2); + } + + unsigned N = VT.getScalarSizeInBits(); + unsigned F = FVT.getScalarSizeInBits(); + + // Check to see if we can do this. + if (!isTypeLegal(FVT)) + return SDValue(); + + // when optimising for minimum size, we don't want to expand div + if (DAG.getMachineFunction().getFunction().hasMinSize()) + return SDValue(); + + // If MUL is unavailable, we cannot proceed in any case. + if (!isOperationLegalOrCustom(ISD::MUL, FVT)) + return SDValue(); + + if (!isOperationLegalOrCustom(ISD::SRA, FVT)) + return SDValue(); + + SmallVector MagicFactors, AbsoluteDivisors; + bool AllDivisorsAreOnes = true; + bool AllDivisorsArePowerOfTwo = true; + + auto BuildSREMPattern = [&](ConstantSDNode *DivisorConstant) { + // calculate magic number: c = floor( (1<getAPIntValue(); + APInt pd = D.abs(); + APInt IsPow2 = APInt(F, pd.isPowerOf2()); + APInt C = APInt::getMaxValue(F) + .udiv(pd.zext(F)) + APInt(F, 1) + IsPow2; + + SDValue AproximateReciprocal = DAG.getConstant(C, DL, FVT.getScalarType()); + SDValue AbsoluteDivisor = DAG.getConstant(pd, DL, VT.getScalarType()); + + MagicFactors.push_back(AproximateReciprocal); + AbsoluteDivisors.push_back(AbsoluteDivisor); + + assert(!pd.isNullValue() && "Divisor cannot be zero"); + + AllDivisorsAreOnes &= pd.isOneValue(); + AllDivisorsArePowerOfTwo &= pd.isPowerOf2(); + + if (!pd.isStrictlyPositive() || D.isMinSignedValue()) { + // Absolute divisor must be in the range of (1,2^(N-1)) + // We can lower remainder of division by powers of two much better + // elsewhere. + return false; + } + + return true; + }; + + // numerator + SDValue Numerator = Node->getOperand(0); + SDValue ExtendedNumerator = DAG.getSExtOrTrunc(Numerator, DL, FVT); + + // divisor constant + SDValue Divisor = Node->getOperand(1); + + if (!ISD::matchUnaryPredicate(Divisor, BuildSREMPattern)) + return SDValue(); + + // If this is a srem by a one, avoid the fold since it can be constant-folded. + if (AllDivisorsAreOnes) + return SDValue(); + + // If this is a srem by a powers-of-two (including INT_MIN), avoid the fold + // since it can be best implemented as a bit test. + if (AllDivisorsArePowerOfTwo) + return SDValue(); + + // absolute divisor + SDValue AbsoluteDivisor = VT.isVector() + ? DAG.getBuildVector(VT, DL, AbsoluteDivisors) + : AbsoluteDivisors[0]; + SDValue ExtendedAbsoluteDivisor = + DAG.getZExtOrTrunc(AbsoluteDivisor, DL, FVT); + + SDValue MagicFactor = VT.isVector() + ? DAG.getBuildVector(FVT, DL, MagicFactors) + : MagicFactors[0]; + + // lowbits = c * n + SDValue Lowbits = + DAG.getNode(ISD::MUL, DL, FVT, MagicFactor, ExtendedNumerator); + + // highbits = lowbits * pd >> F + SDValue Highbits; + if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, FVT) + : isOperationLegalOrCustom(ISD::MULHU, FVT)) + Highbits = + DAG.getNode(ISD::MULHU, DL, FVT, Lowbits, ExtendedAbsoluteDivisor); + else if (IsAfterLegalization + ? isOperationLegal(ISD::UMUL_LOHI, FVT) + : isOperationLegalOrCustom(ISD::UMUL_LOHI, FVT)) { + SDValue LoHi = DAG.getNode(ISD::UMUL_LOHI, DL, DAG.getVTList(FVT, FVT), + Lowbits, ExtendedAbsoluteDivisor); + Highbits = SDValue(LoHi.getNode(), 1); + } else { + return SDValue(); // No mulhu or equivalent + } + SDValue TruncatedHighbits = DAG.getSExtOrTrunc(Highbits, DL, VT); + + // result = highbits -((pd - 1) & (n >> N-1)) + SDValue One = DAG.getConstant(1, DL, VT); + SDValue DecrementedAbsoluteDivisor = + DAG.getNode(ISD::SUB, DL, VT, AbsoluteDivisor, One); + SDValue ShiftAmount = DAG.getConstant(N - 1, DL, VT); + SDValue Sign = DAG.getNode(ISD::SRA, DL, VT, Numerator, ShiftAmount); + SDValue And = DAG.getNode(ISD::AND, DL, VT, DecrementedAbsoluteDivisor, Sign); + SDValue Result = DAG.getNode(ISD::SUB, DL, VT, TruncatedHighbits, And); + + Created.push_back(MagicFactor.getNode()); + Created.push_back(ExtendedNumerator.getNode()); + Created.push_back(Lowbits.getNode()); + Created.push_back(AbsoluteDivisor.getNode()); + Created.push_back(ExtendedAbsoluteDivisor.getNode()); + Created.push_back(Highbits.getNode()); + Created.push_back(One.getNode()); + Created.push_back(DecrementedAbsoluteDivisor.getNode()); + Created.push_back(ShiftAmount.getNode()); + Created.push_back(Sign.getNode()); + Created.push_back(And.getNode()); + + return Result; +} + /// Given an ISD::UREM used only by an ISD::SETEQ or ISD::SETNE /// where the divisor is constant and the comparison target is zero, /// return a DAG expression that will generate the same comparison result Index: llvm/test/CodeGen/AArch64/srem-lkk.ll =================================================================== --- llvm/test/CodeGen/AArch64/srem-lkk.ll +++ llvm/test/CodeGen/AArch64/srem-lkk.ll @@ -1,76 +1,91 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s -define i32 @fold_srem_positive_odd(i32 %x) { -; CHECK-LABEL: fold_srem_positive_odd: +define i32 @lower_srem_positive_odd(i32 %x) { +; CHECK-LABEL: lower_srem_positive_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #37253 -; CHECK-NEXT: movk w8, #44150, lsl #16 -; CHECK-NEXT: smull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: add w8, w8, w0 -; CHECK-NEXT: asr w9, w8, #6 -; CHECK-NEXT: add w8, w9, w8, lsr #31 -; CHECK-NEXT: mov w9, #95 -; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: mov x10, #7589 +; CHECK-NEXT: movk x10, #4139, lsl #16 +; CHECK-NEXT: movk x10, #55878, lsl #32 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: movk x10, #689, lsl #48 +; CHECK-NEXT: mov w8, #94 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mov w10, #95 +; CHECK-NEXT: and w8, w8, w0, asr #31 +; CHECK-NEXT: umulh x9, x9, x10 +; CHECK-NEXT: sub w0, w9, w8 ; CHECK-NEXT: ret %1 = srem i32 %x, 95 ret i32 %1 } -define i32 @fold_srem_positive_even(i32 %x) { -; CHECK-LABEL: fold_srem_positive_even: +define i32 @lower_srem_positive_even(i32 %x) { +; CHECK-LABEL: lower_srem_positive_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #36849 -; CHECK-NEXT: movk w8, #15827, lsl #16 -; CHECK-NEXT: smull x8, w0, w8 -; CHECK-NEXT: lsr x9, x8, #63 -; CHECK-NEXT: asr x8, x8, #40 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: mov w9, #1060 -; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: mov x10, #7172 +; CHECK-NEXT: movk x10, #61579, lsl #16 +; CHECK-NEXT: movk x10, #54159, lsl #32 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: movk x10, #61, lsl #48 +; CHECK-NEXT: mov w8, #1059 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mov w10, #1060 +; CHECK-NEXT: and w8, w8, w0, asr #31 +; CHECK-NEXT: umulh x9, x9, x10 +; CHECK-NEXT: sub w0, w9, w8 ; CHECK-NEXT: ret %1 = srem i32 %x, 1060 ret i32 %1 } -define i32 @fold_srem_negative_odd(i32 %x) { -; CHECK-LABEL: fold_srem_negative_odd: +define i32 @lower_srem_negative_odd(i32 %x) { +; CHECK-LABEL: lower_srem_negative_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #65445 -; CHECK-NEXT: movk w8, #42330, lsl #16 -; CHECK-NEXT: smull x8, w0, w8 -; CHECK-NEXT: lsr x9, x8, #63 -; CHECK-NEXT: asr x8, x8, #40 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: mov w9, #-723 -; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: mov x10, #91 +; CHECK-NEXT: movk x10, #23205, lsl #16 +; CHECK-NEXT: movk x10, #42240, lsl #32 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: movk x10, #90, lsl #48 +; CHECK-NEXT: mov w8, #722 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mov w10, #723 +; CHECK-NEXT: and w8, w8, w0, asr #31 +; CHECK-NEXT: umulh x9, x9, x10 +; CHECK-NEXT: sub w0, w9, w8 ; CHECK-NEXT: ret %1 = srem i32 %x, -723 ret i32 %1 } -define i32 @fold_srem_negative_even(i32 %x) { -; CHECK-LABEL: fold_srem_negative_even: +define i32 @lower_srem_negative_even(i32 %x) { +; CHECK-LABEL: lower_srem_negative_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #62439 -; CHECK-NEXT: movk w8, #64805, lsl #16 -; CHECK-NEXT: smull x8, w0, w8 -; CHECK-NEXT: lsr x9, x8, #63 -; CHECK-NEXT: asr x8, x8, #40 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: mov w9, #-22981 -; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: mov x10, #21004 +; CHECK-NEXT: movk x10, #6399, lsl #16 +; CHECK-NEXT: movk x10, #55820, lsl #32 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: movk x10, #2, lsl #48 +; CHECK-NEXT: mov w8, #22980 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: mov w10, #22981 +; CHECK-NEXT: and w8, w8, w0, asr #31 +; CHECK-NEXT: umulh x9, x9, x10 +; CHECK-NEXT: sub w0, w9, w8 ; CHECK-NEXT: ret %1 = srem i32 %x, -22981 ret i32 %1 } -; Don't fold if we can combine srem with sdiv. +; Don't lower if we can combine srem with sdiv. define i32 @combine_srem_sdiv(i32 %x) { ; CHECK-LABEL: combine_srem_sdiv: ; CHECK: // %bb.0: @@ -91,9 +106,9 @@ ret i32 %3 } -; Don't fold for divisors that are a power of two. -define i32 @dont_fold_srem_power_of_two(i32 %x) { -; CHECK-LABEL: dont_fold_srem_power_of_two: +; Don't lower for divisors that are a power of two. +define i32 @dont_lower_srem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_lower_srem_power_of_two: ; CHECK: // %bb.0: ; CHECK-NEXT: add w8, w0, #63 // =63 ; CHECK-NEXT: cmp w0, #0 // =0 @@ -105,9 +120,9 @@ ret i32 %1 } -; Don't fold if the divisor is one. -define i32 @dont_fold_srem_one(i32 %x) { -; CHECK-LABEL: dont_fold_srem_one: +; Don't lower if the divisor is one. +define i32 @dont_lower_srem_one(i32 %x) { +; CHECK-LABEL: dont_lower_srem_one: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret @@ -115,9 +130,9 @@ ret i32 %1 } -; Don't fold if the divisor is 2^31. -define i32 @dont_fold_srem_i32_smax(i32 %x) { -; CHECK-LABEL: dont_fold_srem_i32_smax: +; Don't lower if the divisor is 2^31. +define i32 @dont_lower_srem_i32_smax(i32 %x) { +; CHECK-LABEL: dont_lower_srem_i32_smax: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #2147483647 ; CHECK-NEXT: add w8, w0, w8 @@ -130,9 +145,9 @@ ret i32 %1 } -; Don't fold i64 srem -define i64 @dont_fold_srem_i64(i64 %x) { -; CHECK-LABEL: dont_fold_srem_i64: +; Don't lower i64 srem +define i64 @dont_lower_srem_i64(i64 %x) { +; CHECK-LABEL: dont_lower_srem_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #58849 ; CHECK-NEXT: movk x8, #48148, lsl #16 @@ -147,3 +162,69 @@ %1 = srem i64 %x, 98 ret i64 %1 } + +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 + +declare dso_local i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #1 + +define void @srem_loop(i32 %x) { +; CHECK-LABEL: srem_loop: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; CHECK-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w23, -40 +; CHECK-NEXT: .cfi_offset w24, -48 +; CHECK-NEXT: .cfi_offset w30, -64 +; CHECK-NEXT: mov x23, #7589 +; CHECK-NEXT: movk x23, #4139, lsl #16 +; CHECK-NEXT: movk x23, #55878, lsl #32 +; CHECK-NEXT: adrp x21, .L.str +; CHECK-NEXT: mov w19, w0 +; CHECK-NEXT: mov w20, wzr +; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: mov w22, #94 +; CHECK-NEXT: movk x23, #689, lsl #48 +; CHECK-NEXT: mov w24, #95 +; CHECK-NEXT: add x21, x21, :lo12:.L.str +; CHECK-NEXT: .LBB9_1: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: mul x9, x9, x23 +; CHECK-NEXT: and w8, w22, w0, asr #31 +; CHECK-NEXT: umulh x9, x9, x24 +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: add w20, w8, w20 +; CHECK-NEXT: mov x0, x21 +; CHECK-NEXT: mov w1, w20 +; CHECK-NEXT: bl printf +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: cmp w0, w19 +; CHECK-NEXT: b.lo .LBB9_1 +; CHECK-NEXT: // %bb.2: // %afterloop +; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = add i32 0, 0 + br label %loop +loop: + %1 = phi i32 [ 1, %entry ], [ %5, %loop ] + %2 = phi i32 [%0, %entry], [%4, %loop] + %3 = srem i32 %1, 95 + %4 = add i32 %3, %2 + %5 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %4) + %6 = icmp ult i32 %5, %x + br i1 %6, label %loop, label %afterloop + +afterloop: + ret void +} Index: llvm/test/CodeGen/AArch64/srem-seteq.ll =================================================================== --- llvm/test/CodeGen/AArch64/srem-seteq.ll +++ llvm/test/CodeGen/AArch64/srem-seteq.ll @@ -83,17 +83,16 @@ define i16 @test_srem_even(i16 %X) nounwind { ; CHECK-LABEL: test_srem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #9363 +; CHECK-NEXT: mov w10, #9363 ; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: movk w9, #37449, lsl #16 -; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: asr w9, w8, #3 -; CHECK-NEXT: add w8, w9, w8, lsr #31 -; CHECK-NEXT: mov w9, #14 -; CHECK-NEXT: msub w8, w8, w9, w0 -; CHECK-NEXT: tst w8, #0xffff +; CHECK-NEXT: mov w9, #13 +; CHECK-NEXT: movk w10, #4681, lsl #16 +; CHECK-NEXT: and w9, w9, w8, lsr #15 +; CHECK-NEXT: mul w8, w8, w10 +; CHECK-NEXT: mov w10, #14 +; CHECK-NEXT: umull x8, w8, w10 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %srem = srem i16 %X, 14 Index: llvm/test/CodeGen/AArch64/srem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/AArch64/srem-vector-lkk.ll +++ llvm/test/CodeGen/AArch64/srem-vector-lkk.ll @@ -1,105 +1,53 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s -define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { -; CHECK-LABEL: fold_srem_vec_1: +define <4 x i16> @lower_srem_vec_1(<4 x i16> %x) { +; CHECK-LABEL: lower_srem_vec_1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #63421 -; CHECK-NEXT: mov w12, #33437 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: movk w9, #31710, lsl #16 -; CHECK-NEXT: smov w11, v0.h[2] -; CHECK-NEXT: movk w12, #21399, lsl #16 -; CHECK-NEXT: smull x12, w11, w12 -; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: lsr x13, x12, #63 -; CHECK-NEXT: asr x12, x12, #37 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w12, w12, w13 -; CHECK-NEXT: mov w13, #98 -; CHECK-NEXT: sub w9, w9, w8 -; CHECK-NEXT: msub w11, w12, w13, w11 -; CHECK-NEXT: asr w13, w9, #6 -; CHECK-NEXT: add w9, w13, w9, lsr #31 -; CHECK-NEXT: mov w13, #37253 -; CHECK-NEXT: mov w10, #-124 -; CHECK-NEXT: smov w12, v0.h[0] -; CHECK-NEXT: movk w13, #44150, lsl #16 -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: smull x10, w12, w13 -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: add w10, w10, w12 -; CHECK-NEXT: asr w13, w10, #6 -; CHECK-NEXT: mov w9, #95 -; CHECK-NEXT: add w10, w13, w10, lsr #31 -; CHECK-NEXT: msub w9, w10, w9, w12 -; CHECK-NEXT: mov w10, #63249 -; CHECK-NEXT: smov w13, v0.h[3] -; CHECK-NEXT: movk w10, #48808, lsl #16 -; CHECK-NEXT: smull x10, w13, w10 -; CHECK-NEXT: lsr x12, x10, #63 -; CHECK-NEXT: asr x10, x10, #40 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: add w10, w10, w12 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #-1003 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: msub w8, w10, w8, w13 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: adrp x8, .LCPI0_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: sshll v3.4s, v0.4h, #0 +; CHECK-NEXT: adrp x8, .LCPI0_2 +; CHECK-NEXT: mul v1.4s, v3.4s, v1.4s +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: sshr v0.4h, v0.4h, #15 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: sub v0.4h, v1.4h, v0.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 } -define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { -; CHECK-LABEL: fold_srem_vec_2: +define <4 x i16> @lower_srem_vec_2(<4 x i16> %x) { +; CHECK-LABEL: lower_srem_vec_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #37253 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: movk w9, #44150, lsl #16 -; CHECK-NEXT: smov w10, v0.h[0] -; CHECK-NEXT: smull x13, w8, w9 -; CHECK-NEXT: smov w11, v0.h[2] -; CHECK-NEXT: smull x14, w10, w9 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: smull x15, w11, w9 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: add w13, w13, w8 -; CHECK-NEXT: smull x9, w12, w9 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: add w14, w14, w10 -; CHECK-NEXT: asr w16, w13, #6 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w15, w15, w11 -; CHECK-NEXT: add w13, w16, w13, lsr #31 -; CHECK-NEXT: asr w16, w14, #6 -; CHECK-NEXT: add w9, w9, w12 -; CHECK-NEXT: add w14, w16, w14, lsr #31 -; CHECK-NEXT: asr w16, w15, #6 -; CHECK-NEXT: add w15, w16, w15, lsr #31 -; CHECK-NEXT: asr w16, w9, #6 -; CHECK-NEXT: add w9, w16, w9, lsr #31 -; CHECK-NEXT: mov w16, #95 -; CHECK-NEXT: msub w10, w14, w16, w10 -; CHECK-NEXT: msub w8, w13, w16, w8 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: msub w11, w15, w16, w11 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: msub w8, w9, w16, w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov w8, #55879 +; CHECK-NEXT: movk w8, #689, lsl #16 +; CHECK-NEXT: sshll v1.4s, v0.4h, #0 +; CHECK-NEXT: dup v4.4s, w8 +; CHECK-NEXT: movi v2.4s, #95 +; CHECK-NEXT: mul v1.4s, v1.4s, v4.4s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: sshr v0.4h, v0.4h, #15 +; CHECK-NEXT: movi v3.4h, #94 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: and v0.8b, v0.8b, v3.8b +; CHECK-NEXT: sub v0.4h, v1.4h, v0.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold if we can combine srem with sdiv. +; Don't lower if we can combine srem with sdiv. define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; CHECK-LABEL: combine_srem_sdiv: ; CHECK: // %bb.0: @@ -151,127 +99,93 @@ ret <4 x i16> %3 } -; Don't fold for divisors that are a power of two. -define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { -; CHECK-LABEL: dont_fold_srem_power_of_two: +; Don't lower for divisors that are a power of two. +define <4 x i16> @dont_lower_srem_power_of_two(<4 x i16> %x) { +; CHECK-LABEL: dont_lower_srem_power_of_two: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: add w12, w8, #31 // =31 -; CHECK-NEXT: cmp w8, #0 // =0 -; CHECK-NEXT: mov w11, #37253 -; CHECK-NEXT: csel w12, w12, w8, lt -; CHECK-NEXT: smov w9, v0.h[0] -; CHECK-NEXT: smov w10, v0.h[3] -; CHECK-NEXT: movk w11, #44150, lsl #16 -; CHECK-NEXT: and w12, w12, #0xffffffe0 -; CHECK-NEXT: sub w8, w8, w12 -; CHECK-NEXT: add w12, w9, #63 // =63 -; CHECK-NEXT: smull x11, w10, w11 -; CHECK-NEXT: cmp w9, #0 // =0 -; CHECK-NEXT: lsr x11, x11, #32 -; CHECK-NEXT: csel w12, w12, w9, lt -; CHECK-NEXT: add w11, w11, w10 -; CHECK-NEXT: and w12, w12, #0xffffffc0 -; CHECK-NEXT: sub w9, w9, w12 -; CHECK-NEXT: asr w12, w11, #6 -; CHECK-NEXT: add w11, w12, w11, lsr #31 -; CHECK-NEXT: smov w12, v0.h[2] -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: add w9, w12, #7 // =7 -; CHECK-NEXT: cmp w12, #0 // =0 -; CHECK-NEXT: csel w9, w9, w12, lt -; CHECK-NEXT: and w9, w9, #0xfffffff8 -; CHECK-NEXT: sub w9, w12, w9 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #95 -; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: msub w8, w11, w8, w10 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: adrp x8, .LCPI3_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: sshll v3.4s, v0.4h, #0 +; CHECK-NEXT: adrp x8, .LCPI3_2 +; CHECK-NEXT: mul v1.4s, v3.4s, v1.4s +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI3_2] +; CHECK-NEXT: sshr v0.4h, v0.4h, #15 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: sub v0.4h, v1.4h, v0.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold if the divisor is one. -define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { -; CHECK-LABEL: dont_fold_srem_one: +; Don't lower if the divisor is one. +define <4 x i16> @dont_lower_srem_one(<4 x i16> %x) { +; CHECK-LABEL: dont_lower_srem_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #17097 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[2] -; CHECK-NEXT: movk w9, #45590, lsl #16 -; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w9, w9, w8 -; CHECK-NEXT: asr w12, w9, #4 -; CHECK-NEXT: add w9, w12, w9, lsr #31 -; CHECK-NEXT: mov w12, #30865 -; CHECK-NEXT: mov w10, #23 -; CHECK-NEXT: smov w11, v0.h[1] -; CHECK-NEXT: movk w12, #51306, lsl #16 -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: smull x10, w11, w12 -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: add w10, w10, w11 -; CHECK-NEXT: asr w12, w10, #9 -; CHECK-NEXT: mov w9, #654 -; CHECK-NEXT: add w10, w12, w10, lsr #31 -; CHECK-NEXT: msub w9, w10, w9, w11 -; CHECK-NEXT: mov w10, #47143 -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: movk w10, #24749, lsl #16 -; CHECK-NEXT: smull x10, w12, w10 -; CHECK-NEXT: lsr x11, x10, #63 -; CHECK-NEXT: asr x10, x10, #43 -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: add w10, w10, w11 -; CHECK-NEXT: mov v0.h[1], w9 -; CHECK-NEXT: mov w9, #5423 -; CHECK-NEXT: mov v0.h[2], w8 -; CHECK-NEXT: msub w8, w10, w9, w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: adrp x8, .LCPI4_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1] +; CHECK-NEXT: sshll v3.4s, v0.4h, #0 +; CHECK-NEXT: adrp x8, .LCPI4_2 +; CHECK-NEXT: mul v1.4s, v3.4s, v1.4s +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_2] +; CHECK-NEXT: sshr v0.4h, v0.4h, #15 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: sub v0.4h, v1.4h, v0.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold if the divisor is 2^15. -define <4 x i16> @dont_fold_srem_i16_smax(<4 x i16> %x) { -; CHECK-LABEL: dont_fold_srem_i16_smax: +; Don't lower if the divisor is 2^15. +define <4 x i16> @dont_lower_srem_i16_smax(<4 x i16> %x) { +; CHECK-LABEL: dont_lower_srem_i16_smax: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w10, #17097 +; CHECK-NEXT: mov x11, #45591 +; CHECK-NEXT: movk x11, #34192, lsl #16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w9, v0.h[2] -; CHECK-NEXT: movk w10, #45590, lsl #16 -; CHECK-NEXT: smull x10, w9, w10 -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: add w10, w10, w9 -; CHECK-NEXT: asr w12, w10, #4 -; CHECK-NEXT: mov w11, #23 -; CHECK-NEXT: add w10, w12, w10, lsr #31 -; CHECK-NEXT: msub w9, w10, w11, w9 -; CHECK-NEXT: mov w10, #47143 +; CHECK-NEXT: mov w10, #22 +; CHECK-NEXT: movk x11, #25644, lsl #32 +; CHECK-NEXT: movk x11, #2849, lsl #48 +; CHECK-NEXT: and w10, w10, w9, asr #31 +; CHECK-NEXT: sxtw x9, w9 +; CHECK-NEXT: mul x9, x9, x11 +; CHECK-NEXT: mov x11, #48291 +; CHECK-NEXT: movk x11, #1244, lsl #16 ; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: movk w10, #24749, lsl #16 -; CHECK-NEXT: smull x10, w12, w10 -; CHECK-NEXT: lsr x11, x10, #63 -; CHECK-NEXT: asr x10, x10, #43 +; CHECK-NEXT: mov w13, #5422 +; CHECK-NEXT: movk x11, #5559, lsl #32 +; CHECK-NEXT: movk x11, #12, lsl #48 +; CHECK-NEXT: and w13, w13, w12, asr #31 +; CHECK-NEXT: sxtw x12, w12 +; CHECK-NEXT: mul x11, x12, x11 +; CHECK-NEXT: mov w12, #23 +; CHECK-NEXT: umulh x9, x9, x12 +; CHECK-NEXT: mov w12, #5423 ; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: add w10, w10, w11 -; CHECK-NEXT: mov w11, #32767 -; CHECK-NEXT: add w11, w8, w11 +; CHECK-NEXT: umulh x11, x11, x12 +; CHECK-NEXT: mov w12, #32767 +; CHECK-NEXT: add w12, w8, w12 ; CHECK-NEXT: cmp w8, #0 // =0 -; CHECK-NEXT: csel w11, w11, w8, lt -; CHECK-NEXT: and w11, w11, #0xffff8000 -; CHECK-NEXT: sub w8, w8, w11 +; CHECK-NEXT: csel w12, w12, w8, lt +; CHECK-NEXT: and w12, w12, #0xffff8000 +; CHECK-NEXT: sub w8, w8, w12 ; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: sub w9, w9, w10 ; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #5423 ; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: msub w8, w10, w8, w12 +; CHECK-NEXT: sub w8, w11, w13 ; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret @@ -279,9 +193,9 @@ ret <4 x i16> %1 } -; Don't fold i64 srem. -define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { -; CHECK-LABEL: dont_fold_srem_i64: +; Don't lower i64 srem. +define <4 x i64> @dont_lower_srem_i64(<4 x i64> %x) { +; CHECK-LABEL: dont_lower_srem_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x9, #6055 ; CHECK-NEXT: movk x9, #58853, lsl #16 Index: llvm/test/CodeGen/ARM/urem-opt-size.ll =================================================================== --- llvm/test/CodeGen/ARM/urem-opt-size.ll +++ llvm/test/CodeGen/ARM/urem-opt-size.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; When optimising for minimum size, we don't want to expand a div to a mul ; and a shift sequence. As a result, the urem instruction e.g. will not be ; expanded to a sequence of umull, lsrs, muls and sub instructions, but @@ -14,20 +15,58 @@ target triple = "thumbv7m-arm-none-eabi" define i32 @foo1() local_unnamed_addr #0 { -entry: ; CHECK-LABEL: foo1: -; CHECK:__aeabi_idiv -; CHECK-NOT: smmul +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl GetValue +; CHECK-NEXT: movw r1, #16960 +; CHECK-NEXT: movt r1, #15 +; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: b __aeabi_idiv +; +; V7M-LABEL: foo1: +; V7M: @ %bb.0: @ %entry +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: bl GetValue +; V7M-NEXT: ldr r1, .LCPI0_0 +; V7M-NEXT: sdiv r0, r0, r1 +; V7M-NEXT: pop {r7, pc} +; V7M-NEXT: .p2align 2 +; V7M-NEXT: @ %bb.1: +; V7M-NEXT: .LCPI0_0: +; V7M-NEXT: .long 1000000 @ 0xf4240 +entry: %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)() %div = sdiv i32 %call, 1000000 ret i32 %div } define i32 @foo2() local_unnamed_addr #0 { -entry: ; CHECK-LABEL: foo2: -; CHECK: __aeabi_uidiv -; CHECK-NOT: umull +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl GetValue +; CHECK-NEXT: movw r1, #16960 +; CHECK-NEXT: movt r1, #15 +; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: b __aeabi_uidiv +; +; V7M-LABEL: foo2: +; V7M: @ %bb.0: @ %entry +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: bl GetValue +; V7M-NEXT: ldr r1, .LCPI1_0 +; V7M-NEXT: udiv r0, r0, r1 +; V7M-NEXT: pop {r7, pc} +; V7M-NEXT: .p2align 2 +; V7M-NEXT: @ %bb.1: +; V7M-NEXT: .LCPI1_0: +; V7M-NEXT: .long 1000000 @ 0xf4240 +entry: %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)() %div = udiv i32 %call, 1000000 ret i32 %div @@ -35,14 +74,34 @@ ; Test for unsigned remainder define i32 @foo3() local_unnamed_addr #0 { -entry: ; CHECK-LABEL: foo3: -; CHECK: __aeabi_uidivmod -; CHECK-NOT: umull +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl GetValue +; CHECK-NEXT: movw r1, #16960 +; CHECK-NEXT: movt r1, #15 +; CHECK-NEXT: bl __aeabi_uidivmod +; CHECK-NEXT: clz r0, r1 +; CHECK-NEXT: lsr r0, r0, #5 +; CHECK-NEXT: pop {r11, pc} +; ; V7M-LABEL: foo3: -; V7M: udiv [[R2:r[0-9]+]], [[R0:r[0-9]+]], [[R1:r[0-9]+]] -; V7M: mls {{r[0-9]+}}, [[R2]], [[R1]], [[R0]] -; V7M-NOT: __aeabi_uidivmod +; V7M: @ %bb.0: @ %entry +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: bl GetValue +; V7M-NEXT: ldr r1, .LCPI2_0 +; V7M-NEXT: udiv r2, r0, r1 +; V7M-NEXT: mls r0, r2, r1, r0 +; V7M-NEXT: clz r0, r0 +; V7M-NEXT: lsrs r0, r0, #5 +; V7M-NEXT: pop {r7, pc} +; V7M-NEXT: .p2align 2 +; V7M-NEXT: @ %bb.1: +; V7M-NEXT: .LCPI2_0: +; V7M-NEXT: .long 1000000 @ 0xf4240 +entry: %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)() %rem = urem i32 %call, 1000000 %cmp = icmp eq i32 %rem, 0 @@ -52,13 +111,31 @@ ; Test for signed remainder define i32 @foo4() local_unnamed_addr #0 { -entry: ; CHECK-LABEL: foo4: -; CHECK:__aeabi_idivmod +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl GetValue +; CHECK-NEXT: movw r1, #16960 +; CHECK-NEXT: movt r1, #15 +; CHECK-NEXT: bl __aeabi_idivmod +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: pop {r11, pc} +; ; V7M-LABEL: foo4: -; V7M: sdiv [[R2:r[0-9]+]], [[R0:r[0-9]+]], [[R1:r[0-9]+]] -; V7M: mls {{r[0-9]+}}, [[R2]], [[R1]], [[R0]] -; V7M-NOT: __aeabi_idivmod +; V7M: @ %bb.0: @ %entry +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: bl GetValue +; V7M-NEXT: ldr r1, .LCPI3_0 +; V7M-NEXT: sdiv r2, r0, r1 +; V7M-NEXT: mls r0, r2, r1, r0 +; V7M-NEXT: pop {r7, pc} +; V7M-NEXT: .p2align 2 +; V7M-NEXT: @ %bb.1: +; V7M-NEXT: .LCPI3_0: +; V7M-NEXT: .long 1000000 @ 0xf4240 +entry: %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)() %rem = srem i32 %call, 1000000 ret i32 %rem @@ -68,14 +145,32 @@ ; as the division needs to be computed anyway in order to calculate ; the remainder (i.e. make sure we don't end up with two divisions). define i32 @foo5() local_unnamed_addr #0 { -entry: ; CHECK-LABEL: foo5: -; CHECK:__aeabi_idivmod +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl GetValue +; CHECK-NEXT: movw r1, #16960 +; CHECK-NEXT: movt r1, #15 +; CHECK-NEXT: bl __aeabi_idivmod +; CHECK-NEXT: add r0, r0, r1 +; CHECK-NEXT: pop {r11, pc} +; ; V7M-LABEL: foo5: -; V7M: sdiv [[R2:r[0-9]+]], [[R0:r[0-9]+]], [[R1:r[0-9]+]] -; V7M-NOT: sdiv -; V7M: mls {{r[0-9]+}}, [[R2]], [[R1]], [[R0]] -; V7M-NOT: __aeabi_idivmod +; V7M: @ %bb.0: @ %entry +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: bl GetValue +; V7M-NEXT: ldr r1, .LCPI4_0 +; V7M-NEXT: sdiv r2, r0, r1 +; V7M-NEXT: mls r0, r2, r1, r0 +; V7M-NEXT: add r0, r2 +; V7M-NEXT: pop {r7, pc} +; V7M-NEXT: .p2align 2 +; V7M-NEXT: @ %bb.1: +; V7M-NEXT: .LCPI4_0: +; V7M-NEXT: .long 1000000 @ 0xf4240 +entry: %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)() %div = sdiv i32 %call, 1000000 %rem = srem i32 %call, 1000000 @@ -89,9 +184,36 @@ ; legalization and this optimisation. ; Function Attrs: norecurse nounwind define i64 @isel_dont_hang(i32 %bar) local_unnamed_addr #4 { -entry: ; CHECK-LABEL: isel_dont_hang: -; CHECK: __aeabi_uldivmod +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: .setfp r11, sp +; CHECK-NEXT: mov r11, sp +; CHECK-NEXT: asr r1, r0, #31 +; CHECK-NEXT: adds r2, r0, #2 +; CHECK-NEXT: adc r3, r1, #0 +; CHECK-NEXT: lsl r1, r1, #1 +; CHECK-NEXT: orr r1, r1, r0, lsr #31 +; CHECK-NEXT: lsl r0, r0, #1 +; CHECK-NEXT: bl __aeabi_uldivmod +; CHECK-NEXT: pop {r11, pc} +; +; V7M-LABEL: isel_dont_hang: +; V7M: @ %bb.0: @ %entry +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: .setfp r7, sp +; V7M-NEXT: mov r7, sp +; V7M-NEXT: asrs r1, r0, #31 +; V7M-NEXT: adds r2, r0, #2 +; V7M-NEXT: adc r3, r1, #0 +; V7M-NEXT: lsl.w r1, r1, #1 +; V7M-NEXT: orr.w r1, r1, r0, lsr #31 +; V7M-NEXT: lsl.w r0, r0, #1 +; V7M-NEXT: bl __aeabi_uldivmod +; V7M-NEXT: pop {r7, pc} +entry: %temp.0 = sext i32 %bar to i64 %mul83 = shl i64 %temp.0, 1 %add84 = add i64 %temp.0, 2 @@ -101,10 +223,24 @@ ; i16 types are promoted to i32, and we expect a normal udiv here: define i16 @isel_dont_hang_2(i16 %bar) local_unnamed_addr #4 { -entry: ; CHECK-LABEL: isel_dont_hang_2: -; CHECK: udiv -; CHECK-NOT: __aeabi_ +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add r1, r0, #2 +; CHECK-NEXT: lsl r0, r0, #1 +; CHECK-NEXT: uxth r1, r1 +; CHECK-NEXT: uxth r0, r0 +; CHECK-NEXT: udiv r0, r0, r1 +; CHECK-NEXT: bx lr +; +; V7M-LABEL: isel_dont_hang_2: +; V7M: @ %bb.0: @ %entry +; V7M-NEXT: adds r1, r0, #2 +; V7M-NEXT: lsl.w r0, r0, #1 +; V7M-NEXT: uxth r1, r1 +; V7M-NEXT: uxth r0, r0 +; V7M-NEXT: udiv r0, r0, r1 +; V7M-NEXT: bx lr +entry: %mul83 = shl i16 %bar, 1 %add84 = add i16 %bar, 2 %div85 = udiv i16 %mul83, %add84 Index: llvm/test/CodeGen/PowerPC/machine-pre.ll =================================================================== --- llvm/test/CodeGen/PowerPC/machine-pre.ll +++ llvm/test/CodeGen/PowerPC/machine-pre.ll @@ -58,16 +58,21 @@ ; CHECK-P9-LABEL: foo: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mflr r0 +; CHECK-P9-NEXT: std r26, -48(r1) # 8-byte Folded Spill ; CHECK-P9-NEXT: std r27, -40(r1) # 8-byte Folded Spill ; CHECK-P9-NEXT: std r28, -32(r1) # 8-byte Folded Spill ; CHECK-P9-NEXT: std r29, -24(r1) # 8-byte Folded Spill ; CHECK-P9-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P9-NEXT: std r0, 16(r1) ; CHECK-P9-NEXT: stdu r1, -80(r1) -; CHECK-P9-NEXT: mr r30, r4 ; CHECK-P9-NEXT: mr r29, r3 ; CHECK-P9-NEXT: lis r3, 21845 +; CHECK-P9-NEXT: ori r3, r3, 21845 +; CHECK-P9-NEXT: sldi r3, r3, 32 +; CHECK-P9-NEXT: mr r30, r4 ; CHECK-P9-NEXT: add r28, r30, r29 +; CHECK-P9-NEXT: li r26, 3 +; CHECK-P9-NEXT: oris r3, r3, 21845 ; CHECK-P9-NEXT: ori r27, r3, 21846 ; CHECK-P9-NEXT: b .LBB1_4 ; CHECK-P9-NEXT: .p2align 4 @@ -93,12 +98,9 @@ ; CHECK-P9-NEXT: mr r30, r3 ; CHECK-P9-NEXT: extsw r3, r28 ; CHECK-P9-NEXT: mulld r4, r3, r27 -; CHECK-P9-NEXT: rldicl r5, r4, 1, 63 -; CHECK-P9-NEXT: rldicl r4, r4, 32, 32 -; CHECK-P9-NEXT: add r4, r4, r5 -; CHECK-P9-NEXT: slwi r5, r4, 1 -; CHECK-P9-NEXT: add r4, r4, r5 -; CHECK-P9-NEXT: subf r3, r4, r3 +; CHECK-P9-NEXT: rlwinm r3, r3, 2, 30, 30 +; CHECK-P9-NEXT: mulhdu r4, r4, r26 +; CHECK-P9-NEXT: subf r3, r3, r4 ; CHECK-P9-NEXT: cmplwi r3, 1 ; CHECK-P9-NEXT: beq cr0, .LBB1_1 ; CHECK-P9-NEXT: # %bb.5: # %while.cond @@ -139,6 +141,7 @@ ; CHECK-P9-NEXT: ld r29, -24(r1) # 8-byte Folded Reload ; CHECK-P9-NEXT: ld r28, -32(r1) # 8-byte Folded Reload ; CHECK-P9-NEXT: ld r27, -40(r1) # 8-byte Folded Reload +; CHECK-P9-NEXT: ld r26, -48(r1) # 8-byte Folded Reload ; CHECK-P9-NEXT: blr entry: %add = add nsw i32 %y, %x Index: llvm/test/CodeGen/PowerPC/srem-lkk.ll =================================================================== --- llvm/test/CodeGen/PowerPC/srem-lkk.ll +++ llvm/test/CodeGen/PowerPC/srem-lkk.ll @@ -2,8 +2,8 @@ ; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc64 < %s | FileCheck -check-prefixes=CHECK,CHECK64 %s ; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc < %s | FileCheck -check-prefixes=CHECK,CHECK32 %s -define i32 @fold_srem_positive_odd(i32 %x) { -; CHECK-LABEL: fold_srem_positive_odd: +define i32 @lower_srem_positive_odd(i32 %x) { +; CHECK-LABEL: lower_srem_positive_odd: ; CHECK: # %bb.0: ; CHECK-NEXT: lis 4, -21386 ; CHECK-NEXT: ori 4, 4, 37253 @@ -20,8 +20,8 @@ } -define i32 @fold_srem_positive_even(i32 %x) { -; CHECK-LABEL: fold_srem_positive_even: +define i32 @lower_srem_positive_even(i32 %x) { +; CHECK-LABEL: lower_srem_positive_even: ; CHECK: # %bb.0: ; CHECK-NEXT: lis 4, 15827 ; CHECK-NEXT: ori 4, 4, 36849 @@ -37,8 +37,8 @@ } -define i32 @fold_srem_negative_odd(i32 %x) { -; CHECK-LABEL: fold_srem_negative_odd: +define i32 @lower_srem_negative_odd(i32 %x) { +; CHECK-LABEL: lower_srem_negative_odd: ; CHECK: # %bb.0: ; CHECK-NEXT: lis 4, -23206 ; CHECK-NEXT: ori 4, 4, 65445 @@ -54,8 +54,8 @@ } -define i32 @fold_srem_negative_even(i32 %x) { -; CHECK-LABEL: fold_srem_negative_even: +define i32 @lower_srem_negative_even(i32 %x) { +; CHECK-LABEL: lower_srem_negative_even: ; CHECK: # %bb.0: ; CHECK-NEXT: lis 4, -731 ; CHECK-NEXT: ori 4, 4, 62439 @@ -71,7 +71,7 @@ } -; Don't fold if we can combine srem with sdiv. +; Don't lower if we can combine srem with sdiv. define i32 @combine_srem_sdiv(i32 %x) { ; CHECK-LABEL: combine_srem_sdiv: ; CHECK: # %bb.0: @@ -92,9 +92,9 @@ ret i32 %3 } -; Don't fold for divisors that are a power of two. -define i32 @dont_fold_srem_power_of_two(i32 %x) { -; CHECK-LABEL: dont_fold_srem_power_of_two: +; Don't lower for divisors that are a power of two. +define i32 @dont_lower_srem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_lower_srem_power_of_two: ; CHECK: # %bb.0: ; CHECK-NEXT: srawi 4, 3, 6 ; CHECK-NEXT: addze 4, 4 @@ -105,9 +105,9 @@ ret i32 %1 } -; Don't fold if the divisor is one. -define i32 @dont_fold_srem_one(i32 %x) { -; CHECK-LABEL: dont_fold_srem_one: +; Don't lower if the divisor is one. +define i32 @dont_lower_srem_one(i32 %x) { +; CHECK-LABEL: dont_lower_srem_one: ; CHECK: # %bb.0: ; CHECK-NEXT: li 3, 0 ; CHECK-NEXT: blr @@ -115,9 +115,9 @@ ret i32 %1 } -; Don't fold if the divisor is 2^31. -define i32 @dont_fold_srem_i32_smax(i32 %x) { -; CHECK-LABEL: dont_fold_srem_i32_smax: +; Don't lower if the divisor is 2^31. +define i32 @dont_lower_srem_i32_smax(i32 %x) { +; CHECK-LABEL: dont_lower_srem_i32_smax: ; CHECK: # %bb.0: ; CHECK-NEXT: srawi 4, 3, 31 ; CHECK-NEXT: addze 4, 4 @@ -128,9 +128,9 @@ ret i32 %1 } -; Don't fold i64 srem -define i64 @dont_fold_srem_i64(i64 %x) { -; CHECK-LABEL: dont_fold_srem_i64: +; Don't lower i64 srem +define i64 @dont_lower_srem_i64(i64 %x) { +; CHECK-LABEL: dont_lower_srem_i64: ; CHECK: # %bb.0: ; CHECK-NEXT: mflr 0 ; CHECK-NEXT: stw 0, 4(1) @@ -147,3 +147,119 @@ %1 = srem i64 %x, 98 ret i64 %1 } + +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 + +declare dso_local i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #1 + +define void @srem_loop(i32 %x) { +; CHECK64-LABEL: srem_loop: +; CHECK64: # %bb.0: # %entry +; CHECK64-NEXT: mflr 0 +; CHECK64-NEXT: stw 0, 4(1) +; CHECK64-NEXT: stwu 1, -32(1) +; CHECK64-NEXT: .cfi_def_cfa_offset 32 +; CHECK64-NEXT: .cfi_offset lr, 4 +; CHECK64-NEXT: .cfi_offset r27, -20 +; CHECK64-NEXT: .cfi_offset r28, -16 +; CHECK64-NEXT: .cfi_offset r29, -12 +; CHECK64-NEXT: .cfi_offset r30, -8 +; CHECK64-NEXT: lis 4, -21386 +; CHECK64-NEXT: lis 5, .L.str@ha +; CHECK64-NEXT: stw 27, 12(1) # 4-byte Folded Spill +; CHECK64-NEXT: ori 27, 4, 37253 +; CHECK64-NEXT: stw 28, 16(1) # 4-byte Folded Spill +; CHECK64-NEXT: la 28, .L.str@l(5) +; CHECK64-NEXT: stw 29, 20(1) # 4-byte Folded Spill +; CHECK64-NEXT: li 29, 0 +; CHECK64-NEXT: stw 30, 24(1) # 4-byte Folded Spill +; CHECK64-NEXT: mr 30, 3 +; CHECK64-NEXT: li 3, 1 +; CHECK64-NEXT: .LBB9_1: # %loop +; CHECK64-NEXT: # +; CHECK64-NEXT: mulhw 4, 3, 27 +; CHECK64-NEXT: crxor 6, 6, 6 +; CHECK64-NEXT: add 4, 4, 3 +; CHECK64-NEXT: srwi 5, 4, 31 +; CHECK64-NEXT: srawi 4, 4, 6 +; CHECK64-NEXT: add 4, 4, 5 +; CHECK64-NEXT: mulli 4, 4, 95 +; CHECK64-NEXT: subf 3, 4, 3 +; CHECK64-NEXT: add 29, 3, 29 +; CHECK64-NEXT: mr 3, 28 +; CHECK64-NEXT: mr 4, 29 +; CHECK64-NEXT: bl printf +; CHECK64-NEXT: cmplw 3, 30 +; CHECK64-NEXT: blt 0, .LBB9_1 +; CHECK64-NEXT: # %bb.2: # %afterloop +; CHECK64-NEXT: lwz 30, 24(1) # 4-byte Folded Reload +; CHECK64-NEXT: lwz 29, 20(1) # 4-byte Folded Reload +; CHECK64-NEXT: lwz 28, 16(1) # 4-byte Folded Reload +; CHECK64-NEXT: lwz 27, 12(1) # 4-byte Folded Reload +; CHECK64-NEXT: lwz 0, 36(1) +; CHECK64-NEXT: addi 1, 1, 32 +; CHECK64-NEXT: mtlr 0 +; CHECK64-NEXT: blr +; +; CHECK32-LABEL: srem_loop: +; CHECK32: # %bb.0: # %entry +; CHECK32-NEXT: mflr 0 +; CHECK32-NEXT: stw 0, 4(1) +; CHECK32-NEXT: stwu 1, -32(1) +; CHECK32-NEXT: .cfi_def_cfa_offset 32 +; CHECK32-NEXT: .cfi_offset lr, 4 +; CHECK32-NEXT: .cfi_offset r27, -20 +; CHECK32-NEXT: .cfi_offset r28, -16 +; CHECK32-NEXT: .cfi_offset r29, -12 +; CHECK32-NEXT: .cfi_offset r30, -8 +; CHECK32-NEXT: lis 4, -21386 +; CHECK32-NEXT: lis 5, .L.str@ha +; CHECK32-NEXT: stw 27, 12(1) # 4-byte Folded Spill +; CHECK32-NEXT: stw 28, 16(1) # 4-byte Folded Spill +; CHECK32-NEXT: stw 29, 20(1) # 4-byte Folded Spill +; CHECK32-NEXT: stw 30, 24(1) # 4-byte Folded Spill +; CHECK32-NEXT: mr 30, 3 +; CHECK32-NEXT: li 29, 0 +; CHECK32-NEXT: li 3, 1 +; CHECK32-NEXT: ori 27, 4, 37253 +; CHECK32-NEXT: la 28, .L.str@l(5) +; CHECK32-NEXT: .LBB9_1: # %loop +; CHECK32-NEXT: # +; CHECK32-NEXT: mulhw 4, 3, 27 +; CHECK32-NEXT: add 4, 4, 3 +; CHECK32-NEXT: srwi 5, 4, 31 +; CHECK32-NEXT: srawi 4, 4, 6 +; CHECK32-NEXT: add 4, 4, 5 +; CHECK32-NEXT: mulli 4, 4, 95 +; CHECK32-NEXT: subf 3, 4, 3 +; CHECK32-NEXT: add 29, 3, 29 +; CHECK32-NEXT: crxor 6, 6, 6 +; CHECK32-NEXT: mr 3, 28 +; CHECK32-NEXT: mr 4, 29 +; CHECK32-NEXT: bl printf +; CHECK32-NEXT: cmplw 3, 30 +; CHECK32-NEXT: blt 0, .LBB9_1 +; CHECK32-NEXT: # %bb.2: # %afterloop +; CHECK32-NEXT: lwz 30, 24(1) # 4-byte Folded Reload +; CHECK32-NEXT: lwz 29, 20(1) # 4-byte Folded Reload +; CHECK32-NEXT: lwz 28, 16(1) # 4-byte Folded Reload +; CHECK32-NEXT: lwz 27, 12(1) # 4-byte Folded Reload +; CHECK32-NEXT: lwz 0, 36(1) +; CHECK32-NEXT: addi 1, 1, 32 +; CHECK32-NEXT: mtlr 0 +; CHECK32-NEXT: blr +entry: + %0 = add i32 0, 0 + br label %loop +loop: + %1 = phi i32 [ 1, %entry ], [ %5, %loop ] + %2 = phi i32 [%0, %entry], [%4, %loop] + %3 = srem i32 %1, 95 + %4 = add i32 %3, %2 + %5 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %4) + %6 = icmp ult i32 %5, %x + br i1 %6, label %loop, label %afterloop + +afterloop: + ret void +} Index: llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll +++ llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll @@ -8,69 +8,75 @@ ; RUN: llc -mcpu=pwr8 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ ; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,P8BE -define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { -; P9LE-LABEL: fold_srem_vec_1: +define <4 x i16> @lower_srem_vec_1(<4 x i16> %x) { +; P9LE-LABEL: lower_srem_vec_1: ; P9LE: # %bb.0: +; P9LE-NEXT: lis r5, 689 ; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: ori r5, r5, 55878 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: lis r5, -21386 -; P9LE-NEXT: ori r5, r5, 37253 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: mulld r5, r4, r5 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: lis r5, 31710 -; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: oris r5, r5, 4139 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: ori r5, r5, 7589 +; P9LE-NEXT: andi. r4, r4, 94 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 95 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: lis r5, 528 +; P9LE-NEXT: ori r5, r5, 33825 +; P9LE-NEXT: sldi r5, r5, 32 ; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: oris r5, r5, 2114 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: ori r5, r5, 4229 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: ori r5, r5, 63421 -; P9LE-NEXT: mulld r5, r4, r5 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r4, r5 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: lis r5, 21399 -; P9LE-NEXT: mulli r4, r4, -124 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 124 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: andi. r4, r4, 123 ; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: lis r5, 668 ; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: ori r5, r5, 48148 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: oris r5, r5, 58848 +; P9LE-NEXT: ori r5, r5, 42800 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: ori r5, r5, 33437 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: rldicl r5, r4, 1, 63 -; P9LE-NEXT: rldicl r4, r4, 32, 32 -; P9LE-NEXT: srawi r4, r4, 5 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: lis r5, -16728 -; P9LE-NEXT: mulli r4, r4, 98 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 98 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: andi. r4, r4, 97 ; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: lis r5, 65 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: ori r5, r5, 22280 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: oris r5, r5, 61158 +; P9LE-NEXT: ori r5, r5, 14506 +; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: ori r5, r5, 63249 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: rldicl r5, r4, 1, 63 -; P9LE-NEXT: rldicl r4, r4, 32, 32 -; P9LE-NEXT: srawi r4, r4, 8 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, -1003 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 1003 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: andi. r4, r4, 1002 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 @@ -78,205 +84,221 @@ ; P9LE-NEXT: vmrglw v2, v2, v3 ; P9LE-NEXT: blr ; -; P9BE-LABEL: fold_srem_vec_1: +; P9BE-LABEL: lower_srem_vec_1: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: lis r5, 65 +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: ori r5, r5, 22280 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: sldi r5, r5, 32 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: lis r4, 31710 -; P9BE-NEXT: ori r4, r4, 63421 +; P9BE-NEXT: oris r5, r5, 61158 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r4, r3, r4 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, -124 +; P9BE-NEXT: ori r5, r5, 14506 +; P9BE-NEXT: andi. r4, r4, 1002 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 1003 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: lis r5, 668 +; P9BE-NEXT: ori r5, r5, 48148 +; P9BE-NEXT: sldi r5, r5, 32 ; P9BE-NEXT: subf r3, r4, r3 -; P9BE-NEXT: lis r4, -21386 +; P9BE-NEXT: oris r5, r5, 58848 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r5, r5, 42800 ; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 37253 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 98 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: andi. r4, r4, 97 ; P9BE-NEXT: subf r3, r4, r3 -; P9BE-NEXT: lis r4, -16728 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: lis r5, 528 +; P9BE-NEXT: ori r5, r5, 33825 +; P9BE-NEXT: sldi r5, r5, 32 +; P9BE-NEXT: oris r5, r5, 2114 +; P9BE-NEXT: ori r5, r5, 4229 ; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 63249 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r5, r4, 1, 63 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: srawi r4, r4, 8 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, -1003 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 124 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: andi. r4, r4, 123 ; P9BE-NEXT: subf r3, r4, r3 -; P9BE-NEXT: lis r4, 21399 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: lis r5, 689 ; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: ori r5, r5, 55878 +; P9BE-NEXT: sldi r5, r5, 32 +; P9BE-NEXT: oris r5, r5, 4139 +; P9BE-NEXT: ori r5, r5, 7589 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 33437 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r5, r4, 1, 63 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: srawi r4, r4, 5 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 98 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 95 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: andi. r4, r4, 94 ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: vmrghh v2, v2, v4 -; P9BE-NEXT: vmrghw v2, v3, v2 +; P9BE-NEXT: vmrghw v2, v2, v3 ; P9BE-NEXT: blr ; -; P8LE-LABEL: fold_srem_vec_1: +; P8LE-LABEL: lower_srem_vec_1: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r4, 21399 -; P8LE-NEXT: lis r9, -16728 -; P8LE-NEXT: lis r11, -21386 -; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; P8LE-NEXT: ori r4, r4, 33437 -; P8LE-NEXT: ori r9, r9, 63249 -; P8LE-NEXT: ori r11, r11, 37253 -; P8LE-NEXT: mfvsrd r5, f0 -; P8LE-NEXT: rldicl r3, r5, 32, 48 -; P8LE-NEXT: rldicl r6, r5, 16, 48 -; P8LE-NEXT: clrldi r7, r5, 48 -; P8LE-NEXT: extsh r8, r3 -; P8LE-NEXT: extsh r10, r6 -; P8LE-NEXT: rldicl r5, r5, 48, 48 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: lis r4, 528 +; P8LE-NEXT: lis r5, 668 +; P8LE-NEXT: lis r6, 65 +; P8LE-NEXT: li r11, 95 +; P8LE-NEXT: ori r3, r3, 55878 +; P8LE-NEXT: ori r4, r4, 33825 +; P8LE-NEXT: ori r5, r5, 48148 +; P8LE-NEXT: ori r6, r6, 22280 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: sldi r4, r4, 32 +; P8LE-NEXT: mfvsrd r7, f0 +; P8LE-NEXT: oris r3, r3, 4139 +; P8LE-NEXT: sldi r5, r5, 32 +; P8LE-NEXT: oris r4, r4, 2114 +; P8LE-NEXT: ori r3, r3, 7589 +; P8LE-NEXT: sldi r6, r6, 32 +; P8LE-NEXT: oris r5, r5, 58848 +; P8LE-NEXT: ori r4, r4, 4229 +; P8LE-NEXT: clrldi r8, r7, 48 +; P8LE-NEXT: rldicl r9, r7, 48, 48 +; P8LE-NEXT: oris r6, r6, 61158 +; P8LE-NEXT: ori r5, r5, 42800 +; P8LE-NEXT: extsh r8, r8 +; P8LE-NEXT: rldicl r10, r7, 32, 48 +; P8LE-NEXT: extsh r9, r9 +; P8LE-NEXT: ori r6, r6, 14506 ; P8LE-NEXT: extsw r8, r8 -; P8LE-NEXT: extsh r12, r7 +; P8LE-NEXT: rldicl r7, r7, 16, 48 +; P8LE-NEXT: extsh r10, r10 +; P8LE-NEXT: extsw r9, r9 +; P8LE-NEXT: mulld r3, r8, r3 +; P8LE-NEXT: extsh r7, r7 ; P8LE-NEXT: extsw r10, r10 -; P8LE-NEXT: mulld r4, r8, r4 -; P8LE-NEXT: lis r8, 31710 -; P8LE-NEXT: extsh r0, r5 -; P8LE-NEXT: extsw r12, r12 -; P8LE-NEXT: mulld r9, r10, r9 -; P8LE-NEXT: ori r8, r8, 63421 -; P8LE-NEXT: extsw r10, r0 -; P8LE-NEXT: mulld r11, r12, r11 -; P8LE-NEXT: mulld r8, r10, r8 -; P8LE-NEXT: rldicl r0, r4, 1, 63 -; P8LE-NEXT: rldicl r4, r4, 32, 32 -; P8LE-NEXT: rldicl r30, r9, 1, 63 -; P8LE-NEXT: rldicl r9, r9, 32, 32 -; P8LE-NEXT: rldicl r11, r11, 32, 32 -; P8LE-NEXT: rldicl r8, r8, 32, 32 -; P8LE-NEXT: add r11, r11, r12 -; P8LE-NEXT: srawi r4, r4, 5 -; P8LE-NEXT: subf r8, r10, r8 -; P8LE-NEXT: srawi r9, r9, 8 -; P8LE-NEXT: srwi r10, r11, 31 -; P8LE-NEXT: add r4, r4, r0 -; P8LE-NEXT: srawi r11, r11, 6 -; P8LE-NEXT: add r9, r9, r30 -; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload -; P8LE-NEXT: add r10, r11, r10 -; P8LE-NEXT: srwi r11, r8, 31 -; P8LE-NEXT: srawi r8, r8, 6 -; P8LE-NEXT: mulli r4, r4, 98 -; P8LE-NEXT: mulli r9, r9, -1003 -; P8LE-NEXT: add r8, r8, r11 -; P8LE-NEXT: mulli r10, r10, 95 -; P8LE-NEXT: mulli r8, r8, -124 -; P8LE-NEXT: subf r3, r4, r3 -; P8LE-NEXT: subf r4, r9, r6 +; P8LE-NEXT: mulld r4, r9, r4 +; P8LE-NEXT: extsw r7, r7 +; P8LE-NEXT: mulld r5, r10, r5 +; P8LE-NEXT: mulld r6, r7, r6 +; P8LE-NEXT: srawi r8, r8, 31 +; P8LE-NEXT: srawi r9, r9, 31 +; P8LE-NEXT: mulhdu r3, r3, r11 +; P8LE-NEXT: li r11, 124 +; P8LE-NEXT: andi. r8, r8, 94 +; P8LE-NEXT: andi. r9, r9, 123 +; P8LE-NEXT: mulhdu r4, r4, r11 +; P8LE-NEXT: li r11, 98 +; P8LE-NEXT: mulhdu r5, r5, r11 +; P8LE-NEXT: li r11, 1003 +; P8LE-NEXT: mulhdu r6, r6, r11 +; P8LE-NEXT: subf r3, r8, r3 +; P8LE-NEXT: srawi r8, r10, 31 ; P8LE-NEXT: mtvsrd f0, r3 -; P8LE-NEXT: subf r3, r10, r7 +; P8LE-NEXT: srawi r3, r7, 31 +; P8LE-NEXT: andi. r7, r8, 97 +; P8LE-NEXT: subf r4, r9, r4 +; P8LE-NEXT: andi. r3, r3, 1002 +; P8LE-NEXT: subf r5, r7, r5 ; P8LE-NEXT: mtvsrd f1, r4 -; P8LE-NEXT: subf r4, r8, r5 -; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: subf r3, r3, r6 +; P8LE-NEXT: mtvsrd f2, r5 ; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: mtvsrd f3, r3 ; P8LE-NEXT: xxswapd v3, vs1 ; P8LE-NEXT: xxswapd v4, vs2 ; P8LE-NEXT: xxswapd v5, vs3 ; P8LE-NEXT: vmrglh v2, v3, v2 ; P8LE-NEXT: vmrglh v3, v5, v4 -; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: vmrglw v2, v3, v2 ; P8LE-NEXT: blr ; -; P8BE-LABEL: fold_srem_vec_1: +; P8BE-LABEL: lower_srem_vec_1: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, -16728 -; P8BE-NEXT: lis r9, 31710 -; P8BE-NEXT: lis r8, 21399 -; P8BE-NEXT: lis r10, -21386 -; P8BE-NEXT: ori r3, r3, 63249 -; P8BE-NEXT: ori r9, r9, 63421 -; P8BE-NEXT: ori r8, r8, 33437 -; P8BE-NEXT: ori r10, r10, 37253 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r7, r4, 32, 48 -; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: rldicl r4, r4, 16, 48 -; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: lis r3, 65 +; P8BE-NEXT: mfvsrd r7, v2 +; P8BE-NEXT: lis r4, 668 +; P8BE-NEXT: lis r5, 528 +; P8BE-NEXT: lis r6, 689 +; P8BE-NEXT: li r11, 1003 +; P8BE-NEXT: ori r3, r3, 22280 +; P8BE-NEXT: ori r4, r4, 48148 +; P8BE-NEXT: ori r5, r5, 33825 +; P8BE-NEXT: ori r6, r6, 55878 +; P8BE-NEXT: li r12, 98 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: clrldi r8, r7, 48 +; P8BE-NEXT: oris r3, r3, 61158 +; P8BE-NEXT: extsh r8, r8 +; P8BE-NEXT: sldi r4, r4, 32 +; P8BE-NEXT: rldicl r9, r7, 48, 48 +; P8BE-NEXT: ori r3, r3, 14506 +; P8BE-NEXT: extsw r8, r8 +; P8BE-NEXT: sldi r5, r5, 32 +; P8BE-NEXT: sldi r6, r6, 32 +; P8BE-NEXT: oris r4, r4, 58848 +; P8BE-NEXT: extsh r9, r9 +; P8BE-NEXT: rldicl r10, r7, 32, 48 +; P8BE-NEXT: rldicl r7, r7, 16, 48 +; P8BE-NEXT: oris r5, r5, 2114 +; P8BE-NEXT: oris r6, r6, 4139 +; P8BE-NEXT: ori r4, r4, 42800 +; P8BE-NEXT: extsw r9, r9 +; P8BE-NEXT: mulld r3, r8, r3 +; P8BE-NEXT: extsh r10, r10 ; P8BE-NEXT: extsh r7, r7 -; P8BE-NEXT: extsh r6, r6 -; P8BE-NEXT: extsw r5, r5 -; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: ori r5, r5, 4229 +; P8BE-NEXT: ori r6, r6, 7589 +; P8BE-NEXT: extsw r10, r10 ; P8BE-NEXT: extsw r7, r7 -; P8BE-NEXT: extsw r6, r6 -; P8BE-NEXT: mulld r3, r5, r3 -; P8BE-NEXT: extsw r4, r4 -; P8BE-NEXT: mulld r9, r7, r9 -; P8BE-NEXT: mulld r8, r6, r8 -; P8BE-NEXT: mulld r10, r4, r10 -; P8BE-NEXT: rldicl r11, r3, 1, 63 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: rldicl r9, r9, 32, 32 -; P8BE-NEXT: rldicl r12, r8, 1, 63 -; P8BE-NEXT: rldicl r8, r8, 32, 32 -; P8BE-NEXT: rldicl r10, r10, 32, 32 -; P8BE-NEXT: subf r9, r7, r9 -; P8BE-NEXT: srawi r3, r3, 8 -; P8BE-NEXT: srawi r8, r8, 5 -; P8BE-NEXT: add r10, r10, r4 -; P8BE-NEXT: add r3, r3, r11 -; P8BE-NEXT: srwi r11, r9, 31 -; P8BE-NEXT: add r8, r8, r12 -; P8BE-NEXT: srawi r9, r9, 6 -; P8BE-NEXT: mulli r3, r3, -1003 -; P8BE-NEXT: add r9, r9, r11 -; P8BE-NEXT: srwi r11, r10, 31 -; P8BE-NEXT: srawi r10, r10, 6 -; P8BE-NEXT: mulli r8, r8, 98 -; P8BE-NEXT: add r10, r10, r11 -; P8BE-NEXT: mulli r9, r9, -124 -; P8BE-NEXT: mulli r10, r10, 95 -; P8BE-NEXT: subf r3, r3, r5 +; P8BE-NEXT: mulld r4, r9, r4 +; P8BE-NEXT: mulld r5, r10, r5 +; P8BE-NEXT: mulld r6, r7, r6 +; P8BE-NEXT: srawi r8, r8, 31 +; P8BE-NEXT: mulhdu r3, r3, r11 +; P8BE-NEXT: li r11, 124 +; P8BE-NEXT: andi. r8, r8, 1002 +; P8BE-NEXT: srawi r9, r9, 31 +; P8BE-NEXT: srawi r10, r10, 31 +; P8BE-NEXT: mulhdu r4, r4, r12 +; P8BE-NEXT: li r12, 95 +; P8BE-NEXT: mulhdu r5, r5, r11 +; P8BE-NEXT: mulhdu r6, r6, r12 +; P8BE-NEXT: subf r3, r8, r3 +; P8BE-NEXT: srawi r7, r7, 31 +; P8BE-NEXT: andi. r8, r9, 97 ; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: subf r5, r8, r6 +; P8BE-NEXT: subf r4, r8, r4 +; P8BE-NEXT: andi. r8, r10, 123 ; P8BE-NEXT: mtvsrd v2, r3 -; P8BE-NEXT: subf r6, r9, r7 -; P8BE-NEXT: sldi r3, r5, 48 -; P8BE-NEXT: subf r4, r10, r4 -; P8BE-NEXT: mtvsrd v3, r3 -; P8BE-NEXT: sldi r3, r6, 48 +; P8BE-NEXT: andi. r3, r7, 94 +; P8BE-NEXT: subf r5, r8, r5 +; P8BE-NEXT: subf r3, r3, r6 ; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: mtvsrd v4, r3 -; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mtvsrd v3, r4 +; P8BE-NEXT: mtvsrd v4, r5 +; P8BE-NEXT: mtvsrd v5, r3 ; P8BE-NEXT: vmrghh v2, v3, v2 ; P8BE-NEXT: vmrghh v3, v5, v4 ; P8BE-NEXT: vmrghw v2, v3, v2 @@ -285,63 +307,55 @@ ret <4 x i16> %1 } -define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { -; P9LE-LABEL: fold_srem_vec_2: +define <4 x i16> @lower_srem_vec_2(<4 x i16> %x) { +; P9LE-LABEL: lower_srem_vec_2: ; P9LE: # %bb.0: +; P9LE-NEXT: lis r5, 689 ; P9LE-NEXT: li r3, 0 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: lis r5, -21386 -; P9LE-NEXT: ori r5, r5, 37253 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: mulld r6, r4, r5 -; P9LE-NEXT: rldicl r6, r6, 32, 32 -; P9LE-NEXT: add r4, r6, r4 -; P9LE-NEXT: srwi r6, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r6 -; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: ori r5, r5, 55878 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: oris r5, r5, 4139 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: li r6, 95 +; P9LE-NEXT: ori r5, r5, 7589 +; P9LE-NEXT: andi. r4, r4, 94 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: mulhdu r3, r3, r6 ; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: mulld r6, r4, r5 -; P9LE-NEXT: rldicl r6, r6, 32, 32 -; P9LE-NEXT: add r4, r6, r4 -; P9LE-NEXT: srwi r6, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r6 -; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: mulhdu r3, r3, r6 +; P9LE-NEXT: andi. r4, r4, 94 ; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: mulld r6, r4, r5 -; P9LE-NEXT: rldicl r6, r6, 32, 32 -; P9LE-NEXT: add r4, r6, r4 -; P9LE-NEXT: srwi r6, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r6 -; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: mulhdu r3, r3, r6 +; P9LE-NEXT: andi. r4, r4, 94 ; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: mulld r5, r4, r5 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: mulhdu r3, r3, r6 +; P9LE-NEXT: andi. r4, r4, 94 ; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: xxswapd v4, vs0 @@ -351,64 +365,56 @@ ; P9LE-NEXT: vmrglw v2, v2, v3 ; P9LE-NEXT: blr ; -; P9BE-LABEL: fold_srem_vec_2: +; P9BE-LABEL: lower_srem_vec_2: ; P9BE: # %bb.0: +; P9BE-NEXT: lis r5, 689 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: ori r5, r5, 55878 +; P9BE-NEXT: sldi r5, r5, 32 +; P9BE-NEXT: oris r5, r5, 4139 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: lis r4, -21386 -; P9BE-NEXT: ori r4, r4, 37253 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: mulld r5, r3, r4 -; P9BE-NEXT: rldicl r5, r5, 32, 32 -; P9BE-NEXT: add r5, r5, r3 -; P9BE-NEXT: srwi r6, r5, 31 -; P9BE-NEXT: srawi r5, r5, 6 -; P9BE-NEXT: add r5, r5, r6 -; P9BE-NEXT: mulli r5, r5, 95 -; P9BE-NEXT: subf r3, r5, r3 +; P9BE-NEXT: li r6, 95 +; P9BE-NEXT: ori r5, r5, 7589 +; P9BE-NEXT: andi. r4, r4, 94 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: mulhdu r3, r3, r6 +; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: mulld r5, r3, r4 -; P9BE-NEXT: rldicl r5, r5, 32, 32 -; P9BE-NEXT: add r5, r5, r3 -; P9BE-NEXT: srwi r6, r5, 31 -; P9BE-NEXT: srawi r5, r5, 6 -; P9BE-NEXT: add r5, r5, r6 -; P9BE-NEXT: mulli r5, r5, 95 -; P9BE-NEXT: subf r3, r5, r3 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: mulhdu r3, r3, r6 +; P9BE-NEXT: andi. r4, r4, 94 +; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: mulld r5, r3, r4 -; P9BE-NEXT: rldicl r5, r5, 32, 32 -; P9BE-NEXT: add r5, r5, r3 -; P9BE-NEXT: srwi r6, r5, 31 -; P9BE-NEXT: srawi r5, r5, 6 -; P9BE-NEXT: add r5, r5, r6 -; P9BE-NEXT: mulli r5, r5, 95 -; P9BE-NEXT: subf r3, r5, r3 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: mulhdu r3, r3, r6 +; P9BE-NEXT: andi. r4, r4, 94 +; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: mulhdu r3, r3, r6 +; P9BE-NEXT: andi. r4, r4, 94 ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v2, r3 @@ -416,63 +422,53 @@ ; P9BE-NEXT: vmrghw v2, v2, v3 ; P9BE-NEXT: blr ; -; P8LE-LABEL: fold_srem_vec_2: +; P8LE-LABEL: lower_srem_vec_2: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r4, -21386 -; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; P8LE-NEXT: ori r4, r4, 37253 -; P8LE-NEXT: mfvsrd r5, f0 -; P8LE-NEXT: clrldi r3, r5, 48 -; P8LE-NEXT: rldicl r7, r5, 32, 48 -; P8LE-NEXT: extsh r8, r3 -; P8LE-NEXT: rldicl r6, r5, 48, 48 -; P8LE-NEXT: extsh r10, r7 -; P8LE-NEXT: rldicl r5, r5, 16, 48 -; P8LE-NEXT: extsw r8, r8 -; P8LE-NEXT: extsh r9, r6 -; P8LE-NEXT: extsw r10, r10 -; P8LE-NEXT: extsh r11, r5 -; P8LE-NEXT: mulld r12, r8, r4 -; P8LE-NEXT: extsw r9, r9 -; P8LE-NEXT: extsw r11, r11 -; P8LE-NEXT: mulld r30, r10, r4 -; P8LE-NEXT: mulld r0, r9, r4 -; P8LE-NEXT: mulld r4, r11, r4 -; P8LE-NEXT: rldicl r12, r12, 32, 32 -; P8LE-NEXT: add r8, r12, r8 -; P8LE-NEXT: rldicl r12, r30, 32, 32 -; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload -; P8LE-NEXT: rldicl r0, r0, 32, 32 -; P8LE-NEXT: rldicl r4, r4, 32, 32 -; P8LE-NEXT: add r10, r12, r10 -; P8LE-NEXT: add r9, r0, r9 -; P8LE-NEXT: srwi r0, r8, 31 -; P8LE-NEXT: add r4, r4, r11 -; P8LE-NEXT: srwi r11, r10, 31 -; P8LE-NEXT: srawi r8, r8, 6 -; P8LE-NEXT: srawi r10, r10, 6 -; P8LE-NEXT: srwi r12, r9, 31 -; P8LE-NEXT: add r8, r8, r0 -; P8LE-NEXT: srawi r9, r9, 6 -; P8LE-NEXT: add r10, r10, r11 -; P8LE-NEXT: srwi r11, r4, 31 -; P8LE-NEXT: srawi r4, r4, 6 -; P8LE-NEXT: add r9, r9, r12 -; P8LE-NEXT: mulli r8, r8, 95 -; P8LE-NEXT: add r4, r4, r11 -; P8LE-NEXT: mulli r9, r9, 95 -; P8LE-NEXT: mulli r10, r10, 95 -; P8LE-NEXT: mulli r4, r4, 95 -; P8LE-NEXT: subf r3, r8, r3 -; P8LE-NEXT: subf r6, r9, r6 -; P8LE-NEXT: mtvsrd f0, r3 -; P8LE-NEXT: subf r3, r10, r7 -; P8LE-NEXT: subf r4, r4, r5 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: li r11, 95 +; P8LE-NEXT: ori r3, r3, 55878 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: oris r3, r3, 4139 +; P8LE-NEXT: ori r3, r3, 7589 +; P8LE-NEXT: clrldi r5, r4, 48 +; P8LE-NEXT: rldicl r6, r4, 48, 48 +; P8LE-NEXT: extsh r5, r5 +; P8LE-NEXT: rldicl r7, r4, 32, 48 +; P8LE-NEXT: extsh r6, r6 +; P8LE-NEXT: rldicl r4, r4, 16, 48 +; P8LE-NEXT: extsw r5, r5 +; P8LE-NEXT: extsh r7, r7 +; P8LE-NEXT: extsw r6, r6 +; P8LE-NEXT: extsh r4, r4 +; P8LE-NEXT: mulld r8, r5, r3 +; P8LE-NEXT: extsw r7, r7 +; P8LE-NEXT: extsw r4, r4 +; P8LE-NEXT: mulld r9, r6, r3 +; P8LE-NEXT: mulld r10, r7, r3 +; P8LE-NEXT: mulld r3, r4, r3 +; P8LE-NEXT: srawi r5, r5, 31 +; P8LE-NEXT: srawi r6, r6, 31 +; P8LE-NEXT: mulhdu r8, r8, r11 +; P8LE-NEXT: andi. r5, r5, 94 +; P8LE-NEXT: andi. r6, r6, 94 +; P8LE-NEXT: mulhdu r9, r9, r11 +; P8LE-NEXT: srawi r7, r7, 31 +; P8LE-NEXT: mulhdu r10, r10, r11 +; P8LE-NEXT: mulhdu r3, r3, r11 +; P8LE-NEXT: srawi r4, r4, 31 +; P8LE-NEXT: subf r5, r5, r8 +; P8LE-NEXT: andi. r4, r4, 94 +; P8LE-NEXT: mtvsrd f0, r5 +; P8LE-NEXT: andi. r5, r7, 94 +; P8LE-NEXT: subf r6, r6, r9 +; P8LE-NEXT: subf r5, r5, r10 +; P8LE-NEXT: subf r3, r4, r3 ; P8LE-NEXT: mtvsrd f1, r6 -; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: mtvsrd f2, r5 ; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: mtvsrd f3, r3 ; P8LE-NEXT: xxswapd v3, vs1 ; P8LE-NEXT: xxswapd v4, vs2 ; P8LE-NEXT: xxswapd v5, vs3 @@ -481,64 +477,56 @@ ; P8LE-NEXT: vmrglw v2, v3, v2 ; P8LE-NEXT: blr ; -; P8BE-LABEL: fold_srem_vec_2: +; P8BE-LABEL: lower_srem_vec_2: ; P8BE: # %bb.0: +; P8BE-NEXT: lis r3, 689 ; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, -21386 -; P8BE-NEXT: ori r3, r3, 37253 +; P8BE-NEXT: li r11, 95 +; P8BE-NEXT: ori r3, r3, 55878 +; P8BE-NEXT: sldi r3, r3, 32 ; P8BE-NEXT: clrldi r5, r4, 48 ; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: extsh r5, r5 ; P8BE-NEXT: rldicl r7, r4, 32, 48 -; P8BE-NEXT: extsh r6, r6 -; P8BE-NEXT: extsw r5, r5 +; P8BE-NEXT: oris r3, r3, 4139 +; P8BE-NEXT: extsh r5, r5 ; P8BE-NEXT: rldicl r4, r4, 16, 48 +; P8BE-NEXT: extsh r6, r6 ; P8BE-NEXT: extsh r7, r7 -; P8BE-NEXT: extsw r6, r6 -; P8BE-NEXT: mulld r8, r5, r3 +; P8BE-NEXT: ori r3, r3, 7589 +; P8BE-NEXT: extsw r5, r5 ; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: extsw r6, r6 ; P8BE-NEXT: extsw r7, r7 -; P8BE-NEXT: mulld r9, r6, r3 ; P8BE-NEXT: extsw r4, r4 +; P8BE-NEXT: mulld r8, r5, r3 +; P8BE-NEXT: mulld r9, r6, r3 ; P8BE-NEXT: mulld r10, r7, r3 ; P8BE-NEXT: mulld r3, r4, r3 -; P8BE-NEXT: rldicl r8, r8, 32, 32 -; P8BE-NEXT: rldicl r9, r9, 32, 32 -; P8BE-NEXT: add r8, r8, r5 -; P8BE-NEXT: rldicl r10, r10, 32, 32 -; P8BE-NEXT: add r9, r9, r6 -; P8BE-NEXT: srwi r11, r8, 31 -; P8BE-NEXT: srawi r8, r8, 6 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: add r10, r10, r7 -; P8BE-NEXT: add r8, r8, r11 -; P8BE-NEXT: srwi r11, r9, 31 -; P8BE-NEXT: add r3, r3, r4 -; P8BE-NEXT: srawi r9, r9, 6 -; P8BE-NEXT: mulli r8, r8, 95 -; P8BE-NEXT: add r9, r9, r11 -; P8BE-NEXT: srwi r11, r10, 31 -; P8BE-NEXT: srawi r10, r10, 6 -; P8BE-NEXT: mulli r9, r9, 95 -; P8BE-NEXT: add r10, r10, r11 -; P8BE-NEXT: srwi r11, r3, 31 -; P8BE-NEXT: srawi r3, r3, 6 -; P8BE-NEXT: mulli r10, r10, 95 -; P8BE-NEXT: subf r5, r8, r5 -; P8BE-NEXT: add r3, r3, r11 +; P8BE-NEXT: srawi r5, r5, 31 +; P8BE-NEXT: srawi r6, r6, 31 +; P8BE-NEXT: srawi r7, r7, 31 +; P8BE-NEXT: andi. r5, r5, 94 +; P8BE-NEXT: mulhdu r8, r8, r11 +; P8BE-NEXT: srawi r4, r4, 31 +; P8BE-NEXT: andi. r6, r6, 94 +; P8BE-NEXT: andi. r7, r7, 94 +; P8BE-NEXT: mulhdu r9, r9, r11 +; P8BE-NEXT: mulhdu r10, r10, r11 +; P8BE-NEXT: andi. r4, r4, 94 +; P8BE-NEXT: mulhdu r3, r3, r11 +; P8BE-NEXT: subf r5, r5, r8 +; P8BE-NEXT: subf r6, r6, r9 +; P8BE-NEXT: subf r7, r7, r10 +; P8BE-NEXT: subf r3, r4, r3 ; P8BE-NEXT: sldi r5, r5, 48 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: subf r6, r9, r6 -; P8BE-NEXT: mtvsrd v2, r5 ; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: subf r7, r10, r7 -; P8BE-NEXT: mtvsrd v3, r6 -; P8BE-NEXT: subf r3, r3, r4 ; P8BE-NEXT: sldi r4, r7, 48 -; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: mtvsrd v2, r5 ; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mtvsrd v3, r6 ; P8BE-NEXT: mtvsrd v4, r4 ; P8BE-NEXT: mtvsrd v5, r3 +; P8BE-NEXT: vmrghh v2, v3, v2 ; P8BE-NEXT: vmrghh v3, v5, v4 ; P8BE-NEXT: vmrghw v2, v3, v2 ; P8BE-NEXT: blr @@ -547,7 +535,7 @@ } -; Don't fold if we can combine srem with sdiv. +; Don't lower if we can combine srem with sdiv. define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; P9LE-LABEL: combine_srem_sdiv: ; P9LE: # %bb.0: @@ -859,9 +847,9 @@ ret <4 x i16> %3 } -; Don't fold for divisors that are a power of two. -define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { -; P9LE-LABEL: dont_fold_srem_power_of_two: +; Don't lower for divisors that are a power of two. +define <4 x i16> @dont_lower_srem_power_of_two(<4 x i16> %x) { +; P9LE-LABEL: dont_lower_srem_power_of_two: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 0 ; P9LE-NEXT: vextuhrx r3, r3, v2 @@ -880,23 +868,6 @@ ; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: lis r5, -21386 -; P9LE-NEXT: ori r5, r5, 37253 -; P9LE-NEXT: xxswapd v4, vs0 -; P9LE-NEXT: vmrglh v3, v4, v3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: mulld r5, r4, r5 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 6 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: extsh r4, r3 @@ -904,14 +875,32 @@ ; P9LE-NEXT: addze r4, r4 ; P9LE-NEXT: slwi r4, r4, 3 ; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: lis r5, 689 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: ori r5, r5, 55878 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: oris r5, r5, 4139 +; P9LE-NEXT: ori r5, r5, 7589 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 95 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: andi. r4, r4, 94 +; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 -; P9LE-NEXT: vmrglh v2, v4, v2 +; P9LE-NEXT: vmrglh v2, v2, v4 ; P9LE-NEXT: vmrglw v2, v2, v3 ; P9LE-NEXT: blr ; -; P9BE-LABEL: dont_fold_srem_power_of_two: +; P9BE-LABEL: dont_lower_srem_power_of_two: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 @@ -929,354 +918,373 @@ ; P9BE-NEXT: addze r4, r4 ; P9BE-NEXT: slwi r4, r4, 6 ; P9BE-NEXT: subf r3, r4, r3 -; P9BE-NEXT: lis r4, -21386 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 37253 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: srawi r4, r3, 3 +; P9BE-NEXT: addze r4, r4 +; P9BE-NEXT: slwi r4, r4, 3 ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: lis r5, 689 ; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: ori r5, r5, 55878 +; P9BE-NEXT: sldi r5, r5, 32 +; P9BE-NEXT: oris r5, r5, 4139 +; P9BE-NEXT: ori r5, r5, 7589 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: srawi r4, r3, 3 -; P9BE-NEXT: addze r4, r4 -; P9BE-NEXT: slwi r4, r4, 3 +; P9BE-NEXT: srawi r4, r3, 31 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 95 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: andi. r4, r4, 94 ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghh v2, v4, v2 ; P9BE-NEXT: vmrghw v2, v3, v2 ; P9BE-NEXT: blr ; -; P8LE-LABEL: dont_fold_srem_power_of_two: +; P8LE-LABEL: dont_lower_srem_power_of_two: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, -21386 -; P8LE-NEXT: ori r3, r3, 37253 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: li r9, 95 +; P8LE-NEXT: ori r3, r3, 55878 +; P8LE-NEXT: sldi r3, r3, 32 ; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: oris r3, r3, 4139 +; P8LE-NEXT: ori r3, r3, 7589 ; P8LE-NEXT: rldicl r5, r4, 16, 48 -; P8LE-NEXT: clrldi r7, r4, 48 -; P8LE-NEXT: extsh r6, r5 -; P8LE-NEXT: extsh r8, r7 -; P8LE-NEXT: extsw r6, r6 -; P8LE-NEXT: rldicl r9, r4, 48, 48 -; P8LE-NEXT: mulld r3, r6, r3 +; P8LE-NEXT: clrldi r6, r4, 48 +; P8LE-NEXT: extsh r5, r5 +; P8LE-NEXT: extsh r8, r6 +; P8LE-NEXT: extsw r5, r5 +; P8LE-NEXT: rldicl r7, r4, 48, 48 +; P8LE-NEXT: mulld r3, r5, r3 ; P8LE-NEXT: srawi r8, r8, 6 -; P8LE-NEXT: extsh r10, r9 +; P8LE-NEXT: extsh r10, r7 ; P8LE-NEXT: addze r8, r8 ; P8LE-NEXT: rldicl r4, r4, 32, 48 ; P8LE-NEXT: srawi r10, r10, 5 ; P8LE-NEXT: slwi r8, r8, 6 -; P8LE-NEXT: subf r7, r8, r7 -; P8LE-NEXT: rldicl r3, r3, 32, 32 -; P8LE-NEXT: mtvsrd f0, r7 -; P8LE-NEXT: add r3, r3, r6 -; P8LE-NEXT: addze r6, r10 -; P8LE-NEXT: srwi r10, r3, 31 -; P8LE-NEXT: srawi r3, r3, 6 -; P8LE-NEXT: slwi r6, r6, 5 -; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: add r3, r3, r10 -; P8LE-NEXT: extsh r10, r4 -; P8LE-NEXT: subf r6, r6, r9 -; P8LE-NEXT: mulli r3, r3, 95 -; P8LE-NEXT: srawi r8, r10, 3 +; P8LE-NEXT: addze r10, r10 +; P8LE-NEXT: subf r6, r8, r6 +; P8LE-NEXT: mulhdu r3, r3, r9 +; P8LE-NEXT: extsh r9, r4 +; P8LE-NEXT: slwi r8, r10, 5 +; P8LE-NEXT: mtvsrd f0, r6 +; P8LE-NEXT: srawi r9, r9, 3 +; P8LE-NEXT: subf r6, r8, r7 +; P8LE-NEXT: addze r9, r9 +; P8LE-NEXT: srawi r5, r5, 31 ; P8LE-NEXT: mtvsrd f1, r6 -; P8LE-NEXT: addze r7, r8 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: slwi r7, r9, 3 +; P8LE-NEXT: andi. r5, r5, 94 +; P8LE-NEXT: subf r4, r7, r4 +; P8LE-NEXT: subf r3, r5, r3 ; P8LE-NEXT: xxswapd v3, vs1 -; P8LE-NEXT: subf r3, r3, r5 -; P8LE-NEXT: slwi r5, r7, 3 -; P8LE-NEXT: subf r4, r5, r4 -; P8LE-NEXT: mtvsrd f2, r3 -; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: mtvsrd f2, r4 +; P8LE-NEXT: mtvsrd f3, r3 ; P8LE-NEXT: xxswapd v4, vs2 ; P8LE-NEXT: vmrglh v2, v3, v2 ; P8LE-NEXT: xxswapd v5, vs3 -; P8LE-NEXT: vmrglh v3, v4, v5 +; P8LE-NEXT: vmrglh v3, v5, v4 ; P8LE-NEXT: vmrglw v2, v3, v2 ; P8LE-NEXT: blr ; -; P8BE-LABEL: dont_fold_srem_power_of_two: +; P8BE-LABEL: dont_lower_srem_power_of_two: ; P8BE: # %bb.0: +; P8BE-NEXT: lis r3, 689 ; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, -21386 -; P8BE-NEXT: ori r3, r3, 37253 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 32, 48 +; P8BE-NEXT: li r9, 95 +; P8BE-NEXT: ori r3, r3, 55878 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: rldicl r5, r4, 32, 48 +; P8BE-NEXT: rldicl r6, r4, 16, 48 +; P8BE-NEXT: rldicl r7, r4, 48, 48 +; P8BE-NEXT: oris r3, r3, 4139 ; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: clrldi r4, r4, 48 +; P8BE-NEXT: ori r3, r3, 7589 ; P8BE-NEXT: extsh r6, r6 -; P8BE-NEXT: extsw r5, r5 -; P8BE-NEXT: rldicl r7, r4, 16, 48 -; P8BE-NEXT: mulld r3, r5, r3 -; P8BE-NEXT: srawi r8, r6, 5 ; P8BE-NEXT: extsh r7, r7 -; P8BE-NEXT: addze r8, r8 -; P8BE-NEXT: rldicl r4, r4, 48, 48 -; P8BE-NEXT: srawi r9, r7, 6 ; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: srawi r8, r5, 5 +; P8BE-NEXT: extsw r4, r4 +; P8BE-NEXT: addze r8, r8 +; P8BE-NEXT: mulld r3, r4, r3 ; P8BE-NEXT: slwi r8, r8, 5 -; P8BE-NEXT: addze r9, r9 +; P8BE-NEXT: subf r5, r8, r5 +; P8BE-NEXT: srawi r8, r6, 6 +; P8BE-NEXT: addze r8, r8 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: slwi r8, r8, 6 +; P8BE-NEXT: mtvsrd v2, r5 +; P8BE-NEXT: mulhdu r3, r3, r9 +; P8BE-NEXT: srawi r9, r7, 3 ; P8BE-NEXT: subf r6, r8, r6 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: slwi r8, r9, 6 -; P8BE-NEXT: add r3, r3, r5 -; P8BE-NEXT: subf r7, r8, r7 -; P8BE-NEXT: srwi r10, r3, 31 -; P8BE-NEXT: srawi r3, r3, 6 -; P8BE-NEXT: add r3, r3, r10 -; P8BE-NEXT: srawi r9, r4, 3 -; P8BE-NEXT: mulli r3, r3, 95 +; P8BE-NEXT: addze r9, r9 +; P8BE-NEXT: srawi r4, r4, 31 +; P8BE-NEXT: slwi r8, r9, 3 +; P8BE-NEXT: andi. r4, r4, 94 +; P8BE-NEXT: subf r5, r8, r7 ; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: addze r8, r9 -; P8BE-NEXT: mtvsrd v2, r6 -; P8BE-NEXT: slwi r6, r8, 3 -; P8BE-NEXT: subf r4, r6, r4 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: subf r3, r3, r5 -; P8BE-NEXT: sldi r5, r7, 48 -; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: subf r3, r4, r3 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: mtvsrd v3, r6 ; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v3, r5 -; P8BE-NEXT: mtvsrd v4, r3 +; P8BE-NEXT: mtvsrd v4, r5 +; P8BE-NEXT: mtvsrd v5, r3 ; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: vmrghh v3, v4, v5 ; P8BE-NEXT: vmrghw v2, v2, v3 ; P8BE-NEXT: blr %1 = srem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold if the divisor is one. -define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { -; P9LE-LABEL: dont_fold_srem_one: +; Don't lower if the divisor is one. +define <4 x i16> @dont_lower_srem_one(<4 x i16> %x) { +; P9LE-LABEL: dont_lower_srem_one: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: lis r5, 2849 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: ori r5, r5, 25644 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: lis r5, -14230 -; P9LE-NEXT: ori r5, r5, 30865 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: mulld r5, r4, r5 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: xxlxor v4, v4, v4 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 9 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: lis r5, -19946 -; P9LE-NEXT: mulli r4, r4, 654 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: oris r5, r5, 34192 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: ori r5, r5, 45591 +; P9LE-NEXT: andi. r4, r4, 22 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 23 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: lis r5, 12 +; P9LE-NEXT: ori r5, r5, 5559 +; P9LE-NEXT: sldi r5, r5, 32 ; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: oris r5, r5, 1244 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: ori r5, r5, 48291 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: ori r5, r5, 17097 -; P9LE-NEXT: mulld r5, r4, r5 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 4 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: lis r5, 24749 -; P9LE-NEXT: mulli r4, r4, 23 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 5423 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: andi. r4, r4, 5422 ; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: lis r5, 100 ; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: ori r5, r5, 13628 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: oris r5, r5, 18438 +; P9LE-NEXT: ori r5, r5, 17236 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: ori r5, r5, 47143 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: rldicl r5, r4, 1, 63 -; P9LE-NEXT: rldicl r4, r4, 32, 32 -; P9LE-NEXT: srawi r4, r4, 11 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 5423 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 654 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: andi. r4, r4, 653 ; P9LE-NEXT: subf r3, r4, r3 -; P9LE-NEXT: vmrglh v3, v3, v4 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxlxor v4, v4, v4 ; P9LE-NEXT: vmrglh v2, v2, v4 -; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: vmrglw v2, v3, v2 ; P9LE-NEXT: blr ; -; P9BE-LABEL: dont_fold_srem_one: +; P9BE-LABEL: dont_lower_srem_one: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: lis r5, 12 +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: ori r5, r5, 5559 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: sldi r5, r5, 32 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: lis r4, -19946 -; P9BE-NEXT: ori r4, r4, 17097 +; P9BE-NEXT: oris r5, r5, 1244 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 4 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 23 +; P9BE-NEXT: ori r5, r5, 48291 +; P9BE-NEXT: andi. r4, r4, 5422 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 5423 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: lis r5, 2849 +; P9BE-NEXT: ori r5, r5, 25644 +; P9BE-NEXT: sldi r5, r5, 32 ; P9BE-NEXT: subf r3, r4, r3 -; P9BE-NEXT: lis r4, 24749 +; P9BE-NEXT: oris r5, r5, 34192 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r5, r5, 45591 ; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 47143 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r5, r4, 1, 63 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: srawi r4, r4, 11 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 5423 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 23 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: andi. r4, r4, 22 ; P9BE-NEXT: subf r3, r4, r3 -; P9BE-NEXT: lis r4, -14230 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: lis r5, 100 +; P9BE-NEXT: ori r5, r5, 13628 +; P9BE-NEXT: sldi r5, r5, 32 +; P9BE-NEXT: oris r5, r5, 18438 +; P9BE-NEXT: ori r5, r5, 17236 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 30865 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 9 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 654 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 654 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: andi. r4, r4, 653 ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v3, v4 +; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: vmrghh v2, v4, v2 ; P9BE-NEXT: vmrghw v2, v2, v3 ; P9BE-NEXT: blr ; -; P8LE-LABEL: dont_fold_srem_one: +; P8LE-LABEL: dont_lower_srem_one: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 24749 -; P8LE-NEXT: lis r8, -19946 -; P8LE-NEXT: lis r10, -14230 +; P8LE-NEXT: lis r3, 2849 +; P8LE-NEXT: lis r4, 12 +; P8LE-NEXT: lis r5, 100 +; P8LE-NEXT: li r9, 23 ; P8LE-NEXT: xxlxor v5, v5, v5 -; P8LE-NEXT: ori r3, r3, 47143 -; P8LE-NEXT: ori r8, r8, 17097 -; P8LE-NEXT: mfvsrd r4, f0 -; P8LE-NEXT: rldicl r5, r4, 16, 48 -; P8LE-NEXT: rldicl r6, r4, 32, 48 -; P8LE-NEXT: rldicl r4, r4, 48, 48 -; P8LE-NEXT: extsh r7, r5 -; P8LE-NEXT: extsh r9, r6 +; P8LE-NEXT: ori r3, r3, 25644 +; P8LE-NEXT: ori r4, r4, 5559 +; P8LE-NEXT: ori r5, r5, 13628 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: sldi r4, r4, 32 +; P8LE-NEXT: mfvsrd r6, f0 +; P8LE-NEXT: oris r3, r3, 34192 +; P8LE-NEXT: sldi r5, r5, 32 +; P8LE-NEXT: oris r4, r4, 1244 +; P8LE-NEXT: ori r3, r3, 45591 +; P8LE-NEXT: oris r5, r5, 18438 +; P8LE-NEXT: ori r4, r4, 48291 +; P8LE-NEXT: rldicl r7, r6, 32, 48 +; P8LE-NEXT: rldicl r8, r6, 16, 48 +; P8LE-NEXT: ori r5, r5, 17236 +; P8LE-NEXT: extsh r7, r7 +; P8LE-NEXT: rldicl r6, r6, 48, 48 +; P8LE-NEXT: extsh r8, r8 ; P8LE-NEXT: extsw r7, r7 -; P8LE-NEXT: extsh r11, r4 -; P8LE-NEXT: extsw r9, r9 +; P8LE-NEXT: extsh r6, r6 +; P8LE-NEXT: extsw r8, r8 ; P8LE-NEXT: mulld r3, r7, r3 -; P8LE-NEXT: ori r7, r10, 30865 -; P8LE-NEXT: extsw r10, r11 -; P8LE-NEXT: mulld r8, r9, r8 -; P8LE-NEXT: mulld r7, r10, r7 -; P8LE-NEXT: rldicl r11, r3, 1, 63 -; P8LE-NEXT: rldicl r3, r3, 32, 32 -; P8LE-NEXT: rldicl r8, r8, 32, 32 -; P8LE-NEXT: rldicl r7, r7, 32, 32 -; P8LE-NEXT: add r8, r8, r9 -; P8LE-NEXT: srawi r3, r3, 11 -; P8LE-NEXT: add r7, r7, r10 -; P8LE-NEXT: srwi r9, r8, 31 -; P8LE-NEXT: srawi r8, r8, 4 -; P8LE-NEXT: add r3, r3, r11 -; P8LE-NEXT: add r8, r8, r9 -; P8LE-NEXT: srwi r9, r7, 31 -; P8LE-NEXT: srawi r7, r7, 9 -; P8LE-NEXT: mulli r3, r3, 5423 -; P8LE-NEXT: add r7, r7, r9 -; P8LE-NEXT: mulli r8, r8, 23 -; P8LE-NEXT: mulli r7, r7, 654 -; P8LE-NEXT: subf r3, r3, r5 -; P8LE-NEXT: mtvsrd f0, r3 -; P8LE-NEXT: subf r3, r8, r6 +; P8LE-NEXT: extsw r6, r6 +; P8LE-NEXT: mulld r4, r8, r4 +; P8LE-NEXT: mulld r5, r6, r5 +; P8LE-NEXT: srawi r7, r7, 31 +; P8LE-NEXT: srawi r8, r8, 31 +; P8LE-NEXT: andi. r7, r7, 22 +; P8LE-NEXT: srawi r6, r6, 31 +; P8LE-NEXT: mulhdu r3, r3, r9 +; P8LE-NEXT: li r9, 5423 +; P8LE-NEXT: andi. r6, r6, 653 +; P8LE-NEXT: mulhdu r4, r4, r9 +; P8LE-NEXT: li r9, 654 +; P8LE-NEXT: mulhdu r5, r5, r9 +; P8LE-NEXT: subf r3, r7, r3 +; P8LE-NEXT: andi. r7, r8, 5422 ; P8LE-NEXT: subf r4, r7, r4 -; P8LE-NEXT: mtvsrd f1, r3 -; P8LE-NEXT: mtvsrd f2, r4 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: subf r3, r6, r5 +; P8LE-NEXT: mtvsrd f1, r4 +; P8LE-NEXT: mtvsrd f2, r3 ; P8LE-NEXT: xxswapd v2, vs0 ; P8LE-NEXT: xxswapd v3, vs1 ; P8LE-NEXT: xxswapd v4, vs2 -; P8LE-NEXT: vmrglh v2, v2, v3 +; P8LE-NEXT: vmrglh v2, v3, v2 ; P8LE-NEXT: vmrglh v3, v4, v5 ; P8LE-NEXT: vmrglw v2, v2, v3 ; P8LE-NEXT: blr ; -; P8BE-LABEL: dont_fold_srem_one: +; P8BE-LABEL: dont_lower_srem_one: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 24749 -; P8BE-NEXT: lis r7, -19946 -; P8BE-NEXT: lis r8, -14230 -; P8BE-NEXT: ori r3, r3, 47143 -; P8BE-NEXT: ori r7, r7, 17097 -; P8BE-NEXT: ori r8, r8, 30865 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: rldicl r4, r4, 32, 48 -; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: lis r3, 12 +; P8BE-NEXT: mfvsrd r6, v2 +; P8BE-NEXT: lis r4, 2849 +; P8BE-NEXT: lis r5, 100 +; P8BE-NEXT: li r9, 5423 +; P8BE-NEXT: ori r3, r3, 5559 +; P8BE-NEXT: ori r4, r4, 25644 +; P8BE-NEXT: ori r5, r5, 13628 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: clrldi r7, r6, 48 +; P8BE-NEXT: sldi r4, r4, 32 +; P8BE-NEXT: rldicl r8, r6, 48, 48 +; P8BE-NEXT: oris r3, r3, 1244 +; P8BE-NEXT: extsh r7, r7 +; P8BE-NEXT: sldi r5, r5, 32 +; P8BE-NEXT: rldicl r6, r6, 32, 48 +; P8BE-NEXT: oris r4, r4, 34192 +; P8BE-NEXT: ori r3, r3, 48291 +; P8BE-NEXT: extsh r8, r8 +; P8BE-NEXT: extsw r7, r7 +; P8BE-NEXT: oris r5, r5, 18438 +; P8BE-NEXT: ori r4, r4, 45591 ; P8BE-NEXT: extsh r6, r6 -; P8BE-NEXT: extsh r4, r4 -; P8BE-NEXT: extsw r5, r5 +; P8BE-NEXT: extsw r8, r8 +; P8BE-NEXT: mulld r3, r7, r3 +; P8BE-NEXT: ori r5, r5, 17236 ; P8BE-NEXT: extsw r6, r6 -; P8BE-NEXT: extsw r4, r4 -; P8BE-NEXT: mulld r3, r5, r3 -; P8BE-NEXT: mulld r7, r6, r7 -; P8BE-NEXT: mulld r8, r4, r8 -; P8BE-NEXT: rldicl r9, r3, 1, 63 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: rldicl r7, r7, 32, 32 -; P8BE-NEXT: rldicl r8, r8, 32, 32 -; P8BE-NEXT: srawi r3, r3, 11 -; P8BE-NEXT: add r7, r7, r6 -; P8BE-NEXT: add r8, r8, r4 -; P8BE-NEXT: add r3, r3, r9 -; P8BE-NEXT: srwi r9, r7, 31 -; P8BE-NEXT: srawi r7, r7, 4 -; P8BE-NEXT: mulli r3, r3, 5423 -; P8BE-NEXT: add r7, r7, r9 -; P8BE-NEXT: srwi r9, r8, 31 -; P8BE-NEXT: srawi r8, r8, 9 -; P8BE-NEXT: mulli r7, r7, 23 -; P8BE-NEXT: add r8, r8, r9 +; P8BE-NEXT: mulld r4, r8, r4 +; P8BE-NEXT: mulld r5, r6, r5 +; P8BE-NEXT: srawi r7, r7, 31 +; P8BE-NEXT: srawi r8, r8, 31 +; P8BE-NEXT: andi. r7, r7, 5422 +; P8BE-NEXT: srawi r6, r6, 31 +; P8BE-NEXT: mulhdu r3, r3, r9 +; P8BE-NEXT: li r9, 23 +; P8BE-NEXT: andi. r6, r6, 653 +; P8BE-NEXT: mulhdu r4, r4, r9 +; P8BE-NEXT: li r9, 654 +; P8BE-NEXT: mulhdu r5, r5, r9 ; P8BE-NEXT: li r9, 0 -; P8BE-NEXT: mulli r8, r8, 654 -; P8BE-NEXT: subf r3, r3, r5 -; P8BE-NEXT: sldi r5, r9, 48 +; P8BE-NEXT: sldi r9, r9, 48 +; P8BE-NEXT: subf r3, r7, r3 +; P8BE-NEXT: andi. r7, r8, 22 +; P8BE-NEXT: mtvsrd v2, r9 +; P8BE-NEXT: subf r4, r7, r4 ; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v2, r5 -; P8BE-NEXT: subf r5, r7, r6 +; P8BE-NEXT: subf r5, r6, r5 +; P8BE-NEXT: sldi r4, r4, 48 ; P8BE-NEXT: mtvsrd v3, r3 ; P8BE-NEXT: sldi r3, r5, 48 -; P8BE-NEXT: subf r4, r8, r4 -; P8BE-NEXT: mtvsrd v4, r3 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: mtvsrd v4, r4 +; P8BE-NEXT: mtvsrd v5, r3 ; P8BE-NEXT: vmrghh v3, v4, v3 ; P8BE-NEXT: vmrghh v2, v2, v5 ; P8BE-NEXT: vmrghw v2, v2, v3 @@ -1285,37 +1293,40 @@ ret <4 x i16> %1 } -; Don't fold if the divisor is 2^15. -define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { -; P9LE-LABEL: dont_fold_urem_i16_smax: +; Don't lower if the divisor is 2^15. +define <4 x i16> @dont_lower_urem_i16_smax(<4 x i16> %x) { +; P9LE-LABEL: dont_lower_urem_i16_smax: ; P9LE: # %bb.0: +; P9LE-NEXT: lis r5, 2849 ; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: ori r5, r5, 25644 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: lis r5, -19946 -; P9LE-NEXT: ori r5, r5, 17097 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: mulld r5, r4, r5 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r5, r4, 31 -; P9LE-NEXT: srawi r4, r4, 4 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: lis r5, 24749 -; P9LE-NEXT: mulli r4, r4, 23 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: oris r5, r5, 34192 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: ori r5, r5, 45591 +; P9LE-NEXT: andi. r4, r4, 22 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 23 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: lis r5, 12 +; P9LE-NEXT: ori r5, r5, 5559 +; P9LE-NEXT: sldi r5, r5, 32 ; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: oris r5, r5, 1244 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: ori r5, r5, 48291 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r4, r3 -; P9LE-NEXT: extsw r4, r4 -; P9LE-NEXT: ori r5, r5, 47143 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: rldicl r5, r4, 1, 63 -; P9LE-NEXT: rldicl r4, r4, 32, 32 -; P9LE-NEXT: srawi r4, r4, 11 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: mulli r4, r4, 5423 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: srawi r4, r3, 31 +; P9LE-NEXT: extsw r3, r3 +; P9LE-NEXT: mulld r3, r3, r5 +; P9LE-NEXT: li r5, 5423 +; P9LE-NEXT: mulhdu r3, r3, r5 +; P9LE-NEXT: andi. r4, r4, 5422 ; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 @@ -1335,36 +1346,39 @@ ; P9LE-NEXT: vmrglw v2, v3, v2 ; P9LE-NEXT: blr ; -; P9BE-LABEL: dont_fold_urem_i16_smax: +; P9BE-LABEL: dont_lower_urem_i16_smax: ; P9BE: # %bb.0: -; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: lis r5, 12 +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: ori r5, r5, 5559 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: sldi r5, r5, 32 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: lis r4, -19946 -; P9BE-NEXT: ori r4, r4, 17097 +; P9BE-NEXT: oris r5, r5, 1244 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: add r4, r4, r3 -; P9BE-NEXT: srwi r5, r4, 31 -; P9BE-NEXT: srawi r4, r4, 4 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 23 +; P9BE-NEXT: ori r5, r5, 48291 +; P9BE-NEXT: andi. r4, r4, 5422 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 5423 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: lis r5, 2849 +; P9BE-NEXT: ori r5, r5, 25644 +; P9BE-NEXT: sldi r5, r5, 32 ; P9BE-NEXT: subf r3, r4, r3 -; P9BE-NEXT: lis r4, 24749 +; P9BE-NEXT: oris r5, r5, 34192 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r5, r5, 45591 ; P9BE-NEXT: mtvsrd v3, r3 -; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 31 ; P9BE-NEXT: extsw r3, r3 -; P9BE-NEXT: ori r4, r4, 47143 -; P9BE-NEXT: mulld r4, r3, r4 -; P9BE-NEXT: rldicl r5, r4, 1, 63 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: srawi r4, r4, 11 -; P9BE-NEXT: add r4, r4, r5 -; P9BE-NEXT: mulli r4, r4, 5423 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: li r5, 23 +; P9BE-NEXT: mulhdu r3, r3, r5 +; P9BE-NEXT: andi. r4, r4, 22 ; P9BE-NEXT: subf r3, r4, r3 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v4, r3 @@ -1379,113 +1393,119 @@ ; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v3, v4 +; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: vmrghh v2, v4, v2 ; P9BE-NEXT: vmrghw v2, v2, v3 ; P9BE-NEXT: blr ; -; P8LE-LABEL: dont_fold_urem_i16_smax: +; P8LE-LABEL: dont_lower_urem_i16_smax: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r6, 24749 -; P8LE-NEXT: lis r7, -19946 +; P8LE-NEXT: lis r3, 2849 +; P8LE-NEXT: lis r4, 12 +; P8LE-NEXT: li r8, 23 ; P8LE-NEXT: xxlxor v5, v5, v5 -; P8LE-NEXT: ori r6, r6, 47143 -; P8LE-NEXT: ori r7, r7, 17097 -; P8LE-NEXT: mfvsrd r3, f0 -; P8LE-NEXT: rldicl r4, r3, 16, 48 -; P8LE-NEXT: rldicl r5, r3, 32, 48 -; P8LE-NEXT: extsh r8, r4 -; P8LE-NEXT: extsh r9, r5 -; P8LE-NEXT: extsw r8, r8 -; P8LE-NEXT: extsw r9, r9 -; P8LE-NEXT: mulld r6, r8, r6 -; P8LE-NEXT: mulld r7, r9, r7 -; P8LE-NEXT: rldicl r3, r3, 48, 48 -; P8LE-NEXT: rldicl r8, r6, 32, 32 -; P8LE-NEXT: rldicl r7, r7, 32, 32 -; P8LE-NEXT: rldicl r6, r6, 1, 63 -; P8LE-NEXT: srawi r8, r8, 11 -; P8LE-NEXT: add r7, r7, r9 -; P8LE-NEXT: add r6, r8, r6 -; P8LE-NEXT: srwi r8, r7, 31 -; P8LE-NEXT: srawi r7, r7, 4 -; P8LE-NEXT: mulli r6, r6, 5423 -; P8LE-NEXT: add r7, r7, r8 -; P8LE-NEXT: extsh r8, r3 -; P8LE-NEXT: mulli r7, r7, 23 +; P8LE-NEXT: ori r3, r3, 25644 +; P8LE-NEXT: ori r4, r4, 5559 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: sldi r4, r4, 32 +; P8LE-NEXT: mfvsrd r5, f0 +; P8LE-NEXT: oris r3, r3, 34192 +; P8LE-NEXT: oris r4, r4, 1244 +; P8LE-NEXT: ori r3, r3, 45591 +; P8LE-NEXT: ori r4, r4, 48291 +; P8LE-NEXT: rldicl r6, r5, 32, 48 +; P8LE-NEXT: rldicl r7, r5, 16, 48 +; P8LE-NEXT: extsh r6, r6 +; P8LE-NEXT: extsh r7, r7 +; P8LE-NEXT: extsw r6, r6 +; P8LE-NEXT: extsw r7, r7 +; P8LE-NEXT: mulld r3, r6, r3 +; P8LE-NEXT: mulld r4, r7, r4 +; P8LE-NEXT: rldicl r5, r5, 48, 48 +; P8LE-NEXT: srawi r6, r6, 31 +; P8LE-NEXT: srawi r7, r7, 31 +; P8LE-NEXT: andi. r6, r6, 22 +; P8LE-NEXT: mulhdu r3, r3, r8 +; P8LE-NEXT: li r8, 5423 +; P8LE-NEXT: mulhdu r4, r4, r8 +; P8LE-NEXT: extsh r8, r5 ; P8LE-NEXT: srawi r8, r8, 15 +; P8LE-NEXT: subf r3, r6, r3 +; P8LE-NEXT: andi. r6, r7, 5422 +; P8LE-NEXT: addze r7, r8 ; P8LE-NEXT: subf r4, r6, r4 -; P8LE-NEXT: addze r6, r8 -; P8LE-NEXT: mtvsrd f0, r4 -; P8LE-NEXT: slwi r4, r6, 15 -; P8LE-NEXT: subf r5, r7, r5 -; P8LE-NEXT: subf r3, r4, r3 -; P8LE-NEXT: mtvsrd f1, r5 -; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: slwi r6, r7, 15 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: subf r3, r6, r5 +; P8LE-NEXT: mtvsrd f1, r4 ; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: xxswapd v2, vs0 ; P8LE-NEXT: xxswapd v3, vs1 ; P8LE-NEXT: xxswapd v4, vs2 -; P8LE-NEXT: vmrglh v2, v2, v3 +; P8LE-NEXT: vmrglh v2, v3, v2 ; P8LE-NEXT: vmrglh v3, v4, v5 ; P8LE-NEXT: vmrglw v2, v2, v3 ; P8LE-NEXT: blr ; -; P8BE-LABEL: dont_fold_urem_i16_smax: +; P8BE-LABEL: dont_lower_urem_i16_smax: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 24749 -; P8BE-NEXT: lis r7, -19946 -; P8BE-NEXT: ori r3, r3, 47143 -; P8BE-NEXT: ori r7, r7, 17097 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: lis r3, 12 +; P8BE-NEXT: mfvsrd r5, v2 +; P8BE-NEXT: lis r4, 2849 +; P8BE-NEXT: li r8, 5423 +; P8BE-NEXT: li r9, 23 +; P8BE-NEXT: ori r3, r3, 5559 +; P8BE-NEXT: ori r4, r4, 25644 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: clrldi r6, r5, 48 +; P8BE-NEXT: sldi r4, r4, 32 +; P8BE-NEXT: oris r3, r3, 1244 ; P8BE-NEXT: extsh r6, r6 -; P8BE-NEXT: extsw r5, r5 +; P8BE-NEXT: rldicl r7, r5, 48, 48 +; P8BE-NEXT: oris r4, r4, 34192 +; P8BE-NEXT: ori r3, r3, 48291 ; P8BE-NEXT: extsw r6, r6 -; P8BE-NEXT: mulld r3, r5, r3 -; P8BE-NEXT: mulld r7, r6, r7 -; P8BE-NEXT: rldicl r4, r4, 32, 48 -; P8BE-NEXT: extsh r4, r4 -; P8BE-NEXT: rldicl r8, r3, 1, 63 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: rldicl r7, r7, 32, 32 -; P8BE-NEXT: srawi r3, r3, 11 -; P8BE-NEXT: add r7, r7, r6 -; P8BE-NEXT: add r3, r3, r8 -; P8BE-NEXT: srwi r8, r7, 31 -; P8BE-NEXT: srawi r7, r7, 4 -; P8BE-NEXT: mulli r3, r3, 5423 -; P8BE-NEXT: add r7, r7, r8 +; P8BE-NEXT: extsh r7, r7 +; P8BE-NEXT: ori r4, r4, 45591 +; P8BE-NEXT: extsw r7, r7 +; P8BE-NEXT: mulld r3, r6, r3 +; P8BE-NEXT: mulld r4, r7, r4 +; P8BE-NEXT: rldicl r5, r5, 32, 48 +; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: mulhdu r3, r3, r8 ; P8BE-NEXT: li r8, 0 -; P8BE-NEXT: mulli r7, r7, 23 -; P8BE-NEXT: srawi r9, r4, 15 -; P8BE-NEXT: subf r3, r3, r5 -; P8BE-NEXT: sldi r5, r8, 48 -; P8BE-NEXT: addze r8, r9 -; P8BE-NEXT: mtvsrd v2, r5 -; P8BE-NEXT: subf r5, r7, r6 -; P8BE-NEXT: slwi r6, r8, 15 -; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mulhdu r4, r4, r9 +; P8BE-NEXT: srawi r9, r5, 15 +; P8BE-NEXT: addze r9, r9 +; P8BE-NEXT: srawi r6, r6, 31 +; P8BE-NEXT: srawi r7, r7, 31 +; P8BE-NEXT: andi. r6, r6, 5422 +; P8BE-NEXT: slwi r9, r9, 15 +; P8BE-NEXT: subf r3, r6, r3 +; P8BE-NEXT: andi. r6, r7, 22 +; P8BE-NEXT: subf r5, r9, r5 ; P8BE-NEXT: subf r4, r6, r4 -; P8BE-NEXT: mtvsrd v3, r3 -; P8BE-NEXT: sldi r3, r5, 48 +; P8BE-NEXT: sldi r8, r8, 48 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mtvsrd v2, r8 ; P8BE-NEXT: sldi r4, r4, 48 +; P8BE-NEXT: mtvsrd v3, r5 ; P8BE-NEXT: mtvsrd v4, r3 ; P8BE-NEXT: mtvsrd v5, r4 -; P8BE-NEXT: vmrghh v3, v4, v3 -; P8BE-NEXT: vmrghh v2, v2, v5 +; P8BE-NEXT: vmrghh v2, v2, v3 +; P8BE-NEXT: vmrghh v3, v5, v4 ; P8BE-NEXT: vmrghw v2, v2, v3 ; P8BE-NEXT: blr %1 = srem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold i64 srem. -define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { -; P9LE-LABEL: dont_fold_srem_i64: +; Don't lower i64 srem. +define <4 x i64> @dont_lower_srem_i64(<4 x i64> %x) { +; P9LE-LABEL: dont_lower_srem_i64: ; P9LE: # %bb.0: ; P9LE-NEXT: lis r4, 24749 ; P9LE-NEXT: ori r4, r4, 47142 @@ -1529,7 +1549,7 @@ ; P9LE-NEXT: mtvsrdd v2, r3, r4 ; P9LE-NEXT: blr ; -; P9BE-LABEL: dont_fold_srem_i64: +; P9BE-LABEL: dont_lower_srem_i64: ; P9BE: # %bb.0: ; P9BE-NEXT: lis r4, 24749 ; P9BE-NEXT: ori r4, r4, 47142 @@ -1572,7 +1592,7 @@ ; P9BE-NEXT: mtvsrdd v2, 0, r3 ; P9BE-NEXT: blr ; -; P8LE-LABEL: dont_fold_srem_i64: +; P8LE-LABEL: dont_lower_srem_i64: ; P8LE: # %bb.0: ; P8LE-NEXT: lis r3, 24749 ; P8LE-NEXT: lis r4, -19946 @@ -1621,7 +1641,7 @@ ; P8LE-NEXT: xxmrghd v2, vs1, vs3 ; P8LE-NEXT: blr ; -; P8BE-LABEL: dont_fold_srem_i64: +; P8BE-LABEL: dont_lower_srem_i64: ; P8BE: # %bb.0: ; P8BE-NEXT: lis r4, -19946 ; P8BE-NEXT: lis r3, 24749 Index: llvm/test/CodeGen/RISCV/srem-lkk.ll =================================================================== --- llvm/test/CodeGen/RISCV/srem-lkk.ll +++ llvm/test/CodeGen/RISCV/srem-lkk.ll @@ -8,8 +8,8 @@ ; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=CHECK,RV64IM %s -define i32 @fold_srem_positive_odd(i32 %x) { -; RV32I-LABEL: fold_srem_positive_odd: +define i32 @lower_srem_positive_odd(i32 %x) { +; RV32I-LABEL: lower_srem_positive_odd: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: .cfi_def_cfa_offset 16 @@ -23,7 +23,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: fold_srem_positive_odd: +; RV32IM-LABEL: lower_srem_positive_odd: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lui a1, 706409 ; RV32IM-NEXT: addi a1, a1, 389 @@ -38,7 +38,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: fold_srem_positive_odd: +; RV64I-LABEL: lower_srem_positive_odd: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: .cfi_def_cfa_offset 16 @@ -53,25 +53,21 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: fold_srem_positive_odd: +; RV64IM-LABEL: lower_srem_positive_odd: ; RV64IM: # %bb.0: ; RV64IM-NEXT: sext.w a0, a0 -; RV64IM-NEXT: lui a1, 1045903 -; RV64IM-NEXT: addiw a1, a1, -733 +; RV64IM-NEXT: lui a1, 176602 +; RV64IM-NEXT: addiw a1, a1, 1121 ; RV64IM-NEXT: slli a1, a1, 15 -; RV64IM-NEXT: addi a1, a1, 1035 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -905 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -1767 -; RV64IM-NEXT: mulh a1, a0, a1 -; RV64IM-NEXT: add a1, a1, a0 -; RV64IM-NEXT: srli a2, a1, 63 -; RV64IM-NEXT: srai a1, a1, 6 -; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: addi a1, a1, 345 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, -603 +; RV64IM-NEXT: mul a1, a0, a1 ; RV64IM-NEXT: addi a2, zero, 95 -; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: mulhu a1, a1, a2 +; RV64IM-NEXT: srli a0, a0, 31 +; RV64IM-NEXT: andi a0, a0, 94 +; RV64IM-NEXT: subw a0, a1, a0 ; RV64IM-NEXT: .cfi_def_cfa_offset 0 ; RV64IM-NEXT: ret %1 = srem i32 %x, 95 @@ -79,8 +75,8 @@ } -define i32 @fold_srem_positive_even(i32 %x) { -; RV32I-LABEL: fold_srem_positive_even: +define i32 @lower_srem_positive_even(i32 %x) { +; RV32I-LABEL: lower_srem_positive_even: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: .cfi_def_cfa_offset 16 @@ -94,7 +90,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: fold_srem_positive_even: +; RV32IM-LABEL: lower_srem_positive_even: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lui a1, 253241 ; RV32IM-NEXT: addi a1, a1, -15 @@ -108,7 +104,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: fold_srem_positive_even: +; RV64I-LABEL: lower_srem_positive_even: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: .cfi_def_cfa_offset 16 @@ -123,22 +119,21 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: fold_srem_positive_even: +; RV64IM-LABEL: lower_srem_positive_even: ; RV64IM: # %bb.0: ; RV64IM-NEXT: sext.w a0, a0 -; RV64IM-NEXT: lui a1, 506482 -; RV64IM-NEXT: addiw a1, a1, -31 +; RV64IM-NEXT: lui a1, 15828 +; RV64IM-NEXT: addiw a1, a1, -1793 +; RV64IM-NEXT: slli a1, a1, 15 +; RV64IM-NEXT: addi a1, a1, 1113 ; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, 711 -; RV64IM-NEXT: slli a1, a1, 19 -; RV64IM-NEXT: addi a1, a1, 1979 -; RV64IM-NEXT: mulh a1, a0, a1 -; RV64IM-NEXT: srli a2, a1, 63 -; RV64IM-NEXT: srai a1, a1, 9 -; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: addi a1, a1, -1020 +; RV64IM-NEXT: mul a1, a0, a1 ; RV64IM-NEXT: addi a2, zero, 1060 -; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: mulhu a1, a1, a2 +; RV64IM-NEXT: srli a0, a0, 31 +; RV64IM-NEXT: andi a0, a0, 1059 +; RV64IM-NEXT: subw a0, a1, a0 ; RV64IM-NEXT: .cfi_def_cfa_offset 0 ; RV64IM-NEXT: ret %1 = srem i32 %x, 1060 @@ -146,8 +141,8 @@ } -define i32 @fold_srem_negative_odd(i32 %x) { -; RV32I-LABEL: fold_srem_negative_odd: +define i32 @lower_srem_negative_odd(i32 %x) { +; RV32I-LABEL: lower_srem_negative_odd: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: .cfi_def_cfa_offset 16 @@ -161,7 +156,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: fold_srem_negative_odd: +; RV32IM-LABEL: lower_srem_negative_odd: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lui a1, 677296 ; RV32IM-NEXT: addi a1, a1, -91 @@ -175,7 +170,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: fold_srem_negative_odd: +; RV64I-LABEL: lower_srem_negative_odd: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: .cfi_def_cfa_offset 16 @@ -190,25 +185,21 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: fold_srem_negative_odd: +; RV64IM-LABEL: lower_srem_negative_odd: ; RV64IM: # %bb.0: ; RV64IM-NEXT: sext.w a0, a0 -; RV64IM-NEXT: lui a1, 4781 -; RV64IM-NEXT: addiw a1, a1, 2045 -; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, 1371 +; RV64IM-NEXT: lui a1, 11603 +; RV64IM-NEXT: addiw a1, a1, -2045 ; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, -11 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -1355 -; RV64IM-NEXT: mulh a1, a0, a1 -; RV64IM-NEXT: sub a1, a1, a0 -; RV64IM-NEXT: srli a2, a1, 63 -; RV64IM-NEXT: srai a1, a1, 9 -; RV64IM-NEXT: add a1, a1, a2 -; RV64IM-NEXT: addi a2, zero, -723 -; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: addi a1, a1, -1371 +; RV64IM-NEXT: slli a1, a1, 16 +; RV64IM-NEXT: addi a1, a1, 91 +; RV64IM-NEXT: mul a1, a0, a1 +; RV64IM-NEXT: addi a2, zero, 723 +; RV64IM-NEXT: mulhu a1, a1, a2 +; RV64IM-NEXT: srli a0, a0, 31 +; RV64IM-NEXT: andi a0, a0, 722 +; RV64IM-NEXT: subw a0, a1, a0 ; RV64IM-NEXT: .cfi_def_cfa_offset 0 ; RV64IM-NEXT: ret %1 = srem i32 %x, -723 @@ -216,8 +207,8 @@ } -define i32 @fold_srem_negative_even(i32 %x) { -; RV32I-LABEL: fold_srem_negative_even: +define i32 @lower_srem_negative_even(i32 %x) { +; RV32I-LABEL: lower_srem_negative_even: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: .cfi_def_cfa_offset 16 @@ -232,7 +223,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: fold_srem_negative_even: +; RV32IM-LABEL: lower_srem_negative_even: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lui a1, 1036895 ; RV32IM-NEXT: addi a1, a1, 999 @@ -247,7 +238,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: fold_srem_negative_even: +; RV64I-LABEL: lower_srem_negative_even: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: .cfi_def_cfa_offset 16 @@ -263,25 +254,23 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: fold_srem_negative_even: +; RV64IM-LABEL: lower_srem_negative_even: ; RV64IM: # %bb.0: ; RV64IM-NEXT: sext.w a0, a0 -; RV64IM-NEXT: lui a1, 1036895 -; RV64IM-NEXT: addiw a1, a1, 999 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, 11 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -523 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -481 -; RV64IM-NEXT: mulh a1, a0, a1 -; RV64IM-NEXT: srli a2, a1, 63 -; RV64IM-NEXT: srai a1, a1, 12 -; RV64IM-NEXT: add a1, a1, a2 -; RV64IM-NEXT: lui a2, 1048570 -; RV64IM-NEXT: addiw a2, a2, 1595 -; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: srli a1, a0, 31 +; RV64IM-NEXT: lui a2, 6 +; RV64IM-NEXT: addiw a3, a2, -1596 +; RV64IM-NEXT: and a1, a1, a3 +; RV64IM-NEXT: lui a3, 11681 +; RV64IM-NEXT: addiw a3, a3, -999 +; RV64IM-NEXT: slli a3, a3, 12 +; RV64IM-NEXT: addi a3, a3, -11 +; RV64IM-NEXT: slli a3, a3, 12 +; RV64IM-NEXT: addi a3, a3, 524 +; RV64IM-NEXT: mul a0, a0, a3 +; RV64IM-NEXT: addiw a2, a2, -1595 +; RV64IM-NEXT: mulhu a0, a0, a2 +; RV64IM-NEXT: subw a0, a0, a1 ; RV64IM-NEXT: .cfi_def_cfa_offset 0 ; RV64IM-NEXT: ret %1 = srem i32 %x, -22981 @@ -289,7 +278,7 @@ } -; Don't fold if we can combine srem with sdiv. +; Don't lower if we can combine srem with sdiv. define i32 @combine_srem_sdiv(i32 %x) { ; RV32I-LABEL: combine_srem_sdiv: ; RV32I: # %bb.0: @@ -392,9 +381,9 @@ ret i32 %3 } -; Don't fold for divisors that are a power of two. -define i32 @dont_fold_srem_power_of_two(i32 %x) { -; RV32I-LABEL: dont_fold_srem_power_of_two: +; Don't lower for divisors that are a power of two. +define i32 @dont_lower_srem_power_of_two(i32 %x) { +; RV32I-LABEL: dont_lower_srem_power_of_two: ; RV32I: # %bb.0: ; RV32I-NEXT: srai a1, a0, 31 ; RV32I-NEXT: srli a1, a1, 26 @@ -404,7 +393,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: dont_fold_srem_power_of_two: +; RV32IM-LABEL: dont_lower_srem_power_of_two: ; RV32IM: # %bb.0: ; RV32IM-NEXT: srai a1, a0, 31 ; RV32IM-NEXT: srli a1, a1, 26 @@ -414,7 +403,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: dont_fold_srem_power_of_two: +; RV64I-LABEL: dont_lower_srem_power_of_two: ; RV64I: # %bb.0: ; RV64I-NEXT: sext.w a1, a0 ; RV64I-NEXT: srli a1, a1, 57 @@ -428,7 +417,7 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: dont_fold_srem_power_of_two: +; RV64IM-LABEL: dont_lower_srem_power_of_two: ; RV64IM: # %bb.0: ; RV64IM-NEXT: sext.w a1, a0 ; RV64IM-NEXT: srli a1, a1, 57 @@ -445,9 +434,9 @@ ret i32 %1 } -; Don't fold if the divisor is one. -define i32 @dont_fold_srem_one(i32 %x) { -; CHECK-LABEL: dont_fold_srem_one: +; Don't lower if the divisor is one. +define i32 @dont_lower_srem_one(i32 %x) { +; CHECK-LABEL: dont_lower_srem_one: ; CHECK: # %bb.0: ; CHECK-NEXT: mv a0, zero ; CHECK-NEXT: .cfi_def_cfa_offset 0 @@ -456,9 +445,9 @@ ret i32 %1 } -; Don't fold if the divisor is 2^31. -define i32 @dont_fold_srem_i32_smax(i32 %x) { -; RV32I-LABEL: dont_fold_srem_i32_smax: +; Don't lower if the divisor is 2^31. +define i32 @dont_lower_srem_i32_smax(i32 %x) { +; RV32I-LABEL: dont_lower_srem_i32_smax: ; RV32I: # %bb.0: ; RV32I-NEXT: srai a1, a0, 31 ; RV32I-NEXT: srli a1, a1, 1 @@ -469,7 +458,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: dont_fold_srem_i32_smax: +; RV32IM-LABEL: dont_lower_srem_i32_smax: ; RV32IM: # %bb.0: ; RV32IM-NEXT: srai a1, a0, 31 ; RV32IM-NEXT: srli a1, a1, 1 @@ -480,7 +469,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: dont_fold_srem_i32_smax: +; RV64I-LABEL: dont_lower_srem_i32_smax: ; RV64I: # %bb.0: ; RV64I-NEXT: sext.w a1, a0 ; RV64I-NEXT: srli a1, a1, 32 @@ -495,7 +484,7 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: dont_fold_srem_i32_smax: +; RV64IM-LABEL: dont_lower_srem_i32_smax: ; RV64IM: # %bb.0: ; RV64IM-NEXT: sext.w a1, a0 ; RV64IM-NEXT: srli a1, a1, 32 @@ -513,9 +502,9 @@ ret i32 %1 } -; Don't fold i64 srem -define i64 @dont_fold_srem_i64(i64 %x) { -; RV32I-LABEL: dont_fold_srem_i64: +; Don't lower i64 srem +define i64 @dont_lower_srem_i64(i64 %x) { +; RV32I-LABEL: dont_lower_srem_i64: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: .cfi_def_cfa_offset 16 @@ -530,7 +519,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: dont_fold_srem_i64: +; RV32IM-LABEL: dont_lower_srem_i64: ; RV32IM: # %bb.0: ; RV32IM-NEXT: addi sp, sp, -16 ; RV32IM-NEXT: .cfi_def_cfa_offset 16 @@ -545,7 +534,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: dont_fold_srem_i64: +; RV64I-LABEL: dont_lower_srem_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: .cfi_def_cfa_offset 16 @@ -559,7 +548,7 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: dont_fold_srem_i64: +; RV64IM-LABEL: dont_lower_srem_i64: ; RV64IM: # %bb.0: ; RV64IM-NEXT: lui a1, 2675 ; RV64IM-NEXT: addiw a1, a1, -251 @@ -581,3 +570,219 @@ %1 = srem i64 %x, 98 ret i64 %1 } + +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 + +declare dso_local i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #1 + +define void @srem_loop(i32 %x) { +; RV32I-LABEL: srem_loop: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) +; RV32I-NEXT: sw s2, 0(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv s1, zero +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: lui a1, %hi(.L.str) +; RV32I-NEXT: addi s0, a1, %lo(.L.str) +; RV32I-NEXT: .LBB9_1: # %loop +; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: add s1, a0, s1 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: call printf +; RV32I-NEXT: bltu a0, s2, .LBB9_1 +; RV32I-NEXT: # %bb.2: # %afterloop +; RV32I-NEXT: lw s2, 0(sp) +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: srem_loop: +; RV32IM: # %bb.0: # %entry +; RV32IM-NEXT: addi sp, sp, -32 +; RV32IM-NEXT: .cfi_def_cfa_offset 32 +; RV32IM-NEXT: sw ra, 28(sp) +; RV32IM-NEXT: sw s0, 24(sp) +; RV32IM-NEXT: sw s1, 20(sp) +; RV32IM-NEXT: sw s2, 16(sp) +; RV32IM-NEXT: sw s3, 12(sp) +; RV32IM-NEXT: sw s4, 8(sp) +; RV32IM-NEXT: .cfi_offset ra, -4 +; RV32IM-NEXT: .cfi_offset s0, -8 +; RV32IM-NEXT: .cfi_offset s1, -12 +; RV32IM-NEXT: .cfi_offset s2, -16 +; RV32IM-NEXT: .cfi_offset s3, -20 +; RV32IM-NEXT: .cfi_offset s4, -24 +; RV32IM-NEXT: mv s2, a0 +; RV32IM-NEXT: mv s1, zero +; RV32IM-NEXT: addi a0, zero, 1 +; RV32IM-NEXT: lui a1, 706409 +; RV32IM-NEXT: addi s3, a1, 389 +; RV32IM-NEXT: addi s4, zero, 95 +; RV32IM-NEXT: lui a1, %hi(.L.str) +; RV32IM-NEXT: addi s0, a1, %lo(.L.str) +; RV32IM-NEXT: .LBB9_1: # %loop +; RV32IM-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IM-NEXT: mulh a1, a0, s3 +; RV32IM-NEXT: add a1, a1, a0 +; RV32IM-NEXT: srli a2, a1, 31 +; RV32IM-NEXT: srai a1, a1, 6 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: mul a1, a1, s4 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: add s1, a0, s1 +; RV32IM-NEXT: mv a0, s0 +; RV32IM-NEXT: mv a1, s1 +; RV32IM-NEXT: call printf +; RV32IM-NEXT: bltu a0, s2, .LBB9_1 +; RV32IM-NEXT: # %bb.2: # %afterloop +; RV32IM-NEXT: lw s4, 8(sp) +; RV32IM-NEXT: lw s3, 12(sp) +; RV32IM-NEXT: lw s2, 16(sp) +; RV32IM-NEXT: lw s1, 20(sp) +; RV32IM-NEXT: lw s0, 24(sp) +; RV32IM-NEXT: lw ra, 28(sp) +; RV32IM-NEXT: .cfi_restore ra +; RV32IM-NEXT: .cfi_restore s0 +; RV32IM-NEXT: .cfi_restore s1 +; RV32IM-NEXT: .cfi_restore s2 +; RV32IM-NEXT: .cfi_restore s3 +; RV32IM-NEXT: .cfi_restore s4 +; RV32IM-NEXT: addi sp, sp, 32 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: srem_loop: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: .cfi_def_cfa_offset 32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) +; RV64I-NEXT: sd s2, 0(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv s0, zero +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: lui a2, %hi(.L.str) +; RV64I-NEXT: addi s2, a2, %lo(.L.str) +; RV64I-NEXT: sext.w s1, a1 +; RV64I-NEXT: .LBB9_1: # %loop +; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: addw s0, a0, s0 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call printf +; RV64I-NEXT: sext.w a1, a0 +; RV64I-NEXT: bltu a1, s1, .LBB9_1 +; RV64I-NEXT: # %bb.2: # %afterloop +; RV64I-NEXT: ld s2, 0(sp) +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: addi sp, sp, 32 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: srem_loop: +; RV64IM: # %bb.0: # %entry +; RV64IM-NEXT: addi sp, sp, -48 +; RV64IM-NEXT: .cfi_def_cfa_offset 48 +; RV64IM-NEXT: sd ra, 40(sp) +; RV64IM-NEXT: sd s0, 32(sp) +; RV64IM-NEXT: sd s1, 24(sp) +; RV64IM-NEXT: sd s2, 16(sp) +; RV64IM-NEXT: sd s3, 8(sp) +; RV64IM-NEXT: sd s4, 0(sp) +; RV64IM-NEXT: .cfi_offset ra, -8 +; RV64IM-NEXT: .cfi_offset s0, -16 +; RV64IM-NEXT: .cfi_offset s1, -24 +; RV64IM-NEXT: .cfi_offset s2, -32 +; RV64IM-NEXT: .cfi_offset s3, -40 +; RV64IM-NEXT: .cfi_offset s4, -48 +; RV64IM-NEXT: mv a1, a0 +; RV64IM-NEXT: mv s0, zero +; RV64IM-NEXT: addi a0, zero, 1 +; RV64IM-NEXT: lui a2, 176602 +; RV64IM-NEXT: addiw a2, a2, 1121 +; RV64IM-NEXT: slli a2, a2, 15 +; RV64IM-NEXT: addi a2, a2, 345 +; RV64IM-NEXT: slli a2, a2, 13 +; RV64IM-NEXT: addi s2, a2, -603 +; RV64IM-NEXT: addi s3, zero, 95 +; RV64IM-NEXT: lui a2, %hi(.L.str) +; RV64IM-NEXT: addi s4, a2, %lo(.L.str) +; RV64IM-NEXT: sext.w s1, a1 +; RV64IM-NEXT: .LBB9_1: # %loop +; RV64IM-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64IM-NEXT: sext.w a0, a0 +; RV64IM-NEXT: mul a1, a0, s2 +; RV64IM-NEXT: mulhu a1, a1, s3 +; RV64IM-NEXT: srli a0, a0, 31 +; RV64IM-NEXT: andi a0, a0, 94 +; RV64IM-NEXT: sub a0, a1, a0 +; RV64IM-NEXT: addw s0, a0, s0 +; RV64IM-NEXT: mv a0, s4 +; RV64IM-NEXT: mv a1, s0 +; RV64IM-NEXT: call printf +; RV64IM-NEXT: sext.w a1, a0 +; RV64IM-NEXT: bltu a1, s1, .LBB9_1 +; RV64IM-NEXT: # %bb.2: # %afterloop +; RV64IM-NEXT: ld s4, 0(sp) +; RV64IM-NEXT: ld s3, 8(sp) +; RV64IM-NEXT: ld s2, 16(sp) +; RV64IM-NEXT: ld s1, 24(sp) +; RV64IM-NEXT: ld s0, 32(sp) +; RV64IM-NEXT: ld ra, 40(sp) +; RV64IM-NEXT: .cfi_restore ra +; RV64IM-NEXT: .cfi_restore s0 +; RV64IM-NEXT: .cfi_restore s1 +; RV64IM-NEXT: .cfi_restore s2 +; RV64IM-NEXT: .cfi_restore s3 +; RV64IM-NEXT: .cfi_restore s4 +; RV64IM-NEXT: addi sp, sp, 48 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret +entry: + %0 = add i32 0, 0 + br label %loop +loop: + %1 = phi i32 [ 1, %entry ], [ %5, %loop ] + %2 = phi i32 [%0, %entry], [%4, %loop] + %3 = srem i32 %1, 95 + %4 = add i32 %3, %2 + %5 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %4) + %6 = icmp ult i32 %5, %x + br i1 %6, label %loop, label %afterloop + +afterloop: + ret void +} Index: llvm/test/CodeGen/RISCV/srem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/RISCV/srem-vector-lkk.ll +++ llvm/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -8,8 +8,8 @@ ; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=CHECK,RV64IM %s -define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { -; RV32I-LABEL: fold_srem_vec_1: +define <4 x i16> @lower_srem_vec_1(<4 x i16> %x) { +; RV32I-LABEL: lower_srem_vec_1: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 ; RV32I-NEXT: .cfi_def_cfa_offset 32 @@ -69,7 +69,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: fold_srem_vec_1: +; RV32IM-LABEL: lower_srem_vec_1: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lh a6, 12(a1) ; RV32IM-NEXT: lh a3, 8(a1) @@ -120,7 +120,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: fold_srem_vec_1: +; RV64I-LABEL: lower_srem_vec_1: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -64 ; RV64I-NEXT: .cfi_def_cfa_offset 64 @@ -180,7 +180,7 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: fold_srem_vec_1: +; RV64IM-LABEL: lower_srem_vec_1: ; RV64IM: # %bb.0: ; RV64IM-NEXT: lh a6, 24(a1) ; RV64IM-NEXT: lh a3, 16(a1) @@ -258,8 +258,8 @@ ret <4 x i16> %1 } -define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { -; RV32I-LABEL: fold_srem_vec_2: +define <4 x i16> @lower_srem_vec_2(<4 x i16> %x) { +; RV32I-LABEL: lower_srem_vec_2: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 ; RV32I-NEXT: .cfi_def_cfa_offset 32 @@ -319,7 +319,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: fold_srem_vec_2: +; RV32IM-LABEL: lower_srem_vec_2: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lh a6, 12(a1) ; RV32IM-NEXT: lh a3, 8(a1) @@ -363,7 +363,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: fold_srem_vec_2: +; RV64I-LABEL: lower_srem_vec_2: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -64 ; RV64I-NEXT: .cfi_def_cfa_offset 64 @@ -423,7 +423,7 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: fold_srem_vec_2: +; RV64IM-LABEL: lower_srem_vec_2: ; RV64IM: # %bb.0: ; RV64IM-NEXT: lh a6, 24(a1) ; RV64IM-NEXT: lh a7, 16(a1) @@ -477,7 +477,7 @@ } -; Don't fold if we can combine srem with sdiv. +; Don't lower if we can combine srem with sdiv. define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; RV32I-LABEL: combine_srem_sdiv: ; RV32I: # %bb.0: @@ -778,9 +778,9 @@ ret <4 x i16> %3 } -; Don't fold for divisors that are a power of two. -define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { -; RV32I-LABEL: dont_fold_srem_power_of_two: +; Don't lower for divisors that are a power of two. +define <4 x i16> @dont_lower_srem_power_of_two(<4 x i16> %x) { +; RV32I-LABEL: dont_lower_srem_power_of_two: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 ; RV32I-NEXT: .cfi_def_cfa_offset 32 @@ -838,7 +838,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: dont_fold_srem_power_of_two: +; RV32IM-LABEL: dont_lower_srem_power_of_two: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lh a6, 8(a1) ; RV32IM-NEXT: lh a3, 4(a1) @@ -880,7 +880,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: dont_fold_srem_power_of_two: +; RV64I-LABEL: dont_lower_srem_power_of_two: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -48 ; RV64I-NEXT: .cfi_def_cfa_offset 48 @@ -938,7 +938,7 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: dont_fold_srem_power_of_two: +; RV64IM-LABEL: dont_lower_srem_power_of_two: ; RV64IM: # %bb.0: ; RV64IM-NEXT: lh a6, 16(a1) ; RV64IM-NEXT: lh a3, 8(a1) @@ -989,9 +989,9 @@ ret <4 x i16> %1 } -; Don't fold if the divisor is one. -define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { -; RV32I-LABEL: dont_fold_srem_one: +; Don't lower if the divisor is one. +define <4 x i16> @dont_lower_srem_one(<4 x i16> %x) { +; RV32I-LABEL: dont_lower_srem_one: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 ; RV32I-NEXT: .cfi_def_cfa_offset 32 @@ -1039,7 +1039,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: dont_fold_srem_one: +; RV32IM-LABEL: dont_lower_srem_one: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lh a2, 12(a1) ; RV32IM-NEXT: lh a3, 4(a1) @@ -1081,7 +1081,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: dont_fold_srem_one: +; RV64I-LABEL: dont_lower_srem_one: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -48 ; RV64I-NEXT: .cfi_def_cfa_offset 48 @@ -1129,7 +1129,7 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: dont_fold_srem_one: +; RV64IM-LABEL: dont_lower_srem_one: ; RV64IM: # %bb.0: ; RV64IM-NEXT: lh a2, 24(a1) ; RV64IM-NEXT: lh a3, 8(a1) @@ -1191,9 +1191,9 @@ ret <4 x i16> %1 } -; Don't fold if the divisor is 2^15. -define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { -; RV32I-LABEL: dont_fold_urem_i16_smax: +; Don't lower if the divisor is 2^15. +define <4 x i16> @dont_lower_urem_i16_smax(<4 x i16> %x) { +; RV32I-LABEL: dont_lower_urem_i16_smax: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 ; RV32I-NEXT: .cfi_def_cfa_offset 32 @@ -1243,7 +1243,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: dont_fold_urem_i16_smax: +; RV32IM-LABEL: dont_lower_urem_i16_smax: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lh a2, 4(a1) ; RV32IM-NEXT: slli a6, a2, 16 @@ -1282,7 +1282,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: dont_fold_urem_i16_smax: +; RV64I-LABEL: dont_lower_urem_i16_smax: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -48 ; RV64I-NEXT: .cfi_def_cfa_offset 48 @@ -1332,7 +1332,7 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: dont_fold_urem_i16_smax: +; RV64IM-LABEL: dont_lower_urem_i16_smax: ; RV64IM: # %bb.0: ; RV64IM-NEXT: lh a2, 8(a1) ; RV64IM-NEXT: slli a6, a2, 48 @@ -1386,9 +1386,9 @@ ret <4 x i16> %1 } -; Don't fold i64 srem. -define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { -; RV32I-LABEL: dont_fold_srem_i64: +; Don't lower i64 srem. +define <4 x i64> @dont_lower_srem_i64(<4 x i64> %x) { +; RV32I-LABEL: dont_lower_srem_i64: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -48 ; RV32I-NEXT: .cfi_def_cfa_offset 48 @@ -1483,7 +1483,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: dont_fold_srem_i64: +; RV32IM-LABEL: dont_lower_srem_i64: ; RV32IM: # %bb.0: ; RV32IM-NEXT: addi sp, sp, -48 ; RV32IM-NEXT: .cfi_def_cfa_offset 48 @@ -1578,7 +1578,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: dont_fold_srem_i64: +; RV64I-LABEL: dont_lower_srem_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -48 ; RV64I-NEXT: .cfi_def_cfa_offset 48 @@ -1626,7 +1626,7 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: dont_fold_srem_i64: +; RV64IM-LABEL: dont_lower_srem_i64: ; RV64IM: # %bb.0: ; RV64IM-NEXT: ld a2, 24(a1) ; RV64IM-NEXT: ld a3, 8(a1) Index: llvm/test/CodeGen/X86/load-scalar-as-vector.ll =================================================================== --- llvm/test/CodeGen/X86/load-scalar-as-vector.ll +++ llvm/test/CodeGen/X86/load-scalar-as-vector.ll @@ -418,28 +418,28 @@ define <4 x i32> @srem_op1_constant(i32* %p) nounwind { ; SSE-LABEL: srem_op1_constant: ; SSE: # %bb.0: -; SSE-NEXT: movslq (%rdi), %rax -; SSE-NEXT: imulq $818089009, %rax, %rcx # imm = 0x30C30C31 -; SSE-NEXT: movq %rcx, %rdx -; SSE-NEXT: shrq $63, %rdx -; SSE-NEXT: sarq $35, %rcx -; SSE-NEXT: addl %edx, %ecx -; SSE-NEXT: imull $42, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movslq (%rdi), %rcx +; SSE-NEXT: movabsq $439208192231179801, %rax # imm = 0x618618618618619 +; SSE-NEXT: imulq %rcx, %rax +; SSE-NEXT: movl $42, %edx +; SSE-NEXT: mulq %rdx +; SSE-NEXT: sarl $31, %ecx +; SSE-NEXT: andl $41, %ecx +; SSE-NEXT: subl %ecx, %edx +; SSE-NEXT: movd %edx, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: srem_op1_constant: ; AVX: # %bb.0: -; AVX-NEXT: movslq (%rdi), %rax -; AVX-NEXT: imulq $818089009, %rax, %rcx # imm = 0x30C30C31 -; AVX-NEXT: movq %rcx, %rdx -; AVX-NEXT: shrq $63, %rdx -; AVX-NEXT: sarq $35, %rcx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: imull $42, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: movslq (%rdi), %rcx +; AVX-NEXT: movabsq $439208192231179801, %rax # imm = 0x618618618618619 +; AVX-NEXT: imulq %rcx, %rax +; AVX-NEXT: movl $42, %edx +; AVX-NEXT: mulq %rdx +; AVX-NEXT: sarl $31, %ecx +; AVX-NEXT: andl $41, %ecx +; AVX-NEXT: subl %ecx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 ; AVX-NEXT: retq %x = load i32, i32* %p %b = srem i32 %x, 42 Index: llvm/test/CodeGen/X86/pr14088.ll =================================================================== --- llvm/test/CodeGen/X86/pr14088.ll +++ llvm/test/CodeGen/X86/pr14088.ll @@ -17,23 +17,24 @@ ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: # %bb.1: # %if.end -; CHECK-NEXT: movslq %r8d, %rax -; CHECK-NEXT: imulq $1374389535, %rax, %rcx # imm = 0x51EB851F -; CHECK-NEXT: movq %rcx, %rdi -; CHECK-NEXT: shrq $63, %rdi -; CHECK-NEXT: sarq $37, %rcx -; CHECK-NEXT: addl %edi, %ecx -; CHECK-NEXT: imull $100, %ecx, %ecx -; CHECK-NEXT: subl %ecx, %eax -; CHECK-NEXT: movw %ax, (%rsi) -; CHECK-NEXT: cwtl +; CHECK-NEXT: movq %rdx, %rcx +; CHECK-NEXT: movslq %r8d, %rdi +; CHECK-NEXT: movabsq $184467440737095517, %rax # imm = 0x28F5C28F5C28F5D +; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: movl $100, %edx +; CHECK-NEXT: mulq %rdx +; CHECK-NEXT: sarl $31, %edi +; CHECK-NEXT: andl $99, %edi +; CHECK-NEXT: subl %edi, %edx +; CHECK-NEXT: movw %dx, (%rsi) +; CHECK-NEXT: movswl %dx, %eax ; CHECK-NEXT: cltq ; CHECK-NEXT: imulq $1717986919, %rax, %rax # imm = 0x66666667 -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: shrq $63, %rcx +; CHECK-NEXT: movq %rax, %rdx +; CHECK-NEXT: shrq $63, %rdx ; CHECK-NEXT: shrq $34, %rax -; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: movb %al, (%rdx) +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: movb %al, (%rcx) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: .LBB0_2: # %return ; CHECK-NEXT: retq Index: llvm/test/CodeGen/X86/srem-lkk.ll =================================================================== --- llvm/test/CodeGen/X86/srem-lkk.ll +++ llvm/test/CodeGen/X86/srem-lkk.ll @@ -1,18 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK -define i32 @fold_srem_positive_odd(i32 %x) { -; CHECK-LABEL: fold_srem_positive_odd: +define i32 @lower_srem_positive_odd(i32 %x) { +; CHECK-LABEL: lower_srem_positive_odd: ; CHECK: # %bb.0: -; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: imulq $-1401515643, %rax, %rcx # imm = 0xAC769185 -; CHECK-NEXT: shrq $32, %rcx -; CHECK-NEXT: addl %eax, %ecx -; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: shrl $31, %edx -; CHECK-NEXT: sarl $6, %ecx -; CHECK-NEXT: addl %edx, %ecx -; CHECK-NEXT: imull $95, %ecx, %ecx +; CHECK-NEXT: movslq %edi, %rcx +; CHECK-NEXT: movabsq $194176253407468965, %rax # imm = 0x2B1DA46102B1DA5 +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $95, %edx +; CHECK-NEXT: mulq %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: sarl $31, %ecx +; CHECK-NEXT: andl $94, %ecx ; CHECK-NEXT: subl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq @@ -21,16 +20,17 @@ } -define i32 @fold_srem_positive_even(i32 %x) { -; CHECK-LABEL: fold_srem_positive_even: +define i32 @lower_srem_positive_even(i32 %x) { +; CHECK-LABEL: lower_srem_positive_even: ; CHECK: # %bb.0: -; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: imulq $1037275121, %rax, %rcx # imm = 0x3DD38FF1 -; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: shrq $63, %rdx -; CHECK-NEXT: sarq $40, %rcx -; CHECK-NEXT: addl %edx, %ecx -; CHECK-NEXT: imull $1060, %ecx, %ecx # imm = 0x424 +; CHECK-NEXT: movslq %edi, %rcx +; CHECK-NEXT: movabsq $17402588748782596, %rax # imm = 0x3DD38FF08B1C04 +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $1060, %edx # imm = 0x424 +; CHECK-NEXT: mulq %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: sarl $31, %ecx +; CHECK-NEXT: andl $1059, %ecx # imm = 0x423 ; CHECK-NEXT: subl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq @@ -39,16 +39,17 @@ } -define i32 @fold_srem_negative_odd(i32 %x) { -; CHECK-LABEL: fold_srem_negative_odd: +define i32 @lower_srem_negative_odd(i32 %x) { +; CHECK-LABEL: lower_srem_negative_odd: ; CHECK: # %bb.0: -; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: imulq $-1520762971, %rax, %rcx # imm = 0xA55AFFA5 -; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: shrq $63, %rdx -; CHECK-NEXT: sarq $40, %rcx -; CHECK-NEXT: addl %edx, %ecx -; CHECK-NEXT: imull $-723, %ecx, %ecx # imm = 0xFD2D +; CHECK-NEXT: movslq %edi, %rcx +; CHECK-NEXT: movabsq $25514168843305051, %rax # imm = 0x5AA5005AA5005B +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $723, %edx # imm = 0x2D3 +; CHECK-NEXT: mulq %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: sarl $31, %ecx +; CHECK-NEXT: andl $722, %ecx # imm = 0x2D2 ; CHECK-NEXT: subl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq @@ -57,16 +58,17 @@ } -define i32 @fold_srem_negative_even(i32 %x) { -; CHECK-LABEL: fold_srem_negative_even: +define i32 @lower_srem_negative_even(i32 %x) { +; CHECK-LABEL: lower_srem_negative_even: ; CHECK: # %bb.0: -; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: imulq $-47844377, %rax, %rcx # imm = 0xFD25F3E7 -; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: shrq $63, %rdx -; CHECK-NEXT: sarq $40, %rcx -; CHECK-NEXT: addl %edx, %ecx -; CHECK-NEXT: imull $-22981, %ecx, %ecx # imm = 0xA63B +; CHECK-NEXT: movslq %edi, %rcx +; CHECK-NEXT: movabsq $802695447269900, %rax # imm = 0x2DA0C18FF520C +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $22981, %edx # imm = 0x59C5 +; CHECK-NEXT: mulq %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: sarl $31, %ecx +; CHECK-NEXT: andl $22980, %ecx # imm = 0x59C4 ; CHECK-NEXT: subl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq @@ -75,7 +77,7 @@ } -; Don't fold if we can combine srem with sdiv. +; Don't lower if we can combine srem with sdiv. define i32 @combine_srem_sdiv(i32 %x) { ; CHECK-LABEL: combine_srem_sdiv: ; CHECK: # %bb.0: @@ -98,9 +100,9 @@ ret i32 %3 } -; Don't fold for divisors that are a power of two. -define i32 @dont_fold_srem_power_of_two(i32 %x) { -; CHECK-LABEL: dont_fold_srem_power_of_two: +; Don't lower for divisors that are a power of two. +define i32 @dont_lower_srem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_lower_srem_power_of_two: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: leal 63(%rax), %ecx @@ -114,9 +116,9 @@ ret i32 %1 } -; Don't fold if the divisor is one. -define i32 @dont_fold_srem_one(i32 %x) { -; CHECK-LABEL: dont_fold_srem_one: +; Don't lower if the divisor is one. +define i32 @dont_lower_srem_one(i32 %x) { +; CHECK-LABEL: dont_lower_srem_one: ; CHECK: # %bb.0: ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: retq @@ -124,9 +126,9 @@ ret i32 %1 } -; Don't fold if the divisor is 2^31. -define i32 @dont_fold_srem_i32_smax(i32 %x) { -; CHECK-LABEL: dont_fold_srem_i32_smax: +; Don't lower if the divisor is 2^31. +define i32 @dont_lower_srem_i32_smax(i32 %x) { +; CHECK-LABEL: dont_lower_srem_i32_smax: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: leal 2147483647(%rdi), %eax @@ -139,9 +141,9 @@ ret i32 %1 } -; Don't fold i64 srem -define i64 @dont_fold_srem_i64(i64 %x) { -; CHECK-LABEL: dont_fold_srem_i64: +; Don't lower i64 srem +define i64 @dont_lower_srem_i64(i64 %x) { +; CHECK-LABEL: dont_lower_srem_i64: ; CHECK: # %bb.0: ; CHECK-NEXT: movabsq $6023426636313322977, %rcx # imm = 0x5397829CBC14E5E1 ; CHECK-NEXT: movq %rdi, %rax @@ -157,3 +159,74 @@ %1 = srem i64 %x, 98 ret i64 %1 } + +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 + +declare dso_local i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #1 + +define void @srem_loop(i32 %x) { +; CHECK-LABEL: srem_loop: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset %rbx, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movl %edi, %ebx +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: movabsq $194176253407468965, %r14 # imm = 0x2B1DA46102B1DA5 +; CHECK-NEXT: movl $95, %r15d +; CHECK-NEXT: xorl %ebp, %ebp +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB9_1: # %loop +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movslq %eax, %rcx +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: imulq %r14, %rax +; CHECK-NEXT: mulq %r15 +; CHECK-NEXT: sarl $31, %ecx +; CHECK-NEXT: andl $94, %ecx +; CHECK-NEXT: subl %ecx, %edx +; CHECK-NEXT: addl %edx, %ebp +; CHECK-NEXT: movl $.L.str, %edi +; CHECK-NEXT: movl %ebp, %esi +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: callq printf +; CHECK-NEXT: cmpl %ebx, %eax +; CHECK-NEXT: jb .LBB9_1 +; CHECK-NEXT: # %bb.2: # %afterloop +; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %0 = add i32 0, 0 + br label %loop +loop: + %1 = phi i32 [ 1, %entry ], [ %5, %loop ] + %2 = phi i32 [%0, %entry], [%4, %loop] + %3 = srem i32 %1, 95 + %4 = add i32 %3, %2 + %5 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %4) + %6 = icmp ult i32 %5, %x + br i1 %6, label %loop, label %afterloop + +afterloop: + ret void +} Index: llvm/test/CodeGen/X86/srem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/X86/srem-vector-lkk.ll +++ llvm/test/CodeGen/X86/srem-vector-lkk.ll @@ -3,141 +3,118 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 -define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { -; SSE-LABEL: fold_srem_vec_1: +define <4 x i16> @lower_srem_vec_1(<4 x i16> %x) { +; SSE-LABEL: lower_srem_vec_1: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: subl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $9, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 -; SSE-NEXT: shrl $16, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: movzwl %dx, %edx -; SSE-NEXT: movswl %dx, %esi -; SSE-NEXT: shrl $15, %edx -; SSE-NEXT: sarl $6, %esi -; SSE-NEXT: addl %edx, %esi -; SSE-NEXT: imull $95, %esi, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: pextrw $1, %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF -; SSE-NEXT: movl %edx, %esi -; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $21, %edx -; SSE-NEXT: addl %esi, %edx -; SSE-NEXT: imull $-124, %edx, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $2675, %edx, %edx # imm = 0xA73 -; SSE-NEXT: movl %edx, %esi -; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $18, %edx -; SSE-NEXT: addl %esi, %edx -; SSE-NEXT: imull $98, %edx, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: pinsrw $2, %ecx, %xmm1 -; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [95,124,98,1003] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE-NEXT: pmovsxwd %xmm0, %xmm3 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE-NEXT: psraw $15, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: psubw %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: fold_srem_vec_1: -; AVX: # %bb.0: -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: subl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $9, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 -; AVX-NEXT: shrl $16, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: movswl %dx, %esi -; AVX-NEXT: shrl $15, %edx -; AVX-NEXT: sarl $6, %esi -; AVX-NEXT: addl %edx, %esi -; AVX-NEXT: imull $95, %esi, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpextrw $1, %xmm0, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $21, %edx -; AVX-NEXT: addl %esi, %edx -; AVX-NEXT: imull $-124, %edx, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: imull $2675, %edx, %edx # imm = 0xA73 -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $18, %edx -; AVX-NEXT: addl %esi, %edx -; AVX-NEXT: imull $98, %edx, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 -; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: lower_srem_vec_1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [95,124,98,1003] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm3 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: lower_srem_vec_1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [95,124,98,1003] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vpmovsxwd %xmm0, %xmm3 +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 } -define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { -; SSE-LABEL: fold_srem_vec_2: +define <4 x i16> @lower_srem_vec_2(<4 x i16> %x) { +; SSE-LABEL: lower_srem_vec_2: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] -; SSE-NEXT: pmulhw %xmm0, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psrlw $15, %xmm2 -; SSE-NEXT: psraw $6, %xmm1 -; SSE-NEXT: paddw %xmm2, %xmm1 -; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 -; SSE-NEXT: psubw %xmm1, %xmm0 +; SSE-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [95,95,95,95] +; SSE-NEXT: pmuludq %xmm3, %xmm2 +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE-NEXT: psraw $15, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: psubw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: fold_srem_vec_2: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 -; AVX-NEXT: vpsraw $6, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: lower_srem_vec_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [95,95,95,95] +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: lower_srem_vec_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [45210183,45210183,45210183,45210183] +; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [95,95,95,95] +; AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold if we can combine srem with sdiv. +; Don't lower if we can combine srem with sdiv. define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; SSE-LABEL: combine_srem_sdiv: ; SSE: # %bb.0: @@ -171,230 +148,157 @@ ret <4 x i16> %3 } -; Don't fold for divisors that are a power of two. -define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { -; SSE-LABEL: dont_fold_srem_power_of_two: +; Don't lower for divisors that are a power of two. +define <4 x i16> @dont_lower_srem_power_of_two(<4 x i16> %x) { +; SSE-LABEL: dont_lower_srem_power_of_two: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: leal 31(%rax), %ecx -; SSE-NEXT: testw %ax, %ax -; SSE-NEXT: cmovnsl %eax, %ecx -; SSE-NEXT: andl $-32, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %xmm0, %ecx -; SSE-NEXT: leal 63(%rcx), %edx -; SSE-NEXT: testw %cx, %cx -; SSE-NEXT: cmovnsl %ecx, %edx -; SSE-NEXT: andl $-64, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: leal 7(%rax), %ecx -; SSE-NEXT: testw %ax, %ax -; SSE-NEXT: cmovnsl %eax, %ecx -; SSE-NEXT: andl $-8, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $6, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: imull $95, %edx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [64,32,8,95] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE-NEXT: pmovsxwd %xmm0, %xmm3 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE-NEXT: psraw $15, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: psubw %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: dont_fold_srem_power_of_two: -; AVX: # %bb.0: -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: leal 31(%rax), %ecx -; AVX-NEXT: testw %ax, %ax -; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-32, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: leal 63(%rcx), %edx -; AVX-NEXT: testw %cx, %cx -; AVX-NEXT: cmovnsl %ecx, %edx -; AVX-NEXT: andl $-64, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: leal 7(%rax), %ecx -; AVX-NEXT: testw %ax, %ax -; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-8, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $6, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: imull $95, %edx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: dont_lower_srem_power_of_two: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [64,32,8,95] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm3 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: dont_lower_srem_power_of_two: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [64,32,8,95] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vpmovsxwd %xmm0, %xmm3 +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold if the divisor is one. -define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { -; SSE-LABEL: dont_fold_srem_one: +; Don't lower if the divisor is one. +define <4 x i16> @dont_lower_srem_one(<4 x i16> %x) { +; SSE-LABEL: dont_lower_srem_one: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $4, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: leal (%rdx,%rdx,2), %ecx -; SSE-NEXT: shll $3, %ecx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: addl %eax, %edx -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B -; SSE-NEXT: movl %ecx, %esi -; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $23, %ecx -; SSE-NEXT: addl %esi, %ecx -; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pinsrw $2, %edx, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; SSE-NEXT: movl %ecx, %edx -; SSE-NEXT: shrl $31, %edx -; SSE-NEXT: sarl $26, %ecx -; SSE-NEXT: addl %edx, %ecx -; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,654,23,5423] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE-NEXT: pmovsxwd %xmm0, %xmm3 +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE-NEXT: psraw $15, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: psubw %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: dont_fold_srem_one: -; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $4, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX-NEXT: shll $3, %ecx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: addl %eax, %edx -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B -; AVX-NEXT: movl %ecx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $23, %ecx -; AVX-NEXT: addl %esi, %ecx -; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $26, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: dont_lower_srem_one: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,654,23,5423] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm3 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: dont_lower_srem_one: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,654,23,5423] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vpmovsxwd %xmm0, %xmm3 +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold if the divisor is 2^15. -define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { -; SSE-LABEL: dont_fold_urem_i16_smax: +; Don't lower if the divisor is 2^15. +define <4 x i16> @dont_lower_urem_i16_smax(<4 x i16> %x) { +; SSE-LABEL: dont_lower_urem_i16_smax: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $4, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: leal (%rdx,%rdx,2), %ecx -; SSE-NEXT: shll $3, %ecx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: addl %eax, %edx +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pextrw $1, %xmm0, %eax ; SSE-NEXT: leal 32767(%rax), %ecx ; SSE-NEXT: testw %ax, %ax ; SSE-NEXT: cmovnsl %eax, %ecx ; SSE-NEXT: andl $-32768, %ecx # imm = 0x8000 ; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pinsrw $2, %edx, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; SSE-NEXT: movl %ecx, %edx -; SSE-NEXT: shrl $31, %edx -; SSE-NEXT: sarl $26, %ecx -; SSE-NEXT: addl %edx, %ecx -; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-NEXT: pextrw $2, %xmm1, %eax +; SSE-NEXT: cwtl +; SSE-NEXT: imull $186737709, %eax, %ecx # imm = 0xB21642D +; SSE-NEXT: leaq (%rcx,%rcx,2), %rdx +; SSE-NEXT: shlq $3, %rdx +; SSE-NEXT: subq %rcx, %rdx +; SSE-NEXT: shrq $32, %rdx +; SSE-NEXT: shrl $15, %eax +; SSE-NEXT: andl $22, %eax +; SSE-NEXT: subl %eax, %edx +; SSE-NEXT: pinsrw $2, %edx, %xmm0 +; SSE-NEXT: pextrw $3, %xmm1, %eax +; SSE-NEXT: cwtl +; SSE-NEXT: imull $791992, %eax, %ecx # imm = 0xC15B8 +; SSE-NEXT: shrl $15, %eax +; SSE-NEXT: andl $5422, %eax # imm = 0x152E +; SSE-NEXT: imulq $5423, %rcx, %rcx # imm = 0x152F +; SSE-NEXT: shrq $32, %rcx +; SSE-NEXT: subl %eax, %ecx +; SSE-NEXT: pinsrw $3, %ecx, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: dont_fold_urem_i16_smax: +; AVX-LABEL: dont_lower_urem_i16_smax: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $4, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX-NEXT: shll $3, %ecx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: addl %eax, %edx ; AVX-NEXT: vpextrw $1, %xmm0, %eax ; AVX-NEXT: leal 32767(%rax), %ecx ; AVX-NEXT: testw %ax, %ax @@ -403,25 +307,34 @@ ; AVX-NEXT: addl %eax, %ecx ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: cwtl +; AVX-NEXT: imull $186737709, %eax, %ecx # imm = 0xB21642D +; AVX-NEXT: leaq (%rcx,%rcx,2), %rdx +; AVX-NEXT: shlq $3, %rdx +; AVX-NEXT: subq %rcx, %rdx +; AVX-NEXT: shrq $32, %rdx +; AVX-NEXT: shrl $15, %eax +; AVX-NEXT: andl $22, %eax +; AVX-NEXT: subl %eax, %edx ; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 ; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $26, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: cwtl +; AVX-NEXT: imull $791992, %eax, %ecx # imm = 0xC15B8 +; AVX-NEXT: shrl $15, %eax +; AVX-NEXT: andl $5422, %eax # imm = 0x152E +; AVX-NEXT: imulq $5423, %rcx, %rcx # imm = 0x152F +; AVX-NEXT: shrq $32, %rcx +; AVX-NEXT: subl %eax, %ecx +; AVX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 ; AVX-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold i64 srem. -define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { -; SSE-LABEL: dont_fold_srem_i64: +; Don't lower i64 srem. +define <4 x i64> @dont_lower_srem_i64(<4 x i64> %x) { +; SSE-LABEL: dont_lower_srem_i64: ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movq %xmm1, %rcx @@ -464,7 +377,7 @@ ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; AVX1-LABEL: dont_fold_srem_i64: +; AVX1-LABEL: dont_lower_srem_i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rcx @@ -508,7 +421,7 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: dont_fold_srem_i64: +; AVX2-LABEL: dont_lower_srem_i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovq %xmm1, %rcx Index: llvm/test/CodeGen/X86/urem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/X86/urem-vector-lkk.ll +++ llvm/test/CodeGen/X86/urem-vector-lkk.ll @@ -375,4 +375,4 @@ ; AVX2-NEXT: retq %1 = urem <4 x i64> %x, ret <4 x i64> %1 -} \ No newline at end of file +} Index: llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -625,15 +625,70 @@ ; SSE-NEXT: psubw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_rem7_8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 -; AVX-NEXT: vpsraw $1, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_rem7_8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7] +; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm5, %xmm5 +; AVX1-NEXT: vpmulld %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2NOBW-LABEL: test_rem7_8i16: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpmovsxwd %xmm0, %ymm1 +; AVX2NOBW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX2NOBW-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; AVX2NOBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] +; AVX2NOBW-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 +; AVX2NOBW-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX2NOBW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2NOBW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2NOBW-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX2NOBW-NEXT: vzeroupper +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovsxwd %xmm0, %ymm1 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX512BW-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %res = srem <8 x i16> %a, ret <8 x i16> %res } @@ -699,71 +754,49 @@ ; AVX1-LABEL: test_rem7_16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363] ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 -; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7] +; AVX1-NEXT: vpmulhuw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5 +; AVX1-NEXT: vpmullw %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2NOBW-LABEL: test_rem7_16i8: ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 ; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm2 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX2NOBW-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX2NOBW-NEXT: vpsubb %xmm3, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2NOBW-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpsubb %xmm0, %xmm1, %xmm0 ; AVX2NOBW-NEXT: vzeroupper ; AVX2NOBW-NEXT: retq ; ; AVX512BW-LABEL: test_rem7_16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX512BW-NEXT: vpsubb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %res = srem <16 x i8> %a, @@ -868,96 +901,47 @@ ; ; AVX1-LABEL: test_remconstant_16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpsraw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpsraw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2NOBW-LABEL: test_remconstant_16i8: ; AVX2NOBW: # %bb.0: -; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2NOBW-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2NOBW-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm3 -; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 -; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX2NOBW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2NOBW-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm3, %xmm1 -; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm3 -; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 -; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX2NOBW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2NOBW-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 -; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm3, %xmm1 -; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 ; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmulhuw {{.*}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2NOBW-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpsubb %xmm0, %xmm1, %xmm0 ; AVX2NOBW-NEXT: vzeroupper ; AVX2NOBW-NEXT: retq ; ; AVX512BW-LABEL: test_remconstant_16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm2 -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2] -; AVX512BW-NEXT: vpsravw %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq Index: llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -553,15 +553,31 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: test_rem7_16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm1 -; AVX2-NEXT: vpsrlw $15, %ymm1, %ymm2 -; AVX2-NEXT: vpsraw $1, %ymm1, %ymm1 -; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2NOBW-LABEL: test_rem7_16i16: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm1 +; AVX2NOBW-NEXT: vpsrlw $15, %ymm1, %ymm2 +; AVX2NOBW-NEXT: vpsraw $1, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_16i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovsxwd %ymm0, %zmm1 +; AVX512BW-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmuludq %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512BW-NEXT: vpmuludq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 +; AVX512BW-NEXT: vpmovdw %zmm2, %ymm1 +; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpsubw %ymm0, %ymm1, %ymm0 +; AVX512BW-NEXT: retq %res = srem <16 x i16> %a, ret <16 x i16> %res } @@ -647,23 +663,14 @@ ; ; AVX512BW-LABEL: test_rem7_32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1 -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm1 -; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxor %ymm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsrlw $7, %ymm1, %ymm1 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm1 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vpsubb %ymm3, %ymm1, %ymm1 -; AVX512BW-NEXT: vpsllw $3, %ymm1, %ymm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq %res = srem <32 x i8> %a, ret <32 x i8> %res @@ -798,23 +805,13 @@ ; ; AVX512BW-LABEL: test_remconstant_32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm2 -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vpsrlw $7, %ymm1, %ymm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 -; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm1 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq %res = srem <32 x i8> %a, Index: llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -442,20 +442,30 @@ ; AVX512F-LABEL: test_rem7_32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] -; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpsrlw $15, %ymm3, %ymm4 -; AVX512F-NEXT: vpsraw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vpaddw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsrlw $15, %ymm2, %ymm3 -; AVX512F-NEXT: vpsraw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm2 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX512F-NEXT: vpmulld %zmm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpmuludq %zmm4, %zmm2, %zmm5 +; AVX512F-NEXT: vpshufd {{.*#+}} zmm2 = zmm2[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-NEXT: vpmuludq %zmm4, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm6, %zmm5 +; AVX512F-NEXT: vpmovdw %zmm5, %ymm2 +; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpsubw %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm2 +; AVX512F-NEXT: vpmulld %zmm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpmuludq %zmm4, %zmm2, %zmm3 +; AVX512F-NEXT: vpshufd {{.*#+}} zmm2 = zmm2[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-NEXT: vpmuludq %zmm4, %zmm2, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 +; AVX512F-NEXT: vpmovdw %zmm3, %ymm2 +; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpsubw %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-intrinsics.ll =================================================================== --- llvm/test/CodeGen/X86/vector-intrinsics.ll +++ llvm/test/CodeGen/X86/vector-intrinsics.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- | grep call | count 43 declare <4 x double> @llvm.sin.v4f64(<4 x double> %p) Index: llvm/test/CodeGen/X86/vector-rem.ll =================================================================== --- llvm/test/CodeGen/X86/vector-rem.ll +++ llvm/test/CodeGen/X86/vector-rem.ll @@ -81,30 +81,30 @@ ; CHECK-LABEL: qux: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $72, %rsp -; CHECK-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; CHECK-NEXT: callq fmodf ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; CHECK-NEXT: callq fmodf ; CHECK-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: callq fmodf -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] ; CHECK-NEXT: callq fmodf -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] Index: llvm/test/CodeGen/X86/vector-truncate-combine.ll =================================================================== --- llvm/test/CodeGen/X86/vector-truncate-combine.ll +++ llvm/test/CodeGen/X86/vector-truncate-combine.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-- -O2 -start-after=stack-protector -stop-before=loops %s -o - | FileCheck %s ; This test verifies the fix for PR33368. Index: llvm/test/CodeGen/X86/vector-variable-idx.ll =================================================================== --- llvm/test/CodeGen/X86/vector-variable-idx.ll +++ llvm/test/CodeGen/X86/vector-variable-idx.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- | grep movss | count 2 ; PR2676 Index: llvm/test/CodeGen/X86/vector-variable-idx2.ll =================================================================== --- llvm/test/CodeGen/X86/vector-variable-idx2.ll +++ llvm/test/CodeGen/X86/vector-variable-idx2.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mattr=+sse4.1 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" Index: llvm/test/CodeGen/X86/vector-width-store-merge.ll =================================================================== --- llvm/test/CodeGen/X86/vector-width-store-merge.ll +++ llvm/test/CodeGen/X86/vector-width-store-merge.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s ; This tests whether or not we generate vectors large than preferred vector width when @@ -5,40 +6,60 @@ ; Function Attrs: nounwind uwtable define weak_odr dso_local void @A(i8* %src, i8* %dst) local_unnamed_addr #0 { +; CHECK-LABEL: A: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmovups (%rdi), %xmm0 +; CHECK-NEXT: vmovups 16(%rdi), %xmm1 +; CHECK-NEXT: vmovups %xmm1, 16(%rsi) +; CHECK-NEXT: vmovups %xmm0, (%rsi) +; CHECK-NEXT: retq entry: -; CHECK: A -; CHECK-NOT: vmovups %ymm -; CHECK: vmovups %xmm call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 32, i1 false) ret void } ; Function Attrs: nounwind uwtable define weak_odr dso_local void @B(i8* %src, i8* %dst) local_unnamed_addr #0 { +; CHECK-LABEL: B: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmovups (%rdi), %xmm0 +; CHECK-NEXT: vmovups 16(%rdi), %xmm1 +; CHECK-NEXT: vmovups 32(%rdi), %xmm2 +; CHECK-NEXT: vmovups 48(%rdi), %xmm3 +; CHECK-NEXT: vmovups %xmm3, 48(%rsi) +; CHECK-NEXT: vmovups %xmm2, 32(%rsi) +; CHECK-NEXT: vmovups %xmm1, 16(%rsi) +; CHECK-NEXT: vmovups %xmm0, (%rsi) +; CHECK-NEXT: retq entry: -; CHECK: B -; CHECK-NOT: vmovups %zmm -; CHECK: vmovups %xmm call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 64, i1 false) ret void } ; Function Attrs: nounwind uwtable define weak_odr dso_local void @C(i8* %src, i8* %dst) local_unnamed_addr #2 { +; CHECK-LABEL: C: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: vmovups %ymm0, (%rsi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq entry: -; CHECK: C -; CHECK-NOT: vmovups %ymm -; CHECK: vmovups %ymm call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 32, i1 false) ret void } ; Function Attrs: nounwind uwtable define weak_odr dso_local void @D(i8* %src, i8* %dst) local_unnamed_addr #2 { +; CHECK-LABEL: D: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: vmovups 32(%rdi), %ymm1 +; CHECK-NEXT: vmovups %ymm1, 32(%rsi) +; CHECK-NEXT: vmovups %ymm0, (%rsi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq entry: -; CHECK: D -; CHECK-NOT: vmovups %zmm -; CHECK: vmovups %ymm call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 64, i1 false) ret void }