Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -3972,6 +3972,10 @@ /// power-of-2 denominators. If the target returns an empty SDValue, LLVM /// assumes SDIV is expensive and replaces it with a series of other integer /// operations. + + SDValue BuildUREM(SDNode *Node, SelectionDAG &DAG, bool IsAfterLegalization, + SmallVectorImpl &Created) const; + virtual SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const; Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3916,6 +3916,25 @@ AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); + if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr) && + isConstantOrConstantVector(N1)) { + // check if there is a div to combine with rem. + unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; + SDNode *DivNode = DAG.getNodeIfExists(DivOpcode, N->getVTList(), {N0, N1}); + if (!DivNode) { + SmallVector Built; + SDValue OptimizedRem = + isSigned ? SDValue() /* placeholder for srem */ + : TLI.BuildUREM(N, DAG, LegalOperations, Built); + if (OptimizedRem.getNode()) { + for (SDNode *N : Built) { + AddToWorklist(N); + } + return OptimizedRem; + } + } + } + // If X/C can be simplified by the division-by-constant logic, lower // X%C to the equivalent of X-X/C*C. // Reuse the SDIVLike/UDIVLike combines - to avoid mangling nodes, the Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4903,6 +4903,116 @@ std::replace_if(Values.begin(), Values.end(), Predicate, Replacement); } +/// Given an ISD::UREM where the divisor is constant, +/// return a DAG expression that will generate the same result +/// using only multiplications, additions and shifts. +/// Ref: D. Lemire, O. Kaser, and N. Kurz, "Faster Remainder by Direct +/// Computation" (LKK) +SDValue TargetLowering::BuildUREM(SDNode *Node, SelectionDAG &DAG, + bool IsAfterLegalization, + SmallVectorImpl &Created) const { + SDLoc DL(Node); + EVT VT = Node->getValueType(0); + EVT FVT; + if (VT.isVector()) { + EVT SVT = + EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2); + FVT = EVT::getVectorVT(*DAG.getContext(), SVT, VT.getVectorElementCount()); + } else { + FVT = EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2); + } + + unsigned F = FVT.getScalarSizeInBits(); + + // when optimising for minimum size, we don't want to expand div + if (DAG.getMachineFunction().getFunction().hasMinSize()) + return SDValue(); + + // Check to see if we can do this. + if (IsAfterLegalization && !isTypeLegal(FVT)) + return SDValue(); + + // If MUL is unavailable, we cannot proceed in any case. + if (!isOperationLegalOrCustom(ISD::MUL, FVT)) + return SDValue(); + + SmallVector MagicFactors; + bool AllDivisorsArePowerOfTwo = true; + bool AllDivisorsAreOnes = true; + + auto BuildUREMPattern = [&](ConstantSDNode *DivisorConstant) { + // calculate magic number: c = ceil(2^N / d) + 1 + const APInt &D = DivisorConstant->getAPIntValue(); + APInt C = APInt::getMaxValue(F).udiv(D.zext(F)) + APInt(F, 1); + SDValue AproximateReciprocal = DAG.getConstant(C, DL, FVT.getScalarType()); + + MagicFactors.push_back(AproximateReciprocal); + + assert(!D.isNullValue() && "Divisor cannot be zero"); + + AllDivisorsArePowerOfTwo &= D.isPowerOf2(); + AllDivisorsAreOnes &= D.isOneValue(); + + if (!D.isStrictlyPositive()) { + // Divisor must be in the range of [1,2^N) + return false; + } + + return true; + }; + + // numerator + SDValue Numerator = Node->getOperand(0); + SDValue ExtendedNumerator = DAG.getZExtOrTrunc(Numerator, DL, FVT); + + // divisor constant + SDValue Divisor = Node->getOperand(1); + SDValue ExtendedDivisor = DAG.getZExtOrTrunc(Divisor, DL, FVT); + + if (!ISD::matchUnaryPredicate(Divisor, BuildUREMPattern)) + return SDValue(); + + // If this is a urem by a one, avoid the fold since it can be constant-folded. + if (AllDivisorsAreOnes) + return SDValue(); + + // If this is a urem by a powers-of-two, avoid the fold since it can be + // best implemented as a bit test. + if (AllDivisorsArePowerOfTwo) + return SDValue(); + + SDValue MagicFactor = VT.isVector() + ? DAG.getBuildVector(FVT, DL, MagicFactors) + : MagicFactors[0]; + + // lowbits = c * n + SDValue Lowbits = + DAG.getNode(ISD::MUL, DL, FVT, MagicFactor, ExtendedNumerator); + + // result = lowbits * d >> F + SDValue Result; + if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, FVT) + : isOperationLegalOrCustom(ISD::MULHU, FVT)) + Result = DAG.getNode(ISD::MULHU, DL, FVT, Lowbits, ExtendedDivisor); + else if (IsAfterLegalization + ? isOperationLegal(ISD::UMUL_LOHI, FVT) + : isOperationLegalOrCustom(ISD::UMUL_LOHI, FVT)) { + SDValue LoHi = DAG.getNode(ISD::UMUL_LOHI, DL, DAG.getVTList(FVT, FVT), + Lowbits, ExtendedDivisor); + Result = SDValue(LoHi.getNode(), 1); + } else { + return SDValue(); // No mulhu or equivalent + } + + Created.push_back(MagicFactor.getNode()); + Created.push_back(ExtendedNumerator.getNode()); + Created.push_back(Lowbits.getNode()); + Created.push_back(ExtendedDivisor.getNode()); + Created.push_back(Result.getNode()); + + return DAG.getZExtOrTrunc(Result, DL, VT); +} + /// Given an ISD::UREM used only by an ISD::SETEQ or ISD::SETNE /// where the divisor is constant and the comparison target is zero, /// return a DAG expression that will generate the same comparison result Index: llvm/test/CodeGen/AArch64/urem-lkk.ll =================================================================== --- llvm/test/CodeGen/AArch64/urem-lkk.ll +++ llvm/test/CodeGen/AArch64/urem-lkk.ll @@ -1,40 +1,43 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s -define i32 @fold_urem_positive_odd(i32 %x) { -; CHECK-LABEL: fold_urem_positive_odd: +define i32 @lower_urem_positive_odd(i32 %x) { +; CHECK-LABEL: lower_urem_positive_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #8969 -; CHECK-NEXT: movk w8, #22765, lsl #16 -; CHECK-NEXT: umull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: sub w9, w0, w8 -; CHECK-NEXT: add w8, w8, w9, lsr #1 -; CHECK-NEXT: lsr w8, w8, #6 +; CHECK-NEXT: mov x9, #7589 +; CHECK-NEXT: movk x9, #4139, lsl #16 +; CHECK-NEXT: movk x9, #55878, lsl #32 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: movk x9, #689, lsl #48 +; CHECK-NEXT: mul x8, x8, x9 ; CHECK-NEXT: mov w9, #95 -; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: umulh x0, x8, x9 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %1 = urem i32 %x, 95 ret i32 %1 } -define i32 @fold_urem_positive_even(i32 %x) { -; CHECK-LABEL: fold_urem_positive_even: +define i32 @lower_urem_positive_even(i32 %x) { +; CHECK-LABEL: lower_urem_positive_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #16323 -; CHECK-NEXT: movk w8, #63310, lsl #16 -; CHECK-NEXT: umull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #42 +; CHECK-NEXT: mov x9, #7172 +; CHECK-NEXT: movk x9, #61579, lsl #16 +; CHECK-NEXT: movk x9, #54159, lsl #32 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: movk x9, #61, lsl #48 +; CHECK-NEXT: mul x8, x8, x9 ; CHECK-NEXT: mov w9, #1060 -; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: umulh x0, x8, x9 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %1 = urem i32 %x, 1060 ret i32 %1 } -; Don't fold if we can combine urem with udiv. +; Don't lower if we can combine urem with udiv. define i32 @combine_urem_udiv(i32 %x) { ; CHECK-LABEL: combine_urem_udiv: ; CHECK: // %bb.0: @@ -55,9 +58,9 @@ ret i32 %3 } -; Don't fold for divisors that are a power of two. -define i32 @dont_fold_urem_power_of_two(i32 %x) { -; CHECK-LABEL: dont_fold_urem_power_of_two: +; Don't lower for divisors that are a power of two. +define i32 @dont_lower_urem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_lower_urem_power_of_two: ; CHECK: // %bb.0: ; CHECK-NEXT: and w0, w0, #0x3f ; CHECK-NEXT: ret @@ -65,9 +68,9 @@ ret i32 %1 } -; Don't fold if the divisor is one. -define i32 @dont_fold_urem_one(i32 %x) { -; CHECK-LABEL: dont_fold_urem_one: +; Don't lower if the divisor is one. +define i32 @dont_lower_urem_one(i32 %x) { +; CHECK-LABEL: dont_lower_urem_one: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret @@ -75,18 +78,18 @@ ret i32 %1 } -; Don't fold if the divisor is 2^32. -define i32 @dont_fold_urem_i32_umax(i32 %x) { -; CHECK-LABEL: dont_fold_urem_i32_umax: +; Don't lower if the divisor is 2^32. +define i32 @dont_lower_urem_i32_umax(i32 %x) { +; CHECK-LABEL: dont_lower_urem_i32_umax: ; CHECK: // %bb.0: ; CHECK-NEXT: ret %1 = urem i32 %x, 4294967296 ret i32 %1 } -; Don't fold i64 urem -define i64 @dont_fold_urem_i64(i64 %x) { -; CHECK-LABEL: dont_fold_urem_i64: +; Don't lower i64 urem +define i64 @dont_lower_urem_i64(i64 %x) { +; CHECK-LABEL: dont_lower_urem_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x9, #58849 ; CHECK-NEXT: movk x9, #48148, lsl #16 @@ -101,3 +104,62 @@ %1 = urem i64 %x, 98 ret i64 %1 } + +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 + +declare dso_local i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #1 + +define void @urem_loop(i32 %x) { +; CHECK-LABEL: urem_loop: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w23, -40 +; CHECK-NEXT: .cfi_offset w30, -48 +; CHECK-NEXT: mov x22, #7589 +; CHECK-NEXT: movk x22, #4139, lsl #16 +; CHECK-NEXT: movk x22, #55878, lsl #32 +; CHECK-NEXT: adrp x21, .L.str +; CHECK-NEXT: mov w19, w0 +; CHECK-NEXT: mov w20, wzr +; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: movk x22, #689, lsl #48 +; CHECK-NEXT: mov w23, #95 +; CHECK-NEXT: add x21, x21, :lo12:.L.str +; CHECK-NEXT: .LBB7_1: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mul x8, x8, x22 +; CHECK-NEXT: umulh x8, x8, x23 +; CHECK-NEXT: add w20, w8, w20 +; CHECK-NEXT: mov x0, x21 +; CHECK-NEXT: mov w1, w20 +; CHECK-NEXT: bl printf +; CHECK-NEXT: cmp w0, w19 +; CHECK-NEXT: b.lo .LBB7_1 +; CHECK-NEXT: // %bb.2: // %afterloop +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = add i32 0, 0 + br label %loop +loop: + %1 = phi i32 [ 1, %entry ], [ %5, %loop ] + %2 = phi i32 [%0, %entry], [%4, %loop] + %3 = urem i32 %1, 95 + %4 = add i32 %3, %2 + %5 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %4) + %6 = icmp ult i32 %5, %x + br i1 %6, label %loop, label %afterloop + +afterloop: + ret void +} Index: llvm/test/CodeGen/AArch64/urem-seteq.ll =================================================================== --- llvm/test/CodeGen/AArch64/urem-seteq.ll +++ llvm/test/CodeGen/AArch64/urem-seteq.ll @@ -78,15 +78,15 @@ define i16 @test_urem_even(i16 %X) nounwind { ; CHECK-LABEL: test_urem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #28087 +; CHECK-NEXT: mov w9, #9363 ; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: movk w9, #46811, lsl #16 -; CHECK-NEXT: mul w8, w8, w9 -; CHECK-NEXT: mov w9, #9362 -; CHECK-NEXT: ror w8, w8, #1 ; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: mul w8, w8, w9 +; CHECK-NEXT: mov w9, #14 +; CHECK-NEXT: umull x8, w8, w9 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: cmp w8, #0 // =0 +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %urem = urem i16 %X, 14 %cmp = icmp ne i16 %urem, 0 Index: llvm/test/CodeGen/AArch64/urem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/AArch64/urem-vector-lkk.ll +++ llvm/test/CodeGen/AArch64/urem-vector-lkk.ll @@ -1,99 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s -define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { -; CHECK-LABEL: fold_urem_vec_1: +define <4 x i16> @lower_urem_vec_1(<4 x i16> %x) { +; CHECK-LABEL: lower_urem_vec_1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w11, #33437 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: movk w11, #21399, lsl #16 -; CHECK-NEXT: umull x11, w10, w11 -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: mov w9, #16913 -; CHECK-NEXT: mov w12, #98 -; CHECK-NEXT: lsr x11, x11, #37 -; CHECK-NEXT: movk w9, #8456, lsl #16 -; CHECK-NEXT: msub w10, w11, w12, w10 -; CHECK-NEXT: ubfx w12, w8, #2, #14 -; CHECK-NEXT: umull x9, w12, w9 -; CHECK-NEXT: mov w11, #124 -; CHECK-NEXT: lsr x9, x9, #34 -; CHECK-NEXT: msub w8, w9, w11, w8 -; CHECK-NEXT: mov w9, #8969 -; CHECK-NEXT: umov w12, v0.h[0] -; CHECK-NEXT: movk w9, #22765, lsl #16 -; CHECK-NEXT: umull x9, w12, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: sub w11, w12, w9 -; CHECK-NEXT: add w9, w9, w11, lsr #1 -; CHECK-NEXT: mov w11, #95 -; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: msub w9, w9, w11, w12 -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov w9, #2287 -; CHECK-NEXT: movk w9, #16727, lsl #16 -; CHECK-NEXT: umull x9, w11, w9 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #1003 -; CHECK-NEXT: lsr x9, x9, #40 -; CHECK-NEXT: mov v0.h[2], w10 -; CHECK-NEXT: msub w8, w9, w8, w11 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: adrp x9, .LCPI0_1 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_1] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v1.2d, v0.4s, v2.4s +; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 } -define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { -; CHECK-LABEL: fold_urem_vec_2: +define <4 x i16> @lower_urem_vec_2(<4 x i16> %x) { +; CHECK-LABEL: lower_urem_vec_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #8969 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: movk w9, #22765, lsl #16 -; CHECK-NEXT: umov w10, v0.h[0] -; CHECK-NEXT: umull x13, w8, w9 -; CHECK-NEXT: umov w11, v0.h[2] -; CHECK-NEXT: umull x14, w10, w9 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: umov w12, v0.h[3] -; CHECK-NEXT: umull x15, w11, w9 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: sub w16, w8, w13 -; CHECK-NEXT: umull x9, w12, w9 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: add w13, w13, w16, lsr #1 -; CHECK-NEXT: sub w16, w10, w14 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w14, w14, w16, lsr #1 -; CHECK-NEXT: sub w16, w11, w15 -; CHECK-NEXT: add w15, w15, w16, lsr #1 -; CHECK-NEXT: sub w16, w12, w9 -; CHECK-NEXT: add w9, w9, w16, lsr #1 -; CHECK-NEXT: mov w16, #95 -; CHECK-NEXT: lsr w13, w13, #6 -; CHECK-NEXT: msub w8, w13, w16, w8 -; CHECK-NEXT: lsr w13, w14, #6 -; CHECK-NEXT: msub w10, w13, w16, w10 -; CHECK-NEXT: lsr w13, w15, #6 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: msub w11, w13, w16, w11 -; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: msub w8, w9, w16, w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov w8, #55879 +; CHECK-NEXT: movk w8, #689, lsl #16 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: dup v2.4s, w8 +; CHECK-NEXT: movi v1.4s, #95 +; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold if we can combine urem with udiv. +; Don't lower if we can combine urem with udiv. define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { ; CHECK-LABEL: combine_urem_udiv: ; CHECK: // %bb.0: @@ -146,86 +91,56 @@ } -; Don't fold for divisors that are a power of two. -define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { -; CHECK-LABEL: dont_fold_urem_power_of_two: +; Don't lower for divisors that are a power of two. +define <4 x i16> @dont_lower_urem_power_of_two(<4 x i16> %x) { +; CHECK-LABEL: dont_lower_urem_power_of_two: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #8969 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: movk w9, #22765, lsl #16 -; CHECK-NEXT: umull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: sub w10, w8, w9 -; CHECK-NEXT: add w9, w9, w10, lsr #1 -; CHECK-NEXT: mov w10, #95 -; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: and w9, w9, #0x3f -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: and w10, w10, #0x1f -; CHECK-NEXT: and w9, w9, #0x7 -; CHECK-NEXT: mov v1.h[1], w10 -; CHECK-NEXT: mov v1.h[2], w9 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: adrp x9, .LCPI3_1 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI3_1] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v1.2d, v0.4s, v2.4s +; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold if the divisor is one. -define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { -; CHECK-LABEL: dont_fold_srem_one: +; Don't lower if the divisor is one. +define <4 x i16> @dont_lower_srem_one(<4 x i16> %x) { +; CHECK-LABEL: dont_lower_srem_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #17097 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: movk w9, #45590, lsl #16 -; CHECK-NEXT: umull x9, w8, w9 -; CHECK-NEXT: mov w10, #23 -; CHECK-NEXT: lsr x9, x9, #36 -; CHECK-NEXT: umov w11, v0.h[1] -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: mov w9, #30865 -; CHECK-NEXT: movk w9, #51306, lsl #16 -; CHECK-NEXT: ubfx w10, w11, #1, #15 -; CHECK-NEXT: umull x9, w10, w9 -; CHECK-NEXT: mov w10, #654 -; CHECK-NEXT: lsr x9, x9, #40 -; CHECK-NEXT: msub w9, w9, w10, w11 -; CHECK-NEXT: mov w11, #47143 -; CHECK-NEXT: umov w10, v0.h[3] -; CHECK-NEXT: movk w11, #24749, lsl #16 -; CHECK-NEXT: movi d1, #0000000000000000 -; CHECK-NEXT: umull x11, w10, w11 -; CHECK-NEXT: mov v1.h[1], w9 -; CHECK-NEXT: mov w9, #5423 -; CHECK-NEXT: lsr x11, x11, #43 -; CHECK-NEXT: mov v1.h[2], w8 -; CHECK-NEXT: msub w8, w11, w9, w10 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: adrp x9, .LCPI4_1 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI4_1] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v1.2d, v0.4s, v2.4s +; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold if the divisor is 2^16. -define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { -; CHECK-LABEL: dont_fold_urem_i16_smax: +; Don't lower if the divisor is 2^16. +define <4 x i16> @dont_lower_urem_i16_smax(<4 x i16> %x) { +; CHECK-LABEL: dont_lower_urem_i16_smax: ; CHECK: // %bb.0: ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold i64 urem. -define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { -; CHECK-LABEL: dont_fold_urem_i64: +; Don't lower i64 urem. +define <4 x i64> @dont_lower_urem_i64(<4 x i64> %x) { +; CHECK-LABEL: dont_lower_urem_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x10, #12109 ; CHECK-NEXT: movk x10, #52170, lsl #16 Index: llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll =================================================================== --- llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll +++ llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s ; test_no_prep: @@ -19,8 +20,21 @@ define i64 @test_no_prep(i8* %0, i32 signext %1) { ; CHECK-LABEL: test_no_prep: -; CHECK: addi r3, r3, 4004 -; CHECK: .LBB0_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmplwi r4, 0 +; CHECK-NEXT: beq cr0, .LBB0_4 +; CHECK-NEXT: # %bb.1: # %.preheader +; CHECK-NEXT: cmpldi r4, 1 +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r5, r4, r5, gt +; CHECK-NEXT: mtctr r5 +; CHECK-NEXT: addi r3, r3, 4004 +; CHECK-NEXT: li r5, 0 +; CHECK-NEXT: li r6, -3 +; CHECK-NEXT: li r7, -2 +; CHECK-NEXT: li r8, -1 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_2: # ; CHECK-NEXT: ldx r9, r3, r6 ; CHECK-NEXT: ldx r10, r3, r7 ; CHECK-NEXT: mulld r9, r10, r9 @@ -30,6 +44,12 @@ ; CHECK-NEXT: addi r3, r3, 1 ; CHECK-NEXT: maddld r5, r9, r12, r5 ; CHECK-NEXT: bdnz .LBB0_2 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: add r3, r5, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: addi r3, r4, 0 +; CHECK-NEXT: blr %3 = sext i32 %1 to i64 %4 = icmp eq i32 %1, 0 br i1 %4, label %27, label %5 @@ -83,8 +103,20 @@ define i64 @test_ds_prep(i8* %0, i32 signext %1) { ; CHECK-LABEL: test_ds_prep: -; CHECK: addi r6, r3, 4001 -; CHECK: .LBB1_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmplwi r4, 0 +; CHECK-NEXT: beq cr0, .LBB1_4 +; CHECK-NEXT: # %bb.1: # %.preheader +; CHECK-NEXT: cmpldi r4, 1 +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: addi r6, r3, 4001 +; CHECK-NEXT: isel r3, r4, r5, gt +; CHECK-NEXT: mtctr r3 +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: li r7, 2 +; CHECK-NEXT: li r8, 5 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB1_2: # ; CHECK-NEXT: ld r10, 0(r6) ; CHECK-NEXT: ldx r11, r6, r5 ; CHECK-NEXT: mulld r10, r11, r10 @@ -95,6 +127,12 @@ ; CHECK-NEXT: maddld r3, r10, r6, r3 ; CHECK-NEXT: mr r6, r9 ; CHECK-NEXT: bdnz .LBB1_2 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: add r3, r3, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: addi r3, r4, 0 +; CHECK-NEXT: blr %3 = sext i32 %1 to i64 %4 = icmp eq i32 %1, 0 br i1 %4, label %27, label %5 @@ -158,8 +196,32 @@ define i64 @test_max_number_reminder(i8* %0, i32 signext %1) { ; CHECK-LABEL: test_max_number_reminder: -; CHECK: addi r8, r3, 4001 -; CHECK: .LBB2_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmplwi r4, 0 +; CHECK-NEXT: std r23, -72(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r24, -64(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r25, -56(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-NEXT: beq cr0, .LBB2_3 +; CHECK-NEXT: # %bb.1: # %.preheader +; CHECK-NEXT: cmpldi r4, 1 +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: addi r8, r3, 4001 +; CHECK-NEXT: isel r3, r4, r5, gt +; CHECK-NEXT: mtctr r3 +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: li r6, 2 +; CHECK-NEXT: li r7, 5 +; CHECK-NEXT: li r9, 6 +; CHECK-NEXT: li r10, 13 +; CHECK-NEXT: li r11, 9 +; CHECK-NEXT: li r12, 10 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB2_2: # ; CHECK-NEXT: ld r30, 0(r8) ; CHECK-NEXT: ldx r29, r8, r5 ; CHECK-NEXT: mulld r30, r29, r30 @@ -180,6 +242,20 @@ ; CHECK-NEXT: maddld r3, r8, r23, r3 ; CHECK-NEXT: mr r8, r0 ; CHECK-NEXT: bdnz .LBB2_2 +; CHECK-NEXT: b .LBB2_4 +; CHECK-NEXT: .LBB2_3: +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: .LBB2_4: +; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload +; CHECK-NEXT: add r3, r3, r4 +; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r25, -56(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r24, -64(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r23, -72(r1) # 8-byte Folded Reload +; CHECK-NEXT: blr %3 = sext i32 %1 to i64 %4 = icmp eq i32 %1, 0 br i1 %4, label %47, label %5 @@ -253,8 +329,20 @@ define dso_local i64 @test_update_ds_prep_interact(i8* %0, i32 signext %1) { ; CHECK-LABEL: test_update_ds_prep_interact: -; CHECK: addi r3, r3, 3997 -; CHECK: .LBB3_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmplwi r4, 0 +; CHECK-NEXT: beq cr0, .LBB3_4 +; CHECK-NEXT: # %bb.1: # %.preheader +; CHECK-NEXT: cmpldi r4, 1 +; CHECK-NEXT: li r6, 1 +; CHECK-NEXT: isel r5, r4, r6, gt +; CHECK-NEXT: mtctr r5 +; CHECK-NEXT: addi r3, r3, 3997 +; CHECK-NEXT: li r5, 0 +; CHECK-NEXT: li r7, 2 +; CHECK-NEXT: li r8, 5 +; CHECK-NEXT: .p2align 5 +; CHECK-NEXT: .LBB3_2: # ; CHECK-NEXT: ldu r9, 4(r3) ; CHECK-NEXT: ldx r10, r3, r6 ; CHECK-NEXT: mulld r9, r10, r9 @@ -263,6 +351,12 @@ ; CHECK-NEXT: ldx r12, r3, r8 ; CHECK-NEXT: maddld r5, r9, r12, r5 ; CHECK-NEXT: bdnz .LBB3_2 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: add r3, r5, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB3_4: +; CHECK-NEXT: addi r3, r4, 0 +; CHECK-NEXT: blr %3 = sext i32 %1 to i64 %4 = icmp eq i32 %1, 0 br i1 %4, label %28, label %5 @@ -317,8 +411,20 @@ define i64 @test_update_ds_prep_nointeract(i8* %0, i32 signext %1) { ; CHECK-LABEL: test_update_ds_prep_nointeract: -; CHECK: addi r3, r3, 4000 -; CHECK: .LBB4_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmplwi r4, 0 +; CHECK-NEXT: beq cr0, .LBB4_4 +; CHECK-NEXT: # %bb.1: # %.preheader +; CHECK-NEXT: cmpldi r4, 1 +; CHECK-NEXT: li r6, 1 +; CHECK-NEXT: isel r5, r4, r6, gt +; CHECK-NEXT: mtctr r5 +; CHECK-NEXT: addi r3, r3, 4000 +; CHECK-NEXT: li r5, 0 +; CHECK-NEXT: li r7, 2 +; CHECK-NEXT: li r8, 6 +; CHECK-NEXT: .p2align 5 +; CHECK-NEXT: .LBB4_2: # ; CHECK-NEXT: lbzu r9, 1(r3) ; CHECK-NEXT: ldx r10, r3, r6 ; CHECK-NEXT: mulld r9, r10, r9 @@ -327,6 +433,12 @@ ; CHECK-NEXT: ldx r12, r3, r8 ; CHECK-NEXT: maddld r5, r9, r12, r5 ; CHECK-NEXT: bdnz .LBB4_2 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: add r3, r5, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB4_4: +; CHECK-NEXT: addi r3, r4, 0 +; CHECK-NEXT: blr %3 = sext i32 %1 to i64 %4 = icmp eq i32 %1, 0 br i1 %4, label %27, label %5 @@ -384,9 +496,25 @@ define dso_local i64 @test_ds_multiple_chains(i8* %0, i8* %1, i32 signext %2) { ; CHECK-LABEL: test_ds_multiple_chains: -; CHECK: addi r3, r3, 4010 -; CHECK: addi r4, r4, 4010 -; CHECK: .LBB5_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmplwi r5, 0 +; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-NEXT: beq cr0, .LBB5_3 +; CHECK-NEXT: # %bb.1: # %.preheader +; CHECK-NEXT: cmpldi r5, 1 +; CHECK-NEXT: li r6, 1 +; CHECK-NEXT: isel r6, r5, r6, gt +; CHECK-NEXT: mtctr r6 +; CHECK-NEXT: addi r3, r3, 4010 +; CHECK-NEXT: addi r4, r4, 4010 +; CHECK-NEXT: li r6, 0 +; CHECK-NEXT: li r7, -9 +; CHECK-NEXT: li r8, -5 +; CHECK-NEXT: li r9, -1 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB5_2: # ; CHECK-NEXT: ldx r10, r3, r7 ; CHECK-NEXT: ld r11, 0(r3) ; CHECK-NEXT: mulld r10, r11, r10 @@ -405,6 +533,15 @@ ; CHECK-NEXT: addi r4, r4, 1 ; CHECK-NEXT: maddld r6, r10, r28, r6 ; CHECK-NEXT: bdnz .LBB5_2 +; CHECK-NEXT: b .LBB5_4 +; CHECK-NEXT: .LBB5_3: +; CHECK-NEXT: li r6, 0 +; CHECK-NEXT: .LBB5_4: +; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload +; CHECK-NEXT: add r3, r6, r5 +; CHECK-NEXT: blr %4 = sext i32 %2 to i64 %5 = icmp eq i32 %2, 0 br i1 %5, label %45, label %6 @@ -491,11 +628,46 @@ define i64 @test_ds_cross_basic_blocks(i8* %0, i32 signext %1) { ; CHECK-LABEL: test_ds_cross_basic_blocks: -; CHECK: addi r5, r3, 4000 -; CHECK: .LBB6_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmplwi r4, 0 +; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-NEXT: beq cr0, .LBB6_8 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addis r6, r2, .LC0@toc@ha +; CHECK-NEXT: ld r7, .LC0@toc@l(r6) +; CHECK-NEXT: cmpldi r4, 1 +; CHECK-NEXT: li r6, 1 +; CHECK-NEXT: isel r10, r4, r6, gt +; CHECK-NEXT: ld r4, 0(r7) +; CHECK-NEXT: lis r7, 21845 +; CHECK-NEXT: ori r7, r7, 21845 +; CHECK-NEXT: sldi r7, r7, 32 +; CHECK-NEXT: oris r7, r7, 21845 +; CHECK-NEXT: mtctr r10 +; CHECK-NEXT: addi r5, r3, 4000 +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: addi r4, r4, -1 +; CHECK-NEXT: ori r7, r7, 21846 +; CHECK-NEXT: li r8, 3 +; CHECK-NEXT: li r9, 2 +; CHECK-NEXT: li r10, 5 +; CHECK-NEXT: li r11, 9 +; CHECK-NEXT: li r12, 1 +; CHECK-NEXT: li r30, 1 +; CHECK-NEXT: li r29, 1 +; CHECK-NEXT: li r28, 1 +; CHECK-NEXT: li r27, 1 +; CHECK-NEXT: li r26, 1 +; CHECK-NEXT: b .LBB6_4 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB6_2: # ; CHECK-NEXT: ld r0, 0(r5) ; CHECK-NEXT: add r26, r0, r26 -; CHECK-NEXT: ldx r0, r5, r7 +; CHECK-NEXT: ldx r0, r5, r6 ; CHECK-NEXT: add r27, r0, r27 ; CHECK-NEXT: .LBB6_3: # ; CHECK-NEXT: mulld r0, r27, r26 @@ -506,31 +678,37 @@ ; CHECK-NEXT: addi r5, r5, 1 ; CHECK-NEXT: bdz .LBB6_9 ; CHECK-NEXT: .LBB6_4: # -; CHECK-NEXT: lbzu r0, 1(r6) -; CHECK-NEXT: clrldi r25, r0, 32 -; CHECK-NEXT: mulld r25, r25, r4 -; CHECK-NEXT: rldicl r25, r25, 31, 33 -; CHECK-NEXT: slwi r24, r25, 1 -; CHECK-NEXT: add r25, r25, r24 -; CHECK-NEXT: subf r0, r25, r0 +; CHECK-NEXT: lbzu r0, 1(r4) +; CHECK-NEXT: mulld r0, r0, r7 +; CHECK-NEXT: mulhdu r0, r0, r8 ; CHECK-NEXT: cmplwi r0, 1 ; CHECK-NEXT: beq cr0, .LBB6_2 ; CHECK-NEXT: # %bb.5: # -; CHECK-NEXT: clrlwi r0, r0, 24 +; CHECK-NEXT: rlwinm r0, r0, 0, 24, 31 ; CHECK-NEXT: cmplwi r0, 2 ; CHECK-NEXT: bne cr0, .LBB6_7 ; CHECK-NEXT: # %bb.6: # -; CHECK-NEXT: ldx r0, r5, r8 -; CHECK-NEXT: add r28, r0, r28 ; CHECK-NEXT: ldx r0, r5, r9 +; CHECK-NEXT: add r28, r0, r28 +; CHECK-NEXT: ldx r0, r5, r10 ; CHECK-NEXT: add r29, r0, r29 ; CHECK-NEXT: b .LBB6_3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB6_7: # -; CHECK-NEXT: ldx r0, r5, r10 +; CHECK-NEXT: ldx r0, r5, r8 ; CHECK-NEXT: add r30, r0, r30 ; CHECK-NEXT: ldx r0, r5, r11 ; CHECK-NEXT: add r12, r0, r12 +; CHECK-NEXT: b .LBB6_3 +; CHECK-NEXT: .LBB6_8: +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: .LBB6_9: +; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload +; CHECK-NEXT: blr %3 = sext i32 %1 to i64 %4 = icmp eq i32 %1, 0 br i1 %4, label %66, label %5 @@ -635,8 +813,19 @@ define float @test_ds_float(i8* %0, i32 signext %1) { ; CHECK-LABEL: test_ds_float: -; CHECK: addi r3, r3, 4000 -; CHECK: .LBB7_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmpwi r4, 1 +; CHECK-NEXT: blt cr0, .LBB7_4 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: clrldi r4, r4, 32 +; CHECK-NEXT: mtctr r4 +; CHECK-NEXT: addi r3, r3, 4000 +; CHECK-NEXT: xxlxor f1, f1, f1 +; CHECK-NEXT: li r4, 1 +; CHECK-NEXT: li r5, 21 +; CHECK-NEXT: li r6, 61 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB7_2: # ; CHECK-NEXT: lfsu f0, 1(r3) ; CHECK-NEXT: lfsx f2, r3, r4 ; CHECK-NEXT: lfsx f3, r3, r5 @@ -646,6 +835,11 @@ ; CHECK-NEXT: xsmulsp f0, f0, f4 ; CHECK-NEXT: xsaddsp f1, f1, f0 ; CHECK-NEXT: bdnz .LBB7_2 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB7_4: +; CHECK-NEXT: xxlxor f1, f1, f1 +; CHECK-NEXT: blr %3 = icmp sgt i32 %1, 0 br i1 %3, label %4, label %28 @@ -702,9 +896,19 @@ define float @test_ds_combine_float_int(i8* %0, i32 signext %1) { ; CHECK-LABEL: test_ds_combine_float_int: -; CHECK: addi r4, r3, 4001 -; CHECK: addi r3, r3, 4000 -; CHECK: .LBB8_2: # +; CHECK: # %bb.0: +; CHECK-NEXT: cmpwi r4, 1 +; CHECK-NEXT: blt cr0, .LBB8_4 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: clrldi r5, r4, 32 +; CHECK-NEXT: addi r4, r3, 4001 +; CHECK-NEXT: addi r3, r3, 4000 +; CHECK-NEXT: mtctr r5 +; CHECK-NEXT: xxlxor f1, f1, f1 +; CHECK-NEXT: li r5, 21 +; CHECK-NEXT: li r6, 61 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB8_2: # ; CHECK-NEXT: lfdu f4, 1(r4) ; CHECK-NEXT: lfsu f0, 1(r3) ; CHECK-NEXT: xscvuxdsp f4, f4 @@ -715,6 +919,11 @@ ; CHECK-NEXT: xsmulsp f0, f3, f0 ; CHECK-NEXT: xsaddsp f1, f1, f0 ; CHECK-NEXT: bdnz .LBB8_2 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB8_4: +; CHECK-NEXT: xxlxor f1, f1, f1 +; CHECK-NEXT: blr %3 = icmp sgt i32 %1, 0 br i1 %3, label %4, label %29 Index: llvm/test/CodeGen/PowerPC/urem-lkk.ll =================================================================== --- llvm/test/CodeGen/PowerPC/urem-lkk.ll +++ llvm/test/CodeGen/PowerPC/urem-lkk.ll @@ -2,8 +2,8 @@ ; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc64 < %s | FileCheck -check-prefixes=CHECK,CHECK64 %s ; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc < %s | FileCheck -check-prefixes=CHECK,CHECK32 %s -define i32 @fold_urem_positive_odd(i32 %x) { -; CHECK-LABEL: fold_urem_positive_odd: +define i32 @lower_urem_positive_odd(i32 %x) { +; CHECK-LABEL: lower_urem_positive_odd: ; CHECK: # %bb.0: ; CHECK-NEXT: lis 4, 22765 ; CHECK-NEXT: ori 4, 4, 8969 @@ -20,8 +20,8 @@ } -define i32 @fold_urem_positive_even(i32 %x) { -; CHECK-LABEL: fold_urem_positive_even: +define i32 @lower_urem_positive_even(i32 %x) { +; CHECK-LABEL: lower_urem_positive_even: ; CHECK: # %bb.0: ; CHECK-NEXT: lis 4, -2226 ; CHECK-NEXT: ori 4, 4, 16323 @@ -35,7 +35,7 @@ } -; Don't fold if we can combine urem with udiv. +; Don't lower if we can combine urem with udiv. define i32 @combine_urem_udiv(i32 %x) { ; CHECK-LABEL: combine_urem_udiv: ; CHECK: # %bb.0: @@ -56,9 +56,9 @@ ret i32 %3 } -; Don't fold for divisors that are a power of two. -define i32 @dont_fold_urem_power_of_two(i32 %x) { -; CHECK-LABEL: dont_fold_urem_power_of_two: +; Don't lower for divisors that are a power of two. +define i32 @dont_lower_urem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_lower_urem_power_of_two: ; CHECK: # %bb.0: ; CHECK-NEXT: clrlwi 3, 3, 26 ; CHECK-NEXT: blr @@ -66,9 +66,9 @@ ret i32 %1 } -; Don't fold if the divisor is one. -define i32 @dont_fold_urem_one(i32 %x) { -; CHECK-LABEL: dont_fold_urem_one: +; Don't lower if the divisor is one. +define i32 @dont_lower_urem_one(i32 %x) { +; CHECK-LABEL: dont_lower_urem_one: ; CHECK: # %bb.0: ; CHECK-NEXT: li 3, 0 ; CHECK-NEXT: blr @@ -76,18 +76,18 @@ ret i32 %1 } -; Don't fold if the divisor is 2^32. -define i32 @dont_fold_urem_i32_umax(i32 %x) { -; CHECK-LABEL: dont_fold_urem_i32_umax: +; Don't lower if the divisor is 2^32. +define i32 @dont_lower_urem_i32_umax(i32 %x) { +; CHECK-LABEL: dont_lower_urem_i32_umax: ; CHECK: # %bb.0: ; CHECK-NEXT: blr %1 = urem i32 %x, 4294967296 ret i32 %1 } -; Don't fold i64 urem -define i64 @dont_fold_urem_i64(i64 %x) { -; CHECK-LABEL: dont_fold_urem_i64: +; Don't lower i64 urem +define i64 @dont_lower_urem_i64(i64 %x) { +; CHECK-LABEL: dont_lower_urem_i64: ; CHECK: # %bb.0: ; CHECK-NEXT: mflr 0 ; CHECK-NEXT: stw 0, 4(1) @@ -104,3 +104,24 @@ %1 = urem i64 %x, 98 ret i64 %1 } + +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 + +declare dso_local i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #1 + +define void @urem_loop(i32 %x) { +entry: + %0 = add i32 0, 0 + br label %loop +loop: + %1 = phi i32 [ 1, %entry ], [ %5, %loop ] + %2 = phi i32 [%0, %entry], [%4, %loop] + %3 = urem i32 %1, 95 + %4 = add i32 %3, %2 + %5 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %4) + %6 = icmp ult i32 %5, %x + br i1 %6, label %loop, label %afterloop + +afterloop: + ret void +} Index: llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll +++ llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll @@ -8,293 +8,275 @@ ; RUN: llc -mcpu=pwr8 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ ; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,P8BE -define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { -; P9LE-LABEL: fold_urem_vec_1: +define <4 x i16> @lower_urem_vec_1(<4 x i16> %x) { +; P9LE-LABEL: lower_urem_vec_1: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: lis r4, 689 +; P9LE-NEXT: ori r4, r4, 55878 +; P9LE-NEXT: li r3, 0 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: lis r5, 21399 -; P9LE-NEXT: ori r5, r5, 33437 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: lis r5, 16727 -; P9LE-NEXT: ori r5, r5, 2287 -; P9LE-NEXT: rldicl r4, r4, 27, 37 -; P9LE-NEXT: mulli r4, r4, 98 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: oris r4, r4, 4139 +; P9LE-NEXT: ori r4, r4, 7589 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 95 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: lis r4, 528 +; P9LE-NEXT: ori r4, r4, 33825 +; P9LE-NEXT: sldi r4, r4, 32 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: oris r4, r4, 2114 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: lis r5, 8456 -; P9LE-NEXT: ori r5, r5, 16913 -; P9LE-NEXT: rldicl r4, r4, 24, 40 -; P9LE-NEXT: mulli r4, r4, 1003 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: ori r4, r4, 4229 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 124 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: lis r4, 668 +; P9LE-NEXT: ori r4, r4, 48148 ; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: sldi r4, r4, 32 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: oris r4, r4, 58848 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 30, 18, 31 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: rldicl r4, r4, 30, 34 -; P9LE-NEXT: mulli r4, r4, 124 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: ori r4, r4, 42800 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 98 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: lis r4, 65 +; P9LE-NEXT: ori r4, r4, 22280 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: oris r4, r4, 61158 +; P9LE-NEXT: ori r4, r4, 14506 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: lis r6, 22765 -; P9LE-NEXT: ori r6, r6, 8969 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 1003 +; P9LE-NEXT: mulhdu r3, r3, r4 ; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: xxswapd v4, vs0 -; P9LE-NEXT: clrldi r5, r4, 32 -; P9LE-NEXT: mulld r5, r5, r6 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 1 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 -; P9LE-NEXT: vmrglh v2, v4, v2 -; P9LE-NEXT: vmrglw v2, v3, v2 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v2, v3 ; P9LE-NEXT: blr ; -; P9BE-LABEL: fold_urem_vec_1: +; P9BE-LABEL: lower_urem_vec_1: ; P9BE: # %bb.0: +; P9BE-NEXT: lis r4, 65 +; P9BE-NEXT: ori r4, r4, 22280 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: lis r5, 16727 -; P9BE-NEXT: ori r5, r5, 2287 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: lis r5, 21399 -; P9BE-NEXT: ori r5, r5, 33437 -; P9BE-NEXT: rldicl r4, r4, 24, 40 -; P9BE-NEXT: mulli r4, r4, 1003 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: oris r4, r4, 61158 +; P9BE-NEXT: ori r4, r4, 14506 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 1003 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: lis r4, 668 +; P9BE-NEXT: ori r4, r4, 48148 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: oris r4, r4, 58848 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: ori r4, r4, 42800 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: lis r5, 8456 -; P9BE-NEXT: ori r5, r5, 16913 -; P9BE-NEXT: rldicl r4, r4, 27, 37 -; P9BE-NEXT: mulli r4, r4, 98 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 98 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: lis r4, 528 +; P9BE-NEXT: ori r4, r4, 33825 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: oris r4, r4, 2114 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: ori r4, r4, 4229 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r4, r3, 16 -; P9BE-NEXT: rlwinm r3, r3, 30, 18, 31 -; P9BE-NEXT: mulld r3, r3, r5 -; P9BE-NEXT: lis r5, 22765 -; P9BE-NEXT: ori r5, r5, 8969 -; P9BE-NEXT: rldicl r3, r3, 30, 34 -; P9BE-NEXT: mulli r3, r3, 124 -; P9BE-NEXT: subf r3, r3, r4 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 124 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: lis r4, 689 +; P9BE-NEXT: ori r4, r4, 55878 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: oris r4, r4, 4139 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: ori r4, r4, 7589 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r5, r4, r3 -; P9BE-NEXT: srwi r5, r5, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 95 +; P9BE-NEXT: mulhdu r3, r3, r4 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: vmrghh v2, v2, v4 ; P9BE-NEXT: vmrghw v2, v2, v3 ; P9BE-NEXT: blr ; -; P8LE-LABEL: fold_urem_vec_1: +; P8LE-LABEL: lower_urem_vec_1: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 22765 -; P8LE-NEXT: lis r8, 21399 -; P8LE-NEXT: ori r3, r3, 8969 -; P8LE-NEXT: ori r8, r8, 33437 -; P8LE-NEXT: mfvsrd r4, f0 -; P8LE-NEXT: clrldi r5, r4, 48 -; P8LE-NEXT: rldicl r9, r4, 32, 48 -; P8LE-NEXT: rlwinm r6, r5, 0, 16, 31 -; P8LE-NEXT: rldicl r10, r4, 16, 48 -; P8LE-NEXT: rlwinm r11, r9, 0, 16, 31 -; P8LE-NEXT: clrldi r7, r6, 32 -; P8LE-NEXT: rlwinm r12, r10, 0, 16, 31 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: lis r4, 528 +; P8LE-NEXT: lis r5, 668 +; P8LE-NEXT: ori r3, r3, 55878 +; P8LE-NEXT: ori r4, r4, 33825 +; P8LE-NEXT: ori r5, r5, 48148 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: sldi r4, r4, 32 +; P8LE-NEXT: mfvsrd r6, f0 +; P8LE-NEXT: oris r3, r3, 4139 +; P8LE-NEXT: oris r4, r4, 2114 +; P8LE-NEXT: ori r3, r3, 7589 +; P8LE-NEXT: sldi r5, r5, 32 +; P8LE-NEXT: ori r4, r4, 4229 +; P8LE-NEXT: clrldi r7, r6, 48 +; P8LE-NEXT: rldicl r8, r6, 48, 48 +; P8LE-NEXT: oris r5, r5, 58848 +; P8LE-NEXT: rlwinm r7, r7, 0, 16, 31 +; P8LE-NEXT: rlwinm r8, r8, 0, 16, 31 +; P8LE-NEXT: ori r5, r5, 42800 ; P8LE-NEXT: mulld r3, r7, r3 -; P8LE-NEXT: lis r7, 16727 -; P8LE-NEXT: ori r7, r7, 2287 -; P8LE-NEXT: mulld r8, r11, r8 -; P8LE-NEXT: lis r11, 8456 -; P8LE-NEXT: rldicl r4, r4, 48, 48 -; P8LE-NEXT: mulld r7, r12, r7 -; P8LE-NEXT: ori r11, r11, 16913 -; P8LE-NEXT: rlwinm r12, r4, 30, 18, 31 -; P8LE-NEXT: rldicl r3, r3, 32, 32 -; P8LE-NEXT: mulld r11, r12, r11 -; P8LE-NEXT: subf r6, r3, r6 -; P8LE-NEXT: rldicl r8, r8, 27, 37 -; P8LE-NEXT: srwi r6, r6, 1 -; P8LE-NEXT: add r3, r6, r3 -; P8LE-NEXT: rldicl r6, r7, 24, 40 -; P8LE-NEXT: mulli r7, r8, 98 -; P8LE-NEXT: srwi r3, r3, 6 -; P8LE-NEXT: rldicl r8, r11, 30, 34 -; P8LE-NEXT: mulli r6, r6, 1003 -; P8LE-NEXT: mulli r3, r3, 95 -; P8LE-NEXT: mulli r8, r8, 124 -; P8LE-NEXT: subf r7, r7, r9 -; P8LE-NEXT: subf r6, r6, r10 -; P8LE-NEXT: mtvsrd f0, r7 -; P8LE-NEXT: subf r3, r3, r5 -; P8LE-NEXT: subf r4, r8, r4 -; P8LE-NEXT: mtvsrd f1, r6 -; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: lis r7, 65 +; P8LE-NEXT: ori r7, r7, 22280 +; P8LE-NEXT: mulld r4, r8, r4 +; P8LE-NEXT: rldicl r8, r6, 32, 48 +; P8LE-NEXT: sldi r7, r7, 32 +; P8LE-NEXT: rlwinm r8, r8, 0, 16, 31 +; P8LE-NEXT: oris r7, r7, 61158 +; P8LE-NEXT: rldicl r6, r6, 16, 48 +; P8LE-NEXT: mulld r5, r8, r5 +; P8LE-NEXT: ori r7, r7, 14506 +; P8LE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8LE-NEXT: mulld r6, r6, r7 +; P8LE-NEXT: li r7, 95 +; P8LE-NEXT: mulhdu r3, r3, r7 +; P8LE-NEXT: li r7, 124 +; P8LE-NEXT: mulhdu r4, r4, r7 +; P8LE-NEXT: li r7, 98 +; P8LE-NEXT: mulhdu r5, r5, r7 +; P8LE-NEXT: li r7, 1003 +; P8LE-NEXT: mulhdu r6, r6, r7 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: mtvsrd f1, r4 +; P8LE-NEXT: mtvsrd f2, r5 ; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: mtvsrd f3, r6 ; P8LE-NEXT: xxswapd v3, vs1 ; P8LE-NEXT: xxswapd v4, vs2 ; P8LE-NEXT: xxswapd v5, vs3 ; P8LE-NEXT: vmrglh v2, v3, v2 ; P8LE-NEXT: vmrglh v3, v5, v4 -; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: vmrglw v2, v3, v2 ; P8LE-NEXT: blr ; -; P8BE-LABEL: fold_urem_vec_1: +; P8BE-LABEL: lower_urem_vec_1: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 -; P8BE-NEXT: lis r9, 16727 -; P8BE-NEXT: ori r3, r3, 8969 -; P8BE-NEXT: ori r9, r9, 2287 -; P8BE-NEXT: rldicl r5, r4, 16, 48 -; P8BE-NEXT: clrldi r6, r4, 48 -; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 -; P8BE-NEXT: rldicl r7, r4, 48, 48 -; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 -; P8BE-NEXT: clrldi r8, r5, 32 +; P8BE-NEXT: lis r3, 65 +; P8BE-NEXT: mfvsrd r6, v2 +; P8BE-NEXT: lis r4, 668 +; P8BE-NEXT: lis r5, 528 +; P8BE-NEXT: ori r3, r3, 22280 +; P8BE-NEXT: ori r4, r4, 48148 +; P8BE-NEXT: ori r5, r5, 33825 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: clrldi r7, r6, 48 +; P8BE-NEXT: oris r3, r3, 61158 +; P8BE-NEXT: sldi r4, r4, 32 ; P8BE-NEXT: rlwinm r7, r7, 0, 16, 31 -; P8BE-NEXT: mulld r3, r8, r3 -; P8BE-NEXT: lis r8, 21399 -; P8BE-NEXT: clrldi r10, r6, 32 -; P8BE-NEXT: ori r8, r8, 33437 -; P8BE-NEXT: clrldi r11, r7, 32 -; P8BE-NEXT: mulld r9, r10, r9 -; P8BE-NEXT: lis r10, 8456 -; P8BE-NEXT: rldicl r4, r4, 32, 48 -; P8BE-NEXT: mulld r8, r11, r8 -; P8BE-NEXT: ori r10, r10, 16913 -; P8BE-NEXT: rlwinm r11, r4, 30, 18, 31 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: rlwinm r4, r4, 0, 16, 31 -; P8BE-NEXT: mulld r10, r11, r10 -; P8BE-NEXT: subf r11, r3, r5 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: rldicl r9, r9, 24, 40 -; P8BE-NEXT: add r3, r11, r3 -; P8BE-NEXT: rldicl r8, r8, 27, 37 -; P8BE-NEXT: srwi r3, r3, 6 -; P8BE-NEXT: mulli r9, r9, 1003 -; P8BE-NEXT: rldicl r10, r10, 30, 34 -; P8BE-NEXT: mulli r8, r8, 98 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: mulli r10, r10, 124 -; P8BE-NEXT: subf r6, r9, r6 -; P8BE-NEXT: subf r7, r8, r7 -; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: subf r3, r3, r5 -; P8BE-NEXT: subf r4, r10, r4 -; P8BE-NEXT: mtvsrd v2, r6 -; P8BE-NEXT: sldi r5, r7, 48 +; P8BE-NEXT: ori r3, r3, 14506 +; P8BE-NEXT: rldicl r8, r6, 48, 48 +; P8BE-NEXT: oris r4, r4, 58848 +; P8BE-NEXT: mulld r3, r7, r3 +; P8BE-NEXT: lis r7, 689 +; P8BE-NEXT: rlwinm r8, r8, 0, 16, 31 +; P8BE-NEXT: ori r4, r4, 42800 +; P8BE-NEXT: sldi r5, r5, 32 +; P8BE-NEXT: ori r7, r7, 55878 +; P8BE-NEXT: mulld r4, r8, r4 +; P8BE-NEXT: rldicl r8, r6, 32, 48 +; P8BE-NEXT: oris r5, r5, 2114 +; P8BE-NEXT: sldi r7, r7, 32 +; P8BE-NEXT: rlwinm r8, r8, 0, 16, 31 +; P8BE-NEXT: ori r5, r5, 4229 +; P8BE-NEXT: rldicl r6, r6, 16, 48 +; P8BE-NEXT: oris r7, r7, 4139 +; P8BE-NEXT: mulld r5, r8, r5 +; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8BE-NEXT: ori r7, r7, 7589 +; P8BE-NEXT: mulld r6, r6, r7 +; P8BE-NEXT: li r7, 1003 +; P8BE-NEXT: mulhdu r3, r3, r7 +; P8BE-NEXT: li r7, 98 +; P8BE-NEXT: mulhdu r4, r4, r7 +; P8BE-NEXT: li r7, 124 +; P8BE-NEXT: mulhdu r5, r5, r7 +; P8BE-NEXT: li r7, 95 +; P8BE-NEXT: mulhdu r6, r6, r7 ; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: mtvsrd v3, r5 -; P8BE-NEXT: mtvsrd v4, r3 -; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: mtvsrd v2, r3 +; P8BE-NEXT: sldi r3, r4, 48 +; P8BE-NEXT: sldi r4, r5, 48 +; P8BE-NEXT: mtvsrd v3, r3 +; P8BE-NEXT: sldi r3, r6, 48 +; P8BE-NEXT: mtvsrd v4, r4 +; P8BE-NEXT: mtvsrd v5, r3 ; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: vmrghh v3, v4, v5 +; P8BE-NEXT: vmrghh v3, v5, v4 ; P8BE-NEXT: vmrghw v2, v3, v2 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, ret <4 x i16> %1 } -define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { -; P9LE-LABEL: fold_urem_vec_2: +define <4 x i16> @lower_urem_vec_2(<4 x i16> %x) { +; P9LE-LABEL: lower_urem_vec_2: ; P9LE: # %bb.0: +; P9LE-NEXT: lis r4, 689 +; P9LE-NEXT: ori r4, r4, 55878 ; P9LE-NEXT: li r3, 0 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: lis r6, 22765 -; P9LE-NEXT: ori r6, r6, 8969 -; P9LE-NEXT: clrldi r5, r4, 32 -; P9LE-NEXT: mulld r5, r5, r6 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 1 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: oris r4, r4, 4139 +; P9LE-NEXT: ori r4, r4, 7589 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r5, 95 +; P9LE-NEXT: mulhdu r3, r3, r5 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: clrldi r5, r4, 32 -; P9LE-NEXT: mulld r5, r5, r6 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 1 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: mulhdu r3, r3, r5 ; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: clrldi r5, r4, 32 -; P9LE-NEXT: mulld r5, r5, r6 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 1 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: mulhdu r3, r3, r5 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: clrldi r5, r4, 32 -; P9LE-NEXT: mulld r5, r5, r6 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 1 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: mulhdu r3, r3, r5 ; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 @@ -303,130 +285,78 @@ ; P9LE-NEXT: vmrglw v2, v2, v3 ; P9LE-NEXT: blr ; -; P9BE-LABEL: fold_urem_vec_2: +; P9BE-LABEL: lower_urem_vec_2: ; P9BE: # %bb.0: +; P9BE-NEXT: lis r4, 689 +; P9BE-NEXT: ori r4, r4, 55878 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: lis r5, 22765 -; P9BE-NEXT: ori r5, r5, 8969 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r6, r4, r3 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r4, r6, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r4, r4, 32 +; P9BE-NEXT: oris r4, r4, 4139 +; P9BE-NEXT: ori r4, r4, 7589 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r5, 95 +; P9BE-NEXT: mulhdu r3, r3, r5 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r6, r4, r3 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r4, r6, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: mulhdu r3, r3, r5 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r6, r4, r3 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r4, r6, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: mulhdu r3, r3, r5 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r5, r4, r3 -; P9BE-NEXT: srwi r5, r5, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: mulhdu r3, r3, r5 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: vmrghh v2, v2, v4 ; P9BE-NEXT: vmrghw v2, v2, v3 ; P9BE-NEXT: blr ; -; P8LE-LABEL: fold_urem_vec_2: +; P8LE-LABEL: lower_urem_vec_2: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r4, 22765 -; P8LE-NEXT: std r29, -24(r1) # 8-byte Folded Spill -; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; P8LE-NEXT: ori r4, r4, 8969 -; P8LE-NEXT: mfvsrd r5, f0 -; P8LE-NEXT: clrldi r3, r5, 48 -; P8LE-NEXT: rldicl r6, r5, 48, 48 -; P8LE-NEXT: rlwinm r8, r3, 0, 16, 31 -; P8LE-NEXT: rldicl r7, r5, 32, 48 -; P8LE-NEXT: rlwinm r9, r6, 0, 16, 31 -; P8LE-NEXT: rldicl r5, r5, 16, 48 -; P8LE-NEXT: clrldi r11, r8, 32 -; P8LE-NEXT: rlwinm r10, r7, 0, 16, 31 -; P8LE-NEXT: rlwinm r12, r5, 0, 16, 31 -; P8LE-NEXT: mulld r11, r11, r4 -; P8LE-NEXT: clrldi r0, r9, 32 -; P8LE-NEXT: clrldi r30, r10, 32 -; P8LE-NEXT: clrldi r29, r12, 32 -; P8LE-NEXT: mulld r0, r0, r4 -; P8LE-NEXT: mulld r30, r30, r4 -; P8LE-NEXT: mulld r4, r29, r4 -; P8LE-NEXT: ld r29, -24(r1) # 8-byte Folded Reload -; P8LE-NEXT: rldicl r11, r11, 32, 32 -; P8LE-NEXT: subf r8, r11, r8 -; P8LE-NEXT: rldicl r0, r0, 32, 32 -; P8LE-NEXT: srwi r8, r8, 1 -; P8LE-NEXT: rldicl r30, r30, 32, 32 -; P8LE-NEXT: rldicl r4, r4, 32, 32 -; P8LE-NEXT: subf r9, r0, r9 -; P8LE-NEXT: add r8, r8, r11 -; P8LE-NEXT: subf r10, r30, r10 -; P8LE-NEXT: subf r11, r4, r12 -; P8LE-NEXT: srwi r9, r9, 1 -; P8LE-NEXT: srwi r8, r8, 6 -; P8LE-NEXT: srwi r10, r10, 1 -; P8LE-NEXT: srwi r11, r11, 1 -; P8LE-NEXT: add r9, r9, r0 -; P8LE-NEXT: add r10, r10, r30 -; P8LE-NEXT: add r4, r11, r4 -; P8LE-NEXT: srwi r9, r9, 6 -; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload -; P8LE-NEXT: mulli r8, r8, 95 -; P8LE-NEXT: srwi r10, r10, 6 -; P8LE-NEXT: srwi r4, r4, 6 -; P8LE-NEXT: mulli r9, r9, 95 -; P8LE-NEXT: mulli r10, r10, 95 -; P8LE-NEXT: mulli r4, r4, 95 -; P8LE-NEXT: subf r3, r8, r3 -; P8LE-NEXT: subf r6, r9, r6 -; P8LE-NEXT: mtvsrd f0, r3 -; P8LE-NEXT: subf r3, r10, r7 -; P8LE-NEXT: subf r4, r4, r5 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: ori r3, r3, 55878 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: oris r3, r3, 4139 +; P8LE-NEXT: ori r3, r3, 7589 +; P8LE-NEXT: clrldi r5, r4, 48 +; P8LE-NEXT: rldicl r6, r4, 48, 48 +; P8LE-NEXT: rldicl r7, r4, 32, 48 +; P8LE-NEXT: rldicl r4, r4, 16, 48 +; P8LE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8LE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8LE-NEXT: rlwinm r7, r7, 0, 16, 31 +; P8LE-NEXT: rlwinm r4, r4, 0, 16, 31 +; P8LE-NEXT: mulld r5, r5, r3 +; P8LE-NEXT: mulld r6, r6, r3 +; P8LE-NEXT: mulld r7, r7, r3 +; P8LE-NEXT: mulld r3, r4, r3 +; P8LE-NEXT: li r4, 95 +; P8LE-NEXT: mulhdu r5, r5, r4 +; P8LE-NEXT: mulhdu r6, r6, r4 +; P8LE-NEXT: mulhdu r7, r7, r4 +; P8LE-NEXT: mulhdu r3, r3, r4 +; P8LE-NEXT: mtvsrd f0, r5 ; P8LE-NEXT: mtvsrd f1, r6 -; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: mtvsrd f2, r7 ; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: mtvsrd f3, r3 ; P8LE-NEXT: xxswapd v3, vs1 ; P8LE-NEXT: xxswapd v4, vs2 ; P8LE-NEXT: xxswapd v5, vs3 @@ -435,61 +365,37 @@ ; P8LE-NEXT: vmrglw v2, v3, v2 ; P8LE-NEXT: blr ; -; P8BE-LABEL: fold_urem_vec_2: +; P8BE-LABEL: lower_urem_vec_2: ; P8BE: # %bb.0: +; P8BE-NEXT: lis r3, 689 ; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 -; P8BE-NEXT: ori r3, r3, 8969 +; P8BE-NEXT: ori r3, r3, 55878 +; P8BE-NEXT: sldi r3, r3, 32 ; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: oris r3, r3, 4139 ; P8BE-NEXT: rldicl r6, r4, 48, 48 ; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 ; P8BE-NEXT: rldicl r7, r4, 32, 48 -; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 -; P8BE-NEXT: clrldi r8, r5, 32 ; P8BE-NEXT: rldicl r4, r4, 16, 48 +; P8BE-NEXT: ori r3, r3, 7589 +; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8BE-NEXT: mulld r5, r5, r3 ; P8BE-NEXT: rlwinm r7, r7, 0, 16, 31 -; P8BE-NEXT: clrldi r9, r6, 32 -; P8BE-NEXT: mulld r8, r8, r3 ; P8BE-NEXT: rlwinm r4, r4, 0, 16, 31 -; P8BE-NEXT: clrldi r10, r7, 32 -; P8BE-NEXT: mulld r9, r9, r3 -; P8BE-NEXT: clrldi r11, r4, 32 -; P8BE-NEXT: mulld r10, r10, r3 -; P8BE-NEXT: mulld r3, r11, r3 -; P8BE-NEXT: rldicl r8, r8, 32, 32 -; P8BE-NEXT: rldicl r9, r9, 32, 32 -; P8BE-NEXT: subf r11, r8, r5 -; P8BE-NEXT: rldicl r10, r10, 32, 32 -; P8BE-NEXT: subf r12, r9, r6 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: add r8, r11, r8 -; P8BE-NEXT: subf r11, r10, r7 -; P8BE-NEXT: srwi r12, r12, 1 -; P8BE-NEXT: add r9, r12, r9 -; P8BE-NEXT: subf r12, r3, r4 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: srwi r8, r8, 6 -; P8BE-NEXT: add r10, r11, r10 -; P8BE-NEXT: srwi r11, r12, 1 -; P8BE-NEXT: srwi r9, r9, 6 -; P8BE-NEXT: add r3, r11, r3 -; P8BE-NEXT: srwi r10, r10, 6 -; P8BE-NEXT: srwi r3, r3, 6 -; P8BE-NEXT: mulli r8, r8, 95 -; P8BE-NEXT: mulli r9, r9, 95 -; P8BE-NEXT: mulli r10, r10, 95 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: subf r5, r8, r5 -; P8BE-NEXT: subf r6, r9, r6 -; P8BE-NEXT: subf r7, r10, r7 -; P8BE-NEXT: subf r3, r3, r4 -; P8BE-NEXT: sldi r5, r5, 48 -; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: mulld r6, r6, r3 +; P8BE-NEXT: mulld r7, r7, r3 +; P8BE-NEXT: mulld r3, r4, r3 +; P8BE-NEXT: li r4, 95 +; P8BE-NEXT: mulhdu r5, r5, r4 +; P8BE-NEXT: mulhdu r6, r6, r4 +; P8BE-NEXT: mulhdu r7, r7, r4 +; P8BE-NEXT: mulhdu r3, r3, r4 +; P8BE-NEXT: sldi r4, r5, 48 +; P8BE-NEXT: sldi r5, r6, 48 +; P8BE-NEXT: mtvsrd v2, r4 ; P8BE-NEXT: sldi r4, r7, 48 -; P8BE-NEXT: mtvsrd v2, r5 ; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v3, r6 +; P8BE-NEXT: mtvsrd v3, r5 ; P8BE-NEXT: mtvsrd v4, r4 ; P8BE-NEXT: mtvsrd v5, r3 ; P8BE-NEXT: vmrghh v2, v3, v2 @@ -501,7 +407,7 @@ } -; Don't fold if we can combine urem with udiv. +; Don't lower if we can combine urem with udiv. define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { ; P9LE-LABEL: combine_urem_udiv: ; P9LE: # %bb.0: @@ -817,9 +723,9 @@ ret <4 x i16> %3 } -; Don't fold for divisors that are a power of two. -define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { -; P9LE-LABEL: dont_fold_urem_power_of_two: +; Don't lower for divisors that are a power of two. +define <4 x i16> @dont_lower_urem_power_of_two(<4 x i16> %x) { +; P9LE-LABEL: dont_lower_urem_power_of_two: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 0 ; P9LE-NEXT: vextuhrx r3, r3, v2 @@ -830,34 +736,31 @@ ; P9LE-NEXT: rlwinm r3, r3, 0, 27, 31 ; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: lis r6, 22765 -; P9LE-NEXT: ori r6, r6, 8969 +; P9LE-NEXT: lis r4, 689 +; P9LE-NEXT: ori r4, r4, 55878 ; P9LE-NEXT: xxswapd v4, vs0 -; P9LE-NEXT: vmrglh v3, v4, v3 -; P9LE-NEXT: clrldi r5, r4, 32 -; P9LE-NEXT: mulld r5, r5, r6 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 1 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: oris r4, r4, 4139 +; P9LE-NEXT: ori r4, r4, 7589 +; P9LE-NEXT: rlwinm r3, r3, 0, 29, 31 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r3, r3, 0, 29, 31 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 95 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 -; P9LE-NEXT: vmrglh v2, v4, v2 +; P9LE-NEXT: vmrglh v2, v2, v4 ; P9LE-NEXT: vmrglw v2, v2, v3 ; P9LE-NEXT: blr ; -; P9BE-LABEL: dont_fold_urem_power_of_two: +; P9BE-LABEL: dont_lower_urem_power_of_two: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 @@ -869,141 +772,137 @@ ; P9BE-NEXT: rlwinm r3, r3, 0, 26, 31 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: lis r5, 22765 -; P9BE-NEXT: ori r5, r5, 8969 +; P9BE-NEXT: rlwinm r3, r3, 0, 29, 31 +; P9BE-NEXT: lis r4, 689 +; P9BE-NEXT: ori r4, r4, 55878 ; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r5, r4, r3 -; P9BE-NEXT: srwi r5, r5, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r4, r4, 32 +; P9BE-NEXT: oris r4, r4, 4139 +; P9BE-NEXT: ori r4, r4, 7589 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: rlwinm r3, r3, 0, 29, 31 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 95 +; P9BE-NEXT: mulhdu r3, r3, r4 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghh v2, v4, v2 ; P9BE-NEXT: vmrghw v2, v3, v2 ; P9BE-NEXT: blr ; -; P8LE-LABEL: dont_fold_urem_power_of_two: +; P8LE-LABEL: dont_lower_urem_power_of_two: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 22765 -; P8LE-NEXT: ori r3, r3, 8969 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: li r6, 95 +; P8LE-NEXT: ori r3, r3, 55878 +; P8LE-NEXT: sldi r3, r3, 32 ; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: oris r3, r3, 4139 +; P8LE-NEXT: ori r3, r3, 7589 ; P8LE-NEXT: rldicl r5, r4, 16, 48 -; P8LE-NEXT: rlwinm r6, r5, 0, 16, 31 -; P8LE-NEXT: clrldi r7, r6, 32 -; P8LE-NEXT: mulld r3, r7, r3 -; P8LE-NEXT: rldicl r7, r4, 48, 48 -; P8LE-NEXT: rlwinm r7, r7, 0, 27, 31 -; P8LE-NEXT: mtvsrd f1, r7 -; P8LE-NEXT: rldicl r3, r3, 32, 32 -; P8LE-NEXT: xxswapd v3, vs1 -; P8LE-NEXT: subf r6, r3, r6 -; P8LE-NEXT: srwi r6, r6, 1 -; P8LE-NEXT: add r3, r6, r3 -; P8LE-NEXT: clrldi r6, r4, 48 -; P8LE-NEXT: srwi r3, r3, 6 +; P8LE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8LE-NEXT: mulld r3, r5, r3 +; P8LE-NEXT: clrldi r5, r4, 48 +; P8LE-NEXT: rlwinm r5, r5, 0, 26, 31 +; P8LE-NEXT: mtvsrd f0, r5 +; P8LE-NEXT: mulhdu r3, r3, r6 +; P8LE-NEXT: rldicl r6, r4, 48, 48 +; P8LE-NEXT: xxswapd v2, vs0 ; P8LE-NEXT: rldicl r4, r4, 32, 48 -; P8LE-NEXT: rlwinm r6, r6, 0, 26, 31 -; P8LE-NEXT: mulli r3, r3, 95 +; P8LE-NEXT: rlwinm r5, r6, 0, 27, 31 ; P8LE-NEXT: rlwinm r4, r4, 0, 29, 31 -; P8LE-NEXT: mtvsrd f0, r6 -; P8LE-NEXT: mtvsrd f3, r4 -; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: mtvsrd f1, r5 +; P8LE-NEXT: mtvsrd f2, r4 +; P8LE-NEXT: mtvsrd f3, r3 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: xxswapd v4, vs2 ; P8LE-NEXT: xxswapd v5, vs3 -; P8LE-NEXT: subf r3, r3, r5 -; P8LE-NEXT: mtvsrd f2, r3 ; P8LE-NEXT: vmrglh v2, v3, v2 -; P8LE-NEXT: xxswapd v4, vs2 -; P8LE-NEXT: vmrglh v3, v4, v5 +; P8LE-NEXT: vmrglh v3, v5, v4 ; P8LE-NEXT: vmrglw v2, v3, v2 ; P8LE-NEXT: blr ; -; P8BE-LABEL: dont_fold_urem_power_of_two: +; P8BE-LABEL: dont_lower_urem_power_of_two: ; P8BE: # %bb.0: +; P8BE-NEXT: lis r3, 689 ; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 -; P8BE-NEXT: ori r3, r3, 8969 +; P8BE-NEXT: li r6, 95 +; P8BE-NEXT: ori r3, r3, 55878 +; P8BE-NEXT: sldi r3, r3, 32 ; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r7, r4, 16, 48 +; P8BE-NEXT: oris r3, r3, 4139 ; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 -; P8BE-NEXT: rlwinm r7, r7, 0, 26, 31 -; P8BE-NEXT: clrldi r6, r5, 32 -; P8BE-NEXT: mulld r3, r6, r3 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: subf r6, r3, r5 -; P8BE-NEXT: srwi r6, r6, 1 -; P8BE-NEXT: add r3, r6, r3 -; P8BE-NEXT: rldicl r6, r4, 32, 48 -; P8BE-NEXT: srwi r3, r3, 6 +; P8BE-NEXT: ori r3, r3, 7589 +; P8BE-NEXT: mulld r3, r5, r3 +; P8BE-NEXT: rldicl r5, r4, 32, 48 +; P8BE-NEXT: rlwinm r5, r5, 0, 27, 31 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: mtvsrd v2, r5 +; P8BE-NEXT: mulhdu r3, r3, r6 +; P8BE-NEXT: rldicl r6, r4, 16, 48 ; P8BE-NEXT: rldicl r4, r4, 48, 48 -; P8BE-NEXT: rlwinm r6, r6, 0, 27, 31 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: rlwinm r6, r6, 0, 26, 31 ; P8BE-NEXT: rlwinm r4, r4, 0, 29, 31 -; P8BE-NEXT: mtvsrd v2, r6 -; P8BE-NEXT: sldi r6, r7, 48 +; P8BE-NEXT: sldi r5, r6, 48 ; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: mtvsrd v3, r6 -; P8BE-NEXT: mtvsrd v5, r4 -; P8BE-NEXT: subf r3, r3, r5 -; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: mtvsrd v3, r5 ; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v4, r3 -; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: mtvsrd v4, r4 +; P8BE-NEXT: mtvsrd v5, r3 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: vmrghh v3, v4, v5 ; P8BE-NEXT: vmrghw v2, v2, v3 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold if the divisor is one. -define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { -; P9LE-LABEL: dont_fold_urem_one: +; Don't lower if the divisor is one. +define <4 x i16> @dont_lower_urem_one(<4 x i16> %x) { +; P9LE-LABEL: dont_lower_urem_one: ; P9LE: # %bb.0: +; P9LE-NEXT: lis r4, 2849 +; P9LE-NEXT: ori r4, r4, 25644 ; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: li r5, 0 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: oris r6, r5, 45590 -; P9LE-NEXT: oris r5, r5, 51306 -; P9LE-NEXT: ori r6, r6, 17097 -; P9LE-NEXT: ori r5, r5, 30865 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: mulld r4, r4, r6 -; P9LE-NEXT: lis r6, 24749 -; P9LE-NEXT: ori r6, r6, 47143 -; P9LE-NEXT: rldicl r4, r4, 28, 36 -; P9LE-NEXT: mulli r4, r4, 23 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: oris r4, r4, 34192 +; P9LE-NEXT: ori r4, r4, 45591 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 23 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: lis r4, 12 +; P9LE-NEXT: ori r4, r4, 5559 +; P9LE-NEXT: sldi r4, r4, 32 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: oris r4, r4, 1244 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: mulld r4, r4, r6 -; P9LE-NEXT: rldicl r4, r4, 21, 43 -; P9LE-NEXT: mulli r4, r4, 5423 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: ori r4, r4, 48291 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 5423 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: lis r4, 100 +; P9LE-NEXT: ori r4, r4, 13628 ; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: sldi r4, r4, 32 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: oris r4, r4, 18438 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 31, 17, 31 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: rldicl r4, r4, 24, 40 -; P9LE-NEXT: mulli r4, r4, 654 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: ori r4, r4, 17236 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 654 +; P9LE-NEXT: mulhdu r3, r3, r4 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 @@ -1013,87 +912,94 @@ ; P9LE-NEXT: vmrglw v2, v3, v2 ; P9LE-NEXT: blr ; -; P9BE-LABEL: dont_fold_urem_one: +; P9BE-LABEL: dont_lower_urem_one: ; P9BE: # %bb.0: +; P9BE-NEXT: lis r4, 12 +; P9BE-NEXT: ori r4, r4, 5559 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: lis r5, 24749 -; P9BE-NEXT: ori r5, r5, 47143 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: li r5, 0 -; P9BE-NEXT: oris r6, r5, 45590 -; P9BE-NEXT: oris r5, r5, 51306 -; P9BE-NEXT: ori r6, r6, 17097 -; P9BE-NEXT: ori r5, r5, 30865 -; P9BE-NEXT: rldicl r4, r4, 21, 43 -; P9BE-NEXT: mulli r4, r4, 5423 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: oris r4, r4, 1244 +; P9BE-NEXT: ori r4, r4, 48291 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 5423 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: lis r4, 2849 +; P9BE-NEXT: ori r4, r4, 25644 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: oris r4, r4, 34192 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: ori r4, r4, 45591 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r6 -; P9BE-NEXT: rldicl r4, r4, 28, 36 -; P9BE-NEXT: mulli r4, r4, 23 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 23 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: lis r4, 100 +; P9BE-NEXT: ori r4, r4, 13628 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: oris r4, r4, 18438 ; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r4, r3, 16 -; P9BE-NEXT: rlwinm r3, r3, 31, 17, 31 -; P9BE-NEXT: mulld r3, r3, r5 -; P9BE-NEXT: rldicl r3, r3, 24, 40 -; P9BE-NEXT: mulli r3, r3, 654 -; P9BE-NEXT: subf r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: ori r4, r4, 17236 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 654 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: vmrghh v2, v4, v2 ; P9BE-NEXT: vmrghw v2, v2, v3 ; P9BE-NEXT: blr ; -; P8LE-LABEL: dont_fold_urem_one: +; P8LE-LABEL: dont_lower_urem_one: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: li r3, 0 -; P8LE-NEXT: lis r8, 24749 +; P8LE-NEXT: lis r3, 2849 +; P8LE-NEXT: lis r4, 12 +; P8LE-NEXT: lis r7, 100 ; P8LE-NEXT: xxlxor v5, v5, v5 -; P8LE-NEXT: oris r5, r3, 45590 -; P8LE-NEXT: ori r8, r8, 47143 -; P8LE-NEXT: oris r3, r3, 51306 -; P8LE-NEXT: ori r5, r5, 17097 -; P8LE-NEXT: ori r3, r3, 30865 -; P8LE-NEXT: mfvsrd r4, f0 -; P8LE-NEXT: rldicl r6, r4, 32, 48 -; P8LE-NEXT: rldicl r7, r4, 16, 48 -; P8LE-NEXT: rlwinm r9, r6, 0, 16, 31 -; P8LE-NEXT: rldicl r4, r4, 48, 48 -; P8LE-NEXT: mulld r5, r9, r5 -; P8LE-NEXT: rlwinm r9, r7, 0, 16, 31 -; P8LE-NEXT: mulld r8, r9, r8 -; P8LE-NEXT: rlwinm r9, r4, 31, 17, 31 -; P8LE-NEXT: mulld r3, r9, r3 -; P8LE-NEXT: rldicl r5, r5, 28, 36 -; P8LE-NEXT: rldicl r8, r8, 21, 43 -; P8LE-NEXT: mulli r5, r5, 23 -; P8LE-NEXT: rldicl r3, r3, 24, 40 -; P8LE-NEXT: mulli r8, r8, 5423 -; P8LE-NEXT: mulli r3, r3, 654 -; P8LE-NEXT: subf r5, r5, r6 -; P8LE-NEXT: subf r6, r8, r7 -; P8LE-NEXT: mtvsrd f0, r5 -; P8LE-NEXT: subf r3, r3, r4 -; P8LE-NEXT: mtvsrd f1, r6 -; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: ori r3, r3, 25644 +; P8LE-NEXT: ori r4, r4, 5559 +; P8LE-NEXT: ori r7, r7, 13628 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: sldi r4, r4, 32 +; P8LE-NEXT: mfvsrd r5, f0 +; P8LE-NEXT: oris r3, r3, 34192 +; P8LE-NEXT: oris r4, r4, 1244 +; P8LE-NEXT: ori r3, r3, 45591 +; P8LE-NEXT: ori r4, r4, 48291 +; P8LE-NEXT: rldicl r6, r5, 32, 48 +; P8LE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8LE-NEXT: mulld r3, r6, r3 +; P8LE-NEXT: rldicl r6, r5, 16, 48 +; P8LE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8LE-NEXT: rldicl r5, r5, 48, 48 +; P8LE-NEXT: mulld r4, r6, r4 +; P8LE-NEXT: sldi r6, r7, 32 +; P8LE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8LE-NEXT: oris r6, r6, 18438 +; P8LE-NEXT: ori r6, r6, 17236 +; P8LE-NEXT: mulld r5, r5, r6 +; P8LE-NEXT: li r6, 23 +; P8LE-NEXT: mulhdu r3, r3, r6 +; P8LE-NEXT: li r6, 5423 +; P8LE-NEXT: mulhdu r4, r4, r6 +; P8LE-NEXT: li r6, 654 +; P8LE-NEXT: mulhdu r5, r5, r6 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: mtvsrd f1, r4 ; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: mtvsrd f2, r5 ; P8LE-NEXT: xxswapd v3, vs1 ; P8LE-NEXT: xxswapd v4, vs2 ; P8LE-NEXT: vmrglh v2, v3, v2 @@ -1101,66 +1007,68 @@ ; P8LE-NEXT: vmrglw v2, v2, v3 ; P8LE-NEXT: blr ; -; P8BE-LABEL: dont_fold_urem_one: +; P8BE-LABEL: dont_lower_urem_one: ; P8BE: # %bb.0: +; P8BE-NEXT: lis r3, 12 ; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: li r3, 0 -; P8BE-NEXT: lis r8, 24749 -; P8BE-NEXT: oris r6, r3, 51306 -; P8BE-NEXT: ori r8, r8, 47143 -; P8BE-NEXT: oris r3, r3, 45590 -; P8BE-NEXT: rldicl r5, r4, 32, 48 -; P8BE-NEXT: clrldi r7, r4, 48 -; P8BE-NEXT: ori r6, r6, 30865 -; P8BE-NEXT: ori r3, r3, 17097 -; P8BE-NEXT: rldicl r4, r4, 48, 48 -; P8BE-NEXT: rlwinm r9, r5, 31, 17, 31 +; P8BE-NEXT: lis r5, 2849 +; P8BE-NEXT: ori r3, r3, 5559 +; P8BE-NEXT: ori r5, r5, 25644 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: clrldi r6, r4, 48 +; P8BE-NEXT: oris r3, r3, 1244 +; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8BE-NEXT: ori r3, r3, 48291 +; P8BE-NEXT: sldi r5, r5, 32 +; P8BE-NEXT: mulld r3, r6, r3 +; P8BE-NEXT: lis r6, 100 +; P8BE-NEXT: oris r5, r5, 34192 +; P8BE-NEXT: ori r6, r6, 13628 +; P8BE-NEXT: rldicl r7, r4, 48, 48 +; P8BE-NEXT: ori r5, r5, 45591 +; P8BE-NEXT: sldi r6, r6, 32 ; P8BE-NEXT: rlwinm r7, r7, 0, 16, 31 -; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8BE-NEXT: rldicl r4, r4, 32, 48 +; P8BE-NEXT: oris r6, r6, 18438 +; P8BE-NEXT: mulld r5, r7, r5 ; P8BE-NEXT: rlwinm r4, r4, 0, 16, 31 -; P8BE-NEXT: mulld r6, r9, r6 -; P8BE-NEXT: clrldi r9, r7, 32 -; P8BE-NEXT: mulld r8, r9, r8 -; P8BE-NEXT: clrldi r9, r4, 32 -; P8BE-NEXT: mulld r3, r9, r3 -; P8BE-NEXT: li r9, 0 -; P8BE-NEXT: rldicl r6, r6, 24, 40 -; P8BE-NEXT: mulli r6, r6, 654 -; P8BE-NEXT: rldicl r8, r8, 21, 43 -; P8BE-NEXT: rldicl r3, r3, 28, 36 -; P8BE-NEXT: mulli r8, r8, 5423 -; P8BE-NEXT: mulli r3, r3, 23 -; P8BE-NEXT: subf r5, r6, r5 -; P8BE-NEXT: sldi r6, r9, 48 -; P8BE-NEXT: mtvsrd v2, r6 -; P8BE-NEXT: sldi r5, r5, 48 -; P8BE-NEXT: subf r6, r8, r7 -; P8BE-NEXT: mtvsrd v3, r5 -; P8BE-NEXT: subf r3, r3, r4 -; P8BE-NEXT: sldi r4, r6, 48 +; P8BE-NEXT: ori r6, r6, 17236 +; P8BE-NEXT: mulld r4, r4, r6 +; P8BE-NEXT: li r6, 5423 +; P8BE-NEXT: mulhdu r3, r3, r6 +; P8BE-NEXT: li r6, 23 +; P8BE-NEXT: mulhdu r5, r5, r6 +; P8BE-NEXT: li r6, 654 +; P8BE-NEXT: mulhdu r4, r4, r6 +; P8BE-NEXT: li r6, 0 ; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v4, r4 +; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: mtvsrd v3, r3 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: mtvsrd v2, r6 +; P8BE-NEXT: sldi r3, r4, 48 +; P8BE-NEXT: mtvsrd v4, r5 ; P8BE-NEXT: mtvsrd v5, r3 -; P8BE-NEXT: vmrghh v2, v2, v3 -; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: vmrghh v3, v4, v3 +; P8BE-NEXT: vmrghh v2, v2, v5 ; P8BE-NEXT: vmrghw v2, v2, v3 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold if the divisor is 2^16. -define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { -; CHECK-LABEL: dont_fold_urem_i16_smax: +; Don't lower if the divisor is 2^16. +define <4 x i16> @dont_lower_urem_i16_smax(<4 x i16> %x) { +; CHECK-LABEL: dont_lower_urem_i16_smax: ; CHECK: # %bb.0: ; CHECK-NEXT: blr %1 = urem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold i64 urem. -define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { -; P9LE-LABEL: dont_fold_urem_i64: +; Don't lower i64 urem. +define <4 x i64> @dont_lower_urem_i64(<4 x i64> %x) { +; P9LE-LABEL: dont_lower_urem_i64: ; P9LE: # %bb.0: ; P9LE-NEXT: lis r4, 25644 ; P9LE-NEXT: ori r4, r4, 34192 @@ -1201,7 +1109,7 @@ ; P9LE-NEXT: mtvsrdd v2, r3, r4 ; P9LE-NEXT: blr ; -; P9BE-LABEL: dont_fold_urem_i64: +; P9BE-LABEL: dont_lower_urem_i64: ; P9BE: # %bb.0: ; P9BE-NEXT: lis r4, 25644 ; P9BE-NEXT: ori r4, r4, 34192 @@ -1241,7 +1149,7 @@ ; P9BE-NEXT: mtvsrdd v2, 0, r3 ; P9BE-NEXT: blr ; -; P8LE-LABEL: dont_fold_urem_i64: +; P8LE-LABEL: dont_lower_urem_i64: ; P8LE: # %bb.0: ; P8LE-NEXT: lis r3, 25644 ; P8LE-NEXT: xxswapd vs0, v3 @@ -1287,7 +1195,7 @@ ; P8LE-NEXT: xxmrghd v2, vs1, vs3 ; P8LE-NEXT: blr ; -; P8BE-LABEL: dont_fold_urem_i64: +; P8BE-LABEL: dont_lower_urem_i64: ; P8BE: # %bb.0: ; P8BE-NEXT: lis r3, 25644 ; P8BE-NEXT: lis r4, -16037 Index: llvm/test/CodeGen/RISCV/urem-lkk.ll =================================================================== --- llvm/test/CodeGen/RISCV/urem-lkk.ll +++ llvm/test/CodeGen/RISCV/urem-lkk.ll @@ -8,8 +8,8 @@ ; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=CHECK,RV64IM %s -define i32 @fold_urem_positive_odd(i32 %x) { -; RV32I-LABEL: fold_urem_positive_odd: +define i32 @lower_urem_positive_odd(i32 %x) { +; RV32I-LABEL: lower_urem_positive_odd: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: .cfi_def_cfa_offset 16 @@ -23,7 +23,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: fold_urem_positive_odd: +; RV32IM-LABEL: lower_urem_positive_odd: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lui a1, 364242 ; RV32IM-NEXT: addi a1, a1, 777 @@ -38,7 +38,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: fold_urem_positive_odd: +; RV64I-LABEL: lower_urem_positive_odd: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: .cfi_def_cfa_offset 16 @@ -54,26 +54,19 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: fold_urem_positive_odd: +; RV64IM-LABEL: lower_urem_positive_odd: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a0, a0, 32 ; RV64IM-NEXT: srli a0, a0, 32 -; RV64IM-NEXT: lui a1, 1423 -; RV64IM-NEXT: addiw a1, a1, -733 +; RV64IM-NEXT: lui a1, 176602 +; RV64IM-NEXT: addiw a1, a1, 1121 ; RV64IM-NEXT: slli a1, a1, 15 -; RV64IM-NEXT: addi a1, a1, 1035 +; RV64IM-NEXT: addi a1, a1, 345 ; RV64IM-NEXT: slli a1, a1, 13 -; RV64IM-NEXT: addi a1, a1, -1811 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, 561 -; RV64IM-NEXT: mulhu a1, a0, a1 -; RV64IM-NEXT: sub a2, a0, a1 -; RV64IM-NEXT: srli a2, a2, 1 -; RV64IM-NEXT: add a1, a2, a1 -; RV64IM-NEXT: srli a1, a1, 6 -; RV64IM-NEXT: addi a2, zero, 95 -; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: addi a1, a1, -603 +; RV64IM-NEXT: mul a0, a0, a1 +; RV64IM-NEXT: addi a1, zero, 95 +; RV64IM-NEXT: mulhu a0, a0, a1 ; RV64IM-NEXT: .cfi_def_cfa_offset 0 ; RV64IM-NEXT: ret %1 = urem i32 %x, 95 @@ -81,8 +74,8 @@ } -define i32 @fold_urem_positive_even(i32 %x) { -; RV32I-LABEL: fold_urem_positive_even: +define i32 @lower_urem_positive_even(i32 %x) { +; RV32I-LABEL: lower_urem_positive_even: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: .cfi_def_cfa_offset 16 @@ -96,7 +89,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: fold_urem_positive_even: +; RV32IM-LABEL: lower_urem_positive_even: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lui a1, 1012964 ; RV32IM-NEXT: addi a1, a1, -61 @@ -108,7 +101,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: fold_urem_positive_even: +; RV64I-LABEL: lower_urem_positive_even: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: .cfi_def_cfa_offset 16 @@ -124,23 +117,19 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: fold_urem_positive_even: +; RV64IM-LABEL: lower_urem_positive_even: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a0, a0, 32 ; RV64IM-NEXT: srli a0, a0, 32 -; RV64IM-NEXT: lui a1, 1048020 +; RV64IM-NEXT: lui a1, 15828 ; RV64IM-NEXT: addiw a1, a1, -1793 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, 139 -; RV64IM-NEXT: slli a1, a1, 14 -; RV64IM-NEXT: addi a1, a1, 1793 -; RV64IM-NEXT: slli a1, a1, 12 -; RV64IM-NEXT: addi a1, a1, -139 -; RV64IM-NEXT: mulhu a1, a0, a1 -; RV64IM-NEXT: srli a1, a1, 10 -; RV64IM-NEXT: addi a2, zero, 1060 -; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: slli a1, a1, 15 +; RV64IM-NEXT: addi a1, a1, 1113 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, -1020 +; RV64IM-NEXT: mul a0, a0, a1 +; RV64IM-NEXT: addi a1, zero, 1060 +; RV64IM-NEXT: mulhu a0, a0, a1 ; RV64IM-NEXT: .cfi_def_cfa_offset 0 ; RV64IM-NEXT: ret %1 = urem i32 %x, 1060 @@ -148,7 +137,7 @@ } -; Don't fold if we can combine urem with udiv. +; Don't lower if we can combine urem with udiv. define i32 @combine_urem_udiv(i32 %x) { ; RV32I-LABEL: combine_urem_udiv: ; RV32I: # %bb.0: @@ -253,9 +242,9 @@ ret i32 %3 } -; Don't fold for divisors that are a power of two. -define i32 @dont_fold_urem_power_of_two(i32 %x) { -; CHECK-LABEL: dont_fold_urem_power_of_two: +; Don't lower for divisors that are a power of two. +define i32 @dont_lower_urem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_lower_urem_power_of_two: ; CHECK: # %bb.0: ; CHECK-NEXT: andi a0, a0, 63 ; CHECK-NEXT: .cfi_def_cfa_offset 0 @@ -264,9 +253,9 @@ ret i32 %1 } -; Don't fold if the divisor is one. -define i32 @dont_fold_urem_one(i32 %x) { -; CHECK-LABEL: dont_fold_urem_one: +; Don't lower if the divisor is one. +define i32 @dont_lower_urem_one(i32 %x) { +; CHECK-LABEL: dont_lower_urem_one: ; CHECK: # %bb.0: ; CHECK-NEXT: mv a0, zero ; CHECK-NEXT: .cfi_def_cfa_offset 0 @@ -275,9 +264,9 @@ ret i32 %1 } -; Don't fold if the divisor is 2^32. -define i32 @dont_fold_urem_i32_umax(i32 %x) { -; CHECK-LABEL: dont_fold_urem_i32_umax: +; Don't lower if the divisor is 2^32. +define i32 @dont_lower_urem_i32_umax(i32 %x) { +; CHECK-LABEL: dont_lower_urem_i32_umax: ; CHECK: # %bb.0: ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret @@ -285,9 +274,9 @@ ret i32 %1 } -; Don't fold i64 urem -define i64 @dont_fold_urem_i64(i64 %x) { -; RV32I-LABEL: dont_fold_urem_i64: +; Don't lower i64 urem +define i64 @dont_lower_urem_i64(i64 %x) { +; RV32I-LABEL: dont_lower_urem_i64: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: .cfi_def_cfa_offset 16 @@ -302,7 +291,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: dont_fold_urem_i64: +; RV32IM-LABEL: dont_lower_urem_i64: ; RV32IM: # %bb.0: ; RV32IM-NEXT: addi sp, sp, -16 ; RV32IM-NEXT: .cfi_def_cfa_offset 16 @@ -317,7 +306,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: dont_fold_urem_i64: +; RV64I-LABEL: dont_lower_urem_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: .cfi_def_cfa_offset 16 @@ -331,7 +320,7 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: dont_fold_urem_i64: +; RV64IM-LABEL: dont_lower_urem_i64: ; RV64IM: # %bb.0: ; RV64IM-NEXT: srli a1, a0, 1 ; RV64IM-NEXT: lui a2, 2675 @@ -352,3 +341,218 @@ %1 = urem i64 %x, 98 ret i64 %1 } + +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 + +declare dso_local i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #1 + +define void @urem_loop(i32 %x) { +; RV32I-LABEL: urem_loop: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) +; RV32I-NEXT: sw s2, 0(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv s1, zero +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: lui a1, %hi(.L.str) +; RV32I-NEXT: addi s0, a1, %lo(.L.str) +; RV32I-NEXT: .LBB7_1: # %loop +; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: add s1, a0, s1 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: call printf +; RV32I-NEXT: bltu a0, s2, .LBB7_1 +; RV32I-NEXT: # %bb.2: # %afterloop +; RV32I-NEXT: lw s2, 0(sp) +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: urem_loop: +; RV32IM: # %bb.0: # %entry +; RV32IM-NEXT: addi sp, sp, -32 +; RV32IM-NEXT: .cfi_def_cfa_offset 32 +; RV32IM-NEXT: sw ra, 28(sp) +; RV32IM-NEXT: sw s0, 24(sp) +; RV32IM-NEXT: sw s1, 20(sp) +; RV32IM-NEXT: sw s2, 16(sp) +; RV32IM-NEXT: sw s3, 12(sp) +; RV32IM-NEXT: sw s4, 8(sp) +; RV32IM-NEXT: .cfi_offset ra, -4 +; RV32IM-NEXT: .cfi_offset s0, -8 +; RV32IM-NEXT: .cfi_offset s1, -12 +; RV32IM-NEXT: .cfi_offset s2, -16 +; RV32IM-NEXT: .cfi_offset s3, -20 +; RV32IM-NEXT: .cfi_offset s4, -24 +; RV32IM-NEXT: mv s2, a0 +; RV32IM-NEXT: mv s1, zero +; RV32IM-NEXT: addi a0, zero, 1 +; RV32IM-NEXT: lui a1, 364242 +; RV32IM-NEXT: addi s3, a1, 777 +; RV32IM-NEXT: addi s4, zero, 95 +; RV32IM-NEXT: lui a1, %hi(.L.str) +; RV32IM-NEXT: addi s0, a1, %lo(.L.str) +; RV32IM-NEXT: .LBB7_1: # %loop +; RV32IM-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IM-NEXT: mulhu a1, a0, s3 +; RV32IM-NEXT: sub a2, a0, a1 +; RV32IM-NEXT: srli a2, a2, 1 +; RV32IM-NEXT: add a1, a2, a1 +; RV32IM-NEXT: srli a1, a1, 6 +; RV32IM-NEXT: mul a1, a1, s4 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: add s1, a0, s1 +; RV32IM-NEXT: mv a0, s0 +; RV32IM-NEXT: mv a1, s1 +; RV32IM-NEXT: call printf +; RV32IM-NEXT: bltu a0, s2, .LBB7_1 +; RV32IM-NEXT: # %bb.2: # %afterloop +; RV32IM-NEXT: lw s4, 8(sp) +; RV32IM-NEXT: lw s3, 12(sp) +; RV32IM-NEXT: lw s2, 16(sp) +; RV32IM-NEXT: lw s1, 20(sp) +; RV32IM-NEXT: lw s0, 24(sp) +; RV32IM-NEXT: lw ra, 28(sp) +; RV32IM-NEXT: .cfi_restore ra +; RV32IM-NEXT: .cfi_restore s0 +; RV32IM-NEXT: .cfi_restore s1 +; RV32IM-NEXT: .cfi_restore s2 +; RV32IM-NEXT: .cfi_restore s3 +; RV32IM-NEXT: .cfi_restore s4 +; RV32IM-NEXT: addi sp, sp, 32 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: urem_loop: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: .cfi_def_cfa_offset 32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) +; RV64I-NEXT: sd s2, 0(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv s0, zero +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: lui a2, %hi(.L.str) +; RV64I-NEXT: addi s2, a2, %lo(.L.str) +; RV64I-NEXT: sext.w s1, a1 +; RV64I-NEXT: .LBB7_1: # %loop +; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: addw s0, a0, s0 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call printf +; RV64I-NEXT: sext.w a1, a0 +; RV64I-NEXT: bltu a1, s1, .LBB7_1 +; RV64I-NEXT: # %bb.2: # %afterloop +; RV64I-NEXT: ld s2, 0(sp) +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: addi sp, sp, 32 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: urem_loop: +; RV64IM: # %bb.0: # %entry +; RV64IM-NEXT: addi sp, sp, -48 +; RV64IM-NEXT: .cfi_def_cfa_offset 48 +; RV64IM-NEXT: sd ra, 40(sp) +; RV64IM-NEXT: sd s0, 32(sp) +; RV64IM-NEXT: sd s1, 24(sp) +; RV64IM-NEXT: sd s2, 16(sp) +; RV64IM-NEXT: sd s3, 8(sp) +; RV64IM-NEXT: sd s4, 0(sp) +; RV64IM-NEXT: .cfi_offset ra, -8 +; RV64IM-NEXT: .cfi_offset s0, -16 +; RV64IM-NEXT: .cfi_offset s1, -24 +; RV64IM-NEXT: .cfi_offset s2, -32 +; RV64IM-NEXT: .cfi_offset s3, -40 +; RV64IM-NEXT: .cfi_offset s4, -48 +; RV64IM-NEXT: mv a1, a0 +; RV64IM-NEXT: mv s0, zero +; RV64IM-NEXT: addi a0, zero, 1 +; RV64IM-NEXT: lui a2, 176602 +; RV64IM-NEXT: addiw a2, a2, 1121 +; RV64IM-NEXT: slli a2, a2, 15 +; RV64IM-NEXT: addi a2, a2, 345 +; RV64IM-NEXT: slli a2, a2, 13 +; RV64IM-NEXT: addi s2, a2, -603 +; RV64IM-NEXT: addi s3, zero, 95 +; RV64IM-NEXT: lui a2, %hi(.L.str) +; RV64IM-NEXT: addi s4, a2, %lo(.L.str) +; RV64IM-NEXT: sext.w s1, a1 +; RV64IM-NEXT: .LBB7_1: # %loop +; RV64IM-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64IM-NEXT: slli a0, a0, 32 +; RV64IM-NEXT: srli a0, a0, 32 +; RV64IM-NEXT: mul a0, a0, s2 +; RV64IM-NEXT: mulhu a0, a0, s3 +; RV64IM-NEXT: addw s0, a0, s0 +; RV64IM-NEXT: mv a0, s4 +; RV64IM-NEXT: mv a1, s0 +; RV64IM-NEXT: call printf +; RV64IM-NEXT: sext.w a1, a0 +; RV64IM-NEXT: bltu a1, s1, .LBB7_1 +; RV64IM-NEXT: # %bb.2: # %afterloop +; RV64IM-NEXT: ld s4, 0(sp) +; RV64IM-NEXT: ld s3, 8(sp) +; RV64IM-NEXT: ld s2, 16(sp) +; RV64IM-NEXT: ld s1, 24(sp) +; RV64IM-NEXT: ld s0, 32(sp) +; RV64IM-NEXT: ld ra, 40(sp) +; RV64IM-NEXT: .cfi_restore ra +; RV64IM-NEXT: .cfi_restore s0 +; RV64IM-NEXT: .cfi_restore s1 +; RV64IM-NEXT: .cfi_restore s2 +; RV64IM-NEXT: .cfi_restore s3 +; RV64IM-NEXT: .cfi_restore s4 +; RV64IM-NEXT: addi sp, sp, 48 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret +entry: + %0 = add i32 0, 0 + br label %loop +loop: + %1 = phi i32 [ 1, %entry ], [ %5, %loop ] + %2 = phi i32 [%0, %entry], [%4, %loop] + %3 = urem i32 %1, 95 + %4 = add i32 %3, %2 + %5 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %4) + %6 = icmp ult i32 %5, %x + br i1 %6, label %loop, label %afterloop + +afterloop: + ret void +} Index: llvm/test/CodeGen/RISCV/urem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -9,8 +9,8 @@ ; RUN: | FileCheck -check-prefixes=CHECK,RV64IM %s -define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { -; RV32I-LABEL: fold_urem_vec_1: +define <4 x i16> @lower_urem_vec_1(<4 x i16> %x) { +; RV32I-LABEL: lower_urem_vec_1: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 ; RV32I-NEXT: .cfi_def_cfa_offset 32 @@ -70,7 +70,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: fold_urem_vec_1: +; RV32IM-LABEL: lower_urem_vec_1: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lhu a6, 12(a1) ; RV32IM-NEXT: lhu a3, 8(a1) @@ -115,7 +115,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: fold_urem_vec_1: +; RV64I-LABEL: lower_urem_vec_1: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -64 ; RV64I-NEXT: .cfi_def_cfa_offset 64 @@ -175,7 +175,7 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: fold_urem_vec_1: +; RV64IM-LABEL: lower_urem_vec_1: ; RV64IM: # %bb.0: ; RV64IM-NEXT: lhu a6, 24(a1) ; RV64IM-NEXT: lhu a3, 16(a1) @@ -248,8 +248,8 @@ ret <4 x i16> %1 } -define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { -; RV32I-LABEL: fold_urem_vec_2: +define <4 x i16> @lower_urem_vec_2(<4 x i16> %x) { +; RV32I-LABEL: lower_urem_vec_2: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 ; RV32I-NEXT: .cfi_def_cfa_offset 32 @@ -309,7 +309,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: fold_urem_vec_2: +; RV32IM-LABEL: lower_urem_vec_2: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lhu a6, 12(a1) ; RV32IM-NEXT: lhu a7, 8(a1) @@ -353,7 +353,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: fold_urem_vec_2: +; RV64I-LABEL: lower_urem_vec_2: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -64 ; RV64I-NEXT: .cfi_def_cfa_offset 64 @@ -413,7 +413,7 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: fold_urem_vec_2: +; RV64IM-LABEL: lower_urem_vec_2: ; RV64IM: # %bb.0: ; RV64IM-NEXT: lhu a6, 24(a1) ; RV64IM-NEXT: lhu a7, 16(a1) @@ -467,7 +467,7 @@ } -; Don't fold if we can combine urem with udiv. +; Don't lower if we can combine urem with udiv. define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { ; RV32I-LABEL: combine_urem_udiv: ; RV32I: # %bb.0: @@ -768,9 +768,9 @@ ret <4 x i16> %3 } -; Don't fold for divisors that are a power of two. -define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { -; RV32I-LABEL: dont_fold_urem_power_of_two: +; Don't lower for divisors that are a power of two. +define <4 x i16> @dont_lower_urem_power_of_two(<4 x i16> %x) { +; RV32I-LABEL: dont_lower_urem_power_of_two: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 ; RV32I-NEXT: .cfi_def_cfa_offset 32 @@ -813,7 +813,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: dont_fold_urem_power_of_two: +; RV32IM-LABEL: dont_lower_urem_power_of_two: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lhu a6, 8(a1) ; RV32IM-NEXT: lhu a3, 4(a1) @@ -839,7 +839,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: dont_fold_urem_power_of_two: +; RV64I-LABEL: dont_lower_urem_power_of_two: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -48 ; RV64I-NEXT: .cfi_def_cfa_offset 48 @@ -882,7 +882,7 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: dont_fold_urem_power_of_two: +; RV64IM-LABEL: dont_lower_urem_power_of_two: ; RV64IM: # %bb.0: ; RV64IM-NEXT: lhu a6, 16(a1) ; RV64IM-NEXT: lhu a3, 8(a1) @@ -917,9 +917,9 @@ ret <4 x i16> %1 } -; Don't fold if the divisor is one. -define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { -; RV32I-LABEL: dont_fold_urem_one: +; Don't lower if the divisor is one. +define <4 x i16> @dont_lower_urem_one(<4 x i16> %x) { +; RV32I-LABEL: dont_lower_urem_one: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -32 ; RV32I-NEXT: .cfi_def_cfa_offset 32 @@ -967,7 +967,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: dont_fold_urem_one: +; RV32IM-LABEL: dont_lower_urem_one: ; RV32IM: # %bb.0: ; RV32IM-NEXT: lhu a2, 4(a1) ; RV32IM-NEXT: lhu a3, 12(a1) @@ -1002,7 +1002,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: dont_fold_urem_one: +; RV64I-LABEL: dont_lower_urem_one: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -48 ; RV64I-NEXT: .cfi_def_cfa_offset 48 @@ -1050,7 +1050,7 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: dont_fold_urem_one: +; RV64IM-LABEL: dont_lower_urem_one: ; RV64IM: # %bb.0: ; RV64IM-NEXT: lhu a2, 24(a1) ; RV64IM-NEXT: lhu a3, 8(a1) @@ -1109,9 +1109,9 @@ ret <4 x i16> %1 } -; Don't fold if the divisor is 2^16. -define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { -; CHECK-LABEL: dont_fold_urem_i16_smax: +; Don't lower if the divisor is 2^16. +define <4 x i16> @dont_lower_urem_i16_smax(<4 x i16> %x) { +; CHECK-LABEL: dont_lower_urem_i16_smax: ; CHECK: # %bb.0: ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret @@ -1119,9 +1119,9 @@ ret <4 x i16> %1 } -; Don't fold i64 urem. -define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { -; RV32I-LABEL: dont_fold_urem_i64: +; Don't lower i64 urem. +define <4 x i64> @dont_lower_urem_i64(<4 x i64> %x) { +; RV32I-LABEL: dont_lower_urem_i64: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -48 ; RV32I-NEXT: .cfi_def_cfa_offset 48 @@ -1216,7 +1216,7 @@ ; RV32I-NEXT: .cfi_def_cfa_offset 0 ; RV32I-NEXT: ret ; -; RV32IM-LABEL: dont_fold_urem_i64: +; RV32IM-LABEL: dont_lower_urem_i64: ; RV32IM: # %bb.0: ; RV32IM-NEXT: addi sp, sp, -48 ; RV32IM-NEXT: .cfi_def_cfa_offset 48 @@ -1311,7 +1311,7 @@ ; RV32IM-NEXT: .cfi_def_cfa_offset 0 ; RV32IM-NEXT: ret ; -; RV64I-LABEL: dont_fold_urem_i64: +; RV64I-LABEL: dont_lower_urem_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -48 ; RV64I-NEXT: .cfi_def_cfa_offset 48 @@ -1359,7 +1359,7 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret ; -; RV64IM-LABEL: dont_fold_urem_i64: +; RV64IM-LABEL: dont_lower_urem_i64: ; RV64IM: # %bb.0: ; RV64IM-NEXT: ld a2, 24(a1) ; RV64IM-NEXT: ld a3, 8(a1) Index: llvm/test/CodeGen/X86/load-scalar-as-vector.ll =================================================================== --- llvm/test/CodeGen/X86/load-scalar-as-vector.ll +++ llvm/test/CodeGen/X86/load-scalar-as-vector.ll @@ -520,29 +520,21 @@ define <16 x i8> @urem_op1_constant(i8* %p) nounwind { ; SSE-LABEL: urem_op1_constant: ; SSE: # %bb.0: -; SSE-NEXT: movb (%rdi), %al -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb %cl -; SSE-NEXT: movzbl %cl, %ecx -; SSE-NEXT: imull $49, %ecx, %ecx -; SSE-NEXT: shrl $10, %ecx -; SSE-NEXT: imull $42, %ecx, %ecx -; SSE-NEXT: subb %cl, %al -; SSE-NEXT: movzbl %al, %eax +; SSE-NEXT: movzbl (%rdi), %eax +; SSE-NEXT: imull $1561, %eax, %eax # imm = 0x619 +; SSE-NEXT: movzwl %ax, %eax +; SSE-NEXT: imull $42, %eax, %eax +; SSE-NEXT: shrl $16, %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: urem_op1_constant: ; AVX: # %bb.0: -; AVX-NEXT: movb (%rdi), %al -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrb %cl -; AVX-NEXT: movzbl %cl, %ecx -; AVX-NEXT: imull $49, %ecx, %ecx -; AVX-NEXT: shrl $10, %ecx -; AVX-NEXT: imull $42, %ecx, %ecx -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: movzbl (%rdi), %eax +; AVX-NEXT: imull $1561, %eax, %eax # imm = 0x619 +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: imull $42, %eax, %eax +; AVX-NEXT: shrl $16, %eax ; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: retq %x = load i8, i8* %p Index: llvm/test/CodeGen/X86/urem-i8-constant.ll =================================================================== --- llvm/test/CodeGen/X86/urem-i8-constant.ll +++ llvm/test/CodeGen/X86/urem-i8-constant.ll @@ -7,11 +7,11 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: ; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: imull $111, %eax, %ecx -; CHECK-NEXT: shrl $12, %ecx -; CHECK-NEXT: leal (%ecx,%ecx,8), %edx -; CHECK-NEXT: leal (%ecx,%edx,4), %ecx -; CHECK-NEXT: subb %cl, %al +; CHECK-NEXT: imull $1772, %eax, %eax # imm = 0x6EC +; CHECK-NEXT: movzwl %ax, %eax +; CHECK-NEXT: leal (%eax,%eax,8), %ecx +; CHECK-NEXT: leal (%eax,%ecx,4), %eax +; CHECK-NEXT: shrl $16, %eax ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retl %t546 = urem i8 %tmp325, 37 Index: llvm/test/CodeGen/X86/urem-lkk.ll =================================================================== --- llvm/test/CodeGen/X86/urem-lkk.ll +++ llvm/test/CodeGen/X86/urem-lkk.ll @@ -1,43 +1,39 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK -define i32 @fold_urem_positive_odd(i32 %x) { -; CHECK-LABEL: fold_urem_positive_odd: +define i32 @lower_urem_positive_odd(i32 %x) { +; CHECK-LABEL: lower_urem_positive_odd: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: imulq $1491936009, %rcx, %rcx # imm = 0x58ED2309 -; CHECK-NEXT: shrq $32, %rcx -; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: subl %ecx, %edx -; CHECK-NEXT: shrl %edx -; CHECK-NEXT: addl %ecx, %edx -; CHECK-NEXT: shrl $6, %edx -; CHECK-NEXT: imull $95, %edx, %ecx -; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: movabsq $194176253407468965, %rax # imm = 0x2B1DA46102B1DA5 +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $95, %ecx +; CHECK-NEXT: mulq %rcx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq %1 = urem i32 %x, 95 ret i32 %1 } -define i32 @fold_urem_positive_even(i32 %x) { -; CHECK-LABEL: fold_urem_positive_even: +define i32 @lower_urem_positive_even(i32 %x) { +; CHECK-LABEL: lower_urem_positive_even: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: movl $4149100483, %edx # imm = 0xF74E3FC3 -; CHECK-NEXT: imulq %rcx, %rdx -; CHECK-NEXT: shrq $42, %rdx -; CHECK-NEXT: imull $1060, %edx, %ecx # imm = 0x424 -; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: movabsq $17402588748782596, %rax # imm = 0x3DD38FF08B1C04 +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $1060, %ecx # imm = 0x424 +; CHECK-NEXT: mulq %rcx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq %1 = urem i32 %x, 1060 ret i32 %1 } -; Don't fold if we can combine urem with udiv. +; Don't lower if we can combine urem with udiv. define i32 @combine_urem_udiv(i32 %x) { ; CHECK-LABEL: combine_urem_udiv: ; CHECK: # %bb.0: @@ -60,9 +56,9 @@ ret i32 %3 } -; Don't fold for divisors that are a power of two. -define i32 @dont_fold_urem_power_of_two(i32 %x) { -; CHECK-LABEL: dont_fold_urem_power_of_two: +; Don't lower for divisors that are a power of two. +define i32 @dont_lower_urem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_lower_urem_power_of_two: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: andl $63, %eax @@ -71,9 +67,9 @@ ret i32 %1 } -; Don't fold if the divisor is one. -define i32 @dont_fold_urem_one(i32 %x) { -; CHECK-LABEL: dont_fold_urem_one: +; Don't lower if the divisor is one. +define i32 @dont_lower_urem_one(i32 %x) { +; CHECK-LABEL: dont_lower_urem_one: ; CHECK: # %bb.0: ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: retq @@ -81,18 +77,18 @@ ret i32 %1 } -; Don't fold if the divisor is 2^32. -define i32 @dont_fold_urem_i32_umax(i32 %x) { -; CHECK-LABEL: dont_fold_urem_i32_umax: +; Don't lower if the divisor is 2^32. +define i32 @dont_lower_urem_i32_umax(i32 %x) { +; CHECK-LABEL: dont_lower_urem_i32_umax: ; CHECK: # %bb.0: ; CHECK-NEXT: retq %1 = urem i32 %x, 4294967296 ret i32 %1 } -; Don't fold i64 urem -define i64 @dont_fold_urem_i64(i64 %x) { -; CHECK-LABEL: dont_fold_urem_i64: +; Don't lower i64 urem +define i64 @dont_lower_urem_i64(i64 %x) { +; CHECK-LABEL: dont_lower_urem_i64: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: shrq %rax @@ -106,3 +102,24 @@ %1 = urem i64 %x, 98 ret i64 %1 } + +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 + +declare dso_local i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #1 + +define void @urem_loop(i32 %x) { +entry: + %0 = add i32 0, 0 + br label %loop +loop: + %1 = phi i32 [ 1, %entry ], [ %5, %loop ] + %2 = phi i32 [%0, %entry], [%4, %loop] + %3 = urem i32 %1, 95 + %4 = add i32 %3, %2 + %5 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %4) + %6 = icmp ult i32 %5, %x + br i1 %6, label %loop, label %afterloop + +afterloop: + ret void +} Index: llvm/test/CodeGen/X86/urem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/X86/urem-vector-lkk.ll +++ llvm/test/CodeGen/X86/urem-vector-lkk.ll @@ -3,112 +3,98 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 -define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { -; SSE-LABEL: fold_urem_vec_1: +define <4 x i16> @lower_urem_vec_1(<4 x i16> %x) { +; SSE-LABEL: lower_urem_vec_1: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrl $2, %ecx -; SSE-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211 -; SSE-NEXT: shrl $19, %ecx -; SSE-NEXT: imull $124, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %xmm0, %ecx -; SSE-NEXT: movzwl %cx, %edx -; SSE-NEXT: imull $44151, %edx, %edx # imm = 0xAC77 -; SSE-NEXT: shrl $22, %edx -; SSE-NEXT: imull $95, %edx, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrl %ecx -; SSE-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 -; SSE-NEXT: shrl $17, %ecx -; SSE-NEXT: imull $98, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $1373, %eax, %ecx # imm = 0x55D -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: movl %eax, %edx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: movzwl %dx, %edx -; SSE-NEXT: shrl %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: shrl $9, %edx -; SSE-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [95,124,98,1003] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] +; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; SSE-NEXT: retq ; -; AVX-LABEL: fold_urem_vec_1: -; AVX: # %bb.0: -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $2, %ecx -; AVX-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211 -; AVX-NEXT: shrl $19, %ecx -; AVX-NEXT: imull $124, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: movzwl %cx, %edx -; AVX-NEXT: imull $44151, %edx, %edx # imm = 0xAC77 -; AVX-NEXT: shrl $22, %edx -; AVX-NEXT: imull $95, %edx, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl %ecx -; AVX-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 -; AVX-NEXT: shrl $17, %ecx -; AVX-NEXT: imull $98, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $1373, %eax, %ecx # imm = 0x55D -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $9, %edx -; AVX-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: lower_urem_vec_1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [95,124,98,1003] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: retq +; +; AVX2-LABEL: lower_urem_vec_1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [95,124,98,1003] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 } -define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { -; SSE-LABEL: fold_urem_vec_2: +define <4 x i16> @lower_urem_vec_2(<4 x i16> %x) { +; SSE-LABEL: lower_urem_vec_2: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] -; SSE-NEXT: pmulhuw %xmm0, %xmm1 -; SSE-NEXT: psrlw $6, %xmm1 -; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 -; SSE-NEXT: psubw %xmm1, %xmm0 +; SSE-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95] +; SSE-NEXT: pmuludq %xmm2, %xmm1 +; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; SSE-NEXT: retq ; -; AVX-LABEL: fold_urem_vec_2: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: lower_urem_vec_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [95,95,95,95] +; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: retq +; +; AVX2-LABEL: lower_urem_vec_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [45210183,45210183,45210183,45210183] +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [95,95,95,95] +; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold if we can combine urem with udiv. +; Don't lower if we can combine urem with udiv. define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { ; SSE-LABEL: combine_urem_udiv: ; SSE: # %bb.0: @@ -135,130 +121,112 @@ ret <4 x i16> %3 } -; Don't fold for divisors that are a power of two. -define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { -; SSE-LABEL: dont_fold_urem_power_of_two: +; Don't lower for divisors that are a power of two. +define <4 x i16> @dont_lower_urem_power_of_two(<4 x i16> %x) { +; SSE-LABEL: dont_lower_urem_power_of_two: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; SSE-NEXT: shrl $22, %ecx -; SSE-NEXT: imull $95, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pextrw $1, %xmm0, %ecx -; SSE-NEXT: andl $31, %ecx -; SSE-NEXT: movd %xmm0, %edx -; SSE-NEXT: andl $63, %edx -; SSE-NEXT: movd %edx, %xmm1 -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %ecx -; SSE-NEXT: andl $7, %ecx -; SSE-NEXT: pinsrw $2, %ecx, %xmm1 -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [64,32,8,95] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] +; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; SSE-NEXT: retq ; -; AVX-LABEL: dont_fold_urem_power_of_two: -; AVX: # %bb.0: -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; AVX-NEXT: shrl $22, %ecx -; AVX-NEXT: imull $95, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpextrw $1, %xmm0, %ecx -; AVX-NEXT: andl $31, %ecx -; AVX-NEXT: vmovd %xmm0, %edx -; AVX-NEXT: andl $63, %edx -; AVX-NEXT: vmovd %edx, %xmm1 -; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %ecx -; AVX-NEXT: andl $7, %ecx -; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 -; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: dont_lower_urem_power_of_two: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [64,32,8,95] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: retq +; +; AVX2-LABEL: dont_lower_urem_power_of_two: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [64,32,8,95] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold if the divisor is one. -define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { -; SSE-LABEL: dont_fold_urem_one: +; Don't lower if the divisor is one. +define <4 x i16> @dont_lower_urem_one(<4 x i16> %x) { +; SSE-LABEL: dont_lower_urem_one: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: imull $25645, %eax, %ecx # imm = 0x642D -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: movl %eax, %edx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: movzwl %dx, %edx -; SSE-NEXT: shrl %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: shrl $4, %edx -; SSE-NEXT: leal (%rdx,%rdx,2), %ecx -; SSE-NEXT: shll $3, %ecx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: addl %eax, %edx -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B -; SSE-NEXT: shrl $25, %ecx -; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pinsrw $2, %edx, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 -; SSE-NEXT: shrl $26, %ecx -; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,654,23,5423] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] +; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; SSE-NEXT: retq ; -; AVX-LABEL: dont_fold_urem_one: -; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: imull $25645, %eax, %ecx # imm = 0x642D -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $4, %edx -; AVX-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX-NEXT: shll $3, %ecx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: addl %eax, %edx -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B -; AVX-NEXT: shrl $25, %ecx -; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 -; AVX-NEXT: shrl $26, %ecx -; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: dont_lower_urem_one: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,654,23,5423] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: retq +; +; AVX2-LABEL: dont_lower_urem_one: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,654,23,5423] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold if the divisor is 2^16. -define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { -; CHECK-LABEL: dont_fold_urem_i16_smax: +; Don't lower if the divisor is 2^16. +define <4 x i16> @dont_lower_urem_i16_smax(<4 x i16> %x) { +; CHECK-LABEL: dont_lower_urem_i16_smax: ; CHECK: # %bb.0: ; CHECK-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 } -; Don't fold i64 urem. -define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { -; SSE-LABEL: dont_fold_urem_i64: +; Don't lower i64 urem. +define <4 x i64> @dont_lower_urem_i64(<4 x i64> %x) { +; SSE-LABEL: dont_lower_urem_i64: ; SSE: # %bb.0: ; SSE-NEXT: movq %xmm1, %rcx ; SSE-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 @@ -296,7 +264,7 @@ ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX1-LABEL: dont_fold_urem_i64: +; AVX1-LABEL: dont_lower_urem_i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rcx @@ -335,7 +303,7 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: dont_fold_urem_i64: +; AVX2-LABEL: dont_lower_urem_i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovq %xmm1, %rcx @@ -375,4 +343,4 @@ ; AVX2-NEXT: retq %1 = urem <4 x i64> %x, ret <4 x i64> %1 -} \ No newline at end of file +} Index: llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -616,16 +616,63 @@ ; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_rem7_8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_rem7_8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7] +; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2NOBW-LABEL: test_rem7_8i16: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2NOBW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX2NOBW-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] +; AVX2NOBW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] +; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2NOBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2NOBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2NOBW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2NOBW-NEXT: vzeroupper +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX512BW-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %res = urem <8 x i16> %a, ret <8 x i16> %res } @@ -690,61 +737,37 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363] ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7] +; AVX1-NEXT: vpmulhuw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmulhuw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2NOBW-LABEL: test_rem7_16i8: ; AVX2NOBW: # %bb.0: -; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm2 -; AVX2NOBW-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2NOBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vzeroupper ; AVX2NOBW-NEXT: retq ; ; AVX512BW-LABEL: test_rem7_16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm2 -; AVX512BW-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %res = urem <16 x i8> %a, @@ -852,93 +875,36 @@ ; AVX1-LABEL: test_remconstant_16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7] -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2NOBW-LABEL: test_remconstant_16i8: ; AVX2NOBW: # %bb.0: -; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm2 -; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 -; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX2NOBW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2NOBW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2NOBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vzeroupper ; AVX2NOBW-NEXT: retq ; ; AVX512BW-LABEL: test_remconstant_16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm2 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,1,3,3,3,2,2,2,2,3,3,3,1,1,2] -; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %res = urem <16 x i8> %a, Index: llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -561,16 +561,29 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: test_rem7_16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1 -; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1 -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2NOBW-LABEL: test_rem7_16i16: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1 +; AVX2NOBW-NEXT: vpsubw %ymm1, %ymm0, %ymm2 +; AVX2NOBW-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX2NOBW-NEXT: vpaddw %ymm1, %ymm2, %ymm1 +; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_16i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512BW-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm0 +; AVX512BW-NEXT: retq %res = urem <16 x i16> %a, ret <16 x i16> %res } @@ -646,20 +659,10 @@ ; ; AVX512BW-LABEL: test_rem7_32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm2 -; AVX512BW-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpsllw $3, %ymm1, %ymm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq %res = urem <32 x i8> %a, ret <32 x i8> %res @@ -791,23 +794,10 @@ ; ; AVX512BW-LABEL: test_remconstant_32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm2 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq %res = urem <32 x i8> %a, ret <32 x i8> %res Index: llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -440,22 +440,23 @@ ; AVX512F-LABEL: test_rem7_32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm4 -; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddw %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm3 -; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX512F-NEXT: vpmulld %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 +; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm4 +; AVX512F-NEXT: vpmovdw %zmm4, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmulld %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmuludq %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm2 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ;